From 409001948d9f221c94a61c3ee96de112755fc04d Mon Sep 17 00:00:00 2001
From: Brian King <brking@linux.vnet.ibm.com>
Date: Wed, 22 Oct 2008 05:53:45 +0000
Subject: powerpc: Update page-in counter for CMM

A new field has been added to the VPA as a method for the client OS to
communicate to firmware the number of page-ins it is performing when
running collaborative memory overcommit.  The hypervisor will use this
information to better determine if a partition is experiencing memory
pressure and needs more memory allocated to it.

Signed-off-by: Brian King <brking@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/mm/fault.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 565b7a237c8..b18bc0f023c 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -30,6 +30,7 @@
 #include <linux/kprobes.h>
 #include <linux/kdebug.h>
 
+#include <asm/firmware.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/mmu.h>
@@ -318,9 +319,16 @@ good_area:
 			goto do_sigbus;
 		BUG();
 	}
-	if (ret & VM_FAULT_MAJOR)
+	if (ret & VM_FAULT_MAJOR) {
 		current->maj_flt++;
-	else
+#ifdef CONFIG_PPC_SMLPAR
+		if (firmware_has_feature(FW_FEATURE_CMO)) {
+			preempt_disable();
+			get_lppaca()->page_ins++;
+			preempt_enable();
+		}
+#endif
+	} else
 		current->min_flt++;
 	up_read(&mm->mmap_sem);
 	return 0;
-- 
cgit v1.2.3


From 7d4320f3d5ace5758111f2beac931376737f80f5 Mon Sep 17 00:00:00 2001
From: Jon Tollefson <kniht@linux.vnet.ibm.com>
Date: Thu, 30 Oct 2008 12:03:57 +0000
Subject: powerpc: Hugetlb pgtable cache access cleanup

Andrew Morton suggested that using a macro that makes an array
reference look like a function call makes it harder to understand the
code.

This therefore removes the huge_pgtable_cache(psize) macro and
replaces its uses with pgtable_cache[HUGE_PGTABLE_INDEX(psize)].

Signed-off-by: Jon Tollefson <kniht@linux.vnet.ibm.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/mm/hugetlbpage.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index a117024ab8c..c2231358adb 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -53,8 +53,7 @@ unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */
 
 /* Subtract one from array size because we don't need a cache for 4K since
  * is not a huge page size */
-#define huge_pgtable_cache(psize)	(pgtable_cache[HUGEPTE_CACHE_NUM \
-							+ psize-1])
+#define HUGE_PGTABLE_INDEX(psize)	(HUGEPTE_CACHE_NUM + psize - 1)
 #define HUGEPTE_CACHE_NAME(psize)	(huge_pgtable_cache_name[psize])
 
 static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = {
@@ -113,7 +112,7 @@ static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr,
 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 			   unsigned long address, unsigned int psize)
 {
-	pte_t *new = kmem_cache_zalloc(huge_pgtable_cache(psize),
+	pte_t *new = kmem_cache_zalloc(pgtable_cache[HUGE_PGTABLE_INDEX(psize)],
 				      GFP_KERNEL|__GFP_REPEAT);
 
 	if (! new)
@@ -121,7 +120,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 
 	spin_lock(&mm->page_table_lock);
 	if (!hugepd_none(*hpdp))
-		kmem_cache_free(huge_pgtable_cache(psize), new);
+		kmem_cache_free(pgtable_cache[HUGE_PGTABLE_INDEX(psize)], new);
 	else
 		hpdp->pd = (unsigned long)new | HUGEPD_OK;
 	spin_unlock(&mm->page_table_lock);
@@ -760,13 +759,14 @@ static int __init hugetlbpage_init(void)
 
 	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
 		if (mmu_huge_psizes[psize]) {
-			huge_pgtable_cache(psize) = kmem_cache_create(
-						HUGEPTE_CACHE_NAME(psize),
-						HUGEPTE_TABLE_SIZE(psize),
-						HUGEPTE_TABLE_SIZE(psize),
-						0,
-						NULL);
-			if (!huge_pgtable_cache(psize))
+			pgtable_cache[HUGE_PGTABLE_INDEX(psize)] =
+				kmem_cache_create(
+					HUGEPTE_CACHE_NAME(psize),
+					HUGEPTE_TABLE_SIZE(psize),
+					HUGEPTE_TABLE_SIZE(psize),
+					0,
+					NULL);
+			if (!pgtable_cache[HUGE_PGTABLE_INDEX(psize)])
 				panic("hugetlbpage_init(): could not create %s"\
 				      "\n", HUGEPTE_CACHE_NAME(psize));
 		}
-- 
cgit v1.2.3


From a6326e98a28d8a57f693369c82559543c6950f09 Mon Sep 17 00:00:00 2001
From: Robert Jennings <rcj@linux.vnet.ibm.com>
Date: Fri, 14 Nov 2008 12:07:34 +0000
Subject: powerpc: Correct page-in counter for CMM with 64k pages

Linux will report the number of page-ins so that the hypervisor can
better determine partition memory pressure.  The hardware page size
and the OS page size can be different.  In the case where the hardware
page size is 4k and the OS is running with 64k pages the code in
commit 409001948d9f221c94a61c3ee96de112755fc04d ("powerpc: Update
page-in counter for CMM") would under-report the number of pages.

This corrects the reporting to the hypervisor by incrementing the
page_in count by 1 << PAGE_FACTOR each time.

Reported-by: Andrew Theurer <habanero@linux.vnet.ibm.com>
Signed-off-by: Robert Jennings <rcj@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/mm/fault.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index b18bc0f023c..7df0409107a 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -324,7 +324,7 @@ good_area:
 #ifdef CONFIG_PPC_SMLPAR
 		if (firmware_has_feature(FW_FEATURE_CMO)) {
 			preempt_disable();
-			get_lppaca()->page_ins++;
+			get_lppaca()->page_ins += (1 << PAGE_FACTOR);
 			preempt_enable();
 		}
 #endif
-- 
cgit v1.2.3


From f4f3a1261ad70988ad45614ebc87e553143a332b Mon Sep 17 00:00:00 2001
From: Kumar Gala <galak@kernel.crashing.org>
Date: Wed, 19 Nov 2008 05:53:04 +0000
Subject: powerpc: hash_page_sync should only be used on SMP & STD_MMU_32

Clean up the ifdefs so we only use hash_page_sync if we have
CONFIG_SMP && CONFIG_PPC_STD_MMU_32.

Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/mm/pgtable_32.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index c31d6d26f0b..44fbc81c9b2 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -48,7 +48,7 @@ EXPORT_SYMBOL(ioremap_bot);	/* aka VMALLOC_END */
 
 extern char etext[], _stext[];
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && defined(CONFIG_PPC_STD_MMU_32)
 extern void hash_page_sync(void);
 #endif
 
@@ -127,7 +127,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
 
 void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && defined(CONFIG_PPC_STD_MMU_32)
 	hash_page_sync();
 #endif
 	free_page((unsigned long)pte);
@@ -135,7 +135,7 @@ void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 
 void pte_free(struct mm_struct *mm, pgtable_t ptepage)
 {
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && defined(CONFIG_PPC_STD_MMU_32)
 	hash_page_sync();
 #endif
 	pgtable_page_dtor(ptepage);
-- 
cgit v1.2.3


From 0186f47e703fb7aa14b54459d642ef5374b3a685 Mon Sep 17 00:00:00 2001
From: Kumar Gala <galak@kernel.crashing.org>
Date: Wed, 19 Nov 2008 12:50:04 +0000
Subject: powerpc: Use RCU based pte freeing mechanism for all powerpc

Refactor the RCU based pte free code that was used on ppc64 to be used
on all powerpc.

Additionally refactor pte_free() & pte_free_kernel() into common code
between ppc32 & ppc64.

Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/mm/Makefile      |   2 +-
 arch/powerpc/mm/hash_low_32.S |  30 -----------
 arch/powerpc/mm/pgtable.c     | 117 ++++++++++++++++++++++++++++++++++++++++++
 arch/powerpc/mm/pgtable_32.c  |  21 --------
 arch/powerpc/mm/tlb_64.c      |  86 -------------------------------
 5 files changed, 118 insertions(+), 138 deletions(-)
 create mode 100644 arch/powerpc/mm/pgtable.c

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index e7392b45a5e..86e657bcfa7 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -6,7 +6,7 @@ ifeq ($(CONFIG_PPC64),y)
 EXTRA_CFLAGS	+= -mno-minimal-toc
 endif
 
-obj-y				:= fault.o mem.o \
+obj-y				:= fault.o mem.o pgtable.o \
 				   init_$(CONFIG_WORD_SIZE).o \
 				   pgtable_$(CONFIG_WORD_SIZE).o \
 				   mmu_context_$(CONFIG_WORD_SIZE).o
diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S
index 7bffb70b9fe..c5536b8b37a 100644
--- a/arch/powerpc/mm/hash_low_32.S
+++ b/arch/powerpc/mm/hash_low_32.S
@@ -35,36 +35,6 @@ mmu_hash_lock:
 	.space	4
 #endif /* CONFIG_SMP */
 
-/*
- * Sync CPUs with hash_page taking & releasing the hash
- * table lock
- */
-#ifdef CONFIG_SMP
-	.text
-_GLOBAL(hash_page_sync)
-	mfmsr   r10
-	rlwinm  r0,r10,0,17,15          /* clear bit 16 (MSR_EE) */
-	mtmsr   r0
-	lis	r8,mmu_hash_lock@h
-	ori	r8,r8,mmu_hash_lock@l
-	lis	r0,0x0fff
-	b	10f
-11:	lwz	r6,0(r8)
-	cmpwi	0,r6,0
-	bne	11b
-10:	lwarx	r6,0,r8
-	cmpwi	0,r6,0
-	bne-	11b
-	stwcx.	r0,0,r8
-	bne-	10b
-	isync
-	eieio
-	li	r0,0
-	stw	r0,0(r8)
-	mtmsr	r10
-	blr
-#endif /* CONFIG_SMP */
-
 /*
  * Load a PTE into the hash table, if possible.
  * The address is in r4, and r3 contains an access flag:
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
new file mode 100644
index 00000000000..6d94116fdea
--- /dev/null
+++ b/arch/powerpc/mm/pgtable.c
@@ -0,0 +1,117 @@
+/*
+ * This file contains common routines for dealing with free of page tables
+ *
+ *  Derived from arch/powerpc/mm/tlb_64.c:
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  Dave Engebretsen <engebret@us.ibm.com>
+ *      Rework for PPC64 port.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+
+static DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
+static unsigned long pte_freelist_forced_free;
+
+struct pte_freelist_batch
+{
+	struct rcu_head	rcu;
+	unsigned int	index;
+	pgtable_free_t	tables[0];
+};
+
+#define PTE_FREELIST_SIZE \
+	((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \
+	  / sizeof(pgtable_free_t))
+
+static void pte_free_smp_sync(void *arg)
+{
+	/* Do nothing, just ensure we sync with all CPUs */
+}
+
+/* This is only called when we are critically out of memory
+ * (and fail to get a page in pte_free_tlb).
+ */
+static void pgtable_free_now(pgtable_free_t pgf)
+{
+	pte_freelist_forced_free++;
+
+	smp_call_function(pte_free_smp_sync, NULL, 1);
+
+	pgtable_free(pgf);
+}
+
+static void pte_free_rcu_callback(struct rcu_head *head)
+{
+	struct pte_freelist_batch *batch =
+		container_of(head, struct pte_freelist_batch, rcu);
+	unsigned int i;
+
+	for (i = 0; i < batch->index; i++)
+		pgtable_free(batch->tables[i]);
+
+	free_page((unsigned long)batch);
+}
+
+static void pte_free_submit(struct pte_freelist_batch *batch)
+{
+	INIT_RCU_HEAD(&batch->rcu);
+	call_rcu(&batch->rcu, pte_free_rcu_callback);
+}
+
+void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf)
+{
+	/* This is safe since tlb_gather_mmu has disabled preemption */
+        cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id());
+	struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
+
+	if (atomic_read(&tlb->mm->mm_users) < 2 ||
+	    cpus_equal(tlb->mm->cpu_vm_mask, local_cpumask)) {
+		pgtable_free(pgf);
+		return;
+	}
+
+	if (*batchp == NULL) {
+		*batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC);
+		if (*batchp == NULL) {
+			pgtable_free_now(pgf);
+			return;
+		}
+		(*batchp)->index = 0;
+	}
+	(*batchp)->tables[(*batchp)->index++] = pgf;
+	if ((*batchp)->index == PTE_FREELIST_SIZE) {
+		pte_free_submit(*batchp);
+		*batchp = NULL;
+	}
+}
+
+void pte_free_finish(void)
+{
+	/* This is safe since tlb_gather_mmu has disabled preemption */
+	struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
+
+	if (*batchp == NULL)
+		return;
+	pte_free_submit(*batchp);
+	*batchp = NULL;
+}
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 44fbc81c9b2..c7b755cba26 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -48,10 +48,6 @@ EXPORT_SYMBOL(ioremap_bot);	/* aka VMALLOC_END */
 
 extern char etext[], _stext[];
 
-#if defined(CONFIG_SMP) && defined(CONFIG_PPC_STD_MMU_32)
-extern void hash_page_sync(void);
-#endif
-
 #ifdef HAVE_BATS
 extern phys_addr_t v_mapped_by_bats(unsigned long va);
 extern unsigned long p_mapped_by_bats(phys_addr_t pa);
@@ -125,23 +121,6 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
 	return ptepage;
 }
 
-void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
-{
-#if defined(CONFIG_SMP) && defined(CONFIG_PPC_STD_MMU_32)
-	hash_page_sync();
-#endif
-	free_page((unsigned long)pte);
-}
-
-void pte_free(struct mm_struct *mm, pgtable_t ptepage)
-{
-#if defined(CONFIG_SMP) && defined(CONFIG_PPC_STD_MMU_32)
-	hash_page_sync();
-#endif
-	pgtable_page_dtor(ptepage);
-	__free_page(ptepage);
-}
-
 void __iomem *
 ioremap(phys_addr_t addr, unsigned long size)
 {
diff --git a/arch/powerpc/mm/tlb_64.c b/arch/powerpc/mm/tlb_64.c
index be7dd422c0f..c931bc7d107 100644
--- a/arch/powerpc/mm/tlb_64.c
+++ b/arch/powerpc/mm/tlb_64.c
@@ -37,81 +37,6 @@ DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
  * arch/powerpc/include/asm/tlb.h file -- tgall
  */
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-static DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
-static unsigned long pte_freelist_forced_free;
-
-struct pte_freelist_batch
-{
-	struct rcu_head	rcu;
-	unsigned int	index;
-	pgtable_free_t	tables[0];
-};
-
-#define PTE_FREELIST_SIZE \
-	((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \
-	  / sizeof(pgtable_free_t))
-
-static void pte_free_smp_sync(void *arg)
-{
-	/* Do nothing, just ensure we sync with all CPUs */
-}
-
-/* This is only called when we are critically out of memory
- * (and fail to get a page in pte_free_tlb).
- */
-static void pgtable_free_now(pgtable_free_t pgf)
-{
-	pte_freelist_forced_free++;
-
-	smp_call_function(pte_free_smp_sync, NULL, 1);
-
-	pgtable_free(pgf);
-}
-
-static void pte_free_rcu_callback(struct rcu_head *head)
-{
-	struct pte_freelist_batch *batch =
-		container_of(head, struct pte_freelist_batch, rcu);
-	unsigned int i;
-
-	for (i = 0; i < batch->index; i++)
-		pgtable_free(batch->tables[i]);
-
-	free_page((unsigned long)batch);
-}
-
-static void pte_free_submit(struct pte_freelist_batch *batch)
-{
-	INIT_RCU_HEAD(&batch->rcu);
-	call_rcu(&batch->rcu, pte_free_rcu_callback);
-}
-
-void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf)
-{
-	/* This is safe since tlb_gather_mmu has disabled preemption */
-        cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id());
-	struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
-
-	if (atomic_read(&tlb->mm->mm_users) < 2 ||
-	    cpus_equal(tlb->mm->cpu_vm_mask, local_cpumask)) {
-		pgtable_free(pgf);
-		return;
-	}
-
-	if (*batchp == NULL) {
-		*batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC);
-		if (*batchp == NULL) {
-			pgtable_free_now(pgf);
-			return;
-		}
-		(*batchp)->index = 0;
-	}
-	(*batchp)->tables[(*batchp)->index++] = pgf;
-	if ((*batchp)->index == PTE_FREELIST_SIZE) {
-		pte_free_submit(*batchp);
-		*batchp = NULL;
-	}
-}
 
 /*
  * A linux PTE was changed and the corresponding hash table entry
@@ -229,17 +154,6 @@ void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
 	batch->index = 0;
 }
 
-void pte_free_finish(void)
-{
-	/* This is safe since tlb_gather_mmu has disabled preemption */
-	struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
-
-	if (*batchp == NULL)
-		return;
-	pte_free_submit(*batchp);
-	*batchp = NULL;
-}
-
 /**
  * __flush_hash_table_range - Flush all HPTEs for a given address range
  *                            from the hash table (and the TLB). But keeps
-- 
cgit v1.2.3


From e41e811a79a4e328005be2744c3076ebde455088 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Sun, 14 Dec 2008 19:44:39 +0000
Subject: powerpc/mm: Rename tlb_32.c and tlb_64.c to tlb_hash32.c and
 tlb_hash64.c

This renames the files to clarify the fact that they are used by
the hash based family of CPUs (the 603 being an exception in that
family but is still handled by that code).

This paves the way for the new tlb_nohash.c coming via a subsequent
commit.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Kumar Gala <galak@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/mm/Makefile     |   2 +-
 arch/powerpc/mm/tlb_32.c     | 190 --------------------------------------
 arch/powerpc/mm/tlb_64.c     | 211 -------------------------------------------
 arch/powerpc/mm/tlb_hash32.c | 190 ++++++++++++++++++++++++++++++++++++++
 arch/powerpc/mm/tlb_hash64.c | 211 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 402 insertions(+), 402 deletions(-)
 delete mode 100644 arch/powerpc/mm/tlb_32.c
 delete mode 100644 arch/powerpc/mm/tlb_64.c
 create mode 100644 arch/powerpc/mm/tlb_hash32.c
 create mode 100644 arch/powerpc/mm/tlb_hash64.c

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 86e657bcfa7..148de35c9ee 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -16,7 +16,7 @@ obj-$(CONFIG_PPC64)		+= hash_utils_64.o \
 				   gup.o mmap.o $(hash-y)
 obj-$(CONFIG_PPC_STD_MMU_32)	+= ppc_mmu_32.o
 obj-$(CONFIG_PPC_STD_MMU)	+= hash_low_$(CONFIG_WORD_SIZE).o \
-				   tlb_$(CONFIG_WORD_SIZE).o
+				   tlb_hash$(CONFIG_WORD_SIZE).o
 obj-$(CONFIG_40x)		+= 40x_mmu.o
 obj-$(CONFIG_44x)		+= 44x_mmu.o
 obj-$(CONFIG_FSL_BOOKE)		+= fsl_booke_mmu.o
diff --git a/arch/powerpc/mm/tlb_32.c b/arch/powerpc/mm/tlb_32.c
deleted file mode 100644
index f9a47fee392..00000000000
--- a/arch/powerpc/mm/tlb_32.c
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * This file contains the routines for TLB flushing.
- * On machines where the MMU uses a hash table to store virtual to
- * physical translations, these routines flush entries from the
- * hash table also.
- *  -- paulus
- *
- *  Derived from arch/ppc/mm/init.c:
- *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
- *
- *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
- *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
- *    Copyright (C) 1996 Paul Mackerras
- *
- *  Derived from "arch/i386/mm/init.c"
- *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <linux/highmem.h>
-#include <linux/pagemap.h>
-
-#include <asm/tlbflush.h>
-#include <asm/tlb.h>
-
-#include "mmu_decl.h"
-
-/*
- * Called when unmapping pages to flush entries from the TLB/hash table.
- */
-void flush_hash_entry(struct mm_struct *mm, pte_t *ptep, unsigned long addr)
-{
-	unsigned long ptephys;
-
-	if (Hash != 0) {
-		ptephys = __pa(ptep) & PAGE_MASK;
-		flush_hash_pages(mm->context.id, addr, ptephys, 1);
-	}
-}
-EXPORT_SYMBOL(flush_hash_entry);
-
-/*
- * Called by ptep_set_access_flags, must flush on CPUs for which the
- * DSI handler can't just "fixup" the TLB on a write fault
- */
-void flush_tlb_page_nohash(struct vm_area_struct *vma, unsigned long addr)
-{
-	if (Hash != 0)
-		return;
-	_tlbie(addr);
-}
-
-/*
- * Called at the end of a mmu_gather operation to make sure the
- * TLB flush is completely done.
- */
-void tlb_flush(struct mmu_gather *tlb)
-{
-	if (Hash == 0) {
-		/*
-		 * 603 needs to flush the whole TLB here since
-		 * it doesn't use a hash table.
-		 */
-		_tlbia();
-	}
-}
-
-/*
- * TLB flushing:
- *
- *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
- *  - flush_tlb_page(vma, vmaddr) flushes one page
- *  - flush_tlb_range(vma, start, end) flushes a range of pages
- *  - flush_tlb_kernel_range(start, end) flushes kernel pages
- *
- * since the hardware hash table functions as an extension of the
- * tlb as far as the linux tables are concerned, flush it too.
- *    -- Cort
- */
-
-/*
- * 750 SMP is a Bad Idea because the 750 doesn't broadcast all
- * the cache operations on the bus.  Hence we need to use an IPI
- * to get the other CPU(s) to invalidate their TLBs.
- */
-#ifdef CONFIG_SMP_750
-#define FINISH_FLUSH	smp_send_tlb_invalidate(0)
-#else
-#define FINISH_FLUSH	do { } while (0)
-#endif
-
-static void flush_range(struct mm_struct *mm, unsigned long start,
-			unsigned long end)
-{
-	pmd_t *pmd;
-	unsigned long pmd_end;
-	int count;
-	unsigned int ctx = mm->context.id;
-
-	if (Hash == 0) {
-		_tlbia();
-		return;
-	}
-	start &= PAGE_MASK;
-	if (start >= end)
-		return;
-	end = (end - 1) | ~PAGE_MASK;
-	pmd = pmd_offset(pud_offset(pgd_offset(mm, start), start), start);
-	for (;;) {
-		pmd_end = ((start + PGDIR_SIZE) & PGDIR_MASK) - 1;
-		if (pmd_end > end)
-			pmd_end = end;
-		if (!pmd_none(*pmd)) {
-			count = ((pmd_end - start) >> PAGE_SHIFT) + 1;
-			flush_hash_pages(ctx, start, pmd_val(*pmd), count);
-		}
-		if (pmd_end == end)
-			break;
-		start = pmd_end + 1;
-		++pmd;
-	}
-}
-
-/*
- * Flush kernel TLB entries in the given range
- */
-void flush_tlb_kernel_range(unsigned long start, unsigned long end)
-{
-	flush_range(&init_mm, start, end);
-	FINISH_FLUSH;
-}
-
-/*
- * Flush all the (user) entries for the address space described by mm.
- */
-void flush_tlb_mm(struct mm_struct *mm)
-{
-	struct vm_area_struct *mp;
-
-	if (Hash == 0) {
-		_tlbia();
-		return;
-	}
-
-	/*
-	 * It is safe to go down the mm's list of vmas when called
-	 * from dup_mmap, holding mmap_sem.  It would also be safe from
-	 * unmap_region or exit_mmap, but not from vmtruncate on SMP -
-	 * but it seems dup_mmap is the only SMP case which gets here.
-	 */
-	for (mp = mm->mmap; mp != NULL; mp = mp->vm_next)
-		flush_range(mp->vm_mm, mp->vm_start, mp->vm_end);
-	FINISH_FLUSH;
-}
-
-void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
-{
-	struct mm_struct *mm;
-	pmd_t *pmd;
-
-	if (Hash == 0) {
-		_tlbie(vmaddr);
-		return;
-	}
-	mm = (vmaddr < TASK_SIZE)? vma->vm_mm: &init_mm;
-	pmd = pmd_offset(pud_offset(pgd_offset(mm, vmaddr), vmaddr), vmaddr);
-	if (!pmd_none(*pmd))
-		flush_hash_pages(mm->context.id, vmaddr, pmd_val(*pmd), 1);
-	FINISH_FLUSH;
-}
-
-/*
- * For each address in the range, find the pte for the address
- * and check _PAGE_HASHPTE bit; if it is set, find and destroy
- * the corresponding HPTE.
- */
-void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
-		     unsigned long end)
-{
-	flush_range(vma->vm_mm, start, end);
-	FINISH_FLUSH;
-}
diff --git a/arch/powerpc/mm/tlb_64.c b/arch/powerpc/mm/tlb_64.c
deleted file mode 100644
index c931bc7d107..00000000000
--- a/arch/powerpc/mm/tlb_64.c
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
- * This file contains the routines for flushing entries from the
- * TLB and MMU hash table.
- *
- *  Derived from arch/ppc64/mm/init.c:
- *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
- *
- *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
- *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
- *    Copyright (C) 1996 Paul Mackerras
- *
- *  Derived from "arch/i386/mm/init.c"
- *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
- *
- *  Dave Engebretsen <engebret@us.ibm.com>
- *      Rework for PPC64 port.
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- */
-
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <linux/percpu.h>
-#include <linux/hardirq.h>
-#include <asm/pgalloc.h>
-#include <asm/tlbflush.h>
-#include <asm/tlb.h>
-#include <asm/bug.h>
-
-DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
-
-/* This is declared as we are using the more or less generic
- * arch/powerpc/include/asm/tlb.h file -- tgall
- */
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-
-/*
- * A linux PTE was changed and the corresponding hash table entry
- * neesd to be flushed. This function will either perform the flush
- * immediately or will batch it up if the current CPU has an active
- * batch on it.
- *
- * Must be called from within some kind of spinlock/non-preempt region...
- */
-void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
-		     pte_t *ptep, unsigned long pte, int huge)
-{
-	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
-	unsigned long vsid, vaddr;
-	unsigned int psize;
-	int ssize;
-	real_pte_t rpte;
-	int i;
-
-	i = batch->index;
-
-	/* We mask the address for the base page size. Huge pages will
-	 * have applied their own masking already
-	 */
-	addr &= PAGE_MASK;
-
-	/* Get page size (maybe move back to caller).
-	 *
-	 * NOTE: when using special 64K mappings in 4K environment like
-	 * for SPEs, we obtain the page size from the slice, which thus
-	 * must still exist (and thus the VMA not reused) at the time
-	 * of this call
-	 */
-	if (huge) {
-#ifdef CONFIG_HUGETLB_PAGE
-		psize = get_slice_psize(mm, addr);;
-#else
-		BUG();
-		psize = pte_pagesize_index(mm, addr, pte); /* shutup gcc */
-#endif
-	} else
-		psize = pte_pagesize_index(mm, addr, pte);
-
-	/* Build full vaddr */
-	if (!is_kernel_addr(addr)) {
-		ssize = user_segment_size(addr);
-		vsid = get_vsid(mm->context.id, addr, ssize);
-		WARN_ON(vsid == 0);
-	} else {
-		vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
-		ssize = mmu_kernel_ssize;
-	}
-	vaddr = hpt_va(addr, vsid, ssize);
-	rpte = __real_pte(__pte(pte), ptep);
-
-	/*
-	 * Check if we have an active batch on this CPU. If not, just
-	 * flush now and return. For now, we don global invalidates
-	 * in that case, might be worth testing the mm cpu mask though
-	 * and decide to use local invalidates instead...
-	 */
-	if (!batch->active) {
-		flush_hash_page(vaddr, rpte, psize, ssize, 0);
-		return;
-	}
-
-	/*
-	 * This can happen when we are in the middle of a TLB batch and
-	 * we encounter memory pressure (eg copy_page_range when it tries
-	 * to allocate a new pte). If we have to reclaim memory and end
-	 * up scanning and resetting referenced bits then our batch context
-	 * will change mid stream.
-	 *
-	 * We also need to ensure only one page size is present in a given
-	 * batch
-	 */
-	if (i != 0 && (mm != batch->mm || batch->psize != psize ||
-		       batch->ssize != ssize)) {
-		__flush_tlb_pending(batch);
-		i = 0;
-	}
-	if (i == 0) {
-		batch->mm = mm;
-		batch->psize = psize;
-		batch->ssize = ssize;
-	}
-	batch->pte[i] = rpte;
-	batch->vaddr[i] = vaddr;
-	batch->index = ++i;
-	if (i >= PPC64_TLB_BATCH_NR)
-		__flush_tlb_pending(batch);
-}
-
-/*
- * This function is called when terminating an mmu batch or when a batch
- * is full. It will perform the flush of all the entries currently stored
- * in a batch.
- *
- * Must be called from within some kind of spinlock/non-preempt region...
- */
-void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
-{
-	cpumask_t tmp;
-	int i, local = 0;
-
-	i = batch->index;
-	tmp = cpumask_of_cpu(smp_processor_id());
-	if (cpus_equal(batch->mm->cpu_vm_mask, tmp))
-		local = 1;
-	if (i == 1)
-		flush_hash_page(batch->vaddr[0], batch->pte[0],
-				batch->psize, batch->ssize, local);
-	else
-		flush_hash_range(i, local);
-	batch->index = 0;
-}
-
-/**
- * __flush_hash_table_range - Flush all HPTEs for a given address range
- *                            from the hash table (and the TLB). But keeps
- *                            the linux PTEs intact.
- *
- * @mm		: mm_struct of the target address space (generally init_mm)
- * @start	: starting address
- * @end         : ending address (not included in the flush)
- *
- * This function is mostly to be used by some IO hotplug code in order
- * to remove all hash entries from a given address range used to map IO
- * space on a removed PCI-PCI bidge without tearing down the full mapping
- * since 64K pages may overlap with other bridges when using 64K pages
- * with 4K HW pages on IO space.
- *
- * Because of that usage pattern, it's only available with CONFIG_HOTPLUG
- * and is implemented for small size rather than speed.
- */
-#ifdef CONFIG_HOTPLUG
-
-void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
-			      unsigned long end)
-{
-	unsigned long flags;
-
-	start = _ALIGN_DOWN(start, PAGE_SIZE);
-	end = _ALIGN_UP(end, PAGE_SIZE);
-
-	BUG_ON(!mm->pgd);
-
-	/* Note: Normally, we should only ever use a batch within a
-	 * PTE locked section. This violates the rule, but will work
-	 * since we don't actually modify the PTEs, we just flush the
-	 * hash while leaving the PTEs intact (including their reference
-	 * to being hashed). This is not the most performance oriented
-	 * way to do things but is fine for our needs here.
-	 */
-	local_irq_save(flags);
-	arch_enter_lazy_mmu_mode();
-	for (; start < end; start += PAGE_SIZE) {
-		pte_t *ptep = find_linux_pte(mm->pgd, start);
-		unsigned long pte;
-
-		if (ptep == NULL)
-			continue;
-		pte = pte_val(*ptep);
-		if (!(pte & _PAGE_HASHPTE))
-			continue;
-		hpte_need_flush(mm, start, ptep, pte, 0);
-	}
-	arch_leave_lazy_mmu_mode();
-	local_irq_restore(flags);
-}
-
-#endif /* CONFIG_HOTPLUG */
diff --git a/arch/powerpc/mm/tlb_hash32.c b/arch/powerpc/mm/tlb_hash32.c
new file mode 100644
index 00000000000..f9a47fee392
--- /dev/null
+++ b/arch/powerpc/mm/tlb_hash32.c
@@ -0,0 +1,190 @@
+/*
+ * This file contains the routines for TLB flushing.
+ * On machines where the MMU uses a hash table to store virtual to
+ * physical translations, these routines flush entries from the
+ * hash table also.
+ *  -- paulus
+ *
+ *  Derived from arch/ppc/mm/init.c:
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+
+#include "mmu_decl.h"
+
+/*
+ * Called when unmapping pages to flush entries from the TLB/hash table.
+ */
+void flush_hash_entry(struct mm_struct *mm, pte_t *ptep, unsigned long addr)
+{
+	unsigned long ptephys;
+
+	if (Hash != 0) {
+		ptephys = __pa(ptep) & PAGE_MASK;
+		flush_hash_pages(mm->context.id, addr, ptephys, 1);
+	}
+}
+EXPORT_SYMBOL(flush_hash_entry);
+
+/*
+ * Called by ptep_set_access_flags, must flush on CPUs for which the
+ * DSI handler can't just "fixup" the TLB on a write fault
+ */
+void flush_tlb_page_nohash(struct vm_area_struct *vma, unsigned long addr)
+{
+	if (Hash != 0)
+		return;
+	_tlbie(addr);
+}
+
+/*
+ * Called at the end of a mmu_gather operation to make sure the
+ * TLB flush is completely done.
+ */
+void tlb_flush(struct mmu_gather *tlb)
+{
+	if (Hash == 0) {
+		/*
+		 * 603 needs to flush the whole TLB here since
+		 * it doesn't use a hash table.
+		 */
+		_tlbia();
+	}
+}
+
+/*
+ * TLB flushing:
+ *
+ *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
+ *  - flush_tlb_page(vma, vmaddr) flushes one page
+ *  - flush_tlb_range(vma, start, end) flushes a range of pages
+ *  - flush_tlb_kernel_range(start, end) flushes kernel pages
+ *
+ * since the hardware hash table functions as an extension of the
+ * tlb as far as the linux tables are concerned, flush it too.
+ *    -- Cort
+ */
+
+/*
+ * 750 SMP is a Bad Idea because the 750 doesn't broadcast all
+ * the cache operations on the bus.  Hence we need to use an IPI
+ * to get the other CPU(s) to invalidate their TLBs.
+ */
+#ifdef CONFIG_SMP_750
+#define FINISH_FLUSH	smp_send_tlb_invalidate(0)
+#else
+#define FINISH_FLUSH	do { } while (0)
+#endif
+
+static void flush_range(struct mm_struct *mm, unsigned long start,
+			unsigned long end)
+{
+	pmd_t *pmd;
+	unsigned long pmd_end;
+	int count;
+	unsigned int ctx = mm->context.id;
+
+	if (Hash == 0) {
+		_tlbia();
+		return;
+	}
+	start &= PAGE_MASK;
+	if (start >= end)
+		return;
+	end = (end - 1) | ~PAGE_MASK;
+	pmd = pmd_offset(pud_offset(pgd_offset(mm, start), start), start);
+	for (;;) {
+		pmd_end = ((start + PGDIR_SIZE) & PGDIR_MASK) - 1;
+		if (pmd_end > end)
+			pmd_end = end;
+		if (!pmd_none(*pmd)) {
+			count = ((pmd_end - start) >> PAGE_SHIFT) + 1;
+			flush_hash_pages(ctx, start, pmd_val(*pmd), count);
+		}
+		if (pmd_end == end)
+			break;
+		start = pmd_end + 1;
+		++pmd;
+	}
+}
+
+/*
+ * Flush kernel TLB entries in the given range
+ */
+void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+	flush_range(&init_mm, start, end);
+	FINISH_FLUSH;
+}
+
+/*
+ * Flush all the (user) entries for the address space described by mm.
+ */
+void flush_tlb_mm(struct mm_struct *mm)
+{
+	struct vm_area_struct *mp;
+
+	if (Hash == 0) {
+		_tlbia();
+		return;
+	}
+
+	/*
+	 * It is safe to go down the mm's list of vmas when called
+	 * from dup_mmap, holding mmap_sem.  It would also be safe from
+	 * unmap_region or exit_mmap, but not from vmtruncate on SMP -
+	 * but it seems dup_mmap is the only SMP case which gets here.
+	 */
+	for (mp = mm->mmap; mp != NULL; mp = mp->vm_next)
+		flush_range(mp->vm_mm, mp->vm_start, mp->vm_end);
+	FINISH_FLUSH;
+}
+
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+	struct mm_struct *mm;
+	pmd_t *pmd;
+
+	if (Hash == 0) {
+		_tlbie(vmaddr);
+		return;
+	}
+	mm = (vmaddr < TASK_SIZE)? vma->vm_mm: &init_mm;
+	pmd = pmd_offset(pud_offset(pgd_offset(mm, vmaddr), vmaddr), vmaddr);
+	if (!pmd_none(*pmd))
+		flush_hash_pages(mm->context.id, vmaddr, pmd_val(*pmd), 1);
+	FINISH_FLUSH;
+}
+
+/*
+ * For each address in the range, find the pte for the address
+ * and check _PAGE_HASHPTE bit; if it is set, find and destroy
+ * the corresponding HPTE.
+ */
+void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
+		     unsigned long end)
+{
+	flush_range(vma->vm_mm, start, end);
+	FINISH_FLUSH;
+}
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
new file mode 100644
index 00000000000..c931bc7d107
--- /dev/null
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -0,0 +1,211 @@
+/*
+ * This file contains the routines for flushing entries from the
+ * TLB and MMU hash table.
+ *
+ *  Derived from arch/ppc64/mm/init.c:
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  Dave Engebretsen <engebret@us.ibm.com>
+ *      Rework for PPC64 port.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <asm/bug.h>
+
+DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
+
+/* This is declared as we are using the more or less generic
+ * arch/powerpc/include/asm/tlb.h file -- tgall
+ */
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+
+/*
+ * A linux PTE was changed and the corresponding hash table entry
+ * neesd to be flushed. This function will either perform the flush
+ * immediately or will batch it up if the current CPU has an active
+ * batch on it.
+ *
+ * Must be called from within some kind of spinlock/non-preempt region...
+ */
+void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
+		     pte_t *ptep, unsigned long pte, int huge)
+{
+	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
+	unsigned long vsid, vaddr;
+	unsigned int psize;
+	int ssize;
+	real_pte_t rpte;
+	int i;
+
+	i = batch->index;
+
+	/* We mask the address for the base page size. Huge pages will
+	 * have applied their own masking already
+	 */
+	addr &= PAGE_MASK;
+
+	/* Get page size (maybe move back to caller).
+	 *
+	 * NOTE: when using special 64K mappings in 4K environment like
+	 * for SPEs, we obtain the page size from the slice, which thus
+	 * must still exist (and thus the VMA not reused) at the time
+	 * of this call
+	 */
+	if (huge) {
+#ifdef CONFIG_HUGETLB_PAGE
+		psize = get_slice_psize(mm, addr);;
+#else
+		BUG();
+		psize = pte_pagesize_index(mm, addr, pte); /* shutup gcc */
+#endif
+	} else
+		psize = pte_pagesize_index(mm, addr, pte);
+
+	/* Build full vaddr */
+	if (!is_kernel_addr(addr)) {
+		ssize = user_segment_size(addr);
+		vsid = get_vsid(mm->context.id, addr, ssize);
+		WARN_ON(vsid == 0);
+	} else {
+		vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
+		ssize = mmu_kernel_ssize;
+	}
+	vaddr = hpt_va(addr, vsid, ssize);
+	rpte = __real_pte(__pte(pte), ptep);
+
+	/*
+	 * Check if we have an active batch on this CPU. If not, just
+	 * flush now and return. For now, we don global invalidates
+	 * in that case, might be worth testing the mm cpu mask though
+	 * and decide to use local invalidates instead...
+	 */
+	if (!batch->active) {
+		flush_hash_page(vaddr, rpte, psize, ssize, 0);
+		return;
+	}
+
+	/*
+	 * This can happen when we are in the middle of a TLB batch and
+	 * we encounter memory pressure (eg copy_page_range when it tries
+	 * to allocate a new pte). If we have to reclaim memory and end
+	 * up scanning and resetting referenced bits then our batch context
+	 * will change mid stream.
+	 *
+	 * We also need to ensure only one page size is present in a given
+	 * batch
+	 */
+	if (i != 0 && (mm != batch->mm || batch->psize != psize ||
+		       batch->ssize != ssize)) {
+		__flush_tlb_pending(batch);
+		i = 0;
+	}
+	if (i == 0) {
+		batch->mm = mm;
+		batch->psize = psize;
+		batch->ssize = ssize;
+	}
+	batch->pte[i] = rpte;
+	batch->vaddr[i] = vaddr;
+	batch->index = ++i;
+	if (i >= PPC64_TLB_BATCH_NR)
+		__flush_tlb_pending(batch);
+}
+
+/*
+ * This function is called when terminating an mmu batch or when a batch
+ * is full. It will perform the flush of all the entries currently stored
+ * in a batch.
+ *
+ * Must be called from within some kind of spinlock/non-preempt region...
+ */
+void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
+{
+	cpumask_t tmp;
+	int i, local = 0;
+
+	i = batch->index;
+	tmp = cpumask_of_cpu(smp_processor_id());
+	if (cpus_equal(batch->mm->cpu_vm_mask, tmp))
+		local = 1;
+	if (i == 1)
+		flush_hash_page(batch->vaddr[0], batch->pte[0],
+				batch->psize, batch->ssize, local);
+	else
+		flush_hash_range(i, local);
+	batch->index = 0;
+}
+
+/**
+ * __flush_hash_table_range - Flush all HPTEs for a given address range
+ *                            from the hash table (and the TLB). But keeps
+ *                            the linux PTEs intact.
+ *
+ * @mm		: mm_struct of the target address space (generally init_mm)
+ * @start	: starting address
+ * @end         : ending address (not included in the flush)
+ *
+ * This function is mostly to be used by some IO hotplug code in order
+ * to remove all hash entries from a given address range used to map IO
+ * space on a removed PCI-PCI bidge without tearing down the full mapping
+ * since 64K pages may overlap with other bridges when using 64K pages
+ * with 4K HW pages on IO space.
+ *
+ * Because of that usage pattern, it's only available with CONFIG_HOTPLUG
+ * and is implemented for small size rather than speed.
+ */
+#ifdef CONFIG_HOTPLUG
+
+void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
+			      unsigned long end)
+{
+	unsigned long flags;
+
+	start = _ALIGN_DOWN(start, PAGE_SIZE);
+	end = _ALIGN_UP(end, PAGE_SIZE);
+
+	BUG_ON(!mm->pgd);
+
+	/* Note: Normally, we should only ever use a batch within a
+	 * PTE locked section. This violates the rule, but will work
+	 * since we don't actually modify the PTEs, we just flush the
+	 * hash while leaving the PTEs intact (including their reference
+	 * to being hashed). This is not the most performance oriented
+	 * way to do things but is fine for our needs here.
+	 */
+	local_irq_save(flags);
+	arch_enter_lazy_mmu_mode();
+	for (; start < end; start += PAGE_SIZE) {
+		pte_t *ptep = find_linux_pte(mm->pgd, start);
+		unsigned long pte;
+
+		if (ptep == NULL)
+			continue;
+		pte = pte_val(*ptep);
+		if (!(pte & _PAGE_HASHPTE))
+			continue;
+		hpte_need_flush(mm, start, ptep, pte, 0);
+	}
+	arch_leave_lazy_mmu_mode();
+	local_irq_restore(flags);
+}
+
+#endif /* CONFIG_HOTPLUG */
-- 
cgit v1.2.3


From f63837f0581fe580168ae1a7d178ded935411747 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Sun, 14 Dec 2008 19:44:51 +0000
Subject: powerpc/mm: Remove flush_HPTE()

The function flush_HPTE() is used in only one place, the implementation
of DEBUG_PAGEALLOC on ppc32.

It's actually a dup of flush_tlb_page() though it's -slightly- more
efficient on hash based processors.  We remove it and replace it by
a direct call to the hash flush code on those processors and to
flush_tlb_page() for everybody else.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/mm/mmu_decl.h   | 17 -----------------
 arch/powerpc/mm/pgtable_32.c |  6 +++++-
 2 files changed, 5 insertions(+), 18 deletions(-)

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index fab3cfad409..b4344fd30f2 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -58,17 +58,14 @@ extern phys_addr_t lowmem_end_addr;
  * architectures.  -- Dan
  */
 #if defined(CONFIG_8xx)
-#define flush_HPTE(X, va, pg)	_tlbie(va, 0 /* 8xx doesn't care about PID */)
 #define MMU_init_hw()		do { } while(0)
 #define mmu_mapin_ram()		(0UL)
 
 #elif defined(CONFIG_4xx)
-#define flush_HPTE(pid, va, pg)	_tlbie(va, pid)
 extern void MMU_init_hw(void);
 extern unsigned long mmu_mapin_ram(void);
 
 #elif defined(CONFIG_FSL_BOOKE)
-#define flush_HPTE(pid, va, pg)	_tlbie(va, pid)
 extern void MMU_init_hw(void);
 extern unsigned long mmu_mapin_ram(void);
 extern void adjust_total_lowmem(void);
@@ -77,18 +74,4 @@ extern void adjust_total_lowmem(void);
 /* anything 32-bit except 4xx or 8xx */
 extern void MMU_init_hw(void);
 extern unsigned long mmu_mapin_ram(void);
-
-/* Be careful....this needs to be updated if we ever encounter 603 SMPs,
- * which includes all new 82xx processors.  We need tlbie/tlbsync here
- * in that case (I think). -- Dan.
- */
-static inline void flush_HPTE(unsigned context, unsigned long va,
-			      unsigned long pdval)
-{
-	if ((Hash != 0) &&
-	    cpu_has_feature(CPU_FTR_HPTE_TABLE))
-		flush_hash_pages(0, va, pdval, 1);
-	else
-		_tlbie(va);
-}
 #endif
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index c7b755cba26..34147244013 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -342,7 +342,11 @@ static int __change_page_attr(struct page *page, pgprot_t prot)
 		return -EINVAL;
 	set_pte_at(&init_mm, address, kpte, mk_pte(page, prot));
 	wmb();
-	flush_HPTE(0, address, pmd_val(*kpmd));
+#ifdef CONFIG_PPC_STD_MMU
+	flush_hash_pages(0, address, pmd_val(*kpmd), 1);
+#else
+	flush_tlb_page(NULL, address);
+#endif
 	pte_unmap(kpte);
 
 	return 0;
-- 
cgit v1.2.3


From 5e696617c425eb97bd943d781f3941fb1e8f0e5b Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Thu, 18 Dec 2008 19:13:24 +0000
Subject: powerpc/mm: Split mmu_context handling

This splits the mmu_context handling between 32-bit hash based
processors, 64-bit hash based processors and everybody else.  This is
preliminary work for adding SMP support for BookE processors.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Kumar Gala <galak@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/mm/Makefile             |   7 +-
 arch/powerpc/mm/mmu_context_32.c     |  84 ------------------
 arch/powerpc/mm/mmu_context_64.c     |  70 ---------------
 arch/powerpc/mm/mmu_context_hash32.c | 103 ++++++++++++++++++++++
 arch/powerpc/mm/mmu_context_hash64.c |  78 +++++++++++++++++
 arch/powerpc/mm/mmu_context_nohash.c | 162 +++++++++++++++++++++++++++++++++++
 6 files changed, 347 insertions(+), 157 deletions(-)
 delete mode 100644 arch/powerpc/mm/mmu_context_32.c
 delete mode 100644 arch/powerpc/mm/mmu_context_64.c
 create mode 100644 arch/powerpc/mm/mmu_context_hash32.c
 create mode 100644 arch/powerpc/mm/mmu_context_hash64.c
 create mode 100644 arch/powerpc/mm/mmu_context_nohash.c

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 148de35c9ee..923bd3fa7d6 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -8,15 +8,16 @@ endif
 
 obj-y				:= fault.o mem.o pgtable.o \
 				   init_$(CONFIG_WORD_SIZE).o \
-				   pgtable_$(CONFIG_WORD_SIZE).o \
-				   mmu_context_$(CONFIG_WORD_SIZE).o
+				   pgtable_$(CONFIG_WORD_SIZE).o
+obj-$(CONFIG_PPC_MMU_NOHASH)	+= mmu_context_nohash.o
 hash-$(CONFIG_PPC_NATIVE)	:= hash_native_64.o
 obj-$(CONFIG_PPC64)		+= hash_utils_64.o \
 				   slb_low.o slb.o stab.o \
 				   gup.o mmap.o $(hash-y)
 obj-$(CONFIG_PPC_STD_MMU_32)	+= ppc_mmu_32.o
 obj-$(CONFIG_PPC_STD_MMU)	+= hash_low_$(CONFIG_WORD_SIZE).o \
-				   tlb_hash$(CONFIG_WORD_SIZE).o
+				   tlb_hash$(CONFIG_WORD_SIZE).o \
+				   mmu_context_hash$(CONFIG_WORD_SIZE).o
 obj-$(CONFIG_40x)		+= 40x_mmu.o
 obj-$(CONFIG_44x)		+= 44x_mmu.o
 obj-$(CONFIG_FSL_BOOKE)		+= fsl_booke_mmu.o
diff --git a/arch/powerpc/mm/mmu_context_32.c b/arch/powerpc/mm/mmu_context_32.c
deleted file mode 100644
index cc32ba41d90..00000000000
--- a/arch/powerpc/mm/mmu_context_32.c
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * This file contains the routines for handling the MMU on those
- * PowerPC implementations where the MMU substantially follows the
- * architecture specification.  This includes the 6xx, 7xx, 7xxx,
- * 8260, and POWER3 implementations but excludes the 8xx and 4xx.
- *  -- paulus
- *
- *  Derived from arch/ppc/mm/init.c:
- *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
- *
- *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
- *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
- *    Copyright (C) 1996 Paul Mackerras
- *
- *  Derived from "arch/i386/mm/init.c"
- *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/mm.h>
-#include <linux/init.h>
-
-#include <asm/mmu_context.h>
-#include <asm/tlbflush.h>
-
-unsigned long next_mmu_context;
-unsigned long context_map[LAST_CONTEXT / BITS_PER_LONG + 1];
-#ifdef FEW_CONTEXTS
-atomic_t nr_free_contexts;
-struct mm_struct *context_mm[LAST_CONTEXT+1];
-void steal_context(void);
-#endif /* FEW_CONTEXTS */
-
-/*
- * Initialize the context management stuff.
- */
-void __init
-mmu_context_init(void)
-{
-	/*
-	 * Some processors have too few contexts to reserve one for
-	 * init_mm, and require using context 0 for a normal task.
-	 * Other processors reserve the use of context zero for the kernel.
-	 * This code assumes FIRST_CONTEXT < 32.
-	 */
-	context_map[0] = (1 << FIRST_CONTEXT) - 1;
-	next_mmu_context = FIRST_CONTEXT;
-#ifdef FEW_CONTEXTS
-	atomic_set(&nr_free_contexts, LAST_CONTEXT - FIRST_CONTEXT + 1);
-#endif /* FEW_CONTEXTS */
-}
-
-#ifdef FEW_CONTEXTS
-/*
- * Steal a context from a task that has one at the moment.
- * This is only used on 8xx and 4xx and we presently assume that
- * they don't do SMP.  If they do then this will have to check
- * whether the MM we steal is in use.
- * We also assume that this is only used on systems that don't
- * use an MMU hash table - this is true for 8xx and 4xx.
- * This isn't an LRU system, it just frees up each context in
- * turn (sort-of pseudo-random replacement :).  This would be the
- * place to implement an LRU scheme if anyone was motivated to do it.
- *  -- paulus
- */
-void
-steal_context(void)
-{
-	struct mm_struct *mm;
-
-	/* free up context `next_mmu_context' */
-	/* if we shouldn't free context 0, don't... */
-	if (next_mmu_context < FIRST_CONTEXT)
-		next_mmu_context = FIRST_CONTEXT;
-	mm = context_mm[next_mmu_context];
-	flush_tlb_mm(mm);
-	destroy_context(mm);
-}
-#endif /* FEW_CONTEXTS */
diff --git a/arch/powerpc/mm/mmu_context_64.c b/arch/powerpc/mm/mmu_context_64.c
deleted file mode 100644
index 1db38ba1f54..00000000000
--- a/arch/powerpc/mm/mmu_context_64.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- *  MMU context allocation for 64-bit kernels.
- *
- *  Copyright (C) 2004 Anton Blanchard, IBM Corp. <anton@samba.org>
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/spinlock.h>
-#include <linux/idr.h>
-
-#include <asm/mmu_context.h>
-
-static DEFINE_SPINLOCK(mmu_context_lock);
-static DEFINE_IDR(mmu_context_idr);
-
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
-{
-	int index;
-	int err;
-
-again:
-	if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL))
-		return -ENOMEM;
-
-	spin_lock(&mmu_context_lock);
-	err = idr_get_new_above(&mmu_context_idr, NULL, 1, &index);
-	spin_unlock(&mmu_context_lock);
-
-	if (err == -EAGAIN)
-		goto again;
-	else if (err)
-		return err;
-
-	if (index > MAX_CONTEXT) {
-		spin_lock(&mmu_context_lock);
-		idr_remove(&mmu_context_idr, index);
-		spin_unlock(&mmu_context_lock);
-		return -ENOMEM;
-	}
-
-	/* The old code would re-promote on fork, we don't do that
-	 * when using slices as it could cause problem promoting slices
-	 * that have been forced down to 4K
-	 */
-	if (slice_mm_new_context(mm))
-		slice_set_user_psize(mm, mmu_virtual_psize);
-	mm->context.id = index;
-
-	return 0;
-}
-
-void destroy_context(struct mm_struct *mm)
-{
-	spin_lock(&mmu_context_lock);
-	idr_remove(&mmu_context_idr, mm->context.id);
-	spin_unlock(&mmu_context_lock);
-
-	mm->context.id = NO_CONTEXT;
-}
diff --git a/arch/powerpc/mm/mmu_context_hash32.c b/arch/powerpc/mm/mmu_context_hash32.c
new file mode 100644
index 00000000000..0dfba2bf7f3
--- /dev/null
+++ b/arch/powerpc/mm/mmu_context_hash32.c
@@ -0,0 +1,103 @@
+/*
+ * This file contains the routines for handling the MMU on those
+ * PowerPC implementations where the MMU substantially follows the
+ * architecture specification.  This includes the 6xx, 7xx, 7xxx,
+ * 8260, and POWER3 implementations but excludes the 8xx and 4xx.
+ *  -- paulus
+ *
+ *  Derived from arch/ppc/mm/init.c:
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/init.h>
+
+#include <asm/mmu_context.h>
+#include <asm/tlbflush.h>
+
+/*
+ * On 32-bit PowerPC 6xx/7xx/7xxx CPUs, we use a set of 16 VSIDs
+ * (virtual segment identifiers) for each context.  Although the
+ * hardware supports 24-bit VSIDs, and thus >1 million contexts,
+ * we only use 32,768 of them.  That is ample, since there can be
+ * at most around 30,000 tasks in the system anyway, and it means
+ * that we can use a bitmap to indicate which contexts are in use.
+ * Using a bitmap means that we entirely avoid all of the problems
+ * that we used to have when the context number overflowed,
+ * particularly on SMP systems.
+ *  -- paulus.
+ */
+#define NO_CONTEXT      	((unsigned long) -1)
+#define LAST_CONTEXT    	32767
+#define FIRST_CONTEXT    	1
+
+/*
+ * This function defines the mapping from contexts to VSIDs (virtual
+ * segment IDs).  We use a skew on both the context and the high 4 bits
+ * of the 32-bit virtual address (the "effective segment ID") in order
+ * to spread out the entries in the MMU hash table.  Note, if this
+ * function is changed then arch/ppc/mm/hashtable.S will have to be
+ * changed to correspond.
+ *
+ *
+ * CTX_TO_VSID(ctx, va)	(((ctx) * (897 * 16) + ((va) >> 28) * 0x111) \
+ *				 & 0xffffff)
+ */
+
+static unsigned long next_mmu_context;
+static unsigned long context_map[LAST_CONTEXT / BITS_PER_LONG + 1];
+
+
+/*
+ * Set up the context for a new address space.
+ */
+int init_new_context(struct task_struct *t, struct mm_struct *mm)
+{
+	unsigned long ctx = next_mmu_context;
+
+	while (test_and_set_bit(ctx, context_map)) {
+		ctx = find_next_zero_bit(context_map, LAST_CONTEXT+1, ctx);
+		if (ctx > LAST_CONTEXT)
+			ctx = 0;
+	}
+	next_mmu_context = (ctx + 1) & LAST_CONTEXT;
+	mm->context.id = ctx;
+
+	return 0;
+}
+
+/*
+ * We're finished using the context for an address space.
+ */
+void destroy_context(struct mm_struct *mm)
+{
+	preempt_disable();
+	if (mm->context.id != NO_CONTEXT) {
+		clear_bit(mm->context.id, context_map);
+		mm->context.id = NO_CONTEXT;
+	}
+	preempt_enable();
+}
+
+/*
+ * Initialize the context management stuff.
+ */
+void __init mmu_context_init(void)
+{
+	/* Reserve context 0 for kernel use */
+	context_map[0] = (1 << FIRST_CONTEXT) - 1;
+	next_mmu_context = FIRST_CONTEXT;
+}
diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c
new file mode 100644
index 00000000000..dbeb86ac90c
--- /dev/null
+++ b/arch/powerpc/mm/mmu_context_hash64.c
@@ -0,0 +1,78 @@
+/*
+ *  MMU context allocation for 64-bit kernels.
+ *
+ *  Copyright (C) 2004 Anton Blanchard, IBM Corp. <anton@samba.org>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+
+#include <asm/mmu_context.h>
+
+static DEFINE_SPINLOCK(mmu_context_lock);
+static DEFINE_IDR(mmu_context_idr);
+
+/*
+ * The proto-VSID space has 2^35 - 1 segments available for user mappings.
+ * Each segment contains 2^28 bytes.  Each context maps 2^44 bytes,
+ * so we can support 2^19-1 contexts (19 == 35 + 28 - 44).
+ */
+#define NO_CONTEXT	0
+#define MAX_CONTEXT	((1UL << 19) - 1)
+
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+{
+	int index;
+	int err;
+
+again:
+	if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL))
+		return -ENOMEM;
+
+	spin_lock(&mmu_context_lock);
+	err = idr_get_new_above(&mmu_context_idr, NULL, 1, &index);
+	spin_unlock(&mmu_context_lock);
+
+	if (err == -EAGAIN)
+		goto again;
+	else if (err)
+		return err;
+
+	if (index > MAX_CONTEXT) {
+		spin_lock(&mmu_context_lock);
+		idr_remove(&mmu_context_idr, index);
+		spin_unlock(&mmu_context_lock);
+		return -ENOMEM;
+	}
+
+	/* The old code would re-promote on fork, we don't do that
+	 * when using slices as it could cause problem promoting slices
+	 * that have been forced down to 4K
+	 */
+	if (slice_mm_new_context(mm))
+		slice_set_user_psize(mm, mmu_virtual_psize);
+	mm->context.id = index;
+
+	return 0;
+}
+
+void destroy_context(struct mm_struct *mm)
+{
+	spin_lock(&mmu_context_lock);
+	idr_remove(&mmu_context_idr, mm->context.id);
+	spin_unlock(&mmu_context_lock);
+
+	mm->context.id = NO_CONTEXT;
+}
diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
new file mode 100644
index 00000000000..00e02150abe
--- /dev/null
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -0,0 +1,162 @@
+/*
+ * This file contains the routines for handling the MMU on those
+ * PowerPC implementations where the MMU is not using the hash
+ * table, such as 8xx, 4xx, BookE's etc...
+ *
+ * Copyright 2008 Ben Herrenschmidt <benh@kernel.crashing.org>
+ *                IBM Corp.
+ *
+ *  Derived from previous arch/powerpc/mm/mmu_context.c
+ *  and arch/powerpc/include/asm/mmu_context.h
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/init.h>
+
+#include <asm/mmu_context.h>
+#include <asm/tlbflush.h>
+
+/*
+ *   The MPC8xx has only 16 contexts.  We rotate through them on each
+ * task switch.  A better way would be to keep track of tasks that
+ * own contexts, and implement an LRU usage.  That way very active
+ * tasks don't always have to pay the TLB reload overhead.  The
+ * kernel pages are mapped shared, so the kernel can run on behalf
+ * of any task that makes a kernel entry.  Shared does not mean they
+ * are not protected, just that the ASID comparison is not performed.
+ *      -- Dan
+ *
+ * The IBM4xx has 256 contexts, so we can just rotate through these
+ * as a way of "switching" contexts.  If the TID of the TLB is zero,
+ * the PID/TID comparison is disabled, so we can use a TID of zero
+ * to represent all kernel pages as shared among all contexts.
+ * 	-- Dan
+ */
+
+#ifdef CONFIG_8xx
+#define NO_CONTEXT      	16
+#define LAST_CONTEXT    	15
+#define FIRST_CONTEXT    	0
+
+#elif defined(CONFIG_4xx)
+#define NO_CONTEXT      	256
+#define LAST_CONTEXT    	255
+#define FIRST_CONTEXT    	1
+
+#elif defined(CONFIG_E200) || defined(CONFIG_E500)
+#define NO_CONTEXT      	256
+#define LAST_CONTEXT    	255
+#define FIRST_CONTEXT    	1
+
+#else
+#error Unsupported processor type
+#endif
+
+static unsigned long next_mmu_context;
+static unsigned long context_map[LAST_CONTEXT / BITS_PER_LONG + 1];
+static atomic_t nr_free_contexts;
+static struct mm_struct *context_mm[LAST_CONTEXT+1];
+static void steal_context(void);
+
+/* Steal a context from a task that has one at the moment.
+ * This is only used on 8xx and 4xx and we presently assume that
+ * they don't do SMP.  If they do then this will have to check
+ * whether the MM we steal is in use.
+ * We also assume that this is only used on systems that don't
+ * use an MMU hash table - this is true for 8xx and 4xx.
+ * This isn't an LRU system, it just frees up each context in
+ * turn (sort-of pseudo-random replacement :).  This would be the
+ * place to implement an LRU scheme if anyone was motivated to do it.
+ *  -- paulus
+ */
+static void steal_context(void)
+{
+	struct mm_struct *mm;
+
+	/* free up context `next_mmu_context' */
+	/* if we shouldn't free context 0, don't... */
+	if (next_mmu_context < FIRST_CONTEXT)
+		next_mmu_context = FIRST_CONTEXT;
+	mm = context_mm[next_mmu_context];
+	flush_tlb_mm(mm);
+	destroy_context(mm);
+}
+
+
+/*
+ * Get a new mmu context for the address space described by `mm'.
+ */
+static inline void get_mmu_context(struct mm_struct *mm)
+{
+	unsigned long ctx;
+
+	if (mm->context.id != NO_CONTEXT)
+		return;
+
+	while (atomic_dec_if_positive(&nr_free_contexts) < 0)
+		steal_context();
+
+	ctx = next_mmu_context;
+	while (test_and_set_bit(ctx, context_map)) {
+		ctx = find_next_zero_bit(context_map, LAST_CONTEXT+1, ctx);
+		if (ctx > LAST_CONTEXT)
+			ctx = 0;
+	}
+	next_mmu_context = (ctx + 1) & LAST_CONTEXT;
+	mm->context.id = ctx;
+	context_mm[ctx] = mm;
+}
+
+void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
+{
+	get_mmu_context(next);
+
+	set_context(next->context.id, next->pgd);
+}
+
+/*
+ * Set up the context for a new address space.
+ */
+int init_new_context(struct task_struct *t, struct mm_struct *mm)
+{
+	mm->context.id = NO_CONTEXT;
+	return 0;
+}
+
+/*
+ * We're finished using the context for an address space.
+ */
+void destroy_context(struct mm_struct *mm)
+{
+	preempt_disable();
+	if (mm->context.id != NO_CONTEXT) {
+		clear_bit(mm->context.id, context_map);
+		mm->context.id = NO_CONTEXT;
+		atomic_inc(&nr_free_contexts);
+	}
+	preempt_enable();
+}
+
+
+/*
+ * Initialize the context management stuff.
+ */
+void __init mmu_context_init(void)
+{
+	/*
+	 * Some processors have too few contexts to reserve one for
+	 * init_mm, and require using context 0 for a normal task.
+	 * Other processors reserve the use of context zero for the kernel.
+	 * This code assumes FIRST_CONTEXT < 32.
+	 */
+	context_map[0] = (1 << FIRST_CONTEXT) - 1;
+	next_mmu_context = FIRST_CONTEXT;
+	atomic_set(&nr_free_contexts, LAST_CONTEXT - FIRST_CONTEXT + 1);
+}
+
-- 
cgit v1.2.3


From 2ca8cf738907180e7fbda90f25f32b86feda609f Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Thu, 18 Dec 2008 19:13:29 +0000
Subject: powerpc/mm: Rework context management for CPUs with no hash table

This reworks the context management code used by 4xx,8xx and
freescale BookE.  It adds support for SMP by implementing a
concept of stale context map to lazily flush the TLB on
processors where a context may have been invalidated.  This
also contains the ground work for generalizing such lazy TLB
flushing by just picking up a new PID and marking the old one
stale.  This will be implemented later.

This is a first implementation that uses a global spinlock.

Ideally, we should try to get at least the fast path (context ID
already assigned) lockless or limited to a per context lock,
but for now this will do.

I tried to keep the UP case reasonably simple to avoid adding
too much overhead to 8xx which does a lot of context stealing
since it effectively has only 16 PIDs available.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Kumar Gala <galak@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/mm/mmu_context_nohash.c | 268 +++++++++++++++++++++++++++++------
 1 file changed, 221 insertions(+), 47 deletions(-)

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
index 00e02150abe..8b5de52de0a 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -14,13 +14,28 @@
  *  as published by the Free Software Foundation; either version
  *  2 of the License, or (at your option) any later version.
  *
+ * TODO:
+ *
+ *   - The global context lock will not scale very well
+ *   - The maps should be dynamically allocated to allow for processors
+ *     that support more PID bits at runtime
+ *   - Implement flush_tlb_mm() by making the context stale and picking
+ *     a new one
+ *   - More aggressively clear stale map bits and maybe find some way to
+ *     also clear mm->cpu_vm_mask bits when processes are migrated
  */
 
+#undef DEBUG
+#define DEBUG_STEAL_ONLY
+#undef DEBUG_MAP_CONSISTENCY
+
+#include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/init.h>
 
 #include <asm/mmu_context.h>
 #include <asm/tlbflush.h>
+#include <linux/spinlock.h>
 
 /*
  *   The MPC8xx has only 16 contexts.  We rotate through them on each
@@ -40,17 +55,14 @@
  */
 
 #ifdef CONFIG_8xx
-#define NO_CONTEXT      	16
 #define LAST_CONTEXT    	15
 #define FIRST_CONTEXT    	0
 
 #elif defined(CONFIG_4xx)
-#define NO_CONTEXT      	256
 #define LAST_CONTEXT    	255
 #define FIRST_CONTEXT    	1
 
 #elif defined(CONFIG_E200) || defined(CONFIG_E500)
-#define NO_CONTEXT      	256
 #define LAST_CONTEXT    	255
 #define FIRST_CONTEXT    	1
 
@@ -58,66 +70,208 @@
 #error Unsupported processor type
 #endif
 
-static unsigned long next_mmu_context;
+static unsigned int next_context, nr_free_contexts;
 static unsigned long context_map[LAST_CONTEXT / BITS_PER_LONG + 1];
-static atomic_t nr_free_contexts;
+static unsigned long stale_map[NR_CPUS][LAST_CONTEXT / BITS_PER_LONG + 1];
 static struct mm_struct *context_mm[LAST_CONTEXT+1];
-static void steal_context(void);
+static spinlock_t context_lock = SPIN_LOCK_UNLOCKED;
 
 /* Steal a context from a task that has one at the moment.
- * This is only used on 8xx and 4xx and we presently assume that
- * they don't do SMP.  If they do then this will have to check
- * whether the MM we steal is in use.
- * We also assume that this is only used on systems that don't
- * use an MMU hash table - this is true for 8xx and 4xx.
+ *
+ * This is used when we are running out of available PID numbers
+ * on the processors.
+ *
  * This isn't an LRU system, it just frees up each context in
  * turn (sort-of pseudo-random replacement :).  This would be the
  * place to implement an LRU scheme if anyone was motivated to do it.
  *  -- paulus
+ *
+ * For context stealing, we use a slightly different approach for
+ * SMP and UP. Basically, the UP one is simpler and doesn't use
+ * the stale map as we can just flush the local CPU
+ *  -- benh
  */
-static void steal_context(void)
+#ifdef CONFIG_SMP
+static unsigned int steal_context_smp(unsigned int id)
 {
 	struct mm_struct *mm;
+	unsigned int cpu, max;
 
-	/* free up context `next_mmu_context' */
-	/* if we shouldn't free context 0, don't... */
-	if (next_mmu_context < FIRST_CONTEXT)
-		next_mmu_context = FIRST_CONTEXT;
-	mm = context_mm[next_mmu_context];
-	flush_tlb_mm(mm);
-	destroy_context(mm);
-}
+ again:
+	max = LAST_CONTEXT - FIRST_CONTEXT;
 
+	/* Attempt to free next_context first and then loop until we manage */
+	while (max--) {
+		/* Pick up the victim mm */
+		mm = context_mm[id];
 
-/*
- * Get a new mmu context for the address space described by `mm'.
+		/* We have a candidate victim, check if it's active, on SMP
+		 * we cannot steal active contexts
+		 */
+		if (mm->context.active) {
+			id++;
+			if (id > LAST_CONTEXT)
+				id = FIRST_CONTEXT;
+			continue;
+		}
+		pr_debug("[%d] steal context %d from mm @%p\n",
+			 smp_processor_id(), id, mm);
+
+		/* Mark this mm has having no context anymore */
+		mm->context.id = MMU_NO_CONTEXT;
+
+		/* Mark it stale on all CPUs that used this mm */
+		for_each_cpu_mask_nr(cpu, mm->cpu_vm_mask)
+			__set_bit(id, stale_map[cpu]);
+		return id;
+	}
+
+	/* This will happen if you have more CPUs than available contexts,
+	 * all we can do here is wait a bit and try again
+	 */
+	spin_unlock(&context_lock);
+	cpu_relax();
+	spin_lock(&context_lock);
+	goto again;
+}
+#endif  /* CONFIG_SMP */
+
+/* Note that this will also be called on SMP if all other CPUs are
+ * offlined, which means that it may be called for cpu != 0. For
+ * this to work, we somewhat assume that CPUs that are onlined
+ * come up with a fully clean TLB (or are cleaned when offlined)
  */
-static inline void get_mmu_context(struct mm_struct *mm)
+static unsigned int steal_context_up(unsigned int id)
 {
-	unsigned long ctx;
+	struct mm_struct *mm;
+	int cpu = smp_processor_id();
 
-	if (mm->context.id != NO_CONTEXT)
-		return;
+	/* Pick up the victim mm */
+	mm = context_mm[id];
+
+	pr_debug("[%d] steal context %d from mm @%p\n", cpu, id, mm);
 
-	while (atomic_dec_if_positive(&nr_free_contexts) < 0)
-		steal_context();
+	/* Mark this mm has having no context anymore */
+	mm->context.id = MMU_NO_CONTEXT;
 
-	ctx = next_mmu_context;
-	while (test_and_set_bit(ctx, context_map)) {
-		ctx = find_next_zero_bit(context_map, LAST_CONTEXT+1, ctx);
-		if (ctx > LAST_CONTEXT)
-			ctx = 0;
+	/* Flush the TLB for that context */
+	local_flush_tlb_mm(mm);
+
+	/* XXX This clear should ultimately be part of local_flush_tlb_mm */
+	__clear_bit(id, stale_map[cpu]);
+
+	return id;
+}
+
+#ifdef DEBUG_MAP_CONSISTENCY
+static void context_check_map(void)
+{
+	unsigned int id, nrf, nact;
+
+	nrf = nact = 0;
+	for (id = FIRST_CONTEXT; id <= LAST_CONTEXT; id++) {
+		int used = test_bit(id, context_map);
+		if (!used)
+			nrf++;
+		if (used != (context_mm[id] != NULL))
+			pr_err("MMU: Context %d is %s and MM is %p !\n",
+			       id, used ? "used" : "free", context_mm[id]);
+		if (context_mm[id] != NULL)
+			nact += context_mm[id]->context.active;
 	}
-	next_mmu_context = (ctx + 1) & LAST_CONTEXT;
-	mm->context.id = ctx;
-	context_mm[ctx] = mm;
+	if (nrf != nr_free_contexts) {
+		pr_err("MMU: Free context count out of sync ! (%d vs %d)\n",
+		       nr_free_contexts, nrf);
+		nr_free_contexts = nrf;
+	}
+	if (nact > num_online_cpus())
+		pr_err("MMU: More active contexts than CPUs ! (%d vs %d)\n",
+		       nact, num_online_cpus());
 }
+#else
+static void context_check_map(void) { }
+#endif
 
 void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
 {
-	get_mmu_context(next);
+	unsigned int id, cpu = smp_processor_id();
+	unsigned long *map;
 
-	set_context(next->context.id, next->pgd);
+	/* No lockless fast path .. yet */
+	spin_lock(&context_lock);
+
+#ifndef DEBUG_STEAL_ONLY
+	pr_debug("[%d] activating context for mm @%p, active=%d, id=%d\n",
+		 cpu, next, next->context.active, next->context.id);
+#endif
+
+#ifdef CONFIG_SMP
+	/* Mark us active and the previous one not anymore */
+	next->context.active++;
+	if (prev) {
+		WARN_ON(prev->context.active < 1);
+		prev->context.active--;
+	}
+#endif /* CONFIG_SMP */
+
+	/* If we already have a valid assigned context, skip all that */
+	id = next->context.id;
+	if (likely(id != MMU_NO_CONTEXT))
+		goto ctxt_ok;
+
+	/* We really don't have a context, let's try to acquire one */
+	id = next_context;
+	if (id > LAST_CONTEXT)
+		id = FIRST_CONTEXT;
+	map = context_map;
+
+	/* No more free contexts, let's try to steal one */
+	if (nr_free_contexts == 0) {
+#ifdef CONFIG_SMP
+		if (num_online_cpus() > 1) {
+			id = steal_context_smp(id);
+			goto stolen;
+		}
+#endif /* CONFIG_SMP */
+		id = steal_context_up(id);
+		goto stolen;
+	}
+	nr_free_contexts--;
+
+	/* We know there's at least one free context, try to find it */
+	while (__test_and_set_bit(id, map)) {
+		id = find_next_zero_bit(map, LAST_CONTEXT+1, id);
+		if (id > LAST_CONTEXT)
+			id = FIRST_CONTEXT;
+	}
+ stolen:
+	next_context = id + 1;
+	context_mm[id] = next;
+	next->context.id = id;
+
+#ifndef DEBUG_STEAL_ONLY
+	pr_debug("[%d] picked up new id %d, nrf is now %d\n",
+		 cpu, id, nr_free_contexts);
+#endif
+
+	context_check_map();
+ ctxt_ok:
+
+	/* If that context got marked stale on this CPU, then flush the
+	 * local TLB for it and unmark it before we use it
+	 */
+	if (test_bit(id, stale_map[cpu])) {
+		pr_debug("[%d] flushing stale context %d for mm @%p !\n",
+			 cpu, id, next);
+		local_flush_tlb_mm(next);
+
+		/* XXX This clear should ultimately be part of local_flush_tlb_mm */
+		__clear_bit(id, stale_map[cpu]);
+	}
+
+	/* Flick the MMU and release lock */
+	set_context(id, next->pgd);
+	spin_unlock(&context_lock);
 }
 
 /*
@@ -125,7 +279,9 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
  */
 int init_new_context(struct task_struct *t, struct mm_struct *mm)
 {
-	mm->context.id = NO_CONTEXT;
+	mm->context.id = MMU_NO_CONTEXT;
+	mm->context.active = 0;
+
 	return 0;
 }
 
@@ -134,13 +290,25 @@ int init_new_context(struct task_struct *t, struct mm_struct *mm)
  */
 void destroy_context(struct mm_struct *mm)
 {
-	preempt_disable();
-	if (mm->context.id != NO_CONTEXT) {
-		clear_bit(mm->context.id, context_map);
-		mm->context.id = NO_CONTEXT;
-		atomic_inc(&nr_free_contexts);
+	unsigned int id;
+
+	if (mm->context.id == MMU_NO_CONTEXT)
+		return;
+
+	WARN_ON(mm->context.active != 0);
+
+	spin_lock(&context_lock);
+	id = mm->context.id;
+	if (id != MMU_NO_CONTEXT) {
+		__clear_bit(id, context_map);
+		mm->context.id = MMU_NO_CONTEXT;
+#ifdef DEBUG_MAP_CONSISTENCY
+		mm->context.active = 0;
+		context_mm[id] = NULL;
+#endif
+		nr_free_contexts++;
 	}
-	preempt_enable();
+	spin_unlock(&context_lock);
 }
 
 
@@ -149,6 +317,12 @@ void destroy_context(struct mm_struct *mm)
  */
 void __init mmu_context_init(void)
 {
+	/* Mark init_mm as being active on all possible CPUs since
+	 * we'll get called with prev == init_mm the first time
+	 * we schedule on a given CPU
+	 */
+	init_mm.context.active = NR_CPUS;
+
 	/*
 	 * Some processors have too few contexts to reserve one for
 	 * init_mm, and require using context 0 for a normal task.
@@ -156,7 +330,7 @@ void __init mmu_context_init(void)
 	 * This code assumes FIRST_CONTEXT < 32.
 	 */
 	context_map[0] = (1 << FIRST_CONTEXT) - 1;
-	next_mmu_context = FIRST_CONTEXT;
-	atomic_set(&nr_free_contexts, LAST_CONTEXT - FIRST_CONTEXT + 1);
+	next_context = FIRST_CONTEXT;
+	nr_free_contexts = LAST_CONTEXT - FIRST_CONTEXT + 1;
 }
 
-- 
cgit v1.2.3


From 7c03d653cd257793dc40520c94e229b5fd0578e7 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Thu, 18 Dec 2008 19:13:32 +0000
Subject: powerpc/mm: Introduce MMU features

We're soon running out of CPU features and I need to add some new
ones for various MMU related bits, so this patch separates the MMU
features from the CPU features.  I moved over the 32-bit MMU related
ones, added base features for MMU type families, but didn't move
over any 64-bit only feature yet.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Kumar Gala <galak@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/mm/ppc_mmu_32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c
index 6aa12081377..9d97db7b7cf 100644
--- a/arch/powerpc/mm/ppc_mmu_32.c
+++ b/arch/powerpc/mm/ppc_mmu_32.c
@@ -192,7 +192,7 @@ void __init MMU_init_hw(void)
 	extern unsigned int hash_page[];
 	extern unsigned int flush_hash_patch_A[], flush_hash_patch_B[];
 
-	if (!cpu_has_feature(CPU_FTR_HPTE_TABLE)) {
+	if (!mmu_has_feature(MMU_FTR_HPTE_TABLE)) {
 		/*
 		 * Put a blr (procedure return) instruction at the
 		 * start of hash_page, since we can still get DSI
-- 
cgit v1.2.3


From f048aace29e007f2b642097e2da8231e0e9cce2d Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Thu, 18 Dec 2008 19:13:38 +0000
Subject: powerpc/mm: Add SMP support to no-hash TLB handling

This commit moves the whole no-hash TLB handling out of line into a
new tlb_nohash.c file, and implements some basic SMP support using
IPIs and/or broadcast tlbivax instructions.

Note that I'm using local invalidations for D->I cache coherency.

At worst, if another processor is trying to execute the same and
has the old entry in its TLB, it will just take a fault and re-do
the TLB flush locally (it won't re-do the cache flush in any case).

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Kumar Gala <galak@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/mm/Makefile     |   2 +-
 arch/powerpc/mm/fault.c      |   2 +-
 arch/powerpc/mm/mem.c        |   2 +-
 arch/powerpc/mm/tlb_hash32.c |   4 +
 arch/powerpc/mm/tlb_nohash.c | 209 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 216 insertions(+), 3 deletions(-)
 create mode 100644 arch/powerpc/mm/tlb_nohash.c

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 923bd3fa7d6..af987df8d5a 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -9,7 +9,7 @@ endif
 obj-y				:= fault.o mem.o pgtable.o \
 				   init_$(CONFIG_WORD_SIZE).o \
 				   pgtable_$(CONFIG_WORD_SIZE).o
-obj-$(CONFIG_PPC_MMU_NOHASH)	+= mmu_context_nohash.o
+obj-$(CONFIG_PPC_MMU_NOHASH)	+= mmu_context_nohash.o tlb_nohash.o
 hash-$(CONFIG_PPC_NATIVE)	:= hash_native_64.o
 obj-$(CONFIG_PPC64)		+= hash_utils_64.o \
 				   slb_low.o slb.o stab.o \
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 7df0409107a..87f1f955dea 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -284,7 +284,7 @@ good_area:
 				}
 				pte_update(ptep, 0, _PAGE_HWEXEC |
 					   _PAGE_ACCESSED);
-				_tlbie(address, mm->context.id);
+				local_flush_tlb_page(vma, address);
 				pte_unmap_unlock(ptep, ptl);
 				up_read(&mm->mmap_sem);
 				return 0;
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index b9e1a1da6e5..8fee696fb79 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -488,7 +488,7 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
 		 * we invalidate the TLB here, thus avoiding dcbst
 		 * misbehaviour.
 		 */
-		_tlbie(address, 0 /* 8xx doesn't care about PID */);
+		_tlbil_va(address, 0 /* 8xx doesn't care about PID */);
 #endif
 		/* The _PAGE_USER test should really be _PAGE_EXEC, but
 		 * older glibc versions execute some code from no-exec
diff --git a/arch/powerpc/mm/tlb_hash32.c b/arch/powerpc/mm/tlb_hash32.c
index f9a47fee392..65190587a36 100644
--- a/arch/powerpc/mm/tlb_hash32.c
+++ b/arch/powerpc/mm/tlb_hash32.c
@@ -137,6 +137,7 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 	flush_range(&init_mm, start, end);
 	FINISH_FLUSH;
 }
+EXPORT_SYMBOL(flush_tlb_kernel_range);
 
 /*
  * Flush all the (user) entries for the address space described by mm.
@@ -160,6 +161,7 @@ void flush_tlb_mm(struct mm_struct *mm)
 		flush_range(mp->vm_mm, mp->vm_start, mp->vm_end);
 	FINISH_FLUSH;
 }
+EXPORT_SYMBOL(flush_tlb_mm);
 
 void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
 {
@@ -176,6 +178,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
 		flush_hash_pages(mm->context.id, vmaddr, pmd_val(*pmd), 1);
 	FINISH_FLUSH;
 }
+EXPORT_SYMBOL(flush_tlb_page);
 
 /*
  * For each address in the range, find the pte for the address
@@ -188,3 +191,4 @@ void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 	flush_range(vma->vm_mm, start, end);
 	FINISH_FLUSH;
 }
+EXPORT_SYMBOL(flush_tlb_range);
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
new file mode 100644
index 00000000000..803a64c02b0
--- /dev/null
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -0,0 +1,209 @@
+/*
+ * This file contains the routines for TLB flushing.
+ * On machines where the MMU does not use a hash table to store virtual to
+ * physical translations (ie, SW loaded TLBs or Book3E compilant processors,
+ * this does -not- include 603 however which shares the implementation with
+ * hash based processors)
+ *
+ *  -- BenH
+ *
+ * Copyright 2008 Ben Herrenschmidt <benh@kernel.crashing.org>
+ *                IBM Corp.
+ *
+ *  Derived from arch/ppc/mm/init.c:
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/preempt.h>
+#include <linux/spinlock.h>
+
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+
+#include "mmu_decl.h"
+
+/*
+ * Base TLB flushing operations:
+ *
+ *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
+ *  - flush_tlb_page(vma, vmaddr) flushes one page
+ *  - flush_tlb_range(vma, start, end) flushes a range of pages
+ *  - flush_tlb_kernel_range(start, end) flushes kernel pages
+ *
+ *  - local_* variants of page and mm only apply to the current
+ *    processor
+ */
+
+/*
+ * These are the base non-SMP variants of page and mm flushing
+ */
+void local_flush_tlb_mm(struct mm_struct *mm)
+{
+	unsigned int pid;
+
+	preempt_disable();
+	pid = mm->context.id;
+	if (pid != MMU_NO_CONTEXT)
+		_tlbil_pid(pid);
+	preempt_enable();
+}
+EXPORT_SYMBOL(local_flush_tlb_mm);
+
+void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+	unsigned int pid;
+
+	preempt_disable();
+	pid = vma ? vma->vm_mm->context.id : 0;
+	if (pid != MMU_NO_CONTEXT)
+		_tlbil_va(vmaddr, pid);
+	preempt_enable();
+}
+EXPORT_SYMBOL(local_flush_tlb_page);
+
+
+/*
+ * And here are the SMP non-local implementations
+ */
+#ifdef CONFIG_SMP
+
+static DEFINE_SPINLOCK(tlbivax_lock);
+
+struct tlb_flush_param {
+	unsigned long addr;
+	unsigned int pid;
+};
+
+static void do_flush_tlb_mm_ipi(void *param)
+{
+	struct tlb_flush_param *p = param;
+
+	_tlbil_pid(p ? p->pid : 0);
+}
+
+static void do_flush_tlb_page_ipi(void *param)
+{
+	struct tlb_flush_param *p = param;
+
+	_tlbil_va(p->addr, p->pid);
+}
+
+
+/* Note on invalidations and PID:
+ *
+ * We snapshot the PID with preempt disabled. At this point, it can still
+ * change either because:
+ * - our context is being stolen (PID -> NO_CONTEXT) on another CPU
+ * - we are invaliating some target that isn't currently running here
+ *   and is concurrently acquiring a new PID on another CPU
+ * - some other CPU is re-acquiring a lost PID for this mm
+ * etc...
+ *
+ * However, this shouldn't be a problem as we only guarantee
+ * invalidation of TLB entries present prior to this call, so we
+ * don't care about the PID changing, and invalidating a stale PID
+ * is generally harmless.
+ */
+
+void flush_tlb_mm(struct mm_struct *mm)
+{
+	cpumask_t cpu_mask;
+	unsigned int pid;
+
+	preempt_disable();
+	pid = mm->context.id;
+	if (unlikely(pid == MMU_NO_CONTEXT))
+		goto no_context;
+	cpu_mask = mm->cpu_vm_mask;
+	cpu_clear(smp_processor_id(), cpu_mask);
+	if (!cpus_empty(cpu_mask)) {
+		struct tlb_flush_param p = { .pid = pid };
+		smp_call_function_mask(cpu_mask, do_flush_tlb_mm_ipi, &p, 1);
+	}
+	_tlbil_pid(pid);
+ no_context:
+	preempt_enable();
+}
+EXPORT_SYMBOL(flush_tlb_mm);
+
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+	cpumask_t cpu_mask;
+	unsigned int pid;
+
+	preempt_disable();
+	pid = vma ? vma->vm_mm->context.id : 0;
+	if (unlikely(pid == MMU_NO_CONTEXT))
+		goto bail;
+	cpu_mask = vma->vm_mm->cpu_vm_mask;
+	cpu_clear(smp_processor_id(), cpu_mask);
+	if (!cpus_empty(cpu_mask)) {
+		/* If broadcast tlbivax is supported, use it */
+		if (mmu_has_feature(MMU_FTR_USE_TLBIVAX_BCAST)) {
+			int lock = mmu_has_feature(MMU_FTR_LOCK_BCAST_INVAL);
+			if (lock)
+				spin_lock(&tlbivax_lock);
+			_tlbivax_bcast(vmaddr, pid);
+			if (lock)
+				spin_unlock(&tlbivax_lock);
+			goto bail;
+		} else {
+			struct tlb_flush_param p = { .pid = pid, .addr = vmaddr };
+			smp_call_function_mask(cpu_mask,
+					       do_flush_tlb_page_ipi, &p, 1);
+		}
+	}
+	_tlbil_va(vmaddr, pid);
+ bail:
+	preempt_enable();
+}
+EXPORT_SYMBOL(flush_tlb_page);
+
+#endif /* CONFIG_SMP */
+
+/*
+ * Flush kernel TLB entries in the given range
+ */
+void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+#ifdef CONFIG_SMP
+	preempt_disable();
+	smp_call_function(do_flush_tlb_mm_ipi, NULL, 1);
+	_tlbil_pid(0);
+	preempt_enable();
+#endif
+	_tlbil_pid(0);
+}
+EXPORT_SYMBOL(flush_tlb_kernel_range);
+
+/*
+ * Currently, for range flushing, we just do a full mm flush. This should
+ * be optimized based on a threshold on the size of the range, since
+ * some implementation can stack multiple tlbivax before a tlbsync but
+ * for now, we keep it that way
+ */
+void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
+		     unsigned long end)
+
+{
+	flush_tlb_mm(vma->vm_mm);
+}
+EXPORT_SYMBOL(flush_tlb_range);
-- 
cgit v1.2.3


From 2a4aca1144394653269720ffbb5a325a77abd5fa Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Thu, 18 Dec 2008 19:13:42 +0000
Subject: powerpc/mm: Split low level tlb invalidate for nohash processors

Currently, the various forms of low level TLB invalidations are all
implemented in misc_32.S for 32-bit processors, in a fairly scary
mess of #ifdef's and with interesting duplication such as a whole
bunch of code for FSL _tlbie and _tlbia which are no longer used.

This moves things around such that _tlbie is now defined in
hash_low_32.S and is only used by the 32-bit hash code, and all
nohash CPUs use the various _tlbil_* forms that are now moved to
a new file, tlb_nohash_low.S.

I moved all the definitions for that stuff out of
include/asm/tlbflush.h as they are really internal mm stuff, into
mm/mmu_decl.h

The code should have no functional changes.  I kept some variants
inline for trivial forms on things like 40x and 8xx.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Kumar Gala <galak@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/mm/Makefile         |   3 +-
 arch/powerpc/mm/hash_low_32.S    |  76 ++++++++++++++++++
 arch/powerpc/mm/mmu_decl.h       |  48 ++++++++++++
 arch/powerpc/mm/tlb_nohash_low.S | 165 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 291 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/mm/tlb_nohash_low.S

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index af987df8d5a..953cc4a1cde 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -9,7 +9,8 @@ endif
 obj-y				:= fault.o mem.o pgtable.o \
 				   init_$(CONFIG_WORD_SIZE).o \
 				   pgtable_$(CONFIG_WORD_SIZE).o
-obj-$(CONFIG_PPC_MMU_NOHASH)	+= mmu_context_nohash.o tlb_nohash.o
+obj-$(CONFIG_PPC_MMU_NOHASH)	+= mmu_context_nohash.o tlb_nohash.o \
+				   tlb_nohash_low.o
 hash-$(CONFIG_PPC_NATIVE)	:= hash_native_64.o
 obj-$(CONFIG_PPC64)		+= hash_utils_64.o \
 				   slb_low.o slb.o stab.o \
diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S
index c5536b8b37a..c8eac22a8f0 100644
--- a/arch/powerpc/mm/hash_low_32.S
+++ b/arch/powerpc/mm/hash_low_32.S
@@ -633,3 +633,79 @@ _GLOBAL(flush_hash_patch_B)
 	SYNC_601
 	isync
 	blr
+
+/*
+ * Flush an entry from the TLB
+ */
+_GLOBAL(_tlbie)
+#ifdef CONFIG_SMP
+	rlwinm	r8,r1,0,0,(31-THREAD_SHIFT)
+	lwz	r8,TI_CPU(r8)
+	oris	r8,r8,11
+	mfmsr	r10
+	SYNC
+	rlwinm	r0,r10,0,17,15		/* clear bit 16 (MSR_EE) */
+	rlwinm	r0,r0,0,28,26		/* clear DR */
+	mtmsr	r0
+	SYNC_601
+	isync
+	lis	r9,mmu_hash_lock@h
+	ori	r9,r9,mmu_hash_lock@l
+	tophys(r9,r9)
+10:	lwarx	r7,0,r9
+	cmpwi	0,r7,0
+	bne-	10b
+	stwcx.	r8,0,r9
+	bne-	10b
+	eieio
+	tlbie	r3
+	sync
+	TLBSYNC
+	li	r0,0
+	stw	r0,0(r9)		/* clear mmu_hash_lock */
+	mtmsr	r10
+	SYNC_601
+	isync
+#else /* CONFIG_SMP */
+	tlbie	r3
+	sync
+#endif /* CONFIG_SMP */
+	blr
+
+/*
+ * Flush the entire TLB. 603/603e only
+ */
+_GLOBAL(_tlbia)
+#if defined(CONFIG_SMP)
+	rlwinm	r8,r1,0,0,(31-THREAD_SHIFT)
+	lwz	r8,TI_CPU(r8)
+	oris	r8,r8,10
+	mfmsr	r10
+	SYNC
+	rlwinm	r0,r10,0,17,15		/* clear bit 16 (MSR_EE) */
+	rlwinm	r0,r0,0,28,26		/* clear DR */
+	mtmsr	r0
+	SYNC_601
+	isync
+	lis	r9,mmu_hash_lock@h
+	ori	r9,r9,mmu_hash_lock@l
+	tophys(r9,r9)
+10:	lwarx	r7,0,r9
+	cmpwi	0,r7,0
+	bne-	10b
+	stwcx.	r8,0,r9
+	bne-	10b
+	sync
+	tlbia
+	sync
+	TLBSYNC
+	li	r0,0
+	stw	r0,0(r9)		/* clear mmu_hash_lock */
+	mtmsr	r10
+	SYNC_601
+	isync
+#else /* CONFIG_SMP */
+	sync
+	tlbia
+	sync
+#endif /* CONFIG_SMP */
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index b4344fd30f2..4314b39b6fa 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -22,10 +22,58 @@
 #include <asm/tlbflush.h>
 #include <asm/mmu.h>
 
+#ifdef CONFIG_PPC_MMU_NOHASH
+
+/*
+ * On 40x and 8xx, we directly inline tlbia and tlbivax
+ */
+#if defined(CONFIG_40x) || defined(CONFIG_8xx)
+static inline void _tlbil_all(void)
+{
+	asm volatile ("sync; tlbia; isync" : : : "memory")
+}
+static inline void _tlbil_pid(unsigned int pid)
+{
+	asm volatile ("sync; tlbia; isync" : : : "memory")
+}
+#else /* CONFIG_40x || CONFIG_8xx */
+extern void _tlbil_all(void);
+extern void _tlbil_pid(unsigned int pid);
+#endif /* !(CONFIG_40x || CONFIG_8xx) */
+
+/*
+ * On 8xx, we directly inline tlbie, on others, it's extern
+ */
+#ifdef CONFIG_8xx
+static inline void _tlbil_va(unsigned long address, unsigned int pid)
+{
+	asm volatile ("tlbie %0; sync" : : "r" (address) : "memory")
+}
+#else /* CONFIG_8xx */
+extern void _tlbil_va(unsigned long address, unsigned int pid);
+#endif /* CONIFG_8xx */
+
+/*
+ * As of today, we don't support tlbivax broadcast on any
+ * implementation. When that becomes the case, this will be
+ * an extern.
+ */
+static inline void _tlbivax_bcast(unsigned long address, unsigned int pid)
+{
+	BUG();
+}
+
+#else /* CONFIG_PPC_MMU_NOHASH */
+
 extern void hash_preload(struct mm_struct *mm, unsigned long ea,
 			 unsigned long access, unsigned long trap);
 
 
+extern void _tlbie(unsigned long address);
+extern void _tlbia(void);
+
+#endif /* CONFIG_PPC_MMU_NOHASH */
+
 #ifdef CONFIG_PPC32
 extern void mapin_ram(void);
 extern int map_page(unsigned long va, phys_addr_t pa, int flags);
diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/tlb_nohash_low.S
new file mode 100644
index 00000000000..763c59fe007
--- /dev/null
+++ b/arch/powerpc/mm/tlb_nohash_low.S
@@ -0,0 +1,165 @@
+/*
+ * This file contains low-level functions for performing various
+ * types of TLB invalidations on various processors with no hash
+ * table.
+ *
+ * This file implements the following functions for all no-hash
+ * processors. Some aren't implemented for some variants. Some
+ * are inline in tlbflush.h
+ *
+ *	- tlbil_va
+ *	- tlbil_pid
+ *	- tlbil_all
+ *	- tlbivax_bcast (not yet)
+ *
+ * Code mostly moved over from misc_32.S
+ *
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Partially rewritten by Cort Dougan (cort@cs.nmt.edu)
+ * Paul Mackerras, Kumar Gala and Benjamin Herrenschmidt.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/cputable.h>
+#include <asm/mmu.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/processor.h>
+
+#if defined(CONFIG_40x)
+
+/*
+ * 40x implementation needs only tlbil_va
+ */
+_GLOBAL(_tlbil_va)
+	/* We run the search with interrupts disabled because we have to change
+	 * the PID and I don't want to preempt when that happens.
+	 */
+	mfmsr	r5
+	mfspr	r6,SPRN_PID
+	wrteei	0
+	mtspr	SPRN_PID,r4
+	tlbsx.	r3, 0, r3
+	mtspr	SPRN_PID,r6
+	wrtee	r5
+	bne	1f
+	sync
+	/* There are only 64 TLB entries, so r3 < 64, which means bit 25 is
+	 * clear. Since 25 is the V bit in the TLB_TAG, loading this value
+	 * will invalidate the TLB entry. */
+	tlbwe	r3, r3, TLB_TAG
+	isync
+1:	blr
+
+#elif defined(CONFIG_8xx)
+
+/*
+ * Nothing to do for 8xx, everything is inline
+ */
+
+#elif defined(CONFIG_44x)
+
+/*
+ * 440 implementation uses tlbsx/we for tlbil_va and a full sweep
+ * of the TLB for everything else.
+ */
+_GLOBAL(_tlbil_va)
+	mfspr	r5,SPRN_MMUCR
+	rlwimi	r5,r4,0,24,31			/* Set TID */
+
+	/* We have to run the search with interrupts disabled, even critical
+	 * and debug interrupts (in fact the only critical exceptions we have
+	 * are debug and machine check).  Otherwise  an interrupt which causes
+	 * a TLB miss can clobber the MMUCR between the mtspr and the tlbsx. */
+	mfmsr	r4
+	lis	r6,(MSR_EE|MSR_CE|MSR_ME|MSR_DE)@ha
+	addi	r6,r6,(MSR_EE|MSR_CE|MSR_ME|MSR_DE)@l
+	andc	r6,r4,r6
+	mtmsr	r6
+	mtspr	SPRN_MMUCR,r5
+	tlbsx.	r3, 0, r3
+	mtmsr	r4
+	bne	1f
+	sync
+	/* There are only 64 TLB entries, so r3 < 64,
+	 * which means bit 22, is clear.  Since 22 is
+	 * the V bit in the TLB_PAGEID, loading this
+	 * value will invalidate the TLB entry.
+	 */
+	tlbwe	r3, r3, PPC44x_TLB_PAGEID
+	isync
+1:	blr
+
+_GLOBAL(_tlbil_all)
+_GLOBAL(_tlbil_pid)
+	li	r3,0
+	sync
+
+	/* Load high watermark */
+	lis	r4,tlb_44x_hwater@ha
+	lwz	r5,tlb_44x_hwater@l(r4)
+
+1:	tlbwe	r3,r3,PPC44x_TLB_PAGEID
+	addi	r3,r3,1
+	cmpw	0,r3,r5
+	ble	1b
+
+	isync
+	blr
+
+#elif defined(CONFIG_FSL_BOOKE)
+/*
+ * FSL BookE implementations. Currently _pid and _all are the
+ * same. This will change when tlbilx is actually supported and
+ * performs invalidate-by-PID. This change will be driven by
+ * mmu_features conditional
+ */
+
+/*
+ * Flush MMU TLB on the local processor
+ */
+_GLOBAL(_tlbil_pid)
+_GLOBAL(_tlbil_all)
+#define MMUCSR0_TLBFI	(MMUCSR0_TLB0FI | MMUCSR0_TLB1FI | \
+			 MMUCSR0_TLB2FI | MMUCSR0_TLB3FI)
+	li	r3,(MMUCSR0_TLBFI)@l
+	mtspr	SPRN_MMUCSR0, r3
+1:
+	mfspr	r3,SPRN_MMUCSR0
+	andi.	r3,r3,MMUCSR0_TLBFI@l
+	bne	1b
+	msync
+	isync
+	blr
+
+/*
+ * Flush MMU TLB for a particular address, but only on the local processor
+ * (no broadcast)
+ */
+_GLOBAL(_tlbil_va)
+	mfmsr	r10
+	wrteei	0
+	slwi	r4,r4,16
+	mtspr	SPRN_MAS6,r4		/* assume AS=0 for now */
+	tlbsx	0,r3
+	mfspr	r4,SPRN_MAS1		/* check valid */
+	andis.	r3,r4,MAS1_VALID@h
+	beq	1f
+	rlwinm	r4,r4,0,1,31
+	mtspr	SPRN_MAS1,r4
+	tlbwe
+	msync
+	isync
+1:	wrtee	r10
+	blr
+#elif
+#error Unsupported processor type !
+#endif
-- 
cgit v1.2.3


From 760ec0e02d8a13d0ed60d99f47879d4aa8ef1910 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Thu, 18 Dec 2008 19:13:46 +0000
Subject: powerpc/44x: No need to mask MSR:CE, ME or DE in _tlbil_va on 440

The handlers for Critical, Machine Check or Debug interrupts
will save and restore MMUCR nowadays, thus we only need to
disable normal interrupts when invalidating TLB entries.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Kumar Gala <galak@kernel.crashing.org>
Acked-by: Josh Boyer <jwboyer@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/mm/tlb_nohash_low.S | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/tlb_nohash_low.S
index 763c59fe007..f900a39e6ec 100644
--- a/arch/powerpc/mm/tlb_nohash_low.S
+++ b/arch/powerpc/mm/tlb_nohash_low.S
@@ -75,18 +75,19 @@ _GLOBAL(_tlbil_va)
 	mfspr	r5,SPRN_MMUCR
 	rlwimi	r5,r4,0,24,31			/* Set TID */
 
-	/* We have to run the search with interrupts disabled, even critical
-	 * and debug interrupts (in fact the only critical exceptions we have
-	 * are debug and machine check).  Otherwise  an interrupt which causes
-	 * a TLB miss can clobber the MMUCR between the mtspr and the tlbsx. */
+	/* We have to run the search with interrupts disabled, otherwise
+	 * an interrupt which causes a TLB miss can clobber the MMUCR
+	 * between the mtspr and the tlbsx.
+	 *
+	 * Critical and Machine Check interrupts take care of saving
+	 * and restoring MMUCR, so only normal interrupts have to be
+	 * taken care of.
+	 */
 	mfmsr	r4
-	lis	r6,(MSR_EE|MSR_CE|MSR_ME|MSR_DE)@ha
-	addi	r6,r6,(MSR_EE|MSR_CE|MSR_ME|MSR_DE)@l
-	andc	r6,r4,r6
-	mtmsr	r6
+	wrteei	0
 	mtspr	SPRN_MMUCR,r5
 	tlbsx.	r3, 0, r3
-	mtmsr	r4
+	wrtee	r4
 	bne	1f
 	sync
 	/* There are only 64 TLB entries, so r3 < 64,
-- 
cgit v1.2.3


From 77520351805cc19ba37394ae33f862ef6d3c2a23 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Thu, 18 Dec 2008 19:13:48 +0000
Subject: powerpc/mm: Runtime allocation of mmu context maps for nohash CPUs

This makes the MMU context code used for CPUs with no hash table
(except 603) dynamically allocate the various maps used to track
the state of contexts.

Only the main free map and CPU 0 stale map are allocated at boot
time.  Other CPU maps are allocated when those CPUs are brought up
and freed if they are unplugged.

This also moves the initialization of the MMU context management
slightly later during the boot process, which should be fine as
it's really only needed when userland if first started anyways.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Kumar Gala <galak@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/mm/init_32.c            |   4 -
 arch/powerpc/mm/mmu_context_nohash.c | 161 ++++++++++++++++++++++++-----------
 2 files changed, 111 insertions(+), 54 deletions(-)

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 388ceda632f..578294c3b1c 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -35,7 +35,6 @@
 #include <asm/pgalloc.h>
 #include <asm/prom.h>
 #include <asm/io.h>
-#include <asm/mmu_context.h>
 #include <asm/pgtable.h>
 #include <asm/mmu.h>
 #include <asm/smp.h>
@@ -180,9 +179,6 @@ void __init MMU_init(void)
 	if (ppc_md.progress)
 		ppc_md.progress("MMU:setio", 0x302);
 
-	/* Initialize the context management stuff */
-	mmu_context_init();
-
 	if (ppc_md.progress)
 		ppc_md.progress("MMU:exit", 0x211);
 
diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
index 8b5de52de0a..52a0cfc38b6 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -28,54 +28,30 @@
 #undef DEBUG
 #define DEBUG_STEAL_ONLY
 #undef DEBUG_MAP_CONSISTENCY
+/*#define DEBUG_CLAMP_LAST_CONTEXT   15 */
 
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/bootmem.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
 
 #include <asm/mmu_context.h>
 #include <asm/tlbflush.h>
-#include <linux/spinlock.h>
-
-/*
- *   The MPC8xx has only 16 contexts.  We rotate through them on each
- * task switch.  A better way would be to keep track of tasks that
- * own contexts, and implement an LRU usage.  That way very active
- * tasks don't always have to pay the TLB reload overhead.  The
- * kernel pages are mapped shared, so the kernel can run on behalf
- * of any task that makes a kernel entry.  Shared does not mean they
- * are not protected, just that the ASID comparison is not performed.
- *      -- Dan
- *
- * The IBM4xx has 256 contexts, so we can just rotate through these
- * as a way of "switching" contexts.  If the TID of the TLB is zero,
- * the PID/TID comparison is disabled, so we can use a TID of zero
- * to represent all kernel pages as shared among all contexts.
- * 	-- Dan
- */
-
-#ifdef CONFIG_8xx
-#define LAST_CONTEXT    	15
-#define FIRST_CONTEXT    	0
-
-#elif defined(CONFIG_4xx)
-#define LAST_CONTEXT    	255
-#define FIRST_CONTEXT    	1
-
-#elif defined(CONFIG_E200) || defined(CONFIG_E500)
-#define LAST_CONTEXT    	255
-#define FIRST_CONTEXT    	1
-
-#else
-#error Unsupported processor type
-#endif
 
+static unsigned int first_context, last_context;
 static unsigned int next_context, nr_free_contexts;
-static unsigned long context_map[LAST_CONTEXT / BITS_PER_LONG + 1];
-static unsigned long stale_map[NR_CPUS][LAST_CONTEXT / BITS_PER_LONG + 1];
-static struct mm_struct *context_mm[LAST_CONTEXT+1];
+static unsigned long *context_map;
+static unsigned long *stale_map[NR_CPUS];
+static struct mm_struct **context_mm;
 static spinlock_t context_lock = SPIN_LOCK_UNLOCKED;
 
+#define CTX_MAP_SIZE	\
+	(sizeof(unsigned long) * (last_context / BITS_PER_LONG + 1))
+
+
 /* Steal a context from a task that has one at the moment.
  *
  * This is used when we are running out of available PID numbers
@@ -98,7 +74,7 @@ static unsigned int steal_context_smp(unsigned int id)
 	unsigned int cpu, max;
 
  again:
-	max = LAST_CONTEXT - FIRST_CONTEXT;
+	max = last_context - first_context;
 
 	/* Attempt to free next_context first and then loop until we manage */
 	while (max--) {
@@ -110,8 +86,8 @@ static unsigned int steal_context_smp(unsigned int id)
 		 */
 		if (mm->context.active) {
 			id++;
-			if (id > LAST_CONTEXT)
-				id = FIRST_CONTEXT;
+			if (id > last_context)
+				id = first_context;
 			continue;
 		}
 		pr_debug("[%d] steal context %d from mm @%p\n",
@@ -169,7 +145,7 @@ static void context_check_map(void)
 	unsigned int id, nrf, nact;
 
 	nrf = nact = 0;
-	for (id = FIRST_CONTEXT; id <= LAST_CONTEXT; id++) {
+	for (id = first_context; id <= last_context; id++) {
 		int used = test_bit(id, context_map);
 		if (!used)
 			nrf++;
@@ -187,6 +163,8 @@ static void context_check_map(void)
 	if (nact > num_online_cpus())
 		pr_err("MMU: More active contexts than CPUs ! (%d vs %d)\n",
 		       nact, num_online_cpus());
+	if (first_context > 0 && !test_bit(0, context_map))
+		pr_err("MMU: Context 0 has been freed !!!\n");
 }
 #else
 static void context_check_map(void) { }
@@ -209,6 +187,10 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
 	/* Mark us active and the previous one not anymore */
 	next->context.active++;
 	if (prev) {
+#ifndef DEBUG_STEAL_ONLY
+		pr_debug(" old context %p active was: %d\n",
+			 prev, prev->context.active);
+#endif
 		WARN_ON(prev->context.active < 1);
 		prev->context.active--;
 	}
@@ -221,8 +203,8 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
 
 	/* We really don't have a context, let's try to acquire one */
 	id = next_context;
-	if (id > LAST_CONTEXT)
-		id = FIRST_CONTEXT;
+	if (id > last_context)
+		id = first_context;
 	map = context_map;
 
 	/* No more free contexts, let's try to steal one */
@@ -240,9 +222,9 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
 
 	/* We know there's at least one free context, try to find it */
 	while (__test_and_set_bit(id, map)) {
-		id = find_next_zero_bit(map, LAST_CONTEXT+1, id);
-		if (id > LAST_CONTEXT)
-			id = FIRST_CONTEXT;
+		id = find_next_zero_bit(map, last_context+1, id);
+		if (id > last_context)
+			id = first_context;
 	}
  stolen:
 	next_context = id + 1;
@@ -311,6 +293,42 @@ void destroy_context(struct mm_struct *mm)
 	spin_unlock(&context_lock);
 }
 
+#ifdef CONFIG_SMP
+
+static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self,
+					    unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned int)(long)hcpu;
+
+	/* We don't touch CPU 0 map, it's allocated at aboot and kept
+	 * around forever
+	 */
+	if (cpu == 0)
+		return NOTIFY_OK;
+
+	switch (action) {
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		pr_debug("MMU: Allocating stale context map for CPU %d\n", cpu);
+		stale_map[cpu] = kzalloc(CTX_MAP_SIZE, GFP_KERNEL);
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		pr_debug("MMU: Freeing stale context map for CPU %d\n", cpu);
+		kfree(stale_map[cpu]);
+		stale_map[cpu] = NULL;
+		break;
+#endif
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata mmu_context_cpu_nb = {
+	.notifier_call	= mmu_context_cpu_notify,
+};
+
+#endif /* CONFIG_SMP */
 
 /*
  * Initialize the context management stuff.
@@ -323,14 +341,57 @@ void __init mmu_context_init(void)
 	 */
 	init_mm.context.active = NR_CPUS;
 
+	/*
+	 *   The MPC8xx has only 16 contexts.  We rotate through them on each
+	 * task switch.  A better way would be to keep track of tasks that
+	 * own contexts, and implement an LRU usage.  That way very active
+	 * tasks don't always have to pay the TLB reload overhead.  The
+	 * kernel pages are mapped shared, so the kernel can run on behalf
+	 * of any task that makes a kernel entry.  Shared does not mean they
+	 * are not protected, just that the ASID comparison is not performed.
+	 *      -- Dan
+	 *
+	 * The IBM4xx has 256 contexts, so we can just rotate through these
+	 * as a way of "switching" contexts.  If the TID of the TLB is zero,
+	 * the PID/TID comparison is disabled, so we can use a TID of zero
+	 * to represent all kernel pages as shared among all contexts.
+	 * 	-- Dan
+	 */
+	if (mmu_has_feature(MMU_FTR_TYPE_8xx)) {
+		first_context = 0;
+		last_context = 15;
+	} else {
+		first_context = 1;
+		last_context = 255;
+	}
+
+#ifdef DEBUG_CLAMP_LAST_CONTEXT
+	last_context = DEBUG_CLAMP_LAST_CONTEXT;
+#endif
+	/*
+	 * Allocate the maps used by context management
+	 */
+	context_map = alloc_bootmem(CTX_MAP_SIZE);
+	context_mm = alloc_bootmem(sizeof(void *) * (last_context + 1));
+	stale_map[0] = alloc_bootmem(CTX_MAP_SIZE);
+
+#ifdef CONFIG_SMP
+	register_cpu_notifier(&mmu_context_cpu_nb);
+#endif
+
+	printk(KERN_INFO
+	       "MMU: Allocated %d bytes of context maps for %d contexts\n",
+	       2 * CTX_MAP_SIZE + (sizeof(void *) * (last_context + 1)),
+	       last_context - first_context + 1);
+
 	/*
 	 * Some processors have too few contexts to reserve one for
 	 * init_mm, and require using context 0 for a normal task.
 	 * Other processors reserve the use of context zero for the kernel.
-	 * This code assumes FIRST_CONTEXT < 32.
+	 * This code assumes first_context < 32.
 	 */
-	context_map[0] = (1 << FIRST_CONTEXT) - 1;
-	next_context = FIRST_CONTEXT;
-	nr_free_contexts = LAST_CONTEXT - FIRST_CONTEXT + 1;
+	context_map[0] = (1 << first_context) - 1;
+	next_context = first_context;
+	nr_free_contexts = last_context - first_context + 1;
 }
 
-- 
cgit v1.2.3


From 64b3d0e8122b422e879b23d42f9e0e8efbbf9744 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Thu, 18 Dec 2008 19:13:51 +0000
Subject: powerpc/mm: Rework usage of _PAGE_COHERENT/NO_CACHE/GUARDED

Currently, we never set _PAGE_COHERENT in the PTEs, we just OR it in
in the hash code based on some CPU feature bit.  We also manipulate
_PAGE_NO_CACHE and _PAGE_GUARDED by hand in all sorts of places.

This changes the logic so that instead, the PTE now contains
_PAGE_COHERENT for all normal RAM pages thay have I = 0 on platforms
that need it.  The hash code clears it if the feature bit is not set.

It also adds some clean accessors to setup various valid combinations
of access flags and change various bits of code to use them instead.

This should help having the PTE actually containing the bit
combinations that we really want.

I also removed _PAGE_GUARDED from _PAGE_BASE on 44x and instead
set it explicitely from the TLB miss.  I will ultimately remove it
completely as it appears that it might not be needed after all
but in the meantime, having it in the TLB miss makes things a
lot easier.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Kumar Gala <galak@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/mm/hash_low_32.S | 4 ++--
 arch/powerpc/mm/mem.c         | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S
index c8eac22a8f0..28845604a10 100644
--- a/arch/powerpc/mm/hash_low_32.S
+++ b/arch/powerpc/mm/hash_low_32.S
@@ -323,8 +323,8 @@ _GLOBAL(create_hpte)
 	ori	r8,r8,0xe14		/* clear out reserved bits and M */
 	andc	r8,r5,r8		/* PP = user? (rw&dirty? 2: 3): 0 */
 BEGIN_FTR_SECTION
-	ori	r8,r8,_PAGE_COHERENT	/* set M (coherence required) */
-END_FTR_SECTION_IFSET(CPU_FTR_NEED_COHERENT)
+	rlwinm	r8,r8,0,~_PAGE_COHERENT	/* clear M (coherence not required) */
+END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT)
 #ifdef CONFIG_PTE_64BIT
 	/* Put the XPN bits into the PTE */
 	rlwimi	r8,r10,8,20,22
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 8fee696fb79..53b06ebb3f2 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -102,8 +102,8 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 		return ppc_md.phys_mem_access_prot(file, pfn, size, vma_prot);
 
 	if (!page_is_ram(pfn))
-		vma_prot = __pgprot(pgprot_val(vma_prot)
-				    | _PAGE_GUARDED | _PAGE_NO_CACHE);
+		vma_prot = pgprot_noncached(vma_prot);
+
 	return vma_prot;
 }
 EXPORT_SYMBOL(phys_mem_access_prot);
-- 
cgit v1.2.3


From a14953597b771f793ce32529d7b8b04fdedca3ef Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Sun, 21 Dec 2008 02:54:25 -0700
Subject: powerpc: Fix missing 'blr' in _tlbia()

Rework to MMU code dropped a much missed 'blr' instruction.

Brown-Paper-Bag-Worn-By: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 arch/powerpc/mm/hash_low_32.S | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S
index 28845604a10..67850ec9feb 100644
--- a/arch/powerpc/mm/hash_low_32.S
+++ b/arch/powerpc/mm/hash_low_32.S
@@ -709,3 +709,4 @@ _GLOBAL(_tlbia)
 	tlbia
 	sync
 #endif /* CONFIG_SMP */
+	blr
-- 
cgit v1.2.3


From 01695a9687e5a8d78589605037cc7828a5b67ac9 Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Wed, 17 Dec 2008 10:09:10 +0000
Subject: powerpc/32: Allow __ioremap on RAM addresses for kdump kernel

While for debugging it is good to catch bogus users of ioremap, though
for kdump support it is more convenient to use __ioremap for
copy_oldmem_page() (exactly as we do for PPC64 currently).

Note that copy_oldmem_page() calls __ioremap with flags set to '0',
so it should be safe with the regard to the caches.

The other option is to use kmap_atomic_pfn()[1], but it will not work
for kernels compiled without HIGHMEM.

That is, on a board with 256MB RAM and crashkernel=64M@32M case, the
!HIGHMEM capturing kernel maps 0-96M range, which does not include all
the memory needed to capture the dump. And, obviously, accessing
anything upper than 96M will cause faults.

[1] http://ozlabs.org/pipermail/linuxppc-dev/2007-November/046747.html

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/mm/pgtable_32.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 34147244013..cd5609759d4 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -173,6 +173,7 @@ __ioremap(phys_addr_t addr, unsigned long size, unsigned long flags)
 	if (p < 16*1024*1024)
 		p += _ISA_MEM_BASE;
 
+#ifndef CONFIG_CRASH_DUMP
 	/*
 	 * Don't allow anybody to remap normal RAM that we're using.
 	 * mem_init() sets high_memory so only do the check after that.
@@ -182,6 +183,7 @@ __ioremap(phys_addr_t addr, unsigned long size, unsigned long flags)
 		       (unsigned long long)p, __builtin_return_address(0));
 		return NULL;
 	}
+#endif
 
 	if (size == 0)
 		return NULL;
-- 
cgit v1.2.3


From ccdcef72c249c289898b164eada89a61855b9287 Mon Sep 17 00:00:00 2001
From: Dale Farnsworth <dale@farnsworth.org>
Date: Wed, 17 Dec 2008 10:09:13 +0000
Subject: powerpc/32: Add the ability for a classic ppc kernel to be loaded at
 32M

Add the ability for a classic ppc kernel to be loaded at an address
of 32MB.  This done by fixing a few places that assume we are loaded
at address 0, and by changing several uses of KERNELBASE to use
PAGE_OFFSET, instead.

Signed-off-by: Dale Farnsworth <dale@farnsworth.org>
Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/mm/init_32.c    | 2 +-
 arch/powerpc/mm/pgtable_32.c | 4 ++--
 arch/powerpc/mm/ppc_mmu_32.c | 8 ++++----
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 578294c3b1c..666a5e8a5be 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -48,7 +48,7 @@
 
 #if defined(CONFIG_KERNEL_START_BOOL) || defined(CONFIG_LOWMEM_SIZE_BOOL)
 /* The ammount of lowmem must be within 0xF0000000 - KERNELBASE. */
-#if (CONFIG_LOWMEM_SIZE > (0xF0000000 - KERNELBASE))
+#if (CONFIG_LOWMEM_SIZE > (0xF0000000 - PAGE_OFFSET))
 #error "You must adjust CONFIG_LOWMEM_SIZE or CONFIG_START_KERNEL"
 #endif
 #endif
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index cd5609759d4..8cba46fc9e3 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -269,7 +269,7 @@ int map_page(unsigned long va, phys_addr_t pa, int flags)
 }
 
 /*
- * Map in a big chunk of physical memory starting at KERNELBASE.
+ * Map in a big chunk of physical memory starting at PAGE_OFFSET.
  */
 void __init mapin_ram(void)
 {
@@ -278,7 +278,7 @@ void __init mapin_ram(void)
 	int ktext;
 
 	s = mmu_mapin_ram();
-	v = KERNELBASE + s;
+	v = PAGE_OFFSET + s;
 	p = memstart_addr + s;
 	for (; s < total_lowmem; s += PAGE_SIZE) {
 		ktext = ((char *) v >= _stext && (char *) v < etext);
diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c
index 9d97db7b7cf..45d925360b8 100644
--- a/arch/powerpc/mm/ppc_mmu_32.c
+++ b/arch/powerpc/mm/ppc_mmu_32.c
@@ -95,16 +95,16 @@ unsigned long __init mmu_mapin_ram(void)
 			break;
 	}
 
-	setbat(2, KERNELBASE, 0, bl, _PAGE_RAM);
-	done = (unsigned long)bat_addrs[2].limit - KERNELBASE + 1;
+	setbat(2, PAGE_OFFSET, 0, bl, _PAGE_RAM);
+	done = (unsigned long)bat_addrs[2].limit - PAGE_OFFSET + 1;
 	if ((done < tot) && !bat_addrs[3].limit) {
 		/* use BAT3 to cover a bit more */
 		tot -= done;
 		for (bl = 128<<10; bl < max_size; bl <<= 1)
 			if (bl * 2 > tot)
 				break;
-		setbat(3, KERNELBASE+done, done, bl, _PAGE_RAM);
-		done = (unsigned long)bat_addrs[3].limit - KERNELBASE + 1;
+		setbat(3, PAGE_OFFSET+done, done, bl, _PAGE_RAM);
+		done = (unsigned long)bat_addrs[3].limit - PAGE_OFFSET + 1;
 	}
 
 	return done;
-- 
cgit v1.2.3


From ca9153a3a2a7556d091dfe080e42b0e67881fff6 Mon Sep 17 00:00:00 2001
From: Ilya Yanok <yanok@emcraft.com>
Date: Thu, 11 Dec 2008 04:55:41 +0300
Subject: powerpc/44x: Support 16K/64K base page sizes on 44x

This adds support for 16k and 64k page sizes on PowerPC 44x processors.

The PGDIR table is much smaller than a page when using 16k or 64k
pages (512 and 32 bytes respectively) so we allocate the PGDIR with
kzalloc() instead of __get_free_pages().

One PTE table covers rather a large memory area when using 16k or 64k
pages (32MB or 512MB respectively), so we can easily put FIXMAP and
PKMAP in the area covered by one PTE table.

Signed-off-by: Yuri Tikhonov <yur@emcraft.com>
Signed-off-by: Vladimir Panfilov <pvr@emcraft.com>
Signed-off-by: Ilya Yanok <yanok@emcraft.com>
Acked-by: Josh Boyer <jwboyer@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/mm/pgtable_32.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 8cba46fc9e3..38ff35f2142 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -68,24 +68,29 @@ extern unsigned long p_mapped_by_tlbcam(unsigned long pa);
 #define p_mapped_by_tlbcam(x)	(0UL)
 #endif /* HAVE_TLBCAM */
 
-#ifdef CONFIG_PTE_64BIT
-/* Some processors use an 8kB pgdir because they have 8-byte Linux PTEs. */
-#define PGDIR_ORDER	1
-#else
-#define PGDIR_ORDER	0
-#endif
+#define PGDIR_ORDER	(32 + PGD_T_LOG2 - PGDIR_SHIFT)
 
 pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	pgd_t *ret;
 
-	ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, PGDIR_ORDER);
+	/* pgdir take page or two with 4K pages and a page fraction otherwise */
+#ifndef CONFIG_PPC_4K_PAGES
+	ret = (pgd_t *)kzalloc(1 << PGDIR_ORDER, GFP_KERNEL);
+#else
+	ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO,
+			PGDIR_ORDER - PAGE_SHIFT);
+#endif
 	return ret;
 }
 
 void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
-	free_pages((unsigned long)pgd, PGDIR_ORDER);
+#ifndef CONFIG_PPC_4K_PAGES
+	kfree((void *)pgd);
+#else
+	free_pages((unsigned long)pgd, PGDIR_ORDER - PAGE_SHIFT);
+#endif
 }
 
 __init_refok pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
@@ -385,7 +390,7 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
 #endif /* CONFIG_DEBUG_PAGEALLOC */
 
 static int fixmaps;
-unsigned long FIXADDR_TOP = 0xfffff000;
+unsigned long FIXADDR_TOP = (-PAGE_SIZE);
 EXPORT_SYMBOL(FIXADDR_TOP);
 
 void __set_fixmap (enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags)
-- 
cgit v1.2.3