[POWERPC] Make tlb flush batch use lazy MMU mode

The current tlb flush code on powerpc 64 bits has a subtle race since we lost the page table lock due to the possible faulting in of new PTEs after a previous one has been removed but before the corresponding hash entry has been evicted, which can leads to all sort of fatal problems. This patch reworks the batch code completely. It doesn't use the mmu_gather stuff anymore. Instead, we use the lazy mmu hooks that were added by the paravirt code. They have the nice property that the enter/leave lazy mmu mode pair is always fully contained by the PTE lock for a given range of PTEs. Thus we can guarantee that all batches are flushed on a given CPU before it drops that lock. We also generalize batching for any PTE update that require a flush. Batching is now enabled on a CPU by arch_enter_lazy_mmu_mode() and disabled by arch_leave_lazy_mmu_mode(). The code epects that this is always contained within a PTE lock section so no preemption can happen and no PTE insertion in that range from another CPU. When batching is enabled on a CPU, every PTE updates that need a hash flush will use the batch for that flush. Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> Signed-off-by: Paul Mackerras <paulus@samba.org>
author: Benjamin Herrenschmidt <benh@kernel.crashing.org> 2007-04-10 17:09:37 +1000
committer: Paul Mackerras <paulus@samba.org> 2007-04-13 04:09:38 +1000
commit: a741e67969577163a4cfc78d7fd2753219087ef1 (patch)
tree: bac4162aaf15367e896429afa60465e201c9204c /arch/powerpc
parent: e4ee3891db35aa9a069bb403c2a66a8fbfa274d6 (diff)
4 files changed, 49 insertions, 43 deletions
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 949092dccf4..e509aae2feb 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -305,9 +305,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
 		set_dabr(new->thread.dabr);
 		__get_cpu_var(current_dabr) = new->thread.dabr;
 	}
-
-	flush_tlb_pending();
-#endif
+#endif /* CONFIG_PPC64 */
 
 	new_thread = &new->thread;
 	old_thread = &current->thread;
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 924d692bc8f..d8e503b2e1a 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -428,10 +428,6 @@ void generic_mach_cpu_die(void)
 	smp_wmb();
 	while (__get_cpu_var(cpu_state) != CPU_UP_PREPARE)
 		cpu_relax();
-
-#ifdef CONFIG_PPC64
-	flush_tlb_pending();
-#endif
 	cpu_set(cpu, cpu_online_map);
 	local_irq_enable();
 }
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index f6ffaaa7a5b..8508f973d9c 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -316,12 +316,11 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 {
 	if (pte_present(*ptep)) {
 		/* We open-code pte_clear because we need to pass the right
-		 * argument to hpte_update (huge / !huge)
+		 * argument to hpte_need_flush (huge / !huge). Might not be
+		 * necessary anymore if we make hpte_need_flush() get the
+		 * page size from the slices
 		 */
-		unsigned long old = pte_update(ptep, ~0UL);
-		if (old & _PAGE_HASHPTE)
-			hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1);
-		flush_tlb_pending();
+		pte_update(mm, addr & HPAGE_MASK, ptep, ~0UL, 1);
 	}
 	*ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
 }
@@ -329,12 +328,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 			      pte_t *ptep)
 {
-	unsigned long old = pte_update(ptep, ~0UL);
-
-	if (old & _PAGE_HASHPTE)
-		hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1);
-	*ptep = __pte(0);
-
+	unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1);
 	return __pte(old);
 }
 
diff --git a/arch/powerpc/mm/tlb_64.c b/arch/powerpc/mm/tlb_64.c
index b58baa65c4a..fd8d08c325e 100644
--- a/arch/powerpc/mm/tlb_64.c
+++ b/arch/powerpc/mm/tlb_64.c
@@ -120,17 +120,20 @@ void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf)
 }
 
 /*
- * Update the MMU hash table to correspond with a change to
- * a Linux PTE.  If wrprot is true, it is permissible to
- * change the existing HPTE to read-only rather than removing it
- * (if we remove it we should clear the _PTE_HPTEFLAGS bits).
+ * A linux PTE was changed and the corresponding hash table entry
+ * neesd to be flushed. This function will either perform the flush
+ * immediately or will batch it up if the current CPU has an active
+ * batch on it.
+ *
+ * Must be called from within some kind of spinlock/non-preempt region...
  */
-void hpte_update(struct mm_struct *mm, unsigned long addr,
-		 pte_t *ptep, unsigned long pte, int huge)
+void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
+		     pte_t *ptep, unsigned long pte, int huge)
 {
 	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
-	unsigned long vsid;
+	unsigned long vsid, vaddr;
 	unsigned int psize;
+	real_pte_t rpte;
 	int i;
 
 	i = batch->index;
@@ -151,6 +154,26 @@ void hpte_update(struct mm_struct *mm, unsigned long addr,
 	} else
 		psize = pte_pagesize_index(pte);
 
+	/* Build full vaddr */
+	if (!is_kernel_addr(addr)) {
+		vsid = get_vsid(mm->context.id, addr);
+		WARN_ON(vsid == 0);
+	} else
+		vsid = get_kernel_vsid(addr);
+	vaddr = (vsid << 28 ) | (addr & 0x0fffffff);
+	rpte = __real_pte(__pte(pte), ptep);
+
+	/*
+	 * Check if we have an active batch on this CPU. If not, just
+	 * flush now and return. For now, we don global invalidates
+	 * in that case, might be worth testing the mm cpu mask though
+	 * and decide to use local invalidates instead...
+	 */
+	if (!batch->active) {
+		flush_hash_page(vaddr, rpte, psize, 0);
+		return;
+	}
+
 	/*
 	 * This can happen when we are in the middle of a TLB batch and
 	 * we encounter memory pressure (eg copy_page_range when it tries
@@ -162,47 +185,42 @@ void hpte_update(struct mm_struct *mm, unsigned long addr,
 	 * batch
 	 */
 	if (i != 0 && (mm != batch->mm || batch->psize != psize)) {
-		flush_tlb_pending();
+		__flush_tlb_pending(batch);
 		i = 0;
 	}
 	if (i == 0) {
 		batch->mm = mm;
 		batch->psize = psize;
 	}
-	if (!is_kernel_addr(addr)) {
-		vsid = get_vsid(mm->context.id, addr);
-		WARN_ON(vsid == 0);
-	} else
-		vsid = get_kernel_vsid(addr);
-	batch->vaddr[i] = (vsid << 28 ) | (addr & 0x0fffffff);
-	batch->pte[i] = __real_pte(__pte(pte), ptep);
+	batch->pte[i] = rpte;
+	batch->vaddr[i] = vaddr;
 	batch->index = ++i;
 	if (i >= PPC64_TLB_BATCH_NR)
-		flush_tlb_pending();
+		__flush_tlb_pending(batch);
 }
 
+/*
+ * This function is called when terminating an mmu batch or when a batch
+ * is full. It will perform the flush of all the entries currently stored
+ * in a batch.
+ *
+ * Must be called from within some kind of spinlock/non-preempt region...
+ */
 void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
 {
-	int i;
-	int cpu;
 	cpumask_t tmp;
-	int local = 0;
+	int i, local = 0;
 
-	BUG_ON(in_interrupt());
-
-	cpu = get_cpu();
 	i = batch->index;
-	tmp = cpumask_of_cpu(cpu);
+	tmp = cpumask_of_cpu(smp_processor_id());
 	if (cpus_equal(batch->mm->cpu_vm_mask, tmp))
 		local = 1;
-
 	if (i == 1)
 		flush_hash_page(batch->vaddr[0], batch->pte[0],
 				batch->psize, local);
 	else
 		flush_hash_range(i, local);
 	batch->index = 0;
-	put_cpu();
 }
 
 void pte_free_finish(void)
author	Benjamin Herrenschmidt <benh@kernel.crashing.org>	2007-04-10 17:09:37 +1000
committer	Paul Mackerras <paulus@samba.org>	2007-04-13 04:09:38 +1000
commit	a741e67969577163a4cfc78d7fd2753219087ef1 (patch)
tree	bac4162aaf15367e896429afa60465e201c9204c /arch/powerpc
parent	e4ee3891db35aa9a069bb403c2a66a8fbfa274d6 (diff)