12 files changed, 400 insertions, 251 deletions
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 953cc4a1cde..6d2838fc879 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -6,7 +6,7 @@ ifeq ($(CONFIG_PPC64),y)
 EXTRA_CFLAGS	+= -mno-minimal-toc
 endif
 
-obj-y				:= fault.o mem.o pgtable.o \
+obj-y				:= fault.o mem.o pgtable.o gup.o \
 				   init_$(CONFIG_WORD_SIZE).o \
 				   pgtable_$(CONFIG_WORD_SIZE).o
 obj-$(CONFIG_PPC_MMU_NOHASH)	+= mmu_context_nohash.o tlb_nohash.o \
@@ -14,7 +14,7 @@ obj-$(CONFIG_PPC_MMU_NOHASH)	+= mmu_context_nohash.o tlb_nohash.o \
 hash-$(CONFIG_PPC_NATIVE)	:= hash_native_64.o
 obj-$(CONFIG_PPC64)		+= hash_utils_64.o \
 				   slb_low.o slb.o stab.o \
-				   gup.o mmap.o $(hash-y)
+				   mmap.o $(hash-y)
 obj-$(CONFIG_PPC_STD_MMU_32)	+= ppc_mmu_32.o
 obj-$(CONFIG_PPC_STD_MMU)	+= hash_low_$(CONFIG_WORD_SIZE).o \
 				   tlb_hash$(CONFIG_WORD_SIZE).o \
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 91c7b8636b8..76993941cac 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -253,45 +253,33 @@ good_area:
 #endif /* CONFIG_8xx */
 
 	if (is_exec) {
-#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
-		/* protection fault */
+#ifdef CONFIG_PPC_STD_MMU
+		/* Protection fault on exec go straight to failure on
+		 * Hash based MMUs as they either don't support per-page
+		 * execute permission, or if they do, it's handled already
+		 * at the hash level. This test would probably have to
+		 * be removed if we change the way this works to make hash
+		 * processors use the same I/D cache coherency mechanism
+		 * as embedded.
+		 */
 		if (error_code & DSISR_PROTFAULT)
 			goto bad_area;
+#endif /* CONFIG_PPC_STD_MMU */
+
 		/*
 		 * Allow execution from readable areas if the MMU does not
 		 * provide separate controls over reading and executing.
+		 *
+		 * Note: That code used to not be enabled for 4xx/BookE.
+		 * It is now as I/D cache coherency for these is done at
+		 * set_pte_at() time and I see no reason why the test
+		 * below wouldn't be valid on those processors. This -may-
+		 * break programs compiled with a really old ABI though.
 		 */
 		if (!(vma->vm_flags & VM_EXEC) &&
 		    (cpu_has_feature(CPU_FTR_NOEXECUTE) ||
 		     !(vma->vm_flags & (VM_READ | VM_WRITE))))
 			goto bad_area;
-#else
-		pte_t *ptep;
-		pmd_t *pmdp;
-
-		/* Since 4xx/Book-E supports per-page execute permission,
-		 * we lazily flush dcache to icache. */
-		ptep = NULL;
-		if (get_pteptr(mm, address, &ptep, &pmdp)) {
-			spinlock_t *ptl = pte_lockptr(mm, pmdp);
-			spin_lock(ptl);
-			if (pte_present(*ptep)) {
-				struct page *page = pte_page(*ptep);
-
-				if (!test_bit(PG_arch_1, &page->flags)) {
-					flush_dcache_icache_page(page);
-					set_bit(PG_arch_1, &page->flags);
-				}
-				pte_update(ptep, 0, _PAGE_HWEXEC |
-					   _PAGE_ACCESSED);
-				local_flush_tlb_page(vma, address);
-				pte_unmap_unlock(ptep, ptl);
-				up_read(&mm->mmap_sem);
-				return 0;
-			}
-			pte_unmap_unlock(ptep, ptl);
-		}
-#endif
 	/* a write */
 	} else if (is_write) {
 		if (!(vma->vm_flags & VM_WRITE))
diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c
index ea6e41e39d9..985b6c361ab 100644
--- a/arch/powerpc/mm/fsl_booke_mmu.c
+++ b/arch/powerpc/mm/fsl_booke_mmu.c
@@ -56,10 +56,14 @@
 
 extern void loadcam_entry(unsigned int index);
 unsigned int tlbcam_index;
-static unsigned long __cam0, __cam1, __cam2;
+static unsigned long cam[CONFIG_LOWMEM_CAM_NUM];
 
 #define NUM_TLBCAMS	(16)
 
+#if defined(CONFIG_LOWMEM_CAM_NUM_BOOL) && (CONFIG_LOWMEM_CAM_NUM >= NUM_TLBCAMS)
+#error "LOWMEM_CAM_NUM must be less than NUM_TLBCAMS"
+#endif
+
 struct tlbcam TLBCAM[NUM_TLBCAMS];
 
 struct tlbcamrange {
@@ -107,7 +111,7 @@ void settlbcam(int index, unsigned long virt, phys_addr_t phys,
 	unsigned int tsize, lz;
 
 	asm ("cntlzw %0,%1" : "=r" (lz) : "r" (size));
-	tsize = (21 - lz) / 2;
+	tsize = 21 - lz;
 
 #ifdef CONFIG_SMP
 	if ((flags & _PAGE_NO_CACHE) == 0)
@@ -152,19 +156,19 @@ void invalidate_tlbcam_entry(int index)
 	loadcam_entry(index);
 }
 
-void __init cam_mapin_ram(unsigned long cam0, unsigned long cam1,
-		unsigned long cam2)
+unsigned long __init mmu_mapin_ram(void)
 {
-	settlbcam(0, PAGE_OFFSET, memstart_addr, cam0, _PAGE_KERNEL, 0);
-	tlbcam_index++;
-	if (cam1) {
-		tlbcam_index++;
-		settlbcam(1, PAGE_OFFSET+cam0, memstart_addr+cam0, cam1, _PAGE_KERNEL, 0);
-	}
-	if (cam2) {
+	unsigned long virt = PAGE_OFFSET;
+	phys_addr_t phys = memstart_addr;
+
+	while (cam[tlbcam_index] && tlbcam_index < ARRAY_SIZE(cam)) {
+		settlbcam(tlbcam_index, virt, phys, cam[tlbcam_index], _PAGE_KERNEL, 0);
+		virt += cam[tlbcam_index];
+		phys += cam[tlbcam_index];
 		tlbcam_index++;
-		settlbcam(2, PAGE_OFFSET+cam0+cam1, memstart_addr+cam0+cam1, cam2, _PAGE_KERNEL, 0);
 	}
+
+	return virt - PAGE_OFFSET;
 }
 
 /*
@@ -175,51 +179,46 @@ void __init MMU_init_hw(void)
 	flush_instruction_cache();
 }
 
-unsigned long __init mmu_mapin_ram(void)
-{
-	cam_mapin_ram(__cam0, __cam1, __cam2);
-
-	return __cam0 + __cam1 + __cam2;
-}
-
-
 void __init
 adjust_total_lowmem(void)
 {
-	phys_addr_t max_lowmem_size = __max_low_memory;
-	phys_addr_t cam_max_size = 0x10000000;
 	phys_addr_t ram;
+	unsigned int max_cam = (mfspr(SPRN_TLB1CFG) >> 16) & 0xff;
+	char buf[ARRAY_SIZE(cam) * 5 + 1], *p = buf;
+	int i;
+	unsigned long virt = PAGE_OFFSET & 0xffffffffUL;
+	unsigned long phys = memstart_addr & 0xffffffffUL;
 
-	/* adjust CAM size to max_lowmem_size */
-	if (max_lowmem_size < cam_max_size)
-		cam_max_size = max_lowmem_size;
+	/* Convert (4^max) kB to (2^max) bytes */
+	max_cam = max_cam * 2 + 10;
 
-	/* adjust lowmem size to max_lowmem_size */
-	ram = min(max_lowmem_size, total_lowmem);
+	/* adjust lowmem size to __max_low_memory */
+	ram = min((phys_addr_t)__max_low_memory, (phys_addr_t)total_lowmem);
 
 	/* Calculate CAM values */
-	__cam0 = 1UL << 2 * (__ilog2(ram) / 2);
-	if (__cam0 > cam_max_size)
-		__cam0 = cam_max_size;
-	ram -= __cam0;
-	if (ram) {
-		__cam1 = 1UL << 2 * (__ilog2(ram) / 2);
-		if (__cam1 > cam_max_size)
-			__cam1 = cam_max_size;
-		ram -= __cam1;
-	}
-	if (ram) {
-		__cam2 = 1UL << 2 * (__ilog2(ram) / 2);
-		if (__cam2 > cam_max_size)
-			__cam2 = cam_max_size;
-		ram -= __cam2;
+	__max_low_memory = 0;
+	for (i = 0; ram && i < ARRAY_SIZE(cam); i++) {
+		unsigned int camsize = __ilog2(ram) & ~1U;
+		unsigned int align = __ffs(virt | phys) & ~1U;
+
+		if (camsize > align)
+			camsize = align;
+		if (camsize > max_cam)
+			camsize = max_cam;
+
+		cam[i] = 1UL << camsize;
+		ram -= cam[i];
+		__max_low_memory += cam[i];
+		virt += cam[i];
+		phys += cam[i];
+
+		p += sprintf(p, "%lu/", cam[i] >> 20);
 	}
+	for (; i < ARRAY_SIZE(cam); i++)
+		p += sprintf(p, "0/");
+	p[-1] = '\0';
 
-	printk(KERN_INFO "Memory CAM mapping: CAM0=%ldMb, CAM1=%ldMb,"
-			" CAM2=%ldMb residual: %ldMb\n",
-			__cam0 >> 20, __cam1 >> 20, __cam2 >> 20,
-			(long int)((total_lowmem - __cam0 - __cam1 - __cam2)
-				   >> 20));
-	__max_low_memory = __cam0 + __cam1 + __cam2;
+	pr_info("Memory CAM mapping: %s Mb, residual: %dMb\n", buf,
+	        (unsigned int)((total_lowmem - __max_low_memory) >> 20));
 	__initial_memory_limit_addr = memstart_addr + __max_low_memory;
 }
diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
index 28a114db3ba..bc400c78c97 100644
--- a/arch/powerpc/mm/gup.c
+++ b/arch/powerpc/mm/gup.c
@@ -14,6 +14,8 @@
 #include <linux/rwsem.h>
 #include <asm/pgtable.h>
 
+#ifdef __HAVE_ARCH_PTE_SPECIAL
+
 /*
  * The performance critical leaf functions are made noinline otherwise gcc
  * inlines everything into a single function which results in too much
@@ -151,8 +153,11 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 	unsigned long addr, len, end;
 	unsigned long next;
 	pgd_t *pgdp;
-	int psize, nr = 0;
+	int nr = 0;
+#ifdef CONFIG_PPC64
 	unsigned int shift;
+	int psize;
+#endif
 
 	pr_debug("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read");
 
@@ -205,8 +210,13 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 	 */
 	local_irq_disable();
 
+#ifdef CONFIG_PPC64
+	/* Those bits are related to hugetlbfs implementation and only exist
+	 * on 64-bit for now
+	 */
 	psize = get_slice_psize(mm, addr);
 	shift = mmu_psize_defs[psize].shift;
+#endif /* CONFIG_PPC64 */
 
 #ifdef CONFIG_HUGETLB_PAGE
 	if (unlikely(mmu_huge_psizes[psize])) {
@@ -236,7 +246,9 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 		do {
 			pgd_t pgd = *pgdp;
 
+#ifdef CONFIG_PPC64
 			VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, addr)].shift);
+#endif
 			pr_debug("  %016lx: normal pgd %p\n", addr,
 				 (void *)pgd_val(pgd));
 			next = pgd_addr_end(addr, end);
@@ -279,3 +291,5 @@ slow_irqon:
 		return ret;
 	}
 }
+
+#endif /* __HAVE_ARCH_PTE_SPECIAL */
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 8d5b4758c13..f5bc1b213f2 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -516,7 +516,7 @@ static int __init htab_dt_scan_pftsize(unsigned long node,
 
 static unsigned long __init htab_get_table_size(void)
 {
-	unsigned long mem_size, rnd_mem_size, pteg_count;
+	unsigned long mem_size, rnd_mem_size, pteg_count, psize;
 
 	/* If hash size isn't already provided by the platform, we try to
 	 * retrieve it from the device-tree. If it's not there neither, we
@@ -534,7 +534,8 @@ static unsigned long __init htab_get_table_size(void)
 		rnd_mem_size <<= 1;
 
 	/* # pages / 2 */
-	pteg_count = max(rnd_mem_size >> (12 + 1), 1UL << 11);
+	psize = mmu_psize_defs[mmu_virtual_psize].shift;
+	pteg_count = max(rnd_mem_size >> (psize + 1), 1UL << 11);
 
 	return pteg_count << 7;
 }
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index f00f09a77f1..f668fa9ba80 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -472,40 +472,7 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
 {
 #ifdef CONFIG_PPC_STD_MMU
 	unsigned long access = 0, trap;
-#endif
-	unsigned long pfn = pte_pfn(pte);
-
-	/* handle i-cache coherency */
-	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE) &&
-	    !cpu_has_feature(CPU_FTR_NOEXECUTE) &&
-	    pfn_valid(pfn)) {
-		struct page *page = pfn_to_page(pfn);
-#ifdef CONFIG_8xx
-		/* On 8xx, cache control instructions (particularly
-		 * "dcbst" from flush_dcache_icache) fault as write
-		 * operation if there is an unpopulated TLB entry
-		 * for the address in question. To workaround that,
-		 * we invalidate the TLB here, thus avoiding dcbst
-		 * misbehaviour.
-		 */
-		_tlbil_va(address, 0 /* 8xx doesn't care about PID */);
-#endif
-		/* The _PAGE_USER test should really be _PAGE_EXEC, but
-		 * older glibc versions execute some code from no-exec
-		 * pages, which for now we are supporting.  If exec-only
-		 * pages are ever implemented, this will have to change.
-		 */
-		if (!PageReserved(page) && (pte_val(pte) & _PAGE_USER)
-		    && !test_bit(PG_arch_1, &page->flags)) {
-			if (vma->vm_mm == current->active_mm) {
-				__flush_dcache_icache((void *) address);
-			} else
-				flush_dcache_icache_page(page);
-			set_bit(PG_arch_1, &page->flags);
-		}
-	}
 
-#ifdef CONFIG_PPC_STD_MMU
 	/* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
 	if (!pte_young(pte) || address >= TASK_SIZE)
 		return;
diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
index 86010fc7d3b..0d957a4c70f 100644
--- a/arch/powerpc/mm/mmap.c
+++ b/arch/powerpc/mm/mmap.c
@@ -24,36 +24,26 @@
 
 #include <linux/personality.h>
 #include <linux/mm.h>
+#include <linux/random.h>
 #include <linux/sched.h>
 
 /*
  * Top of mmap area (just below the process stack).
  *
- * Leave an at least ~128 MB hole.
+ * Leave at least a ~128 MB hole on 32bit applications.
+ *
+ * On 64bit applications we randomise the stack by 1GB so we need to
+ * space our mmap start address by a further 1GB, otherwise there is a
+ * chance the mmap area will end up closer to the stack than our ulimit
+ * requires.
  */
-#define MIN_GAP (128*1024*1024)
+#define MIN_GAP32 (128*1024*1024)
+#define MIN_GAP64 ((128 + 1024)*1024*1024UL)
+#define MIN_GAP ((is_32bit_task()) ? MIN_GAP32 : MIN_GAP64)
 #define MAX_GAP (TASK_SIZE/6*5)
 
-static inline unsigned long mmap_base(void)
-{
-	unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
-
-	if (gap < MIN_GAP)
-		gap = MIN_GAP;
-	else if (gap > MAX_GAP)
-		gap = MAX_GAP;
-
-	return TASK_SIZE - (gap & PAGE_MASK);
-}
-
 static inline int mmap_is_legacy(void)
 {
-	/*
-	 * Force standard allocation for 64 bit programs.
-	 */
-	if (!test_thread_flag(TIF_32BIT))
-		return 1;
-
 	if (current->personality & ADDR_COMPAT_LAYOUT)
 		return 1;
 
@@ -64,6 +54,40 @@ static inline int mmap_is_legacy(void)
 }
 
 /*
+ * Since get_random_int() returns the same value within a 1 jiffy window,
+ * we will almost always get the same randomisation for the stack and mmap
+ * region. This will mean the relative distance between stack and mmap will
+ * be the same.
+ *
+ * To avoid this we can shift the randomness by 1 bit.
+ */
+static unsigned long mmap_rnd(void)
+{
+	unsigned long rnd = 0;
+
+	if (current->flags & PF_RANDOMIZE) {
+		/* 8MB for 32bit, 1GB for 64bit */
+		if (is_32bit_task())
+			rnd = (long)(get_random_int() % (1<<(22-PAGE_SHIFT)));
+		else
+			rnd = (long)(get_random_int() % (1<<(29-PAGE_SHIFT)));
+	}
+	return (rnd << PAGE_SHIFT) * 2;
+}
+
+static inline unsigned long mmap_base(void)
+{
+	unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
+
+	if (gap < MIN_GAP)
+		gap = MIN_GAP;
+	else if (gap > MAX_GAP)
+		gap = MAX_GAP;
+
+	return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd());
+}
+
+/*
  * This function, called very early during the creation of a new
  * process VM image, sets up which VM layout function to use:
  */
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 5ac08b8ab65..9047145095a 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -158,35 +158,6 @@ static void unmap_cpu_from_node(unsigned long cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static struct device_node * __cpuinit find_cpu_node(unsigned int cpu)
-{
-	unsigned int hw_cpuid = get_hard_smp_processor_id(cpu);
-	struct device_node *cpu_node = NULL;
-	const unsigned int *interrupt_server, *reg;
-	int len;
-
-	while ((cpu_node = of_find_node_by_type(cpu_node, "cpu")) != NULL) {
-		/* Try interrupt server first */
-		interrupt_server = of_get_property(cpu_node,
-					"ibm,ppc-interrupt-server#s", &len);
-
-		len = len / sizeof(u32);
-
-		if (interrupt_server && (len > 0)) {
-			while (len--) {
-				if (interrupt_server[len] == hw_cpuid)
-					return cpu_node;
-			}
-		} else {
-			reg = of_get_property(cpu_node, "reg", &len);
-			if (reg && (len > 0) && (reg[0] == hw_cpuid))
-				return cpu_node;
-		}
-	}
-
-	return NULL;
-}
-
 /* must hold reference to node during call */
 static const int *of_get_associativity(struct device_node *dev)
 {
@@ -290,7 +261,7 @@ static int __init find_min_common_depth(void)
 	ref_points = of_get_property(rtas_root,
 			"ibm,associativity-reference-points", &len);
 
-	if ((len >= 1) && ref_points) {
+	if ((len >= 2 * sizeof(unsigned int)) && ref_points) {
 		depth = ref_points[1];
 	} else {
 		dbg("NUMA: ibm,associativity-reference-points not found.\n");
@@ -470,7 +441,7 @@ static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,
 static int __cpuinit numa_setup_cpu(unsigned long lcpu)
 {
 	int nid = 0;
-	struct device_node *cpu = find_cpu_node(lcpu);
+	struct device_node *cpu = of_get_cpu_node(lcpu, NULL);
 
 	if (!cpu) {
 		WARN_ON(1);
@@ -652,7 +623,7 @@ static int __init parse_numa_properties(void)
 	for_each_present_cpu(i) {
 		int nid;
 
-		cpu = find_cpu_node(i);
+		cpu = of_get_cpu_node(i, NULL);
 		BUG_ON(!cpu);
 		nid = of_node_to_nid_single(cpu);
 		of_node_put(cpu);
@@ -1041,57 +1012,32 @@ early_param("numa", early_numa);
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 /*
- * Validate the node associated with the memory section we are
- * trying to add.
- */
-int valid_hot_add_scn(int *nid, unsigned long start, u32 lmb_size,
-		      unsigned long scn_addr)
-{
-	nodemask_t nodes;
-
-	if (*nid < 0 || !node_online(*nid))
-		*nid = any_online_node(NODE_MASK_ALL);
-
-	if ((scn_addr >= start) && (scn_addr < (start + lmb_size))) {
-		nodes_setall(nodes);
-		while (NODE_DATA(*nid)->node_spanned_pages == 0) {
-			node_clear(*nid, nodes);
-			*nid = any_online_node(nodes);
-		}
-
-		return 1;
-	}
-
-	return 0;
-}
-
-/*
- * Find the node associated with a hot added memory section represented
- * by the ibm,dynamic-reconfiguration-memory node.
+ * Find the node associated with a hot added memory section for
+ * memory represented in the device tree by the property
+ * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory.
  */
 static int hot_add_drconf_scn_to_nid(struct device_node *memory,
 				     unsigned long scn_addr)
 {
 	const u32 *dm;
-	unsigned int n, rc;
+	unsigned int drconf_cell_cnt, rc;
 	unsigned long lmb_size;
-	int default_nid = any_online_node(NODE_MASK_ALL);
-	int nid;
 	struct assoc_arrays aa;
+	int nid = -1;
 
-	n = of_get_drconf_memory(memory, &dm);
-	if (!n)
-		return default_nid;;
+	drconf_cell_cnt = of_get_drconf_memory(memory, &dm);
+	if (!drconf_cell_cnt)
+		return -1;
 
 	lmb_size = of_get_lmb_size(memory);
 	if (!lmb_size)
-		return default_nid;
+		return -1;
 
 	rc = of_get_assoc_arrays(memory, &aa);
 	if (rc)
-		return default_nid;
+		return -1;
 
-	for (; n != 0; --n) {
+	for (; drconf_cell_cnt != 0; --drconf_cell_cnt) {
 		struct of_drconf_cell drmem;
 
 		read_drconf_cell(&drmem, &dm);
@@ -1102,15 +1048,57 @@ static int hot_add_drconf_scn_to_nid(struct device_node *memory,
 		    || !(drmem.flags & DRCONF_MEM_ASSIGNED))
 			continue;
 
+		if ((scn_addr < drmem.base_addr)
+		    || (scn_addr >= (drmem.base_addr + lmb_size)))
+			continue;
+
 		nid = of_drconf_to_nid_single(&drmem, &aa);
+		break;
+	}
+
+	return nid;
+}
 
-		if (valid_hot_add_scn(&nid, drmem.base_addr, lmb_size,
-				      scn_addr))
-			return nid;
+/*
+ * Find the node associated with a hot added memory section for memory
+ * represented in the device tree as a node (i.e. memory@XXXX) for
+ * each lmb.
+ */
+int hot_add_node_scn_to_nid(unsigned long scn_addr)
+{
+	struct device_node *memory = NULL;
+	int nid = -1;
+
+	while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
+		unsigned long start, size;
+		int ranges;
+		const unsigned int *memcell_buf;
+		unsigned int len;
+
+		memcell_buf = of_get_property(memory, "reg", &len);
+		if (!memcell_buf || len <= 0)
+			continue;
+
+		/* ranges in cell */
+		ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
+
+		while (ranges--) {
+			start = read_n_cells(n_mem_addr_cells, &memcell_buf);
+			size = read_n_cells(n_mem_size_cells, &memcell_buf);
+
+			if ((scn_addr < start) || (scn_addr >= (start + size)))
+				continue;
+
+			nid = of_node_to_nid_single(memory);
+			break;
+		}
+
+		of_node_put(memory);
+		if (nid >= 0)
+			break;
 	}
 
-	BUG();	/* section address should be found above */
-	return 0;
+	return nid;
 }
 
 /*
@@ -1121,7 +1109,7 @@ static int hot_add_drconf_scn_to_nid(struct device_node *memory,
 int hot_add_scn_to_nid(unsigned long scn_addr)
 {
 	struct device_node *memory = NULL;
-	int nid;
+	int nid, found = 0;
 
 	if (!numa_enabled || (min_common_depth < 0))
 		return any_online_node(NODE_MASK_ALL);
@@ -1130,35 +1118,25 @@ int hot_add_scn_to_nid(unsigned long scn_addr)
 	if (memory) {
 		nid = hot_add_drconf_scn_to_nid(memory, scn_addr);
 		of_node_put(memory);
-		return nid;
+	} else {
+		nid = hot_add_node_scn_to_nid(scn_addr);
 	}
 
-	while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
-		unsigned long start, size;
-		int ranges;
-		const unsigned int *memcell_buf;
-		unsigned int len;
-
-		memcell_buf = of_get_property(memory, "reg", &len);
-		if (!memcell_buf || len <= 0)
-			continue;
+	if (nid < 0 || !node_online(nid))
+		nid = any_online_node(NODE_MASK_ALL);
 
-		/* ranges in cell */
-		ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
-ha_new_range:
-		start = read_n_cells(n_mem_addr_cells, &memcell_buf);
-		size = read_n_cells(n_mem_size_cells, &memcell_buf);
-		nid = of_node_to_nid_single(memory);
+	if (NODE_DATA(nid)->node_spanned_pages)
+		return nid;
 
-		if (valid_hot_add_scn(&nid, start, size, scn_addr)) {
-			of_node_put(memory);
-			return nid;
+	for_each_online_node(nid) {
+		if (NODE_DATA(nid)->node_spanned_pages) {
+			found = 1;
+			break;
 		}
-
-		if (--ranges)		/* process all ranges in cell */
-			goto ha_new_range;
 	}
-	BUG();	/* section address should be found above */
-	return 0;
+
+	BUG_ON(!found);
+	return nid;
 }
+
 #endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 6d94116fdea..a27ded3adac 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -1,5 +1,6 @@
 /*
  * This file contains common routines for dealing with free of page tables
+ * Along with common page table handling code
  *
  *  Derived from arch/powerpc/mm/tlb_64.c:
  *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
@@ -115,3 +116,133 @@ void pte_free_finish(void)
 	pte_free_submit(*batchp);
 	*batchp = NULL;
 }
+
+/*
+ * Handle i/d cache flushing, called from set_pte_at() or ptep_set_access_flags()
+ */
+static pte_t do_dcache_icache_coherency(pte_t pte)
+{
+	unsigned long pfn = pte_pfn(pte);
+	struct page *page;
+
+	if (unlikely(!pfn_valid(pfn)))
+		return pte;
+	page = pfn_to_page(pfn);
+
+	if (!PageReserved(page) && !test_bit(PG_arch_1, &page->flags)) {
+		pr_debug("do_dcache_icache_coherency... flushing\n");
+		flush_dcache_icache_page(page);
+		set_bit(PG_arch_1, &page->flags);
+	}
+	else
+		pr_debug("do_dcache_icache_coherency... already clean\n");
+	return __pte(pte_val(pte) | _PAGE_HWEXEC);
+}
+
+static inline int is_exec_fault(void)
+{
+	return current->thread.regs && TRAP(current->thread.regs) == 0x400;
+}
+
+/* We only try to do i/d cache coherency on stuff that looks like
+ * reasonably "normal" PTEs. We currently require a PTE to be present
+ * and we avoid _PAGE_SPECIAL and _PAGE_NO_CACHE
+ */
+static inline int pte_looks_normal(pte_t pte)
+{
+	return (pte_val(pte) &
+		(_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE)) ==
+		(_PAGE_PRESENT);
+}
+
+#if defined(CONFIG_PPC_STD_MMU)
+/* Server-style MMU handles coherency when hashing if HW exec permission
+ * is supposed per page (currently 64-bit only). Else, we always flush
+ * valid PTEs in set_pte.
+ */
+static inline int pte_need_exec_flush(pte_t pte, int set_pte)
+{
+	return set_pte && pte_looks_normal(pte) &&
+		!(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) ||
+		  cpu_has_feature(CPU_FTR_NOEXECUTE));
+}
+#elif _PAGE_HWEXEC == 0
+/* Embedded type MMU without HW exec support (8xx only so far), we flush
+ * the cache for any present PTE
+ */
+static inline int pte_need_exec_flush(pte_t pte, int set_pte)
+{
+	return set_pte && pte_looks_normal(pte);
+}
+#else
+/* Other embedded CPUs with HW exec support per-page, we flush on exec
+ * fault if HWEXEC is not set
+ */
+static inline int pte_need_exec_flush(pte_t pte, int set_pte)
+{
+	return pte_looks_normal(pte) && is_exec_fault() &&
+		!(pte_val(pte) & _PAGE_HWEXEC);
+}
+#endif
+
+/*
+ * set_pte stores a linux PTE into the linux page table.
+ */
+void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
+{
+#ifdef CONFIG_DEBUG_VM
+	WARN_ON(pte_present(*ptep));
+#endif
+	/* Note: mm->context.id might not yet have been assigned as
+	 * this context might not have been activated yet when this
+	 * is called.
+	 */
+	pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
+	if (pte_need_exec_flush(pte, 1))
+		pte = do_dcache_icache_coherency(pte);
+
+	/* Perform the setting of the PTE */
+	__set_pte_at(mm, addr, ptep, pte, 0);
+}
+
+/*
+ * This is called when relaxing access to a PTE. It's also called in the page
+ * fault path when we don't hit any of the major fault cases, ie, a minor
+ * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
+ * handled those two for us, we additionally deal with missing execute
+ * permission here on some processors
+ */
+int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
+			  pte_t *ptep, pte_t entry, int dirty)
+{
+	int changed;
+	if (!dirty && pte_need_exec_flush(entry, 0))
+		entry = do_dcache_icache_coherency(entry);
+	changed = !pte_same(*(ptep), entry);
+	if (changed) {
+		assert_pte_locked(vma->vm_mm, address);
+		__ptep_set_access_flags(ptep, entry);
+		flush_tlb_page_nohash(vma, address);
+	}
+	return changed;
+}
+
+#ifdef CONFIG_DEBUG_VM
+void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	if (mm == &init_mm)
+		return;
+	pgd = mm->pgd + pgd_index(addr);
+	BUG_ON(pgd_none(*pgd));
+	pud = pud_offset(pgd, addr);
+	BUG_ON(pud_none(*pud));
+	pmd = pmd_offset(pud, addr);
+	BUG_ON(!pmd_present(*pmd));
+	BUG_ON(!spin_is_locked(pte_lockptr(mm, pmd)));
+}
+#endif /* CONFIG_DEBUG_VM */
+
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 58bcaeba728..0f8c4371dfa 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -129,7 +129,8 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
 void __iomem *
 ioremap(phys_addr_t addr, unsigned long size)
 {
-	return __ioremap(addr, size, _PAGE_NO_CACHE | _PAGE_GUARDED);
+	return __ioremap_caller(addr, size, _PAGE_NO_CACHE | _PAGE_GUARDED,
+				__builtin_return_address(0));
 }
 EXPORT_SYMBOL(ioremap);
 
@@ -143,13 +144,20 @@ ioremap_flags(phys_addr_t addr, unsigned long size, unsigned long flags)
 	/* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
 	flags &= ~(_PAGE_USER | _PAGE_EXEC | _PAGE_HWEXEC);
 
-	return __ioremap(addr, size, flags);
+	return __ioremap_caller(addr, size, flags, __builtin_return_address(0));
 }
 EXPORT_SYMBOL(ioremap_flags);
 
 void __iomem *
 __ioremap(phys_addr_t addr, unsigned long size, unsigned long flags)
 {
+	return __ioremap_caller(addr, size, flags, __builtin_return_address(0));
+}
+
+void __iomem *
+__ioremap_caller(phys_addr_t addr, unsigned long size, unsigned long flags,
+		 void *caller)
+{
 	unsigned long v, i;
 	phys_addr_t p;
 	int err;
@@ -212,7 +220,7 @@ __ioremap(phys_addr_t addr, unsigned long size, unsigned long flags)
 
 	if (mem_init_done) {
 		struct vm_struct *area;
-		area = get_vm_area(size, VM_IOREMAP);
+		area = get_vm_area_caller(size, VM_IOREMAP, caller);
 		if (area == 0)
 			return NULL;
 		v = (unsigned long) area->addr;
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 365e61ae5db..bfa7db6b2fd 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -144,8 +144,8 @@ void __iounmap_at(void *ea, unsigned long size)
 	unmap_kernel_range((unsigned long)ea, size);
 }
 
-void __iomem * __ioremap(phys_addr_t addr, unsigned long size,
-			 unsigned long flags)
+void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size,
+				unsigned long flags, void *caller)
 {
 	phys_addr_t paligned;
 	void __iomem *ret;
@@ -168,8 +168,9 @@ void __iomem * __ioremap(phys_addr_t addr, unsigned long size,
 	if (mem_init_done) {
 		struct vm_struct *area;
 
-		area = __get_vm_area(size, VM_IOREMAP,
-				     ioremap_bot, IOREMAP_END);
+		area = __get_vm_area_caller(size, VM_IOREMAP,
+					    ioremap_bot, IOREMAP_END,
+					    caller);
 		if (area == NULL)
 			return NULL;
 		ret = __ioremap_at(paligned, area->addr, size, flags);
@@ -186,19 +187,27 @@ void __iomem * __ioremap(phys_addr_t addr, unsigned long size,
 	return ret;
 }
 
+void __iomem * __ioremap(phys_addr_t addr, unsigned long size,
+			 unsigned long flags)
+{
+	return __ioremap_caller(addr, size, flags, __builtin_return_address(0));
+}
 
 void __iomem * ioremap(phys_addr_t addr, unsigned long size)
 {
 	unsigned long flags = _PAGE_NO_CACHE | _PAGE_GUARDED;
+	void *caller = __builtin_return_address(0);
 
 	if (ppc_md.ioremap)
-		return ppc_md.ioremap(addr, size, flags);
-	return __ioremap(addr, size, flags);
+		return ppc_md.ioremap(addr, size, flags, caller);
+	return __ioremap_caller(addr, size, flags, caller);
 }
 
 void __iomem * ioremap_flags(phys_addr_t addr, unsigned long size,
 			     unsigned long flags)
 {
+	void *caller = __builtin_return_address(0);
+
 	/* writeable implies dirty for kernel addresses */
 	if (flags & _PAGE_RW)
 		flags |= _PAGE_DIRTY;
@@ -207,8 +216,8 @@ void __iomem * ioremap_flags(phys_addr_t addr, unsigned long size,
 	flags &= ~(_PAGE_USER | _PAGE_EXEC);
 
 	if (ppc_md.ioremap)
-		return ppc_md.ioremap(addr, size, flags);
-	return __ioremap(addr, size, flags);
+		return ppc_md.ioremap(addr, size, flags, caller);
+	return __ioremap_caller(addr, size, flags, caller);
 }
 
 
diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/tlb_nohash_low.S
index f900a39e6ec..788b87c36f7 100644
--- a/arch/powerpc/mm/tlb_nohash_low.S
+++ b/arch/powerpc/mm/tlb_nohash_low.S
@@ -118,25 +118,50 @@ _GLOBAL(_tlbil_pid)
 
 #elif defined(CONFIG_FSL_BOOKE)
 /*
- * FSL BookE implementations. Currently _pid and _all are the
- * same. This will change when tlbilx is actually supported and
- * performs invalidate-by-PID. This change will be driven by
- * mmu_features conditional
+ * FSL BookE implementations.
+ *
+ * Since feature sections are using _SECTION_ELSE we need
+ * to have the larger code path before the _SECTION_ELSE
  */
 
+#define MMUCSR0_TLBFI	(MMUCSR0_TLB0FI | MMUCSR0_TLB1FI | \
+			 MMUCSR0_TLB2FI | MMUCSR0_TLB3FI)
 /*
  * Flush MMU TLB on the local processor
  */
-_GLOBAL(_tlbil_pid)
 _GLOBAL(_tlbil_all)
-#define MMUCSR0_TLBFI	(MMUCSR0_TLB0FI | MMUCSR0_TLB1FI | \
-			 MMUCSR0_TLB2FI | MMUCSR0_TLB3FI)
+BEGIN_MMU_FTR_SECTION
+	li	r3,(MMUCSR0_TLBFI)@l
+	mtspr	SPRN_MMUCSR0, r3
+1:
+	mfspr	r3,SPRN_MMUCSR0
+	andi.	r3,r3,MMUCSR0_TLBFI@l
+	bne	1b
+MMU_FTR_SECTION_ELSE
+	PPC_TLBILX_ALL(0,0)
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_TLBILX)
+	msync
+	isync
+	blr
+
+_GLOBAL(_tlbil_pid)
+BEGIN_MMU_FTR_SECTION
+	slwi	r3,r3,16
+	mfmsr	r10
+	wrteei	0
+	mfspr	r4,SPRN_MAS6	/* save MAS6 */
+	mtspr	SPRN_MAS6,r3
+	PPC_TLBILX_PID(0,0)
+	mtspr	SPRN_MAS6,r4	/* restore MAS6 */
+	wrtee	r10
+MMU_FTR_SECTION_ELSE
 	li	r3,(MMUCSR0_TLBFI)@l
 	mtspr	SPRN_MMUCSR0, r3
 1:
 	mfspr	r3,SPRN_MMUCSR0
 	andi.	r3,r3,MMUCSR0_TLBFI@l
 	bne	1b
+ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBILX)
 	msync
 	isync
 	blr
@@ -149,7 +174,9 @@ _GLOBAL(_tlbil_va)
 	mfmsr	r10
 	wrteei	0
 	slwi	r4,r4,16
+	ori	r4,r4,(MAS6_ISIZE(BOOK3E_PAGESZ_4K))@l
 	mtspr	SPRN_MAS6,r4		/* assume AS=0 for now */
+BEGIN_MMU_FTR_SECTION
 	tlbsx	0,r3
 	mfspr	r4,SPRN_MAS1		/* check valid */
 	andis.	r3,r4,MAS1_VALID@h
@@ -157,6 +184,9 @@ _GLOBAL(_tlbil_va)
 	rlwinm	r4,r4,0,1,31
 	mtspr	SPRN_MAS1,r4
 	tlbwe
+MMU_FTR_SECTION_ELSE
+	PPC_TLBILX_VA(0,r3)
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_TLBILX)
 	msync
 	isync
 1:	wrtee	r10