Merge ../linux-2.6

author: Mauro Carvalho Chehab <mchehab@infradead.org> 2008-07-27 12:25:57 -0300
committer: Mauro Carvalho Chehab <mchehab@infradead.org> 2008-07-27 12:25:57 -0300
commit: 50cb993ea6cd187bfed085cb3e0747066edeb02f (patch)
tree: 61edac62c6c5bc07c59e4369c50c6821ad77f2c0 /arch/powerpc
parent: 445c2714cf72817ab1ad3ca894c6d9b2047b3a3e (diff)
parent: 8be1a6d6c77ab4532e4476fdb8177030ef48b52c (diff)
48 files changed, 2498 insertions, 393 deletions
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 737ebf9d12b..fe88418167c 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -110,7 +110,10 @@ config PPC
 	default y
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FTRACE
+	select ARCH_WANT_OPTIONAL_GPIOLIB
 	select HAVE_IDE
+	select HAVE_IOREMAP_PROT
+	select HAVE_EFFICIENT_UNALIGNED_ACCESS
 	select HAVE_KPROBES
 	select HAVE_ARCH_KGDB
 	select HAVE_KRETPROBES
@@ -843,6 +846,7 @@ source "crypto/Kconfig"
 config PPC_CLOCK
 	bool
 	default n
+	select HAVE_CLK
 
 config PPC_LIB_RHEAP
 	bool
diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index b936a1dd0a5..25c273c761d 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -23,6 +23,9 @@
 struct cpu_spec* cur_cpu_spec = NULL;
 EXPORT_SYMBOL(cur_cpu_spec);
 
+/* The platform string corresponding to the real PVR */
+const char *powerpc_base_platform;
+
 /* NOTE:
  * Unlike ppc32, ppc64 will only call this once for the boot CPU, it's
  * the responsibility of the appropriate CPU save/restore functions to
@@ -1652,6 +1655,14 @@ struct cpu_spec * __init identify_cpu(unsigned long offset, unsigned int pvr)
 			} else
 				*t = *s;
 			*PTRRELOC(&cur_cpu_spec) = &the_cpu_spec;
+
+			/*
+			 * Set the base platform string once; assumes
+			 * we're called with real pvr first.
+			 */
+			if (*PTRRELOC(&powerpc_base_platform) == NULL)
+				*PTRRELOC(&powerpc_base_platform) = t->platform;
+
 #if defined(CONFIG_PPC64) || defined(CONFIG_BOOKE)
 			/* ppc64 and booke expect identify_cpu to also call
 			 * setup_cpu for that processor. I will consolidate
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index da52269aec1..81c8324a4a3 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -148,7 +148,7 @@ transfer_to_handler:
 	/* Check to see if the dbcr0 register is set up to debug.  Use the
 	   internal debug mode bit to do this. */
 	lwz	r12,THREAD_DBCR0(r12)
-	andis.	r12,r12,DBCR0_IDM@h
+	andis.	r12,r12,(DBCR0_IDM  | DBSR_DAC1R | DBSR_DAC1W)@h
 	beq+	3f
 	/* From user and task is ptraced - load up global dbcr0 */
 	li	r12,-1			/* clear all pending debug events */
@@ -292,7 +292,7 @@ syscall_exit_cont:
 	/* If the process has its own DBCR0 value, load it up.  The internal
 	   debug mode bit tells us that dbcr0 should be loaded. */
 	lwz	r0,THREAD+THREAD_DBCR0(r2)
-	andis.	r10,r0,DBCR0_IDM@h
+	andis.	r10,r0,(DBCR0_IDM  | DBSR_DAC1R | DBSR_DAC1W)@h
 	bnel-	load_dbcr0
 #endif
 #ifdef CONFIG_44x
@@ -720,7 +720,7 @@ restore_user:
 	/* Check whether this process has its own DBCR0 value.  The internal
 	   debug mode bit tells us that dbcr0 should be loaded. */
 	lwz	r0,THREAD+THREAD_DBCR0(r2)
-	andis.	r10,r0,DBCR0_IDM@h
+	andis.	r10,r0,(DBCR0_IDM  | DBSR_DAC1R | DBSR_DAC1W)@h
 	bnel-	load_dbcr0
 #endif
 
diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c
index c3cf0e8f3ac..d308a9f70f1 100644
--- a/arch/powerpc/kernel/idle.c
+++ b/arch/powerpc/kernel/idle.c
@@ -60,7 +60,7 @@ void cpu_idle(void)
 
 	set_thread_flag(TIF_POLLING_NRFLAG);
 	while (1) {
-		tick_nohz_stop_sched_tick();
+		tick_nohz_stop_sched_tick(1);
 		while (!need_resched() && !cpu_should_die()) {
 			ppc64_runlatch_off();
 
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 2385f68c175..550a19399bf 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -49,6 +49,8 @@ static int novmerge = 1;
 
 static int protect4gb = 1;
 
+static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned int);
+
 static inline unsigned long iommu_num_pages(unsigned long vaddr,
 					    unsigned long slen)
 {
@@ -191,6 +193,7 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
 {
 	unsigned long entry, flags;
 	dma_addr_t ret = DMA_ERROR_CODE;
+	int build_fail;
 
 	spin_lock_irqsave(&(tbl->it_lock), flags);
 
@@ -205,9 +208,21 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
 	ret = entry << IOMMU_PAGE_SHIFT;	/* Set the return dma address */
 
 	/* Put the TCEs in the HW table */
-	ppc_md.tce_build(tbl, entry, npages, (unsigned long)page & IOMMU_PAGE_MASK,
-			 direction, attrs);
+	build_fail = ppc_md.tce_build(tbl, entry, npages,
+	                              (unsigned long)page & IOMMU_PAGE_MASK,
+	                              direction, attrs);
+
+	/* ppc_md.tce_build() only returns non-zero for transient errors.
+	 * Clean up the table bitmap in this case and return
+	 * DMA_ERROR_CODE. For all other errors the functionality is
+	 * not altered.
+	 */
+	if (unlikely(build_fail)) {
+		__iommu_free(tbl, ret, npages);
 
+		spin_unlock_irqrestore(&(tbl->it_lock), flags);
+		return DMA_ERROR_CODE;
+	}
 
 	/* Flush/invalidate TLB caches if necessary */
 	if (ppc_md.tce_flush)
@@ -276,7 +291,7 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 	dma_addr_t dma_next = 0, dma_addr;
 	unsigned long flags;
 	struct scatterlist *s, *outs, *segstart;
-	int outcount, incount, i;
+	int outcount, incount, i, build_fail = 0;
 	unsigned int align;
 	unsigned long handle;
 	unsigned int max_seg_size;
@@ -337,8 +352,11 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 			    npages, entry, dma_addr);
 
 		/* Insert into HW table */
-		ppc_md.tce_build(tbl, entry, npages, vaddr & IOMMU_PAGE_MASK,
-				 direction, attrs);
+		build_fail = ppc_md.tce_build(tbl, entry, npages,
+		                              vaddr & IOMMU_PAGE_MASK,
+		                              direction, attrs);
+		if(unlikely(build_fail))
+			goto failure;
 
 		/* If we are in an open segment, try merging */
 		if (segstart != s) {
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 4ba2af12545..de79915452c 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -144,7 +144,6 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
 	kcb->kprobe_saved_msr = regs->msr;
 }
 
-/* Called with kretprobe_lock held */
 void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
 				      struct pt_regs *regs)
 {
@@ -312,8 +311,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p,
 	unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
 
 	INIT_HLIST_HEAD(&empty_rp);
-	spin_lock_irqsave(&kretprobe_lock, flags);
-	head = kretprobe_inst_table_head(current);
+	kretprobe_hash_lock(current, &head, &flags);
 
 	/*
 	 * It is possible to have multiple instances associated with a given
@@ -352,7 +350,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p,
 	regs->nip = orig_ret_address;
 
 	reset_current_kprobe();
-	spin_unlock_irqrestore(&kretprobe_lock, flags);
+	kretprobe_hash_unlock(current, &flags);
 	preempt_enable_no_resched();
 
 	hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
diff --git a/arch/powerpc/kernel/lparcfg.c b/arch/powerpc/kernel/lparcfg.c
index 827a5726a03..9f856a0c3e3 100644
--- a/arch/powerpc/kernel/lparcfg.c
+++ b/arch/powerpc/kernel/lparcfg.c
@@ -34,8 +34,9 @@
 #include <asm/time.h>
 #include <asm/prom.h>
 #include <asm/vdso_datapage.h>
+#include <asm/vio.h>
 
-#define MODULE_VERS "1.7"
+#define MODULE_VERS "1.8"
 #define MODULE_NAME "lparcfg"
 
 /* #define LPARCFG_DEBUG */
@@ -129,32 +130,46 @@ static int iseries_lparcfg_data(struct seq_file *m, void *v)
 /*
  * Methods used to fetch LPAR data when running on a pSeries platform.
  */
-static void log_plpar_hcall_return(unsigned long rc, char *tag)
+/**
+ * h_get_mpp
+ * H_GET_MPP hcall returns info in 7 parms
+ */
+int h_get_mpp(struct hvcall_mpp_data *mpp_data)
 {
-	switch(rc) {
-	case 0:
-		return;
-	case H_HARDWARE:
-		printk(KERN_INFO "plpar-hcall (%s) "
-				"Hardware fault\n", tag);
-		return;
-	case H_FUNCTION:
-		printk(KERN_INFO "plpar-hcall (%s) "
-				"Function not allowed\n", tag);
-		return;
-	case H_AUTHORITY:
-		printk(KERN_INFO "plpar-hcall (%s) "
-				"Not authorized to this function\n", tag);
-		return;
-	case H_PARAMETER:
-		printk(KERN_INFO "plpar-hcall (%s) "
-				"Bad parameter(s)\n",tag);
-		return;
-	default:
-		printk(KERN_INFO "plpar-hcall (%s) "
-				"Unexpected rc(0x%lx)\n", tag, rc);
-	}
+	int rc;
+	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
+
+	rc = plpar_hcall9(H_GET_MPP, retbuf);
+
+	mpp_data->entitled_mem = retbuf[0];
+	mpp_data->mapped_mem = retbuf[1];
+
+	mpp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff;
+	mpp_data->pool_num = retbuf[2] & 0xffff;
+
+	mpp_data->mem_weight = (retbuf[3] >> 7 * 8) & 0xff;
+	mpp_data->unallocated_mem_weight = (retbuf[3] >> 6 * 8) & 0xff;
+	mpp_data->unallocated_entitlement = retbuf[3] & 0xffffffffffff;
+
+	mpp_data->pool_size = retbuf[4];
+	mpp_data->loan_request = retbuf[5];
+	mpp_data->backing_mem = retbuf[6];
+
+	return rc;
 }
+EXPORT_SYMBOL(h_get_mpp);
+
+struct hvcall_ppp_data {
+	u64	entitlement;
+	u64	unallocated_entitlement;
+	u16	group_num;
+	u16	pool_num;
+	u8	capped;
+	u8	weight;
+	u8	unallocated_weight;
+	u16	active_procs_in_pool;
+	u16	active_system_procs;
+};
 
 /*
  * H_GET_PPP hcall returns info in 4 parms.
@@ -176,27 +191,30 @@ static void log_plpar_hcall_return(unsigned long rc, char *tag)
  *              XXXX - Active processors in Physical Processor Pool.
  *                  XXXX  - Processors active on platform.
  */
-static unsigned int h_get_ppp(unsigned long *entitled,
-			      unsigned long *unallocated,
-			      unsigned long *aggregation,
-			      unsigned long *resource)
+static unsigned int h_get_ppp(struct hvcall_ppp_data *ppp_data)
 {
 	unsigned long rc;
 	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
 
 	rc = plpar_hcall(H_GET_PPP, retbuf);
 
-	*entitled = retbuf[0];
-	*unallocated = retbuf[1];
-	*aggregation = retbuf[2];
-	*resource = retbuf[3];
+	ppp_data->entitlement = retbuf[0];
+	ppp_data->unallocated_entitlement = retbuf[1];
+
+	ppp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff;
+	ppp_data->pool_num = retbuf[2] & 0xffff;
 
-	log_plpar_hcall_return(rc, "H_GET_PPP");
+	ppp_data->capped = (retbuf[3] >> 6 * 8) & 0x01;
+	ppp_data->weight = (retbuf[3] >> 5 * 8) & 0xff;
+	ppp_data->unallocated_weight = (retbuf[3] >> 4 * 8) & 0xff;
+	ppp_data->active_procs_in_pool = (retbuf[3] >> 2 * 8) & 0xffff;
+	ppp_data->active_system_procs = retbuf[3] & 0xffff;
 
 	return rc;
 }
 
-static void h_pic(unsigned long *pool_idle_time, unsigned long *num_procs)
+static unsigned h_pic(unsigned long *pool_idle_time,
+		      unsigned long *num_procs)
 {
 	unsigned long rc;
 	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
@@ -206,8 +224,87 @@ static void h_pic(unsigned long *pool_idle_time, unsigned long *num_procs)
 	*pool_idle_time = retbuf[0];
 	*num_procs = retbuf[1];
 
-	if (rc != H_AUTHORITY)
-		log_plpar_hcall_return(rc, "H_PIC");
+	return rc;
+}
+
+/*
+ * parse_ppp_data
+ * Parse out the data returned from h_get_ppp and h_pic
+ */
+static void parse_ppp_data(struct seq_file *m)
+{
+	struct hvcall_ppp_data ppp_data;
+	int rc;
+
+	rc = h_get_ppp(&ppp_data);
+	if (rc)
+		return;
+
+	seq_printf(m, "partition_entitled_capacity=%ld\n",
+	           ppp_data.entitlement);
+	seq_printf(m, "group=%d\n", ppp_data.group_num);
+	seq_printf(m, "system_active_processors=%d\n",
+	           ppp_data.active_system_procs);
+
+	/* pool related entries are apropriate for shared configs */
+	if (lppaca[0].shared_proc) {
+		unsigned long pool_idle_time, pool_procs;
+
+		seq_printf(m, "pool=%d\n", ppp_data.pool_num);
+
+		/* report pool_capacity in percentage */
+		seq_printf(m, "pool_capacity=%d\n",
+			   ppp_data.active_procs_in_pool * 100);
+
+		h_pic(&pool_idle_time, &pool_procs);
+		seq_printf(m, "pool_idle_time=%ld\n", pool_idle_time);
+		seq_printf(m, "pool_num_procs=%ld\n", pool_procs);
+	}
+
+	seq_printf(m, "unallocated_capacity_weight=%d\n",
+		   ppp_data.unallocated_weight);
+	seq_printf(m, "capacity_weight=%d\n", ppp_data.weight);
+	seq_printf(m, "capped=%d\n", ppp_data.capped);
+	seq_printf(m, "unallocated_capacity=%ld\n",
+		   ppp_data.unallocated_entitlement);
+}
+
+/**
+ * parse_mpp_data
+ * Parse out data returned from h_get_mpp
+ */
+static void parse_mpp_data(struct seq_file *m)
+{
+	struct hvcall_mpp_data mpp_data;
+	int rc;
+
+	rc = h_get_mpp(&mpp_data);
+	if (rc)
+		return;
+
+	seq_printf(m, "entitled_memory=%ld\n", mpp_data.entitled_mem);
+
+	if (mpp_data.mapped_mem != -1)
+		seq_printf(m, "mapped_entitled_memory=%ld\n",
+		           mpp_data.mapped_mem);
+
+	seq_printf(m, "entitled_memory_group_number=%d\n", mpp_data.group_num);
+	seq_printf(m, "entitled_memory_pool_number=%d\n", mpp_data.pool_num);
+
+	seq_printf(m, "entitled_memory_weight=%d\n", mpp_data.mem_weight);
+	seq_printf(m, "unallocated_entitled_memory_weight=%d\n",
+	           mpp_data.unallocated_mem_weight);
+	seq_printf(m, "unallocated_io_mapping_entitlement=%ld\n",
+	           mpp_data.unallocated_entitlement);
+
+	if (mpp_data.pool_size != -1)
+		seq_printf(m, "entitled_memory_pool_size=%ld bytes\n",
+		           mpp_data.pool_size);
+
+	seq_printf(m, "entitled_memory_loan_request=%ld\n",
+	           mpp_data.loan_request);
+
+	seq_printf(m, "backing_memory=%ld bytes\n", mpp_data.backing_mem);
 }
 
 #define SPLPAR_CHARACTERISTICS_TOKEN 20
@@ -313,6 +410,25 @@ static int lparcfg_count_active_processors(void)
 	return count;
 }
 
+static void pseries_cmo_data(struct seq_file *m)
+{
+	int cpu;
+	unsigned long cmo_faults = 0;
+	unsigned long cmo_fault_time = 0;
+
+	if (!firmware_has_feature(FW_FEATURE_CMO))
+		return;
+
+	for_each_possible_cpu(cpu) {
+		cmo_faults += lppaca[cpu].cmo_faults;
+		cmo_fault_time += lppaca[cpu].cmo_fault_time;
+	}
+
+	seq_printf(m, "cmo_faults=%lu\n", cmo_faults);
+	seq_printf(m, "cmo_fault_time_usec=%lu\n",
+		   cmo_fault_time / tb_ticks_per_usec);
+}
+
 static int pseries_lparcfg_data(struct seq_file *m, void *v)
 {
 	int partition_potential_processors;
@@ -334,60 +450,13 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v)
 	partition_active_processors = lparcfg_count_active_processors();
 
 	if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
-		unsigned long h_entitled, h_unallocated;
-		unsigned long h_aggregation, h_resource;
-		unsigned long pool_idle_time, pool_procs;
-		unsigned long purr;
-
-		h_get_ppp(&h_entitled, &h_unallocated, &h_aggregation,
-			  &h_resource);
-
-		seq_printf(m, "R4=0x%lx\n", h_entitled);
-		seq_printf(m, "R5=0x%lx\n", h_unallocated);
-		seq_printf(m, "R6=0x%lx\n", h_aggregation);
-		seq_printf(m, "R7=0x%lx\n", h_resource);
-
-		purr = get_purr();
-
 		/* this call handles the ibm,get-system-parameter contents */
 		parse_system_parameter_string(m);
+		parse_ppp_data(m);
+		parse_mpp_data(m);
+		pseries_cmo_data(m);
 
-		seq_printf(m, "partition_entitled_capacity=%ld\n", h_entitled);
-
-		seq_printf(m, "group=%ld\n", (h_aggregation >> 2 * 8) & 0xffff);
-
-		seq_printf(m, "system_active_processors=%ld\n",
-			   (h_resource >> 0 * 8) & 0xffff);
-
-		/* pool related entries are apropriate for shared configs */
-		if (lppaca[0].shared_proc) {
-
-			h_pic(&pool_idle_time, &pool_procs);
-
-			seq_printf(m, "pool=%ld\n",
-				   (h_aggregation >> 0 * 8) & 0xffff);
-
-			/* report pool_capacity in percentage */
-			seq_printf(m, "pool_capacity=%ld\n",
-				   ((h_resource >> 2 * 8) & 0xffff) * 100);
-
-			seq_printf(m, "pool_idle_time=%ld\n", pool_idle_time);
-
-			seq_printf(m, "pool_num_procs=%ld\n", pool_procs);
-		}
-
-		seq_printf(m, "unallocated_capacity_weight=%ld\n",
-			   (h_resource >> 4 * 8) & 0xFF);
-
-		seq_printf(m, "capacity_weight=%ld\n",
-			   (h_resource >> 5 * 8) & 0xFF);
-
-		seq_printf(m, "capped=%ld\n", (h_resource >> 6 * 8) & 0x01);
-
-		seq_printf(m, "unallocated_capacity=%ld\n", h_unallocated);
-
-		seq_printf(m, "purr=%ld\n", purr);
-
+		seq_printf(m, "purr=%ld\n", get_purr());
 	} else {		/* non SPLPAR case */
 
 		seq_printf(m, "system_active_processors=%d\n",
@@ -414,6 +483,83 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v)
 	return 0;
 }
 
+static ssize_t update_ppp(u64 *entitlement, u8 *weight)
+{
+	struct hvcall_ppp_data ppp_data;
+	u8 new_weight;
+	u64 new_entitled;
+	ssize_t retval;
+
+	/* Get our current parameters */
+	retval = h_get_ppp(&ppp_data);
+	if (retval)
+		return retval;
+
+	if (entitlement) {
+		new_weight = ppp_data.weight;
+		new_entitled = *entitlement;
+	} else if (weight) {
+		new_weight = *weight;
+		new_entitled = ppp_data.entitlement;
+	} else
+		return -EINVAL;
+
+	pr_debug("%s: current_entitled = %lu, current_weight = %u\n",
+	         __FUNCTION__, ppp_data.entitlement, ppp_data.weight);
+
+	pr_debug("%s: new_entitled = %lu, new_weight = %u\n",
+		 __FUNCTION__, new_entitled, new_weight);
+
+	retval = plpar_hcall_norets(H_SET_PPP, new_entitled, new_weight);
+	return retval;
+}
+
+/**
+ * update_mpp
+ *
+ * Update the memory entitlement and weight for the partition.  Caller must
+ * specify either a new entitlement or weight, not both, to be updated
+ * since the h_set_mpp call takes both entitlement and weight as parameters.
+ */
+static ssize_t update_mpp(u64 *entitlement, u8 *weight)
+{
+	struct hvcall_mpp_data mpp_data;
+	u64 new_entitled;
+	u8 new_weight;
+	ssize_t rc;
+
+	if (entitlement) {
+		/* Check with vio to ensure the new memory entitlement
+		 * can be handled.
+		 */
+		rc = vio_cmo_entitlement_update(*entitlement);
+		if (rc)
+			return rc;
+	}
+
+	rc = h_get_mpp(&mpp_data);
+	if (rc)
+		return rc;
+
+	if (entitlement) {
+		new_weight = mpp_data.mem_weight;
+		new_entitled = *entitlement;
+	} else if (weight) {
+		new_weight = *weight;
+		new_entitled = mpp_data.entitled_mem;
+	} else
+		return -EINVAL;
+
+	pr_debug("%s: current_entitled = %lu, current_weight = %u\n",
+	         __FUNCTION__, mpp_data.entitled_mem, mpp_data.mem_weight);
+
+	pr_debug("%s: new_entitled = %lu, new_weight = %u\n",
+	         __FUNCTION__, new_entitled, new_weight);
+
+	rc = plpar_hcall_norets(H_SET_MPP, new_entitled, new_weight);
+	return rc;
+}
+
 /*
  * Interface for changing system parameters (variable capacity weight
  * and entitled capacity).  Format of input is "param_name=value";
@@ -427,35 +573,27 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v)
 static ssize_t lparcfg_write(struct file *file, const char __user * buf,
 			     size_t count, loff_t * off)
 {
-	char *kbuf;
+	int kbuf_sz = 64;
+	char kbuf[kbuf_sz];
 	char *tmp;
 	u64 new_entitled, *new_entitled_ptr = &new_entitled;
 	u8 new_weight, *new_weight_ptr = &new_weight;
-
-	unsigned long current_entitled;	/* parameters for h_get_ppp */
-	unsigned long dummy;
-	unsigned long resource;
-	u8 current_weight;
-
-	ssize_t retval = -ENOMEM;
+	ssize_t retval;
 
 	if (!firmware_has_feature(FW_FEATURE_SPLPAR) ||
 			firmware_has_feature(FW_FEATURE_ISERIES))
 		return -EINVAL;
 
-	kbuf = kmalloc(count, GFP_KERNEL);
-	if (!kbuf)
-		goto out;
+	if (count > kbuf_sz)
+		return -EINVAL;
 
-	retval = -EFAULT;
 	if (copy_from_user(kbuf, buf, count))
-		goto out;
+		return -EFAULT;
 
-	retval = -EINVAL;
 	kbuf[count - 1] = '\0';
 	tmp = strchr(kbuf, '=');
 	if (!tmp)
-		goto out;
+		return -EINVAL;
 
 	*tmp++ = '\0';
 
@@ -463,34 +601,32 @@ static ssize_t lparcfg_write(struct file *file, const char __user * buf,
 		char *endp;
 		*new_entitled_ptr = (u64) simple_strtoul(tmp, &endp, 10);
 		if (endp == tmp)
-			goto out;
-		new_weight_ptr = &current_weight;
+			return -EINVAL;
+
+		retval = update_ppp(new_entitled_ptr, NULL);
 	} else if (!strcmp(kbuf, "capacity_weight")) {
 		char *endp;
 		*new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10);
 		if (endp == tmp)
-			goto out;
-		new_entitled_ptr = &current_entitled;
-	} else
-		goto out;
-
-	/* Get our current parameters */
-	retval = h_get_ppp(&current_entitled, &dummy, &dummy, &resource);
-	if (retval) {
-		retval = -EIO;
-		goto out;
-	}
-
-	current_weight = (resource >> 5 * 8) & 0xFF;
+			return -EINVAL;
 
-	pr_debug("%s: current_entitled = %lu, current_weight = %u\n",
-		 __func__, current_entitled, current_weight);
+		retval = update_ppp(NULL, new_weight_ptr);
+	} else if (!strcmp(kbuf, "entitled_memory")) {
+		char *endp;
+		*new_entitled_ptr = (u64) simple_strtoul(tmp, &endp, 10);
+		if (endp == tmp)
+			return -EINVAL;
 
-	pr_debug("%s: new_entitled = %lu, new_weight = %u\n",
-		 __func__, *new_entitled_ptr, *new_weight_ptr);
+		retval = update_mpp(new_entitled_ptr, NULL);
+	} else if (!strcmp(kbuf, "entitled_memory_weight")) {
+		char *endp;
+		*new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10);
+		if (endp == tmp)
+			return -EINVAL;
 
-	retval = plpar_hcall_norets(H_SET_PPP, *new_entitled_ptr,
-				    *new_weight_ptr);
+		retval = update_mpp(NULL, new_weight_ptr);
+	} else
+		return -EINVAL;
 
 	if (retval == H_SUCCESS || retval == H_CONSTRAINED) {
 		retval = count;
@@ -506,8 +642,6 @@ static ssize_t lparcfg_write(struct file *file, const char __user * buf,
 		retval = -EIO;
 	}
 
-out:
-	kfree(kbuf);
 	return retval;
 }
 
diff --git a/arch/powerpc/kernel/machine_kexec.c b/arch/powerpc/kernel/machine_kexec.c
index 29a0e039d43..aab76887a84 100644
--- a/arch/powerpc/kernel/machine_kexec.c
+++ b/arch/powerpc/kernel/machine_kexec.c
@@ -48,7 +48,7 @@ void machine_kexec_cleanup(struct kimage *image)
  * Do not allocate memory (or fail in any way) in machine_kexec().
  * We are past the point of no return, committed to rebooting now.
  */
-NORET_TYPE void machine_kexec(struct kimage *image)
+void machine_kexec(struct kimage *image)
 {
 	if (ppc_md.machine_kexec)
 		ppc_md.machine_kexec(image);
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 219f3634115..db2497ccc11 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -47,6 +47,8 @@
 #ifdef CONFIG_PPC64
 #include <asm/firmware.h>
 #endif
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
 
 extern unsigned long _get_SP(void);
 
@@ -239,6 +241,35 @@ void discard_lazy_cpu_state(void)
 }
 #endif /* CONFIG_SMP */
 
+void do_dabr(struct pt_regs *regs, unsigned long address,
+		    unsigned long error_code)
+{
+	siginfo_t info;
+
+	if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code,
+			11, SIGSEGV) == NOTIFY_STOP)
+		return;
+
+	if (debugger_dabr_match(regs))
+		return;
+
+	/* Clear the DAC and struct entries.  One shot trigger */
+#if (defined(CONFIG_44x) || defined(CONFIG_BOOKE))
+	mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) & ~(DBSR_DAC1R | DBSR_DAC1W
+							| DBCR0_IDM));
+#endif
+
+	/* Clear the DABR */
+	set_dabr(0);
+
+	/* Deliver the signal to userspace */
+	info.si_signo = SIGTRAP;
+	info.si_errno = 0;
+	info.si_code = TRAP_HWBKPT;
+	info.si_addr = (void __user *)address;
+	force_sig_info(SIGTRAP, &info, current);
+}
+
 static DEFINE_PER_CPU(unsigned long, current_dabr);
 
 int set_dabr(unsigned long dabr)
@@ -254,6 +285,11 @@ int set_dabr(unsigned long dabr)
 #if defined(CONFIG_PPC64) || defined(CONFIG_6xx)
 	mtspr(SPRN_DABR, dabr);
 #endif
+
+#if defined(CONFIG_44x) || defined(CONFIG_BOOKE)
+	mtspr(SPRN_DAC1, dabr);
+#endif
+
 	return 0;
 }
 
@@ -337,6 +373,12 @@ struct task_struct *__switch_to(struct task_struct *prev,
 	if (unlikely(__get_cpu_var(current_dabr) != new->thread.dabr))
 		set_dabr(new->thread.dabr);
 
+#if defined(CONFIG_44x) || defined(CONFIG_BOOKE)
+	/* If new thread DAC (HW breakpoint) is the same then leave it */
+	if (new->thread.dabr)
+		set_dabr(new->thread.dabr);
+#endif
+
 	new_thread = &new->thread;
 	old_thread = &current->thread;
 
@@ -525,6 +567,10 @@ void flush_thread(void)
 	if (current->thread.dabr) {
 		current->thread.dabr = 0;
 		set_dabr(0);
+
+#if defined(CONFIG_44x) || defined(CONFIG_BOOKE)
+		current->thread.dbcr0 &= ~(DBSR_DAC1R | DBSR_DAC1W);
+#endif
 	}
 }
 
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 1ea8c8d3ce8..c4ab2195b9c 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -643,6 +643,11 @@ static void __init early_cmdline_parse(void)
 #else
 #define OV5_MSI			0x00
 #endif /* CONFIG_PCI_MSI */
+#ifdef CONFIG_PPC_SMLPAR
+#define OV5_CMO			0x80	/* Cooperative Memory Overcommitment */
+#else
+#define OV5_CMO			0x00
+#endif
 
 /*
  * The architecture vector has an array of PVR mask/value pairs,
@@ -687,10 +692,12 @@ static unsigned char ibm_architecture_vec[] = {
 	0,				/* don't halt */
 
 	/* option vector 5: PAPR/OF options */
-	3 - 2,				/* length */
+	5 - 2,				/* length */
 	0,				/* don't ignore, don't halt */
 	OV5_LPAR | OV5_SPLPAR | OV5_LARGE_PAGES | OV5_DRCONF_MEMORY |
 	OV5_DONATE_DEDICATE_CPU | OV5_MSI,
+	0,
+	OV5_CMO,
 };
 
 /* Old method - ELF header with PT_NOTE sections */
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index 8feb93e7890..a5d0e78779c 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -703,7 +703,7 @@ void user_enable_single_step(struct task_struct *task)
 
 	if (regs != NULL) {
 #if defined(CONFIG_40x) || defined(CONFIG_BOOKE)
-		task->thread.dbcr0 = DBCR0_IDM | DBCR0_IC;
+		task->thread.dbcr0 |= DBCR0_IDM | DBCR0_IC;
 		regs->msr |= MSR_DE;
 #else
 		regs->msr |= MSR_SE;
@@ -716,9 +716,16 @@ void user_disable_single_step(struct task_struct *task)
 {
 	struct pt_regs *regs = task->thread.regs;
 
+
+#if defined(CONFIG_44x) || defined(CONFIG_BOOKE)
+	/* If DAC then do not single step, skip */
+	if (task->thread.dabr)
+		return;
+#endif
+
 	if (regs != NULL) {
 #if defined(CONFIG_40x) || defined(CONFIG_BOOKE)
-		task->thread.dbcr0 = 0;
+		task->thread.dbcr0 &= ~(DBCR0_IC | DBCR0_IDM);
 		regs->msr &= ~MSR_DE;
 #else
 		regs->msr &= ~MSR_SE;
@@ -727,22 +734,75 @@ void user_disable_single_step(struct task_struct *task)
 	clear_tsk_thread_flag(task, TIF_SINGLESTEP);
 }
 
-static int ptrace_set_debugreg(struct task_struct *task, unsigned long addr,
+int ptrace_set_debugreg(struct task_struct *task, unsigned long addr,
 			       unsigned long data)
 {
-	/* We only support one DABR and no IABRS at the moment */
+	/* For ppc64 we support one DABR and no IABR's at the moment (ppc64).
+	 *  For embedded processors we support one DAC and no IAC's at the
+	 *  moment.
+	 */
 	if (addr > 0)
 		return -EINVAL;
 
-	/* The bottom 3 bits are flags */
 	if ((data & ~0x7UL) >= TASK_SIZE)
 		return -EIO;
 
-	/* Ensure translation is on */
+#ifdef CONFIG_PPC64
+
+	/* For processors using DABR (i.e. 970), the bottom 3 bits are flags.
+	 *  It was assumed, on previous implementations, that 3 bits were
+	 *  passed together with the data address, fitting the design of the
+	 *  DABR register, as follows:
+	 *
+	 *  bit 0: Read flag
+	 *  bit 1: Write flag
+	 *  bit 2: Breakpoint translation
+	 *
+	 *  Thus, we use them here as so.
+	 */
+
+	/* Ensure breakpoint translation bit is set */
 	if (data && !(data & DABR_TRANSLATION))
 		return -EIO;
 
+	/* Move contents to the DABR register */
 	task->thread.dabr = data;
+
+#endif
+#if defined(CONFIG_44x) || defined(CONFIG_BOOKE)
+
+	/* As described above, it was assumed 3 bits were passed with the data
+	 *  address, but we will assume only the mode bits will be passed
+	 *  as to not cause alignment restrictions for DAC-based processors.
+	 */
+
+	/* DAC's hold the whole address without any mode flags */
+	task->thread.dabr = data & ~0x3UL;
+
+	if (task->thread.dabr == 0) {
+		task->thread.dbcr0 &= ~(DBSR_DAC1R | DBSR_DAC1W | DBCR0_IDM);
+		task->thread.regs->msr &= ~MSR_DE;
+		return 0;
+	}
+
+	/* Read or Write bits must be set */
+
+	if (!(data & 0x3UL))
+		return -EINVAL;
+
+	/* Set the Internal Debugging flag (IDM bit 1) for the DBCR0
+	   register */
+	task->thread.dbcr0 = DBCR0_IDM;
+
+	/* Check for write and read flags and set DBCR0
+	   accordingly */
+	if (data & 0x1UL)
+		task->thread.dbcr0 |= DBSR_DAC1R;
+	if (data & 0x2UL)
+		task->thread.dbcr0 |= DBSR_DAC1W;
+
+	task->thread.regs->msr |= MSR_DE;
+#endif
 	return 0;
 }
 
diff --git a/arch/powerpc/kernel/rtas_flash.c b/arch/powerpc/kernel/rtas_flash.c
index 09ded5c424a..149cb112cd1 100644
--- a/arch/powerpc/kernel/rtas_flash.c
+++ b/arch/powerpc/kernel/rtas_flash.c
@@ -286,7 +286,7 @@ static ssize_t rtas_flash_read(struct file *file, char __user *buf,
 }
 
 /* constructor for flash_block_cache */
-void rtas_block_ctor(struct kmem_cache *cache, void *ptr)
+void rtas_block_ctor(void *ptr)
 {
 	memset(ptr, 0, RTAS_BLK_SIZE);
 }
diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index ad55488939c..7aada783ec6 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -145,8 +145,12 @@ int do_signal(sigset_t *oldset, struct pt_regs *regs)
 	 * user space. The DABR will have been cleared if it
 	 * triggered inside the kernel.
 	 */
-	if (current->thread.dabr)
+	if (current->thread.dabr) {
 		set_dabr(current->thread.dabr);
+#if defined(CONFIG_44x) || defined(CONFIG_BOOKE)
+		mtspr(SPRN_DBCR0, current->thread.dbcr0);
+#endif
+	}
 
 	if (is32) {
         	if (ka.sa.sa_flags & SA_SIGINFO)
diff --git a/arch/powerpc/kernel/suspend.c b/arch/powerpc/kernel/suspend.c
index 8cee5710754..6fc6328dc62 100644
--- a/arch/powerpc/kernel/suspend.c
+++ b/arch/powerpc/kernel/suspend.c
@@ -7,6 +7,7 @@
  * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
  */
 
+#include <linux/mm.h>
 #include <asm/page.h>
 
 /* References to section boundaries */
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index aba0ba95f06..800e5e9a087 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -529,7 +529,8 @@ static void register_nodes(void)
 #endif
 
 /* Only valid if CPU is present. */
-static ssize_t show_physical_id(struct sys_device *dev, char *buf)
+static ssize_t show_physical_id(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
 
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 878fbddb6ae..81ccb8dd1a5 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -1067,6 +1067,22 @@ void __kprobes DebugException(struct pt_regs *regs, unsigned long debug_status)
 		}
 
 		_exception(SIGTRAP, regs, TRAP_TRACE, regs->nip);
+	} else if (debug_status & (DBSR_DAC1R | DBSR_DAC1W)) {
+		regs->msr &= ~MSR_DE;
+
+		if (user_mode(regs)) {
+			current->thread.dbcr0 &= ~(DBSR_DAC1R | DBSR_DAC1W |
+								DBCR0_IDM);
+		} else {
+			/* Disable DAC interupts */
+			mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) & ~(DBSR_DAC1R |
+						DBSR_DAC1W | DBCR0_IDM));
+
+			/* Clear the DAC event */
+			mtspr(SPRN_DBSR, (DBSR_DAC1R | DBSR_DAC1W));
+		}
+		/* Setup and send the trap to the handler */
+		do_dabr(regs, mfspr(SPRN_DAC1), debug_status);
 	}
 }
 #endif /* CONFIG_4xx || CONFIG_BOOKE */
diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c
index b77f8af7ddd..ade8aeaa2e7 100644
--- a/arch/powerpc/kernel/vio.c
+++ b/arch/powerpc/kernel/vio.c
@@ -1,11 +1,12 @@
 /*
  * IBM PowerPC Virtual I/O Infrastructure Support.
  *
- *    Copyright (c) 2003-2005 IBM Corp.
+ *    Copyright (c) 2003,2008 IBM Corp.
  *     Dave Engebretsen engebret@us.ibm.com
  *     Santiago Leon santil@us.ibm.com
  *     Hollis Blanchard <hollisb@us.ibm.com>
  *     Stephen Rothwell
+ *     Robert Jennings <rcjenn@us.ibm.com>
  *
  *      This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -46,6 +47,996 @@ static struct vio_dev vio_bus_device  = { /* fake "parent" device */
 	.dev.bus = &vio_bus_type,
 };
 
+#ifdef CONFIG_PPC_SMLPAR
+/**
+ * vio_cmo_pool - A pool of IO memory for CMO use
+ *
+ * @size: The size of the pool in bytes
+ * @free: The amount of free memory in the pool
+ */
+struct vio_cmo_pool {
+	size_t size;
+	size_t free;
+};
+
+/* How many ms to delay queued balance work */
+#define VIO_CMO_BALANCE_DELAY 100
+
+/* Portion out IO memory to CMO devices by this chunk size */
+#define VIO_CMO_BALANCE_CHUNK 131072
+
+/**
+ * vio_cmo_dev_entry - A device that is CMO-enabled and requires entitlement
+ *
+ * @vio_dev: struct vio_dev pointer
+ * @list: pointer to other devices on bus that are being tracked
+ */
+struct vio_cmo_dev_entry {
+	struct vio_dev *viodev;
+	struct list_head list;
+};
+
+/**
+ * vio_cmo - VIO bus accounting structure for CMO entitlement
+ *
+ * @lock: spinlock for entire structure
+ * @balance_q: work queue for balancing system entitlement
+ * @device_list: list of CMO-enabled devices requiring entitlement
+ * @entitled: total system entitlement in bytes
+ * @reserve: pool of memory from which devices reserve entitlement, incl. spare
+ * @excess: pool of excess entitlement not needed for device reserves or spare
+ * @spare: IO memory for device hotplug functionality
+ * @min: minimum necessary for system operation
+ * @desired: desired memory for system operation
+ * @curr: bytes currently allocated
+ * @high: high water mark for IO data usage
+ */
+struct vio_cmo {
+	spinlock_t lock;
+	struct delayed_work balance_q;
+	struct list_head device_list;
+	size_t entitled;
+	struct vio_cmo_pool reserve;
+	struct vio_cmo_pool excess;
+	size_t spare;
+	size_t min;
+	size_t desired;
+	size_t curr;
+	size_t high;
+} vio_cmo;
+
+/**
+ * vio_cmo_OF_devices - Count the number of OF devices that have DMA windows
+ */
+static int vio_cmo_num_OF_devs(void)
+{
+	struct device_node *node_vroot;
+	int count = 0;
+
+	/*
+	 * Count the number of vdevice entries with an
+	 * ibm,my-dma-window OF property
+	 */
+	node_vroot = of_find_node_by_name(NULL, "vdevice");
+	if (node_vroot) {
+		struct device_node *of_node;
+		struct property *prop;
+
+		for_each_child_of_node(node_vroot, of_node) {
+			prop = of_find_property(of_node, "ibm,my-dma-window",
+			                       NULL);
+			if (prop)
+				count++;
+		}
+	}
+	of_node_put(node_vroot);
+	return count;
+}
+
+/**
+ * vio_cmo_alloc - allocate IO memory for CMO-enable devices
+ *
+ * @viodev: VIO device requesting IO memory
+ * @size: size of allocation requested
+ *
+ * Allocations come from memory reserved for the devices and any excess
+ * IO memory available to all devices.  The spare pool used to service
+ * hotplug must be equal to %VIO_CMO_MIN_ENT for the excess pool to be
+ * made available.
+ *
+ * Return codes:
+ *  0 for successful allocation and -ENOMEM for a failure
+ */
+static inline int vio_cmo_alloc(struct vio_dev *viodev, size_t size)
+{
+	unsigned long flags;
+	size_t reserve_free = 0;
+	size_t excess_free = 0;
+	int ret = -ENOMEM;
+
+	spin_lock_irqsave(&vio_cmo.lock, flags);
+
+	/* Determine the amount of free entitlement available in reserve */
+	if (viodev->cmo.entitled > viodev->cmo.allocated)
+		reserve_free = viodev->cmo.entitled - viodev->cmo.allocated;
+
+	/* If spare is not fulfilled, the excess pool can not be used. */
+	if (vio_cmo.spare >= VIO_CMO_MIN_ENT)
+		excess_free = vio_cmo.excess.free;
+
+	/* The request can be satisfied */
+	if ((reserve_free + excess_free) >= size) {
+		vio_cmo.curr += size;
+		if (vio_cmo.curr > vio_cmo.high)
+			vio_cmo.high = vio_cmo.curr;
+		viodev->cmo.allocated += size;
+		size -= min(reserve_free, size);
+		vio_cmo.excess.free -= size;
+		ret = 0;
+	}
+
+	spin_unlock_irqrestore(&vio_cmo.lock, flags);
+	return ret;
+}
+
+/**
+ * vio_cmo_dealloc - deallocate IO memory from CMO-enable devices
+ * @viodev: VIO device freeing IO memory
+ * @size: size of deallocation
+ *
+ * IO memory is freed by the device back to the correct memory pools.
+ * The spare pool is replenished first from either memory pool, then
+ * the reserve pool is used to reduce device entitlement, the excess
+ * pool is used to increase the reserve pool toward the desired entitlement
+ * target, and then the remaining memory is returned to the pools.
+ *
+ */
+static inline void vio_cmo_dealloc(struct vio_dev *viodev, size_t size)
+{
+	unsigned long flags;
+	size_t spare_needed = 0;
+	size_t excess_freed = 0;
+	size_t reserve_freed = size;
+	size_t tmp;
+	int balance = 0;
+
+	spin_lock_irqsave(&vio_cmo.lock, flags);
+	vio_cmo.curr -= size;
+
+	/* Amount of memory freed from the excess pool */
+	if (viodev->cmo.allocated > viodev->cmo.entitled) {
+		excess_freed = min(reserve_freed, (viodev->cmo.allocated -
+		                                   viodev->cmo.entitled));
+		reserve_freed -= excess_freed;
+	}
+
+	/* Remove allocation from device */
+	viodev->cmo.allocated -= (reserve_freed + excess_freed);
+
+	/* Spare is a subset of the reserve pool, replenish it first. */
+	spare_needed = VIO_CMO_MIN_ENT - vio_cmo.spare;
+
+	/*
+	 * Replenish the spare in the reserve pool from the excess pool.
+	 * This moves entitlement into the reserve pool.
+	 */
+	if (spare_needed && excess_freed) {
+		tmp = min(excess_freed, spare_needed);
+		vio_cmo.excess.size -= tmp;
+		vio_cmo.reserve.size += tmp;
+		vio_cmo.spare += tmp;
+		excess_freed -= tmp;
+		spare_needed -= tmp;
+		balance = 1;
+	}
+
+	/*
+	 * Replenish the spare in the reserve pool from the reserve pool.
+	 * This removes entitlement from the device down to VIO_CMO_MIN_ENT,
+	 * if needed, and gives it to the spare pool. The amount of used
+	 * memory in this pool does not change.
+	 */
+	if (spare_needed && reserve_freed) {
+		tmp = min(spare_needed, min(reserve_freed,
+		                            (viodev->cmo.entitled -
+		                             VIO_CMO_MIN_ENT)));
+
+		vio_cmo.spare += tmp;
+		viodev->cmo.entitled -= tmp;
+		reserve_freed -= tmp;
+		spare_needed -= tmp;
+		balance = 1;
+	}
+
+	/*
+	 * Increase the reserve pool until the desired allocation is met.
+	 * Move an allocation freed from the excess pool into the reserve
+	 * pool and schedule a balance operation.
+	 */
+	if (excess_freed && (vio_cmo.desired > vio_cmo.reserve.size)) {
+		tmp = min(excess_freed, (vio_cmo.desired - vio_cmo.reserve.size));
+
+		vio_cmo.excess.size -= tmp;
+		vio_cmo.reserve.size += tmp;
+		excess_freed -= tmp;
+		balance = 1;
+	}
+
+	/* Return memory from the excess pool to that pool */
+	if (excess_freed)
+		vio_cmo.excess.free += excess_freed;
+
+	if (balance)
+		schedule_delayed_work(&vio_cmo.balance_q, VIO_CMO_BALANCE_DELAY);
+	spin_unlock_irqrestore(&vio_cmo.lock, flags);
+}
+
+/**
+ * vio_cmo_entitlement_update - Manage system entitlement changes
+ *
+ * @new_entitlement: new system entitlement to attempt to accommodate
+ *
+ * Increases in entitlement will be used to fulfill the spare entitlement
+ * and the rest is given to the excess pool.  Decreases, if they are
+ * possible, come from the excess pool and from unused device entitlement
+ *
+ * Returns: 0 on success, -ENOMEM when change can not be made
+ */
+int vio_cmo_entitlement_update(size_t new_entitlement)
+{
+	struct vio_dev *viodev;
+	struct vio_cmo_dev_entry *dev_ent;
+	unsigned long flags;
+	size_t avail, delta, tmp;
+
+	spin_lock_irqsave(&vio_cmo.lock, flags);
+
+	/* Entitlement increases */
+	if (new_entitlement > vio_cmo.entitled) {
+		delta = new_entitlement - vio_cmo.entitled;
+
+		/* Fulfill spare allocation */
+		if (vio_cmo.spare < VIO_CMO_MIN_ENT) {
+			tmp = min(delta, (VIO_CMO_MIN_ENT - vio_cmo.spare));
+			vio_cmo.spare += tmp;
+			vio_cmo.reserve.size += tmp;
+			delta -= tmp;
+		}
+
+		/* Remaining new allocation goes to the excess pool */
+		vio_cmo.entitled += delta;
+		vio_cmo.excess.size += delta;
+		vio_cmo.excess.free += delta;
+
+		goto out;
+	}
+
+	/* Entitlement decreases */
+	delta = vio_cmo.entitled - new_entitlement;
+	avail = vio_cmo.excess.free;
+
+	/*
+	 * Need to check how much unused entitlement each device can
+	 * sacrifice to fulfill entitlement change.
+	 */
+	list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+		if (avail >= delta)
+			break;
+
+		viodev = dev_ent->viodev;
+		if ((viodev->cmo.entitled > viodev->cmo.allocated) &&
+		    (viodev->cmo.entitled > VIO_CMO_MIN_ENT))
+				avail += viodev->cmo.entitled -
+				         max_t(size_t, viodev->cmo.allocated,
+				               VIO_CMO_MIN_ENT);
+	}
+
+	if (delta <= avail) {
+		vio_cmo.entitled -= delta;
+
+		/* Take entitlement from the excess pool first */
+		tmp = min(vio_cmo.excess.free, delta);
+		vio_cmo.excess.size -= tmp;
+		vio_cmo.excess.free -= tmp;
+		delta -= tmp;
+
+		/*
+		 * Remove all but VIO_CMO_MIN_ENT bytes from devices
+		 * until entitlement change is served
+		 */
+		list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+			if (!delta)
+				break;
+
+			viodev = dev_ent->viodev;
+			tmp = 0;
+			if ((viodev->cmo.entitled > viodev->cmo.allocated) &&
+			    (viodev->cmo.entitled > VIO_CMO_MIN_ENT))
+				tmp = viodev->cmo.entitled -
+				      max_t(size_t, viodev->cmo.allocated,
+				            VIO_CMO_MIN_ENT);
+			viodev->cmo.entitled -= min(tmp, delta);
+			delta -= min(tmp, delta);
+		}
+	} else {
+		spin_unlock_irqrestore(&vio_cmo.lock, flags);
+		return -ENOMEM;
+	}
+
+out:
+	schedule_delayed_work(&vio_cmo.balance_q, 0);
+	spin_unlock_irqrestore(&vio_cmo.lock, flags);
+	return 0;
+}
+
+/**
+ * vio_cmo_balance - Balance entitlement among devices
+ *
+ * @work: work queue structure for this operation
+ *
+ * Any system entitlement above the minimum needed for devices, or
+ * already allocated to devices, can be distributed to the devices.
+ * The list of devices is iterated through to recalculate the desired
+ * entitlement level and to determine how much entitlement above the
+ * minimum entitlement is allocated to devices.
+ *
+ * Small chunks of the available entitlement are given to devices until
+ * their requirements are fulfilled or there is no entitlement left to give.
+ * Upon completion sizes of the reserve and excess pools are calculated.
+ *
+ * The system minimum entitlement level is also recalculated here.
+ * Entitlement will be reserved for devices even after vio_bus_remove to
+ * accommodate reloading the driver.  The OF tree is walked to count the
+ * number of devices present and this will remove entitlement for devices
+ * that have actually left the system after having vio_bus_remove called.
+ */
+static void vio_cmo_balance(struct work_struct *work)
+{
+	struct vio_cmo *cmo;
+	struct vio_dev *viodev;
+	struct vio_cmo_dev_entry *dev_ent;
+	unsigned long flags;
+	size_t avail = 0, level, chunk, need;
+	int devcount = 0, fulfilled;
+
+	cmo = container_of(work, struct vio_cmo, balance_q.work);
+
+	spin_lock_irqsave(&vio_cmo.lock, flags);
+
+	/* Calculate minimum entitlement and fulfill spare */
+	cmo->min = vio_cmo_num_OF_devs() * VIO_CMO_MIN_ENT;
+	BUG_ON(cmo->min > cmo->entitled);
+	cmo->spare = min_t(size_t, VIO_CMO_MIN_ENT, (cmo->entitled - cmo->min));
+	cmo->min += cmo->spare;
+	cmo->desired = cmo->min;
+
+	/*
+	 * Determine how much entitlement is available and reset device
+	 * entitlements
+	 */
+	avail = cmo->entitled - cmo->spare;
+	list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+		viodev = dev_ent->viodev;
+		devcount++;
+		viodev->cmo.entitled = VIO_CMO_MIN_ENT;
+		cmo->desired += (viodev->cmo.desired - VIO_CMO_MIN_ENT);
+		avail -= max_t(size_t, viodev->cmo.allocated, VIO_CMO_MIN_ENT);
+	}
+
+	/*
+	 * Having provided each device with the minimum entitlement, loop
+	 * over the devices portioning out the remaining entitlement
+	 * until there is nothing left.
+	 */
+	level = VIO_CMO_MIN_ENT;
+	while (avail) {
+		fulfilled = 0;
+		list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+			viodev = dev_ent->viodev;
+
+			if (viodev->cmo.desired <= level) {
+				fulfilled++;
+				continue;
+			}
+
+			/*
+			 * Give the device up to VIO_CMO_BALANCE_CHUNK
+			 * bytes of entitlement, but do not exceed the
+			 * desired level of entitlement for the device.
+			 */
+			chunk = min_t(size_t, avail, VIO_CMO_BALANCE_CHUNK);
+			chunk = min(chunk, (viodev->cmo.desired -
+			                    viodev->cmo.entitled));
+			viodev->cmo.entitled += chunk;
+
+			/*
+			 * If the memory for this entitlement increase was
+			 * already allocated to the device it does not come
+			 * from the available pool being portioned out.
+			 */
+			need = max(viodev->cmo.allocated, viodev->cmo.entitled)-
+			       max(viodev->cmo.allocated, level);
+			avail -= need;
+
+		}
+		if (fulfilled == devcount)
+			break;
+		level += VIO_CMO_BALANCE_CHUNK;
+	}
+
+	/* Calculate new reserve and excess pool sizes */
+	cmo->reserve.size = cmo->min;
+	cmo->excess.free = 0;
+	cmo->excess.size = 0;
+	need = 0;
+	list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+		viodev = dev_ent->viodev;
+		/* Calculated reserve size above the minimum entitlement */
+		if (viodev->cmo.entitled)
+			cmo->reserve.size += (viodev->cmo.entitled -
+			                      VIO_CMO_MIN_ENT);
+		/* Calculated used excess entitlement */
+		if (viodev->cmo.allocated > viodev->cmo.entitled)
+			need += viodev->cmo.allocated - viodev->cmo.entitled;
+	}
+	cmo->excess.size = cmo->entitled - cmo->reserve.size;
+	cmo->excess.free = cmo->excess.size - need;
+
+	cancel_delayed_work(container_of(work, struct delayed_work, work));
+	spin_unlock_irqrestore(&vio_cmo.lock, flags);
+}
+
+static void *vio_dma_iommu_alloc_coherent(struct device *dev, size_t size,
+                                          dma_addr_t *dma_handle, gfp_t flag)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+	void *ret;
+
+	if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE))) {
+		atomic_inc(&viodev->cmo.allocs_failed);
+		return NULL;
+	}
+
+	ret = dma_iommu_ops.alloc_coherent(dev, size, dma_handle, flag);
+	if (unlikely(ret == NULL)) {
+		vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE));
+		atomic_inc(&viodev->cmo.allocs_failed);
+	}
+
+	return ret;
+}
+
+static void vio_dma_iommu_free_coherent(struct device *dev, size_t size,
+                                        void *vaddr, dma_addr_t dma_handle)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+
+	dma_iommu_ops.free_coherent(dev, size, vaddr, dma_handle);
+
+	vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE));
+}
+
+static dma_addr_t vio_dma_iommu_map_single(struct device *dev, void *vaddr,
+                                           size_t size,
+                                           enum dma_data_direction direction,
+                                           struct dma_attrs *attrs)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+	dma_addr_t ret = DMA_ERROR_CODE;
+
+	if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE))) {
+		atomic_inc(&viodev->cmo.allocs_failed);
+		return ret;
+	}
+
+	ret = dma_iommu_ops.map_single(dev, vaddr, size, direction, attrs);
+	if (unlikely(dma_mapping_error(ret))) {
+		vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE));
+		atomic_inc(&viodev->cmo.allocs_failed);
+	}
+
+	return ret;
+}
+
+static void vio_dma_iommu_unmap_single(struct device *dev,
+		dma_addr_t dma_handle, size_t size,
+		enum dma_data_direction direction,
+		struct dma_attrs *attrs)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+
+	dma_iommu_ops.unmap_single(dev, dma_handle, size, direction, attrs);
+
+	vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE));
+}
+
+static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
+                                int nelems, enum dma_data_direction direction,
+                                struct dma_attrs *attrs)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+	struct scatterlist *sgl;
+	int ret, count = 0;
+	size_t alloc_size = 0;
+
+	for (sgl = sglist; count < nelems; count++, sgl++)
+		alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE);
+
+	if (vio_cmo_alloc(viodev, alloc_size)) {
+		atomic_inc(&viodev->cmo.allocs_failed);
+		return 0;
+	}
+
+	ret = dma_iommu_ops.map_sg(dev, sglist, nelems, direction, attrs);
+
+	if (unlikely(!ret)) {
+		vio_cmo_dealloc(viodev, alloc_size);
+		atomic_inc(&viodev->cmo.allocs_failed);
+	}
+
+	for (sgl = sglist, count = 0; count < ret; count++, sgl++)
+		alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE);
+	if (alloc_size)
+		vio_cmo_dealloc(viodev, alloc_size);
+
+	return ret;
+}
+
+static void vio_dma_iommu_unmap_sg(struct device *dev,
+		struct scatterlist *sglist, int nelems,
+		enum dma_data_direction direction,
+		struct dma_attrs *attrs)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+	struct scatterlist *sgl;
+	size_t alloc_size = 0;
+	int count = 0;
+
+	for (sgl = sglist; count < nelems; count++, sgl++)
+		alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE);
+
+	dma_iommu_ops.unmap_sg(dev, sglist, nelems, direction, attrs);
+
+	vio_cmo_dealloc(viodev, alloc_size);
+}
+
+struct dma_mapping_ops vio_dma_mapping_ops = {
+	.alloc_coherent = vio_dma_iommu_alloc_coherent,
+	.free_coherent  = vio_dma_iommu_free_coherent,
+	.map_single     = vio_dma_iommu_map_single,
+	.unmap_single   = vio_dma_iommu_unmap_single,
+	.map_sg         = vio_dma_iommu_map_sg,
+	.unmap_sg       = vio_dma_iommu_unmap_sg,
+};
+
+/**
+ * vio_cmo_set_dev_desired - Set desired entitlement for a device
+ *
+ * @viodev: struct vio_dev for device to alter
+ * @new_desired: new desired entitlement level in bytes
+ *
+ * For use by devices to request a change to their entitlement at runtime or
+ * through sysfs.  The desired entitlement level is changed and a balancing
+ * of system resources is scheduled to run in the future.
+ */
+void vio_cmo_set_dev_desired(struct vio_dev *viodev, size_t desired)
+{
+	unsigned long flags;
+	struct vio_cmo_dev_entry *dev_ent;
+	int found = 0;
+
+	if (!firmware_has_feature(FW_FEATURE_CMO))
+		return;
+
+	spin_lock_irqsave(&vio_cmo.lock, flags);
+	if (desired < VIO_CMO_MIN_ENT)
+		desired = VIO_CMO_MIN_ENT;
+
+	/*
+	 * Changes will not be made for devices not in the device list.
+	 * If it is not in the device list, then no driver is loaded
+	 * for the device and it can not receive entitlement.
+	 */
+	list_for_each_entry(dev_ent, &vio_cmo.device_list, list)
+		if (viodev == dev_ent->viodev) {
+			found = 1;
+			break;
+		}
+	if (!found)
+		return;
+
+	/* Increase/decrease in desired device entitlement */
+	if (desired >= viodev->cmo.desired) {
+		/* Just bump the bus and device values prior to a balance*/
+		vio_cmo.desired += desired - viodev->cmo.desired;
+		viodev->cmo.desired = desired;
+	} else {
+		/* Decrease bus and device values for desired entitlement */
+		vio_cmo.desired -= viodev->cmo.desired - desired;
+		viodev->cmo.desired = desired;
+		/*
+		 * If less entitlement is desired than current entitlement, move
+		 * any reserve memory in the change region to the excess pool.
+		 */
+		if (viodev->cmo.entitled > desired) {
+			vio_cmo.reserve.size -= viodev->cmo.entitled - desired;
+			vio_cmo.excess.size += viodev->cmo.entitled - desired;
+			/*
+			 * If entitlement moving from the reserve pool to the
+			 * excess pool is currently unused, add to the excess
+			 * free counter.
+			 */
+			if (viodev->cmo.allocated < viodev->cmo.entitled)
+				vio_cmo.excess.free += viodev->cmo.entitled -
+				                       max(viodev->cmo.allocated, desired);
+			viodev->cmo.entitled = desired;
+		}
+	}
+	schedule_delayed_work(&vio_cmo.balance_q, 0);
+	spin_unlock_irqrestore(&vio_cmo.lock, flags);
+}
+
+/**
+ * vio_cmo_bus_probe - Handle CMO specific bus probe activities
+ *
+ * @viodev - Pointer to struct vio_dev for device
+ *
+ * Determine the devices IO memory entitlement needs, attempting
+ * to satisfy the system minimum entitlement at first and scheduling
+ * a balance operation to take care of the rest at a later time.
+ *
+ * Returns: 0 on success, -EINVAL when device doesn't support CMO, and
+ *          -ENOMEM when entitlement is not available for device or
+ *          device entry.
+ *
+ */
+static int vio_cmo_bus_probe(struct vio_dev *viodev)
+{
+	struct vio_cmo_dev_entry *dev_ent;
+	struct device *dev = &viodev->dev;
+	struct vio_driver *viodrv = to_vio_driver(dev->driver);
+	unsigned long flags;
+	size_t size;
+
+	/*
+	 * Check to see that device has a DMA window and configure
+	 * entitlement for the device.
+	 */
+	if (of_get_property(viodev->dev.archdata.of_node,
+	                    "ibm,my-dma-window", NULL)) {
+		/* Check that the driver is CMO enabled and get desired DMA */
+		if (!viodrv->get_desired_dma) {
+			dev_err(dev, "%s: device driver does not support CMO\n",
+			        __func__);
+			return -EINVAL;
+		}
+
+		viodev->cmo.desired = IOMMU_PAGE_ALIGN(viodrv->get_desired_dma(viodev));
+		if (viodev->cmo.desired < VIO_CMO_MIN_ENT)
+			viodev->cmo.desired = VIO_CMO_MIN_ENT;
+		size = VIO_CMO_MIN_ENT;
+
+		dev_ent = kmalloc(sizeof(struct vio_cmo_dev_entry),
+		                  GFP_KERNEL);
+		if (!dev_ent)
+			return -ENOMEM;
+
+		dev_ent->viodev = viodev;
+		spin_lock_irqsave(&vio_cmo.lock, flags);
+		list_add(&dev_ent->list, &vio_cmo.device_list);
+	} else {
+		viodev->cmo.desired = 0;
+		size = 0;
+		spin_lock_irqsave(&vio_cmo.lock, flags);
+	}
+
+	/*
+	 * If the needs for vio_cmo.min have not changed since they
+	 * were last set, the number of devices in the OF tree has
+	 * been constant and the IO memory for this is already in
+	 * the reserve pool.
+	 */
+	if (vio_cmo.min == ((vio_cmo_num_OF_devs() + 1) *
+	                    VIO_CMO_MIN_ENT)) {
+		/* Updated desired entitlement if device requires it */
+		if (size)
+			vio_cmo.desired += (viodev->cmo.desired -
+		                        VIO_CMO_MIN_ENT);
+	} else {
+		size_t tmp;
+
+		tmp = vio_cmo.spare + vio_cmo.excess.free;
+		if (tmp < size) {
+			dev_err(dev, "%s: insufficient free "
+			        "entitlement to add device. "
+			        "Need %lu, have %lu\n", __func__,
+				size, (vio_cmo.spare + tmp));
+			spin_unlock_irqrestore(&vio_cmo.lock, flags);
+			return -ENOMEM;
+		}
+
+		/* Use excess pool first to fulfill request */
+		tmp = min(size, vio_cmo.excess.free);
+		vio_cmo.excess.free -= tmp;
+		vio_cmo.excess.size -= tmp;
+		vio_cmo.reserve.size += tmp;
+
+		/* Use spare if excess pool was insufficient */
+		vio_cmo.spare -= size - tmp;
+
+		/* Update bus accounting */
+		vio_cmo.min += size;
+		vio_cmo.desired += viodev->cmo.desired;
+	}
+	spin_unlock_irqrestore(&vio_cmo.lock, flags);
+	return 0;
+}
+
+/**
+ * vio_cmo_bus_remove - Handle CMO specific bus removal activities
+ *
+ * @viodev - Pointer to struct vio_dev for device
+ *
+ * Remove the device from the cmo device list.  The minimum entitlement
+ * will be reserved for the device as long as it is in the system.  The
+ * rest of the entitlement the device had been allocated will be returned
+ * to the system.
+ */
+static void vio_cmo_bus_remove(struct vio_dev *viodev)
+{
+	struct vio_cmo_dev_entry *dev_ent;
+	unsigned long flags;
+	size_t tmp;
+
+	spin_lock_irqsave(&vio_cmo.lock, flags);
+	if (viodev->cmo.allocated) {
+		dev_err(&viodev->dev, "%s: device had %lu bytes of IO "
+		        "allocated after remove operation.\n",
+		        __func__, viodev->cmo.allocated);
+		BUG();
+	}
+
+	/*
+	 * Remove the device from the device list being maintained for
+	 * CMO enabled devices.
+	 */
+	list_for_each_entry(dev_ent, &vio_cmo.device_list, list)
+		if (viodev == dev_ent->viodev) {
+			list_del(&dev_ent->list);
+			kfree(dev_ent);
+			break;
+		}
+
+	/*
+	 * Devices may not require any entitlement and they do not need
+	 * to be processed.  Otherwise, return the device's entitlement
+	 * back to the pools.
+	 */
+	if (viodev->cmo.entitled) {
+		/*
+		 * This device has not yet left the OF tree, it's
+		 * minimum entitlement remains in vio_cmo.min and
+		 * vio_cmo.desired
+		 */
+		vio_cmo.desired -= (viodev->cmo.desired - VIO_CMO_MIN_ENT);
+
+		/*
+		 * Save min allocation for device in reserve as long
+		 * as it exists in OF tree as determined by later
+		 * balance operation
+		 */
+		viodev->cmo.entitled -= VIO_CMO_MIN_ENT;
+
+		/* Replenish spare from freed reserve pool */
+		if (viodev->cmo.entitled && (vio_cmo.spare < VIO_CMO_MIN_ENT)) {
+			tmp = min(viodev->cmo.entitled, (VIO_CMO_MIN_ENT -
+			                                 vio_cmo.spare));
+			vio_cmo.spare += tmp;
+			viodev->cmo.entitled -= tmp;
+		}
+
+		/* Remaining reserve goes to excess pool */
+		vio_cmo.excess.size += viodev->cmo.entitled;
+		vio_cmo.excess.free += viodev->cmo.entitled;
+		vio_cmo.reserve.size -= viodev->cmo.entitled;
+
+		/*
+		 * Until the device is removed it will keep a
+		 * minimum entitlement; this will guarantee that
+		 * a module unload/load will result in a success.
+		 */
+		viodev->cmo.entitled = VIO_CMO_MIN_ENT;
+		viodev->cmo.desired = VIO_CMO_MIN_ENT;
+		atomic_set(&viodev->cmo.allocs_failed, 0);
+	}
+
+	spin_unlock_irqrestore(&vio_cmo.lock, flags);
+}
+
+static void vio_cmo_set_dma_ops(struct vio_dev *viodev)
+{
+	vio_dma_mapping_ops.dma_supported = dma_iommu_ops.dma_supported;
+	viodev->dev.archdata.dma_ops = &vio_dma_mapping_ops;
+}
+
+/**
+ * vio_cmo_bus_init - CMO entitlement initialization at bus init time
+ *
+ * Set up the reserve and excess entitlement pools based on available
+ * system entitlement and the number of devices in the OF tree that
+ * require entitlement in the reserve pool.
+ */
+static void vio_cmo_bus_init(void)
+{
+	struct hvcall_mpp_data mpp_data;
+	int err;
+
+	memset(&vio_cmo, 0, sizeof(struct vio_cmo));
+	spin_lock_init(&vio_cmo.lock);
+	INIT_LIST_HEAD(&vio_cmo.device_list);
+	INIT_DELAYED_WORK(&vio_cmo.balance_q, vio_cmo_balance);
+
+	/* Get current system entitlement */
+	err = h_get_mpp(&mpp_data);
+
+	/*
+	 * On failure, continue with entitlement set to 0, will panic()
+	 * later when spare is reserved.
+	 */
+	if (err != H_SUCCESS) {
+		printk(KERN_ERR "%s: unable to determine system IO "\
+		       "entitlement. (%d)\n", __func__, err);
+		vio_cmo.entitled = 0;
+	} else {
+		vio_cmo.entitled = mpp_data.entitled_mem;
+	}
+
+	/* Set reservation and check against entitlement */
+	vio_cmo.spare = VIO_CMO_MIN_ENT;
+	vio_cmo.reserve.size = vio_cmo.spare;
+	vio_cmo.reserve.size += (vio_cmo_num_OF_devs() *
+	                         VIO_CMO_MIN_ENT);
+	if (vio_cmo.reserve.size > vio_cmo.entitled) {
+		printk(KERN_ERR "%s: insufficient system entitlement\n",
+		       __func__);
+		panic("%s: Insufficient system entitlement", __func__);
+	}
+
+	/* Set the remaining accounting variables */
+	vio_cmo.excess.size = vio_cmo.entitled - vio_cmo.reserve.size;
+	vio_cmo.excess.free = vio_cmo.excess.size;
+	vio_cmo.min = vio_cmo.reserve.size;
+	vio_cmo.desired = vio_cmo.reserve.size;
+}
+
+/* sysfs device functions and data structures for CMO */
+
+#define viodev_cmo_rd_attr(name)                                        \
+static ssize_t viodev_cmo_##name##_show(struct device *dev,             \
+                                        struct device_attribute *attr,  \
+                                         char *buf)                     \
+{                                                                       \
+	return sprintf(buf, "%lu\n", to_vio_dev(dev)->cmo.name);        \
+}
+
+static ssize_t viodev_cmo_allocs_failed_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+	return sprintf(buf, "%d\n", atomic_read(&viodev->cmo.allocs_failed));
+}
+
+static ssize_t viodev_cmo_allocs_failed_reset(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+	atomic_set(&viodev->cmo.allocs_failed, 0);
+	return count;
+}
+
+static ssize_t viodev_cmo_desired_set(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+	size_t new_desired;
+	int ret;
+
+	ret = strict_strtoul(buf, 10, &new_desired);
+	if (ret)
+		return ret;
+
+	vio_cmo_set_dev_desired(viodev, new_desired);
+	return count;
+}
+
+viodev_cmo_rd_attr(desired);
+viodev_cmo_rd_attr(entitled);
+viodev_cmo_rd_attr(allocated);
+
+static ssize_t name_show(struct device *, struct device_attribute *, char *);
+static ssize_t devspec_show(struct device *, struct device_attribute *, char *);
+static struct device_attribute vio_cmo_dev_attrs[] = {
+	__ATTR_RO(name),
+	__ATTR_RO(devspec),
+	__ATTR(cmo_desired,       S_IWUSR|S_IRUSR|S_IWGRP|S_IRGRP|S_IROTH,
+	       viodev_cmo_desired_show, viodev_cmo_desired_set),
+	__ATTR(cmo_entitled,      S_IRUGO, viodev_cmo_entitled_show,      NULL),
+	__ATTR(cmo_allocated,     S_IRUGO, viodev_cmo_allocated_show,     NULL),
+	__ATTR(cmo_allocs_failed, S_IWUSR|S_IRUSR|S_IWGRP|S_IRGRP|S_IROTH,
+	       viodev_cmo_allocs_failed_show, viodev_cmo_allocs_failed_reset),
+	__ATTR_NULL
+};
+
+/* sysfs bus functions and data structures for CMO */
+
+#define viobus_cmo_rd_attr(name)                                        \
+static ssize_t                                                          \
+viobus_cmo_##name##_show(struct bus_type *bt, char *buf)                \
+{                                                                       \
+	return sprintf(buf, "%lu\n", vio_cmo.name);                     \
+}
+
+#define viobus_cmo_pool_rd_attr(name, var)                              \
+static ssize_t                                                          \
+viobus_cmo_##name##_pool_show_##var(struct bus_type *bt, char *buf)     \
+{                                                                       \
+	return sprintf(buf, "%lu\n", vio_cmo.name.var);                 \
+}
+
+static ssize_t viobus_cmo_high_reset(struct bus_type *bt, const char *buf,
+                                     size_t count)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&vio_cmo.lock, flags);
+	vio_cmo.high = vio_cmo.curr;
+	spin_unlock_irqrestore(&vio_cmo.lock, flags);
+
+	return count;
+}
+
+viobus_cmo_rd_attr(entitled);
+viobus_cmo_pool_rd_attr(reserve, size);
+viobus_cmo_pool_rd_attr(excess, size);
+viobus_cmo_pool_rd_attr(excess, free);
+viobus_cmo_rd_attr(spare);
+viobus_cmo_rd_attr(min);
+viobus_cmo_rd_attr(desired);
+viobus_cmo_rd_attr(curr);
+viobus_cmo_rd_attr(high);
+
+static struct bus_attribute vio_cmo_bus_attrs[] = {
+	__ATTR(cmo_entitled, S_IRUGO, viobus_cmo_entitled_show, NULL),
+	__ATTR(cmo_reserve_size, S_IRUGO, viobus_cmo_reserve_pool_show_size, NULL),
+	__ATTR(cmo_excess_size, S_IRUGO, viobus_cmo_excess_pool_show_size, NULL),
+	__ATTR(cmo_excess_free, S_IRUGO, viobus_cmo_excess_pool_show_free, NULL),
+	__ATTR(cmo_spare,   S_IRUGO, viobus_cmo_spare_show,   NULL),
+	__ATTR(cmo_min,     S_IRUGO, viobus_cmo_min_show,     NULL),
+	__ATTR(cmo_desired, S_IRUGO, viobus_cmo_desired_show, NULL),
+	__ATTR(cmo_curr,    S_IRUGO, viobus_cmo_curr_show,    NULL),
+	__ATTR(cmo_high,    S_IWUSR|S_IRUSR|S_IWGRP|S_IRGRP|S_IROTH,
+	       viobus_cmo_high_show, viobus_cmo_high_reset),
+	__ATTR_NULL
+};
+
+static void vio_cmo_sysfs_init(void)
+{
+	vio_bus_type.dev_attrs = vio_cmo_dev_attrs;
+	vio_bus_type.bus_attrs = vio_cmo_bus_attrs;
+}
+#else /* CONFIG_PPC_SMLPAR */
+/* Dummy functions for iSeries platform */
+int vio_cmo_entitlement_update(size_t new_entitlement) { return 0; }
+void vio_cmo_set_dev_desired(struct vio_dev *viodev, size_t desired) {}
+static int vio_cmo_bus_probe(struct vio_dev *viodev) { return 0; }
+static void vio_cmo_bus_remove(struct vio_dev *viodev) {}
+static void vio_cmo_set_dma_ops(struct vio_dev *viodev) {}
+static void vio_cmo_bus_init() {}
+static void vio_cmo_sysfs_init() { }
+#endif /* CONFIG_PPC_SMLPAR */
+EXPORT_SYMBOL(vio_cmo_entitlement_update);
+EXPORT_SYMBOL(vio_cmo_set_dev_desired);
+
 static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev)
 {
 	const unsigned char *dma_window;
@@ -114,8 +1105,17 @@ static int vio_bus_probe(struct device *dev)
 		return error;
 
 	id = vio_match_device(viodrv->id_table, viodev);
-	if (id)
+	if (id) {
+		memset(&viodev->cmo, 0, sizeof(viodev->cmo));
+		if (firmware_has_feature(FW_FEATURE_CMO)) {
+			error = vio_cmo_bus_probe(viodev);
+			if (error)
+				return error;
+		}
 		error = viodrv->probe(viodev, id);
+		if (error)
+			vio_cmo_bus_remove(viodev);
+	}
 
 	return error;
 }
@@ -125,12 +1125,23 @@ static int vio_bus_remove(struct device *dev)
 {
 	struct vio_dev *viodev = to_vio_dev(dev);
 	struct vio_driver *viodrv = to_vio_driver(dev->driver);
+	struct device *devptr;
+	int ret = 1;
+
+	/*
+	 * Hold a reference to the device after the remove function is called
+	 * to allow for CMO accounting cleanup for the device.
+	 */
+	devptr = get_device(dev);
 
 	if (viodrv->remove)
-		return viodrv->remove(viodev);
+		ret = viodrv->remove(viodev);
+
+	if (!ret && firmware_has_feature(FW_FEATURE_CMO))
+		vio_cmo_bus_remove(viodev);
 
-	/* driver can't remove */
-	return 1;
+	put_device(devptr);
+	return ret;
 }
 
 /**
@@ -215,7 +1226,11 @@ struct vio_dev *vio_register_device_node(struct device_node *of_node)
 			viodev->unit_address = *unit_address;
 	}
 	viodev->dev.archdata.of_node = of_node_get(of_node);
-	viodev->dev.archdata.dma_ops = &dma_iommu_ops;
+
+	if (firmware_has_feature(FW_FEATURE_CMO))
+		vio_cmo_set_dma_ops(viodev);
+	else
+		viodev->dev.archdata.dma_ops = &dma_iommu_ops;
 	viodev->dev.archdata.dma_data = vio_build_iommu_table(viodev);
 	viodev->dev.archdata.numa_node = of_node_to_nid(of_node);
 
@@ -245,6 +1260,9 @@ static int __init vio_bus_init(void)
 	int err;
 	struct device_node *node_vroot;
 
+	if (firmware_has_feature(FW_FEATURE_CMO))
+		vio_cmo_sysfs_init();
+
 	err = bus_register(&vio_bus_type);
 	if (err) {
 		printk(KERN_ERR "failed to register VIO bus\n");
@@ -262,6 +1280,9 @@ static int __init vio_bus_init(void)
 		return err;
 	}
 
+	if (firmware_has_feature(FW_FEATURE_CMO))
+		vio_cmo_bus_init();
+
 	node_vroot = of_find_node_by_name(NULL, "vdevice");
 	if (node_vroot) {
 		struct device_node *of_node;
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index a914411bced..4a8ce62fe11 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -85,7 +85,7 @@ SECTIONS
 
 	/* The dummy segment contents for the bug workaround mentioned above
 	   near PHDRS.  */
-	.dummy : {
+	.dummy : AT(ADDR(.dummy) - LOAD_OFFSET) {
 		LONG(0xf177)
 	} :kernel :dummy
 
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 0559fe086eb..7c975d43e3f 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -10,6 +10,7 @@
 #include <linux/kernel.h>
 #include <linux/vmalloc.h>
 #include <linux/init.h>
+#include <linux/mm.h>
 #include <asm/page.h>
 #include <asm/code-patching.h>
 
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 1707d00331f..565b7a237c8 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -100,31 +100,6 @@ static int store_updates_sp(struct pt_regs *regs)
 	return 0;
 }
 
-#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
-static void do_dabr(struct pt_regs *regs, unsigned long address,
-		    unsigned long error_code)
-{
-	siginfo_t info;
-
-	if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code,
-			11, SIGSEGV) == NOTIFY_STOP)
-		return;
-
-	if (debugger_dabr_match(regs))
-		return;
-
-	/* Clear the DABR */
-	set_dabr(0);
-
-	/* Deliver the signal to userspace */
-	info.si_signo = SIGTRAP;
-	info.si_errno = 0;
-	info.si_code = TRAP_HWBKPT;
-	info.si_addr = (void __user *)address;
-	force_sig_info(SIGTRAP, &info, current);
-}
-#endif /* !(CONFIG_4xx || CONFIG_BOOKE)*/
-
 /*
  * For 600- and 800-family processors, the error_code parameter is DSISR
  * for a data fault, SRR1 for an instruction fault. For 400-family processors
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 8d3b58ebd38..5ce5a4dcd00 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -68,6 +68,7 @@
 
 #define KB (1024)
 #define MB (1024*KB)
+#define GB (1024L*MB)
 
 /*
  * Note:  pte   --> Linux PTE
@@ -102,7 +103,6 @@ int mmu_kernel_ssize = MMU_SEGSIZE_256M;
 int mmu_highuser_ssize = MMU_SEGSIZE_256M;
 u16 mmu_slb_size = 64;
 #ifdef CONFIG_HUGETLB_PAGE
-int mmu_huge_psize = MMU_PAGE_16M;
 unsigned int HPAGE_SHIFT;
 #endif
 #ifdef CONFIG_PPC_64K_PAGES
@@ -329,6 +329,44 @@ static int __init htab_dt_scan_page_sizes(unsigned long node,
 	return 0;
 }
 
+/* Scan for 16G memory blocks that have been set aside for huge pages
+ * and reserve those blocks for 16G huge pages.
+ */
+static int __init htab_dt_scan_hugepage_blocks(unsigned long node,
+					const char *uname, int depth,
+					void *data) {
+	char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+	unsigned long *addr_prop;
+	u32 *page_count_prop;
+	unsigned int expected_pages;
+	long unsigned int phys_addr;
+	long unsigned int block_size;
+
+	/* We are scanning "memory" nodes only */
+	if (type == NULL || strcmp(type, "memory") != 0)
+		return 0;
+
+	/* This property is the log base 2 of the number of virtual pages that
+	 * will represent this memory block. */
+	page_count_prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL);
+	if (page_count_prop == NULL)
+		return 0;
+	expected_pages = (1 << page_count_prop[0]);
+	addr_prop = of_get_flat_dt_prop(node, "reg", NULL);
+	if (addr_prop == NULL)
+		return 0;
+	phys_addr = addr_prop[0];
+	block_size = addr_prop[1];
+	if (block_size != (16 * GB))
+		return 0;
+	printk(KERN_INFO "Huge page(16GB) memory: "
+			"addr = 0x%lX size = 0x%lX pages = %d\n",
+			phys_addr, block_size, expected_pages);
+	lmb_reserve(phys_addr, block_size * expected_pages);
+	add_gpage(phys_addr, block_size, expected_pages);
+	return 0;
+}
+
 static void __init htab_init_page_sizes(void)
 {
 	int rc;
@@ -418,15 +456,18 @@ static void __init htab_init_page_sizes(void)
 	       );
 
 #ifdef CONFIG_HUGETLB_PAGE
-	/* Init large page size. Currently, we pick 16M or 1M depending
+	/* Reserve 16G huge page memory sections for huge pages */
+	of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL);
+
+/* Set default large page size. Currently, we pick 16M or 1M depending
 	 * on what is available
 	 */
 	if (mmu_psize_defs[MMU_PAGE_16M].shift)
-		set_huge_psize(MMU_PAGE_16M);
+		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
 	/* With 4k/4level pagetables, we can't (for now) cope with a
 	 * huge page size < PMD_SIZE */
 	else if (mmu_psize_defs[MMU_PAGE_1M].shift)
-		set_huge_psize(MMU_PAGE_1M);
+		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
 #endif /* CONFIG_HUGETLB_PAGE */
 }
 
@@ -847,7 +888,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 
 #ifdef CONFIG_HUGETLB_PAGE
 	/* Handle hugepage regions */
-	if (HPAGE_SHIFT && psize == mmu_huge_psize) {
+	if (HPAGE_SHIFT && mmu_huge_psizes[psize]) {
 		DBG_LOW(" -> huge page !\n");
 		return hash_huge_page(mm, access, ea, vsid, local, trap);
 	}
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 0d12fba31bc..ed0aab0208a 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -24,21 +24,43 @@
 #include <asm/cputable.h>
 #include <asm/spu.h>
 
-#define HPAGE_SHIFT_64K	16
-#define HPAGE_SHIFT_16M	24
+#define PAGE_SHIFT_64K	16
+#define PAGE_SHIFT_16M	24
+#define PAGE_SHIFT_16G	34
 
 #define NUM_LOW_AREAS	(0x100000000UL >> SID_SHIFT)
 #define NUM_HIGH_AREAS	(PGTABLE_RANGE >> HTLB_AREA_SHIFT)
+#define MAX_NUMBER_GPAGES	1024
 
-unsigned int hugepte_shift;
-#define PTRS_PER_HUGEPTE	(1 << hugepte_shift)
-#define HUGEPTE_TABLE_SIZE	(sizeof(pte_t) << hugepte_shift)
+/* Tracks the 16G pages after the device tree is scanned and before the
+ * huge_boot_pages list is ready.  */
+static unsigned long gpage_freearray[MAX_NUMBER_GPAGES];
+static unsigned nr_gpages;
 
-#define HUGEPD_SHIFT		(HPAGE_SHIFT + hugepte_shift)
-#define HUGEPD_SIZE		(1UL << HUGEPD_SHIFT)
-#define HUGEPD_MASK		(~(HUGEPD_SIZE-1))
+/* Array of valid huge page sizes - non-zero value(hugepte_shift) is
+ * stored for the huge page sizes that are valid.
+ */
+unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */
+
+#define hugepte_shift			mmu_huge_psizes
+#define PTRS_PER_HUGEPTE(psize)		(1 << hugepte_shift[psize])
+#define HUGEPTE_TABLE_SIZE(psize)	(sizeof(pte_t) << hugepte_shift[psize])
+
+#define HUGEPD_SHIFT(psize)		(mmu_psize_to_shift(psize) \
+						+ hugepte_shift[psize])
+#define HUGEPD_SIZE(psize)		(1UL << HUGEPD_SHIFT(psize))
+#define HUGEPD_MASK(psize)		(~(HUGEPD_SIZE(psize)-1))
+
+/* Subtract one from array size because we don't need a cache for 4K since
+ * is not a huge page size */
+#define huge_pgtable_cache(psize)	(pgtable_cache[HUGEPTE_CACHE_NUM \
+							+ psize-1])
+#define HUGEPTE_CACHE_NAME(psize)	(huge_pgtable_cache_name[psize])
 
-#define huge_pgtable_cache	(pgtable_cache[HUGEPTE_CACHE_NUM])
+static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = {
+	"unused_4K", "hugepte_cache_64K", "unused_64K_AP",
+	"hugepte_cache_1M", "hugepte_cache_16M", "hugepte_cache_16G"
+};
 
 /* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
  * will choke on pointers to hugepte tables, which is handy for
@@ -49,24 +71,49 @@ typedef struct { unsigned long pd; } hugepd_t;
 
 #define hugepd_none(hpd)	((hpd).pd == 0)
 
+static inline int shift_to_mmu_psize(unsigned int shift)
+{
+	switch (shift) {
+#ifndef CONFIG_PPC_64K_PAGES
+	case PAGE_SHIFT_64K:
+	    return MMU_PAGE_64K;
+#endif
+	case PAGE_SHIFT_16M:
+	    return MMU_PAGE_16M;
+	case PAGE_SHIFT_16G:
+	    return MMU_PAGE_16G;
+	}
+	return -1;
+}
+
+static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
+{
+	if (mmu_psize_defs[mmu_psize].shift)
+		return mmu_psize_defs[mmu_psize].shift;
+	BUG();
+}
+
 static inline pte_t *hugepd_page(hugepd_t hpd)
 {
 	BUG_ON(!(hpd.pd & HUGEPD_OK));
 	return (pte_t *)(hpd.pd & ~HUGEPD_OK);
 }
 
-static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr)
+static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr,
+				    struct hstate *hstate)
 {
-	unsigned long idx = ((addr >> HPAGE_SHIFT) & (PTRS_PER_HUGEPTE-1));
+	unsigned int shift = huge_page_shift(hstate);
+	int psize = shift_to_mmu_psize(shift);
+	unsigned long idx = ((addr >> shift) & (PTRS_PER_HUGEPTE(psize)-1));
 	pte_t *dir = hugepd_page(*hpdp);
 
 	return dir + idx;
 }
 
 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
-			   unsigned long address)
+			   unsigned long address, unsigned int psize)
 {
-	pte_t *new = kmem_cache_alloc(huge_pgtable_cache,
+	pte_t *new = kmem_cache_zalloc(huge_pgtable_cache(psize),
 				      GFP_KERNEL|__GFP_REPEAT);
 
 	if (! new)
@@ -74,7 +121,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 
 	spin_lock(&mm->page_table_lock);
 	if (!hugepd_none(*hpdp))
-		kmem_cache_free(huge_pgtable_cache, new);
+		kmem_cache_free(huge_pgtable_cache(psize), new);
 	else
 		hpdp->pd = (unsigned long)new | HUGEPD_OK;
 	spin_unlock(&mm->page_table_lock);
@@ -83,27 +130,60 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 
 /* Base page size affects how we walk hugetlb page tables */
 #ifdef CONFIG_PPC_64K_PAGES
-#define hpmd_offset(pud, addr)		pmd_offset(pud, addr)
-#define hpmd_alloc(mm, pud, addr)	pmd_alloc(mm, pud, addr)
+#define hpmd_offset(pud, addr, h)	pmd_offset(pud, addr)
+#define hpmd_alloc(mm, pud, addr, h)	pmd_alloc(mm, pud, addr)
 #else
 static inline
-pmd_t *hpmd_offset(pud_t *pud, unsigned long addr)
+pmd_t *hpmd_offset(pud_t *pud, unsigned long addr, struct hstate *hstate)
 {
-	if (HPAGE_SHIFT == HPAGE_SHIFT_64K)
+	if (huge_page_shift(hstate) == PAGE_SHIFT_64K)
 		return pmd_offset(pud, addr);
 	else
 		return (pmd_t *) pud;
 }
 static inline
-pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr)
+pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr,
+		  struct hstate *hstate)
 {
-	if (HPAGE_SHIFT == HPAGE_SHIFT_64K)
+	if (huge_page_shift(hstate) == PAGE_SHIFT_64K)
 		return pmd_alloc(mm, pud, addr);
 	else
 		return (pmd_t *) pud;
 }
 #endif
 
+/* Build list of addresses of gigantic pages.  This function is used in early
+ * boot before the buddy or bootmem allocator is setup.
+ */
+void add_gpage(unsigned long addr, unsigned long page_size,
+	unsigned long number_of_pages)
+{
+	if (!addr)
+		return;
+	while (number_of_pages > 0) {
+		gpage_freearray[nr_gpages] = addr;
+		nr_gpages++;
+		number_of_pages--;
+		addr += page_size;
+	}
+}
+
+/* Moves the gigantic page addresses from the temporary list to the
+ * huge_boot_pages list.
+ */
+int alloc_bootmem_huge_page(struct hstate *hstate)
+{
+	struct huge_bootmem_page *m;
+	if (nr_gpages == 0)
+		return 0;
+	m = phys_to_virt(gpage_freearray[--nr_gpages]);
+	gpage_freearray[nr_gpages] = 0;
+	list_add(&m->list, &huge_boot_pages);
+	m->hstate = hstate;
+	return 1;
+}
+
+
 /* Modelled after find_linux_pte() */
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
@@ -111,39 +191,52 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 	pud_t *pu;
 	pmd_t *pm;
 
-	BUG_ON(get_slice_psize(mm, addr) != mmu_huge_psize);
+	unsigned int psize;
+	unsigned int shift;
+	unsigned long sz;
+	struct hstate *hstate;
+	psize = get_slice_psize(mm, addr);
+	shift = mmu_psize_to_shift(psize);
+	sz = ((1UL) << shift);
+	hstate = size_to_hstate(sz);
 
-	addr &= HPAGE_MASK;
+	addr &= hstate->mask;
 
 	pg = pgd_offset(mm, addr);
 	if (!pgd_none(*pg)) {
 		pu = pud_offset(pg, addr);
 		if (!pud_none(*pu)) {
-			pm = hpmd_offset(pu, addr);
+			pm = hpmd_offset(pu, addr, hstate);
 			if (!pmd_none(*pm))
-				return hugepte_offset((hugepd_t *)pm, addr);
+				return hugepte_offset((hugepd_t *)pm, addr,
+						      hstate);
 		}
 	}
 
 	return NULL;
 }
 
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+			unsigned long addr, unsigned long sz)
 {
 	pgd_t *pg;
 	pud_t *pu;
 	pmd_t *pm;
 	hugepd_t *hpdp = NULL;
+	struct hstate *hstate;
+	unsigned int psize;
+	hstate = size_to_hstate(sz);
 
-	BUG_ON(get_slice_psize(mm, addr) != mmu_huge_psize);
+	psize = get_slice_psize(mm, addr);
+	BUG_ON(!mmu_huge_psizes[psize]);
 
-	addr &= HPAGE_MASK;
+	addr &= hstate->mask;
 
 	pg = pgd_offset(mm, addr);
 	pu = pud_alloc(mm, pg, addr);
 
 	if (pu) {
-		pm = hpmd_alloc(mm, pu, addr);
+		pm = hpmd_alloc(mm, pu, addr, hstate);
 		if (pm)
 			hpdp = (hugepd_t *)pm;
 	}
@@ -151,10 +244,10 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 	if (! hpdp)
 		return NULL;
 
-	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr))
+	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, psize))
 		return NULL;
 
-	return hugepte_offset(hpdp, addr);
+	return hugepte_offset(hpdp, addr, hstate);
 }
 
 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
@@ -162,19 +255,22 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
 	return 0;
 }
 
-static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp)
+static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp,
+			       unsigned int psize)
 {
 	pte_t *hugepte = hugepd_page(*hpdp);
 
 	hpdp->pd = 0;
 	tlb->need_flush = 1;
-	pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, HUGEPTE_CACHE_NUM,
+	pgtable_free_tlb(tlb, pgtable_free_cache(hugepte,
+						 HUGEPTE_CACHE_NUM+psize-1,
 						 PGF_CACHENUM_MASK));
 }
 
 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 				   unsigned long addr, unsigned long end,
-				   unsigned long floor, unsigned long ceiling)
+				   unsigned long floor, unsigned long ceiling,
+				   unsigned int psize)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -186,7 +282,7 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 		next = pmd_addr_end(addr, end);
 		if (pmd_none(*pmd))
 			continue;
-		free_hugepte_range(tlb, (hugepd_t *)pmd);
+		free_hugepte_range(tlb, (hugepd_t *)pmd, psize);
 	} while (pmd++, addr = next, addr != end);
 
 	start &= PUD_MASK;
@@ -212,6 +308,9 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 	pud_t *pud;
 	unsigned long next;
 	unsigned long start;
+	unsigned int shift;
+	unsigned int psize = get_slice_psize(tlb->mm, addr);
+	shift = mmu_psize_to_shift(psize);
 
 	start = addr;
 	pud = pud_offset(pgd, addr);
@@ -220,16 +319,18 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 #ifdef CONFIG_PPC_64K_PAGES
 		if (pud_none_or_clear_bad(pud))
 			continue;
-		hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);
+		hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling,
+				       psize);
 #else
-		if (HPAGE_SHIFT == HPAGE_SHIFT_64K) {
+		if (shift == PAGE_SHIFT_64K) {
 			if (pud_none_or_clear_bad(pud))
 				continue;
-			hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);
+			hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
+					       ceiling, psize);
 		} else {
 			if (pud_none(*pud))
 				continue;
-			free_hugepte_range(tlb, (hugepd_t *)pud);
+			free_hugepte_range(tlb, (hugepd_t *)pud, psize);
 		}
 #endif
 	} while (pud++, addr = next, addr != end);
@@ -255,7 +356,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
  *
  * Must be called with pagetable lock held.
  */
-void hugetlb_free_pgd_range(struct mmu_gather **tlb,
+void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 			    unsigned long addr, unsigned long end,
 			    unsigned long floor, unsigned long ceiling)
 {
@@ -297,31 +398,33 @@ void hugetlb_free_pgd_range(struct mmu_gather **tlb,
 	 * now has no other vmas using it, so can be freed, we don't
 	 * bother to round floor or end up - the tests don't need that.
 	 */
+	unsigned int psize = get_slice_psize(tlb->mm, addr);
 
-	addr &= HUGEPD_MASK;
+	addr &= HUGEPD_MASK(psize);
 	if (addr < floor) {
-		addr += HUGEPD_SIZE;
+		addr += HUGEPD_SIZE(psize);
 		if (!addr)
 			return;
 	}
 	if (ceiling) {
-		ceiling &= HUGEPD_MASK;
+		ceiling &= HUGEPD_MASK(psize);
 		if (!ceiling)
 			return;
 	}
 	if (end - 1 > ceiling - 1)
-		end -= HUGEPD_SIZE;
+		end -= HUGEPD_SIZE(psize);
 	if (addr > end - 1)
 		return;
 
 	start = addr;
-	pgd = pgd_offset((*tlb)->mm, addr);
+	pgd = pgd_offset(tlb->mm, addr);
 	do {
-		BUG_ON(get_slice_psize((*tlb)->mm, addr) != mmu_huge_psize);
+		psize = get_slice_psize(tlb->mm, addr);
+		BUG_ON(!mmu_huge_psizes[psize]);
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		hugetlb_free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
+		hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
 	} while (pgd++, addr = next, addr != end);
 }
 
@@ -334,7 +437,11 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 		 * necessary anymore if we make hpte_need_flush() get the
 		 * page size from the slices
 		 */
-		pte_update(mm, addr & HPAGE_MASK, ptep, ~0UL, 1);
+		unsigned int psize = get_slice_psize(mm, addr);
+		unsigned int shift = mmu_psize_to_shift(psize);
+		unsigned long sz = ((1UL) << shift);
+		struct hstate *hstate = size_to_hstate(sz);
+		pte_update(mm, addr & hstate->mask, ptep, ~0UL, 1);
 	}
 	*ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
 }
@@ -351,14 +458,19 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
 {
 	pte_t *ptep;
 	struct page *page;
+	unsigned int mmu_psize = get_slice_psize(mm, address);
 
-	if (get_slice_psize(mm, address) != mmu_huge_psize)
+	/* Verify it is a huge page else bail. */
+	if (!mmu_huge_psizes[mmu_psize])
 		return ERR_PTR(-EINVAL);
 
 	ptep = huge_pte_offset(mm, address);
 	page = pte_page(*ptep);
-	if (page)
-		page += (address % HPAGE_SIZE) / PAGE_SIZE;
+	if (page) {
+		unsigned int shift = mmu_psize_to_shift(mmu_psize);
+		unsigned long sz = ((1UL) << shift);
+		page += (address % sz) / PAGE_SIZE;
+	}
 
 	return page;
 }
@@ -368,6 +480,11 @@ int pmd_huge(pmd_t pmd)
 	return 0;
 }
 
+int pud_huge(pud_t pud)
+{
+	return 0;
+}
+
 struct page *
 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 		pmd_t *pmd, int write)
@@ -381,15 +498,16 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 					unsigned long len, unsigned long pgoff,
 					unsigned long flags)
 {
-	return slice_get_unmapped_area(addr, len, flags,
-				       mmu_huge_psize, 1, 0);
+	struct hstate *hstate = hstate_file(file);
+	int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
+	return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
 }
 
 /*
  * Called by asm hashtable.S for doing lazy icache flush
  */
 static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
-						  pte_t pte, int trap)
+					pte_t pte, int trap, unsigned long sz)
 {
 	struct page *page;
 	int i;
@@ -402,7 +520,7 @@ static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
 	/* page is dirty */
 	if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
 		if (trap == 0x400) {
-			for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++)
+			for (i = 0; i < (sz / PAGE_SIZE); i++)
 				__flush_dcache_icache(page_address(page+i));
 			set_bit(PG_arch_1, &page->flags);
 		} else {
@@ -418,11 +536,16 @@ int hash_huge_page(struct mm_struct *mm, unsigned long access,
 {
 	pte_t *ptep;
 	unsigned long old_pte, new_pte;
-	unsigned long va, rflags, pa;
+	unsigned long va, rflags, pa, sz;
 	long slot;
 	int err = 1;
 	int ssize = user_segment_size(ea);
+	unsigned int mmu_psize;
+	int shift;
+	mmu_psize = get_slice_psize(mm, ea);
 
+	if (!mmu_huge_psizes[mmu_psize])
+		goto out;
 	ptep = huge_pte_offset(mm, ea);
 
 	/* Search the Linux page table for a match with va */
@@ -465,30 +588,32 @@ int hash_huge_page(struct mm_struct *mm, unsigned long access,
 	rflags = 0x2 | (!(new_pte & _PAGE_RW));
  	/* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
 	rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
+	shift = mmu_psize_to_shift(mmu_psize);
+	sz = ((1UL) << shift);
 	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
 		/* No CPU has hugepages but lacks no execute, so we
 		 * don't need to worry about that case */
 		rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
-						       trap);
+						       trap, sz);
 
 	/* Check if pte already has an hpte (case 2) */
 	if (unlikely(old_pte & _PAGE_HASHPTE)) {
 		/* There MIGHT be an HPTE for this pte */
 		unsigned long hash, slot;
 
-		hash = hpt_hash(va, HPAGE_SHIFT, ssize);
+		hash = hpt_hash(va, shift, ssize);
 		if (old_pte & _PAGE_F_SECOND)
 			hash = ~hash;
 		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 		slot += (old_pte & _PAGE_F_GIX) >> 12;
 
-		if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_huge_psize,
+		if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize,
 					 ssize, local) == -1)
 			old_pte &= ~_PAGE_HPTEFLAGS;
 	}
 
 	if (likely(!(old_pte & _PAGE_HASHPTE))) {
-		unsigned long hash = hpt_hash(va, HPAGE_SHIFT, ssize);
+		unsigned long hash = hpt_hash(va, shift, ssize);
 		unsigned long hpte_group;
 
 		pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
@@ -509,7 +634,7 @@ repeat:
 
 		/* Insert into the hash table, primary slot */
 		slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
-					  mmu_huge_psize, ssize);
+					  mmu_psize, ssize);
 
 		/* Primary is full, try the secondary */
 		if (unlikely(slot == -1)) {
@@ -517,7 +642,7 @@ repeat:
 				      HPTES_PER_GROUP) & ~0x7UL; 
 			slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
 						  HPTE_V_SECONDARY,
-						  mmu_huge_psize, ssize);
+						  mmu_psize, ssize);
 			if (slot == -1) {
 				if (mftb() & 0x1)
 					hpte_group = ((hash & htab_hash_mask) *
@@ -549,45 +674,54 @@ void set_huge_psize(int psize)
 {
 	/* Check that it is a page size supported by the hardware and
 	 * that it fits within pagetable limits. */
-	if (mmu_psize_defs[psize].shift && mmu_psize_defs[psize].shift < SID_SHIFT &&
+	if (mmu_psize_defs[psize].shift &&
+		mmu_psize_defs[psize].shift < SID_SHIFT_1T &&
 		(mmu_psize_defs[psize].shift > MIN_HUGEPTE_SHIFT ||
-			mmu_psize_defs[psize].shift == HPAGE_SHIFT_64K)) {
-		HPAGE_SHIFT = mmu_psize_defs[psize].shift;
-		mmu_huge_psize = psize;
-#ifdef CONFIG_PPC_64K_PAGES
-		hugepte_shift = (PMD_SHIFT-HPAGE_SHIFT);
-#else
-		if (HPAGE_SHIFT == HPAGE_SHIFT_64K)
-			hugepte_shift = (PMD_SHIFT-HPAGE_SHIFT);
-		else
-			hugepte_shift = (PUD_SHIFT-HPAGE_SHIFT);
-#endif
-
+		 mmu_psize_defs[psize].shift == PAGE_SHIFT_64K ||
+		 mmu_psize_defs[psize].shift == PAGE_SHIFT_16G)) {
+		/* Return if huge page size has already been setup or is the
+		 * same as the base page size. */
+		if (mmu_huge_psizes[psize] ||
+		   mmu_psize_defs[psize].shift == PAGE_SHIFT)
+			return;
+		hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT);
+
+		switch (mmu_psize_defs[psize].shift) {
+		case PAGE_SHIFT_64K:
+		    /* We only allow 64k hpages with 4k base page,
+		     * which was checked above, and always put them
+		     * at the PMD */
+		    hugepte_shift[psize] = PMD_SHIFT;
+		    break;
+		case PAGE_SHIFT_16M:
+		    /* 16M pages can be at two different levels
+		     * of pagestables based on base page size */
+		    if (PAGE_SHIFT == PAGE_SHIFT_64K)
+			    hugepte_shift[psize] = PMD_SHIFT;
+		    else /* 4k base page */
+			    hugepte_shift[psize] = PUD_SHIFT;
+		    break;
+		case PAGE_SHIFT_16G:
+		    /* 16G pages are always at PGD level */
+		    hugepte_shift[psize] = PGDIR_SHIFT;
+		    break;
+		}
+		hugepte_shift[psize] -= mmu_psize_defs[psize].shift;
 	} else
-		HPAGE_SHIFT = 0;
+		hugepte_shift[psize] = 0;
 }
 
 static int __init hugepage_setup_sz(char *str)
 {
 	unsigned long long size;
-	int mmu_psize = -1;
+	int mmu_psize;
 	int shift;
 
 	size = memparse(str, &str);
 
 	shift = __ffs(size);
-	switch (shift) {
-#ifndef CONFIG_PPC_64K_PAGES
-	case HPAGE_SHIFT_64K:
-		mmu_psize = MMU_PAGE_64K;
-		break;
-#endif
-	case HPAGE_SHIFT_16M:
-		mmu_psize = MMU_PAGE_16M;
-		break;
-	}
-
-	if (mmu_psize >=0 && mmu_psize_defs[mmu_psize].shift)
+	mmu_psize = shift_to_mmu_psize(shift);
+	if (mmu_psize >= 0 && mmu_psize_defs[mmu_psize].shift)
 		set_huge_psize(mmu_psize);
 	else
 		printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
@@ -596,23 +730,33 @@ static int __init hugepage_setup_sz(char *str)
 }
 __setup("hugepagesz=", hugepage_setup_sz);
 
-static void zero_ctor(struct kmem_cache *cache, void *addr)
-{
-	memset(addr, 0, kmem_cache_size(cache));
-}
-
 static int __init hugetlbpage_init(void)
 {
+	unsigned int psize;
+
 	if (!cpu_has_feature(CPU_FTR_16M_PAGE))
 		return -ENODEV;
-
-	huge_pgtable_cache = kmem_cache_create("hugepte_cache",
-					       HUGEPTE_TABLE_SIZE,
-					       HUGEPTE_TABLE_SIZE,
-					       0,
-					       zero_ctor);
-	if (! huge_pgtable_cache)
-		panic("hugetlbpage_init(): could not create hugepte cache\n");
+	/* Add supported huge page sizes.  Need to change HUGE_MAX_HSTATE
+	 * and adjust PTE_NONCACHE_NUM if the number of supported huge page
+	 * sizes changes.
+	 */
+	set_huge_psize(MMU_PAGE_16M);
+	set_huge_psize(MMU_PAGE_64K);
+	set_huge_psize(MMU_PAGE_16G);
+
+	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+		if (mmu_huge_psizes[psize]) {
+			huge_pgtable_cache(psize) = kmem_cache_create(
+						HUGEPTE_CACHE_NAME(psize),
+						HUGEPTE_TABLE_SIZE(psize),
+						HUGEPTE_TABLE_SIZE(psize),
+						0,
+						NULL);
+			if (!huge_pgtable_cache(psize))
+				panic("hugetlbpage_init(): could not create %s"\
+				      "\n", HUGEPTE_CACHE_NAME(psize));
+		}
+	}
 
 	return 0;
 }
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 6ef63caca68..4f7df85129d 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -136,9 +136,14 @@ static int __init setup_kcore(void)
 module_init(setup_kcore);
 #endif
 
-static void zero_ctor(struct kmem_cache *cache, void *addr)
+static void pgd_ctor(void *addr)
 {
-	memset(addr, 0, kmem_cache_size(cache));
+	memset(addr, 0, PGD_TABLE_SIZE);
+}
+
+static void pmd_ctor(void *addr)
+{
+	memset(addr, 0, PMD_TABLE_SIZE);
 }
 
 static const unsigned int pgtable_cache_size[2] = {
@@ -153,29 +158,18 @@ static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
 };
 
 #ifdef CONFIG_HUGETLB_PAGE
-/* Hugepages need one extra cache, initialized in hugetlbpage.c.  We
- * can't put into the tables above, because HPAGE_SHIFT is not compile
- * time constant. */
-struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+1];
+/* Hugepages need an extra cache per hugepagesize, initialized in
+ * hugetlbpage.c.  We can't put into the tables above, because HPAGE_SHIFT
+ * is not compile time constant. */
+struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+MMU_PAGE_COUNT];
 #else
 struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
 #endif
 
 void pgtable_cache_init(void)
 {
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(pgtable_cache_size); i++) {
-		int size = pgtable_cache_size[i];
-		const char *name = pgtable_cache_name[i];
-
-		pr_debug("Allocating page table cache %s (#%d) "
-			"for size: %08x...\n", name, i, size);
-		pgtable_cache[i] = kmem_cache_create(name,
-						     size, size,
-						     SLAB_PANIC,
-						     zero_ctor);
-	}
+	pgtable_cache[0] = kmem_cache_create(pgtable_cache_name[0], PGD_TABLE_SIZE, PGD_TABLE_SIZE, SLAB_PANIC, pgd_ctor);
+	pgtable_cache[1] = kmem_cache_create(pgtable_cache_name[1], PMD_TABLE_SIZE, PMD_TABLE_SIZE, SLAB_PANIC, pmd_ctor);
 }
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 1ca2235f096..702691cb9e8 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -186,45 +186,6 @@ walk_memory_resource(unsigned long start_pfn, unsigned long nr_pages, void *arg,
 }
 EXPORT_SYMBOL_GPL(walk_memory_resource);
 
-void show_mem(void)
-{
-	unsigned long total = 0, reserved = 0;
-	unsigned long shared = 0, cached = 0;
-	unsigned long highmem = 0;
-	struct page *page;
-	pg_data_t *pgdat;
-	unsigned long i;
-
-	printk("Mem-info:\n");
-	show_free_areas();
-	for_each_online_pgdat(pgdat) {
-		unsigned long flags;
-		pgdat_resize_lock(pgdat, &flags);
-		for (i = 0; i < pgdat->node_spanned_pages; i++) {
-			if (!pfn_valid(pgdat->node_start_pfn + i))
-				continue;
-			page = pgdat_page_nr(pgdat, i);
-			total++;
-			if (PageHighMem(page))
-				highmem++;
-			if (PageReserved(page))
-				reserved++;
-			else if (PageSwapCache(page))
-				cached++;
-			else if (page_count(page))
-				shared += page_count(page) - 1;
-		}
-		pgdat_resize_unlock(pgdat, &flags);
-	}
-	printk("%ld pages of RAM\n", total);
-#ifdef CONFIG_HIGHMEM
-	printk("%ld pages of HIGHMEM\n", highmem);
-#endif
-	printk("%ld reserved pages\n", reserved);
-	printk("%ld pages shared\n", shared);
-	printk("%ld pages swap cached\n", cached);
-}
-
 /*
  * Initialize the bootmem system and give it all the memory we
  * have available.  If we are using highmem, we only put the
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index cf4bffba6f7..d9a18135133 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -39,7 +39,6 @@ EXPORT_SYMBOL(numa_cpu_lookup_table);
 EXPORT_SYMBOL(numa_cpumask_lookup_table);
 EXPORT_SYMBOL(node_data);
 
-static bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES];
 static int min_common_depth;
 static int n_mem_addr_cells, n_mem_size_cells;
 
@@ -816,7 +815,7 @@ void __init do_init_bootmem(void)
   		dbg("node %d\n", nid);
 		dbg("NODE_DATA() = %p\n", NODE_DATA(nid));
 
-		NODE_DATA(nid)->bdata = &plat_node_bdata[nid];
+		NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
 		NODE_DATA(nid)->node_start_pfn = start_pfn;
 		NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
 
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index c7584072dfc..2001abdb191 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -145,13 +145,20 @@ void pte_free(struct mm_struct *mm, pgtable_t ptepage)
 void __iomem *
 ioremap(phys_addr_t addr, unsigned long size)
 {
-	return __ioremap(addr, size, _PAGE_NO_CACHE);
+	return __ioremap(addr, size, _PAGE_NO_CACHE | _PAGE_GUARDED);
 }
 EXPORT_SYMBOL(ioremap);
 
 void __iomem *
 ioremap_flags(phys_addr_t addr, unsigned long size, unsigned long flags)
 {
+	/* writeable implies dirty for kernel addresses */
+	if (flags & _PAGE_RW)
+		flags |= _PAGE_DIRTY | _PAGE_HWWRITE;
+
+	/* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
+	flags &= ~(_PAGE_USER | _PAGE_EXEC | _PAGE_HWEXEC);
+
 	return __ioremap(addr, size, flags);
 }
 EXPORT_SYMBOL(ioremap_flags);
@@ -163,6 +170,14 @@ __ioremap(phys_addr_t addr, unsigned long size, unsigned long flags)
 	phys_addr_t p;
 	int err;
 
+	/* Make sure we have the base flags */
+	if ((flags & _PAGE_PRESENT) == 0)
+		flags |= _PAGE_KERNEL;
+
+	/* Non-cacheable page cannot be coherent */
+	if (flags & _PAGE_NO_CACHE)
+		flags &= ~_PAGE_COHERENT;
+
 	/*
 	 * Choose an address to map it to.
 	 * Once the vmalloc system is running, we use it.
@@ -219,11 +234,6 @@ __ioremap(phys_addr_t addr, unsigned long size, unsigned long flags)
 		v = (ioremap_bot -= size);
 	}
 
-	if ((flags & _PAGE_PRESENT) == 0)
-		flags |= _PAGE_KERNEL;
-	if (flags & _PAGE_NO_CACHE)
-		flags |= _PAGE_GUARDED;
-
 	/*
 	 * Should check if it is a candidate for a BAT mapping
 	 */
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 3ef0ad2f9ca..365e61ae5db 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -107,9 +107,18 @@ void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size,
 {
 	unsigned long i;
 
+	/* Make sure we have the base flags */
 	if ((flags & _PAGE_PRESENT) == 0)
 		flags |= pgprot_val(PAGE_KERNEL);
 
+	/* Non-cacheable page cannot be coherent */
+	if (flags & _PAGE_NO_CACHE)
+		flags &= ~_PAGE_COHERENT;
+
+	/* We don't support the 4K PFN hack with ioremap */
+	if (flags & _PAGE_4K_PFN)
+		return NULL;
+
 	WARN_ON(pa & ~PAGE_MASK);
 	WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
 	WARN_ON(size & ~PAGE_MASK);
@@ -190,6 +199,13 @@ void __iomem * ioremap(phys_addr_t addr, unsigned long size)
 void __iomem * ioremap_flags(phys_addr_t addr, unsigned long size,
 			     unsigned long flags)
 {
+	/* writeable implies dirty for kernel addresses */
+	if (flags & _PAGE_RW)
+		flags |= _PAGE_DIRTY;
+
+	/* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
+	flags &= ~(_PAGE_USER | _PAGE_EXEC);
+
 	if (ppc_md.ioremap)
 		return ppc_md.ioremap(addr, size, flags);
 	return __ioremap(addr, size, flags);
diff --git a/arch/powerpc/mm/tlb_64.c b/arch/powerpc/mm/tlb_64.c
index a01b5c608ff..409fcc7b63c 100644
--- a/arch/powerpc/mm/tlb_64.c
+++ b/arch/powerpc/mm/tlb_64.c
@@ -147,7 +147,7 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
 	 */
 	if (huge) {
 #ifdef CONFIG_HUGETLB_PAGE
-		psize = mmu_huge_psize;
+		psize = get_slice_psize(mm, addr);;
 #else
 		BUG();
 		psize = pte_pagesize_index(mm, addr, pte); /* shutup gcc */
diff --git a/arch/powerpc/platforms/52xx/Kconfig b/arch/powerpc/platforms/52xx/Kconfig
index d664b1bce38..696a5ee4962 100644
--- a/arch/powerpc/platforms/52xx/Kconfig
+++ b/arch/powerpc/platforms/52xx/Kconfig
@@ -1,7 +1,6 @@
 config PPC_MPC52xx
 	bool "52xx-based boards"
 	depends on PPC_MULTIPLATFORM && PPC32
-	select FSL_SOC
 	select PPC_CLOCK
 	select PPC_PCI_CHOICE
 
@@ -48,6 +47,7 @@ config PPC_MPC5200_BUGFIX
 config PPC_MPC5200_GPIO
 	bool "MPC5200 GPIO support"
 	depends on PPC_MPC52xx
-	select HAVE_GPIO_LIB
+	select ARCH_REQUIRE_GPIOLIB
+	select GENERIC_GPIO
 	help
 	  Enable gpiolib support for mpc5200 based boards
diff --git a/arch/powerpc/platforms/cell/celleb_scc_pciex.c b/arch/powerpc/platforms/cell/celleb_scc_pciex.c
index 0e04f8fb152..3e7e0f1568e 100644
--- a/arch/powerpc/platforms/cell/celleb_scc_pciex.c
+++ b/arch/powerpc/platforms/cell/celleb_scc_pciex.c
@@ -281,7 +281,7 @@ static int __init scc_pciex_iowa_init(struct iowa_bus *bus, void *data)
 
 	dummy_page_da = dma_map_single(bus->phb->parent, dummy_page_va,
 				       PAGE_SIZE, DMA_FROM_DEVICE);
-	if (dma_mapping_error(dummy_page_da)) {
+	if (dma_mapping_error(bus->phb->parent, dummy_page_da)) {
 		pr_err("PCIEX:Map dummy page failed.\n");
 		kfree(dummy_page_va);
 		return -1;
diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c
index 208005ca262..e06420af5fe 100644
--- a/arch/powerpc/platforms/cell/iommu.c
+++ b/arch/powerpc/platforms/cell/iommu.c
@@ -172,7 +172,7 @@ static void invalidate_tce_cache(struct cbe_iommu *iommu, unsigned long *pte,
 	}
 }
 
-static void tce_build_cell(struct iommu_table *tbl, long index, long npages,
+static int tce_build_cell(struct iommu_table *tbl, long index, long npages,
 		unsigned long uaddr, enum dma_data_direction direction,
 		struct dma_attrs *attrs)
 {
@@ -213,6 +213,7 @@ static void tce_build_cell(struct iommu_table *tbl, long index, long npages,
 
 	pr_debug("tce_build_cell(index=%lx,n=%lx,dir=%d,base_pte=%lx)\n",
 		 index, npages, direction, base_pte);
+	return 0;
 }
 
 static void tce_free_cell(struct iommu_table *tbl, long index, long npages)
@@ -1150,12 +1151,23 @@ static int iommu_fixed_disabled;
 
 static int __init setup_iommu_fixed(char *str)
 {
+	struct device_node *pciep;
+
 	if (strcmp(str, "off") == 0)
 		iommu_fixed_disabled = 1;
 
-	else if (strcmp(str, "weak") == 0)
+	/* If we can find a pcie-endpoint in the device tree assume that
+	 * we're on a triblade or a CAB so by default the fixed mapping
+	 * should be set to be weakly ordered; but only if the boot
+	 * option WASN'T set for strong ordering
+	 */
+	pciep = of_find_node_by_type(NULL, "pcie-endpoint");
+
+	if (strcmp(str, "weak") == 0 || (pciep && strcmp(str, "strong") != 0))
 		iommu_fixed_is_weak = 1;
 
+	of_node_put(pciep);
+
 	return 1;
 }
 __setup("iommu_fixed=", setup_iommu_fixed);
diff --git a/arch/powerpc/platforms/cell/spider-pci.c b/arch/powerpc/platforms/cell/spider-pci.c
index 418b605ac35..5122ec14527 100644
--- a/arch/powerpc/platforms/cell/spider-pci.c
+++ b/arch/powerpc/platforms/cell/spider-pci.c
@@ -111,7 +111,7 @@ static int __init spiderpci_pci_setup_chip(struct pci_controller *phb,
 
 	dummy_page_da = dma_map_single(phb->parent, dummy_page_va,
 				       PAGE_SIZE, DMA_FROM_DEVICE);
-	if (dma_mapping_error(dummy_page_da)) {
+	if (dma_mapping_error(phb->parent, dummy_page_da)) {
 		pr_err("SPIDER-IOWA:Map dummy page filed.\n");
 		kfree(dummy_page_va);
 		return -1;
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index 99c73066b82..010a51f5979 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -288,9 +288,32 @@ spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return VM_FAULT_NOPAGE;
 }
 
+static int spufs_mem_mmap_access(struct vm_area_struct *vma,
+				unsigned long address,
+				void *buf, int len, int write)
+{
+	struct spu_context *ctx = vma->vm_file->private_data;
+	unsigned long offset = address - vma->vm_start;
+	char *local_store;
+
+	if (write && !(vma->vm_flags & VM_WRITE))
+		return -EACCES;
+	if (spu_acquire(ctx))
+		return -EINTR;
+	if ((offset + len) > vma->vm_end)
+		len = vma->vm_end - offset;
+	local_store = ctx->ops->get_ls(ctx);
+	if (write)
+		memcpy_toio(local_store + offset, buf, len);
+	else
+		memcpy_fromio(buf, local_store + offset, len);
+	spu_release(ctx);
+	return len;
+}
 
 static struct vm_operations_struct spufs_mem_mmap_vmops = {
 	.fault = spufs_mem_mmap_fault,
+	.access = spufs_mem_mmap_access,
 };
 
 static int spufs_mem_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 7123472801d..690ca7b0dcf 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -78,7 +78,7 @@ spufs_destroy_inode(struct inode *inode)
 }
 
 static void
-spufs_init_once(struct kmem_cache *cachep, void *p)
+spufs_init_once(void *p)
 {
 	struct spufs_inode_info *ei = p;
 
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 34654743363..2deeeba7ecc 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -312,11 +312,28 @@ static struct spu *aff_ref_location(struct spu_context *ctx, int mem_aff,
 	 */
 	node = cpu_to_node(raw_smp_processor_id());
 	for (n = 0; n < MAX_NUMNODES; n++, node++) {
+		int available_spus;
+
 		node = (node < MAX_NUMNODES) ? node : 0;
 		if (!node_allowed(ctx, node))
 			continue;
+
+		available_spus = 0;
 		mutex_lock(&cbe_spu_info[node].list_mutex);
 		list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
+			if (spu->ctx && spu->ctx->gang
+					&& spu->ctx->aff_offset == 0)
+				available_spus -=
+					(spu->ctx->gang->contexts - 1);
+			else
+				available_spus++;
+		}
+		if (available_spus < ctx->gang->contexts) {
+			mutex_unlock(&cbe_spu_info[node].list_mutex);
+			continue;
+		}
+
+		list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
 			if ((!mem_aff || spu->has_mem_affinity) &&
 							sched_spu(spu)) {
 				mutex_unlock(&cbe_spu_info[node].list_mutex);
@@ -389,6 +406,9 @@ static int has_affinity(struct spu_context *ctx)
 	if (list_empty(&ctx->aff_list))
 		return 0;
 
+	if (atomic_read(&ctx->gang->aff_sched_count) == 0)
+		ctx->gang->aff_ref_spu = NULL;
+
 	if (!gang->aff_ref_spu) {
 		if (!(gang->aff_flags & AFF_MERGED))
 			aff_merge_remaining_ctxs(gang);
@@ -416,14 +436,8 @@ static void spu_unbind_context(struct spu *spu, struct spu_context *ctx)
  	if (spu->ctx->flags & SPU_CREATE_NOSCHED)
 		atomic_dec(&cbe_spu_info[spu->node].reserved_spus);
 
-	if (ctx->gang){
-		mutex_lock(&ctx->gang->aff_mutex);
-		if (has_affinity(ctx)) {
-			if (atomic_dec_and_test(&ctx->gang->aff_sched_count))
-				ctx->gang->aff_ref_spu = NULL;
-		}
-		mutex_unlock(&ctx->gang->aff_mutex);
-	}
+	if (ctx->gang)
+		atomic_dec_if_positive(&ctx->gang->aff_sched_count);
 
 	spu_switch_notify(spu, NULL);
 	spu_unmap_mappings(ctx);
@@ -562,10 +576,7 @@ static struct spu *spu_get_idle(struct spu_context *ctx)
 				goto found;
 			mutex_unlock(&cbe_spu_info[node].list_mutex);
 
-			mutex_lock(&ctx->gang->aff_mutex);
-			if (atomic_dec_and_test(&ctx->gang->aff_sched_count))
-				ctx->gang->aff_ref_spu = NULL;
-			mutex_unlock(&ctx->gang->aff_mutex);
+			atomic_dec(&ctx->gang->aff_sched_count);
 			goto not_found;
 		}
 		mutex_unlock(&ctx->gang->aff_mutex);
diff --git a/arch/powerpc/platforms/cell/spufs/sputrace.c b/arch/powerpc/platforms/cell/spufs/sputrace.c
index 8c0e95766a6..92d20e993ed 100644
--- a/arch/powerpc/platforms/cell/spufs/sputrace.c
+++ b/arch/powerpc/platforms/cell/spufs/sputrace.c
@@ -196,8 +196,7 @@ static int __init sputrace_init(void)
 	struct proc_dir_entry *entry;
 	int i, error = -ENOMEM;
 
-	sputrace_log = kcalloc(sizeof(struct sputrace),
-				bufsize, GFP_KERNEL);
+	sputrace_log = kcalloc(bufsize, sizeof(struct sputrace), GFP_KERNEL);
 	if (!sputrace_log)
 		goto out;
 
diff --git a/arch/powerpc/platforms/iseries/iommu.c b/arch/powerpc/platforms/iseries/iommu.c
index bc818e4e203..bb464d1211b 100644
--- a/arch/powerpc/platforms/iseries/iommu.c
+++ b/arch/powerpc/platforms/iseries/iommu.c
@@ -41,7 +41,7 @@
 #include <asm/iseries/hv_call_event.h>
 #include <asm/iseries/iommu.h>
 
-static void tce_build_iSeries(struct iommu_table *tbl, long index, long npages,
+static int tce_build_iSeries(struct iommu_table *tbl, long index, long npages,
 		unsigned long uaddr, enum dma_data_direction direction,
 		struct dma_attrs *attrs)
 {
@@ -71,6 +71,7 @@ static void tce_build_iSeries(struct iommu_table *tbl, long index, long npages,
 		index++;
 		uaddr += TCE_PAGE_SIZE;
 	}
+	return 0;
 }
 
 static void tce_free_iSeries(struct iommu_table *tbl, long index, long npages)
diff --git a/arch/powerpc/platforms/iseries/mf.c b/arch/powerpc/platforms/iseries/mf.c
index 1dc7295746d..731d7b15774 100644
--- a/arch/powerpc/platforms/iseries/mf.c
+++ b/arch/powerpc/platforms/iseries/mf.c
@@ -871,7 +871,7 @@ static int proc_mf_dump_cmdline(char *page, char **start, off_t off,
 		count = 256 - off;
 
 	dma_addr = iseries_hv_map(page, off + count, DMA_FROM_DEVICE);
-	if (dma_mapping_error(dma_addr))
+	if (dma_mapping_error(NULL, dma_addr))
 		return -ENOMEM;
 	memset(page, 0, off + count);
 	memset(&vsp_cmd, 0, sizeof(vsp_cmd));
diff --git a/arch/powerpc/platforms/iseries/setup.c b/arch/powerpc/platforms/iseries/setup.c
index b72120751bb..70b688c1aef 100644
--- a/arch/powerpc/platforms/iseries/setup.c
+++ b/arch/powerpc/platforms/iseries/setup.c
@@ -561,7 +561,7 @@ static void yield_shared_processor(void)
 static void iseries_shared_idle(void)
 {
 	while (1) {
-		tick_nohz_stop_sched_tick();
+		tick_nohz_stop_sched_tick(1);
 		while (!need_resched() && !hvlpevent_is_pending()) {
 			local_irq_disable();
 			ppc64_runlatch_off();
@@ -591,7 +591,7 @@ static void iseries_dedicated_idle(void)
 	set_thread_flag(TIF_POLLING_NRFLAG);
 
 	while (1) {
-		tick_nohz_stop_sched_tick();
+		tick_nohz_stop_sched_tick(1);
 		if (!need_resched()) {
 			while (!need_resched()) {
 				ppc64_runlatch_off();
diff --git a/arch/powerpc/platforms/pasemi/iommu.c b/arch/powerpc/platforms/pasemi/iommu.c
index 70541b7a501..a0ff03a3d8d 100644
--- a/arch/powerpc/platforms/pasemi/iommu.c
+++ b/arch/powerpc/platforms/pasemi/iommu.c
@@ -83,7 +83,7 @@ static u32 *iob_l2_base;
 static struct iommu_table iommu_table_iobmap;
 static int iommu_table_iobmap_inited;
 
-static void iobmap_build(struct iommu_table *tbl, long index,
+static int iobmap_build(struct iommu_table *tbl, long index,
 			 long npages, unsigned long uaddr,
 			 enum dma_data_direction direction,
 			 struct dma_attrs *attrs)
@@ -108,6 +108,7 @@ static void iobmap_build(struct iommu_table *tbl, long index,
 		uaddr += IOBMAP_PAGE_SIZE;
 		bus_addr += IOBMAP_PAGE_SIZE;
 	}
+	return 0;
 }
 
 
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index 757c0296e0b..97619fd51e3 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -40,3 +40,26 @@ config PPC_PSERIES_DEBUG
 	depends on PPC_PSERIES && PPC_EARLY_DEBUG
 	bool "Enable extra debug logging in platforms/pseries"
 	default y
+
+config PPC_SMLPAR
+	bool "Support for shared-memory logical partitions"
+	depends on PPC_PSERIES
+	select LPARCFG
+	default n
+	help
+	  Select this option to enable shared memory partition support.
+	  With this option a system running in an LPAR can be given more
+	  memory than physically available and will allow firmware to
+	  balance memory across many LPARs.
+
+config CMM
+	tristate "Collaborative memory management"
+	depends on PPC_SMLPAR
+	default y
+	help
+	  Select this option, if you want to enable the kernel interface
+	  to reduce the memory size of the system. This is accomplished
+	  by allocating pages of memory and put them "on hold". This only
+	  makes sense for a system running in an LPAR where the unused pages
+	  will be reused for other LPARs. The interface allows firmware to
+	  balance memory across many LPARs.
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile
index 554c6e42ef2..dfe574af2dc 100644
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -24,3 +24,4 @@ obj-$(CONFIG_HVC_CONSOLE)	+= hvconsole.o
 obj-$(CONFIG_HVCS)		+= hvcserver.o
 obj-$(CONFIG_HCALL_STATS)	+= hvCall_inst.o
 obj-$(CONFIG_PHYP_DUMP)	+= phyp_dump.o
+obj-$(CONFIG_CMM)		+= cmm.o
diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c
new file mode 100644
index 00000000000..c6b3be03168
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/cmm.c
@@ -0,0 +1,468 @@
+/*
+ * Collaborative memory management interface.
+ *
+ * Copyright (C) 2008 IBM Corporation
+ * Author(s): Brian King (brking@linux.vnet.ibm.com),
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include <linux/ctype.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/oom.h>
+#include <linux/sched.h>
+#include <linux/stringify.h>
+#include <linux/swap.h>
+#include <linux/sysdev.h>
+#include <asm/firmware.h>
+#include <asm/hvcall.h>
+#include <asm/mmu.h>
+#include <asm/pgalloc.h>
+#include <asm/uaccess.h>
+
+#include "plpar_wrappers.h"
+
+#define CMM_DRIVER_VERSION	"1.0.0"
+#define CMM_DEFAULT_DELAY	1
+#define CMM_DEBUG			0
+#define CMM_DISABLE		0
+#define CMM_OOM_KB		1024
+#define CMM_MIN_MEM_MB		256
+#define KB2PAGES(_p)		((_p)>>(PAGE_SHIFT-10))
+#define PAGES2KB(_p)		((_p)<<(PAGE_SHIFT-10))
+
+static unsigned int delay = CMM_DEFAULT_DELAY;
+static unsigned int oom_kb = CMM_OOM_KB;
+static unsigned int cmm_debug = CMM_DEBUG;
+static unsigned int cmm_disabled = CMM_DISABLE;
+static unsigned long min_mem_mb = CMM_MIN_MEM_MB;
+static struct sys_device cmm_sysdev;
+
+MODULE_AUTHOR("Brian King <brking@linux.vnet.ibm.com>");
+MODULE_DESCRIPTION("IBM System p Collaborative Memory Manager");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(CMM_DRIVER_VERSION);
+
+module_param_named(delay, delay, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(delay, "Delay (in seconds) between polls to query hypervisor paging requests. "
+		 "[Default=" __stringify(CMM_DEFAULT_DELAY) "]");
+module_param_named(oom_kb, oom_kb, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(oom_kb, "Amount of memory in kb to free on OOM. "
+		 "[Default=" __stringify(CMM_OOM_KB) "]");
+module_param_named(min_mem_mb, min_mem_mb, ulong, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(min_mem_mb, "Minimum amount of memory (in MB) to not balloon. "
+		 "[Default=" __stringify(CMM_MIN_MEM_MB) "]");
+module_param_named(debug, cmm_debug, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(debug, "Enable module debugging logging. Set to 1 to enable. "
+		 "[Default=" __stringify(CMM_DEBUG) "]");
+
+#define CMM_NR_PAGES ((PAGE_SIZE - sizeof(void *) - sizeof(unsigned long)) / sizeof(unsigned long))
+
+#define cmm_dbg(...) if (cmm_debug) { printk(KERN_INFO "cmm: "__VA_ARGS__); }
+
+struct cmm_page_array {
+	struct cmm_page_array *next;
+	unsigned long index;
+	unsigned long page[CMM_NR_PAGES];
+};
+
+static unsigned long loaned_pages;
+static unsigned long loaned_pages_target;
+static unsigned long oom_freed_pages;
+
+static struct cmm_page_array *cmm_page_list;
+static DEFINE_SPINLOCK(cmm_lock);
+
+static struct task_struct *cmm_thread_ptr;
+
+/**
+ * cmm_alloc_pages - Allocate pages and mark them as loaned
+ * @nr:	number of pages to allocate
+ *
+ * Return value:
+ * 	number of pages requested to be allocated which were not
+ **/
+static long cmm_alloc_pages(long nr)
+{
+	struct cmm_page_array *pa, *npa;
+	unsigned long addr;
+	long rc;
+
+	cmm_dbg("Begin request for %ld pages\n", nr);
+
+	while (nr) {
+		addr = __get_free_page(GFP_NOIO | __GFP_NOWARN |
+				       __GFP_NORETRY | __GFP_NOMEMALLOC);
+		if (!addr)
+			break;
+		spin_lock(&cmm_lock);
+		pa = cmm_page_list;
+		if (!pa || pa->index >= CMM_NR_PAGES) {
+			/* Need a new page for the page list. */
+			spin_unlock(&cmm_lock);
+			npa = (struct cmm_page_array *)__get_free_page(GFP_NOIO | __GFP_NOWARN |
+								       __GFP_NORETRY | __GFP_NOMEMALLOC);
+			if (!npa) {
+				pr_info("%s: Can not allocate new page list\n", __FUNCTION__);
+				free_page(addr);
+				break;
+			}
+			spin_lock(&cmm_lock);
+			pa = cmm_page_list;
+
+			if (!pa || pa->index >= CMM_NR_PAGES) {
+				npa->next = pa;
+				npa->index = 0;
+				pa = npa;
+				cmm_page_list = pa;
+			} else
+				free_page((unsigned long) npa);
+		}
+
+		if ((rc = plpar_page_set_loaned(__pa(addr)))) {
+			pr_err("%s: Can not set page to loaned. rc=%ld\n", __FUNCTION__, rc);
+			spin_unlock(&cmm_lock);
+			free_page(addr);
+			break;
+		}
+
+		pa->page[pa->index++] = addr;
+		loaned_pages++;
+		totalram_pages--;
+		spin_unlock(&cmm_lock);
+		nr--;
+	}
+
+	cmm_dbg("End request with %ld pages unfulfilled\n", nr);
+	return nr;
+}
+
+/**
+ * cmm_free_pages - Free pages and mark them as active
+ * @nr:	number of pages to free
+ *
+ * Return value:
+ * 	number of pages requested to be freed which were not
+ **/
+static long cmm_free_pages(long nr)
+{
+	struct cmm_page_array *pa;
+	unsigned long addr;
+
+	cmm_dbg("Begin free of %ld pages.\n", nr);
+	spin_lock(&cmm_lock);
+	pa = cmm_page_list;
+	while (nr) {
+		if (!pa || pa->index <= 0)
+			break;
+		addr = pa->page[--pa->index];
+
+		if (pa->index == 0) {
+			pa = pa->next;
+			free_page((unsigned long) cmm_page_list);
+			cmm_page_list = pa;
+		}
+
+		plpar_page_set_active(__pa(addr));
+		free_page(addr);
+		loaned_pages--;
+		nr--;
+		totalram_pages++;
+	}
+	spin_unlock(&cmm_lock);
+	cmm_dbg("End request with %ld pages unfulfilled\n", nr);
+	return nr;
+}
+
+/**
+ * cmm_oom_notify - OOM notifier
+ * @self:	notifier block struct
+ * @dummy:	not used
+ * @parm:	returned - number of pages freed
+ *
+ * Return value:
+ * 	NOTIFY_OK
+ **/
+static int cmm_oom_notify(struct notifier_block *self,
+			  unsigned long dummy, void *parm)
+{
+	unsigned long *freed = parm;
+	long nr = KB2PAGES(oom_kb);
+
+	cmm_dbg("OOM processing started\n");
+	nr = cmm_free_pages(nr);
+	loaned_pages_target = loaned_pages;
+	*freed += KB2PAGES(oom_kb) - nr;
+	oom_freed_pages += KB2PAGES(oom_kb) - nr;
+	cmm_dbg("OOM processing complete\n");
+	return NOTIFY_OK;
+}
+
+/**
+ * cmm_get_mpp - Read memory performance parameters
+ *
+ * Makes hcall to query the current page loan request from the hypervisor.
+ *
+ * Return value:
+ * 	nothing
+ **/
+static void cmm_get_mpp(void)
+{
+	int rc;
+	struct hvcall_mpp_data mpp_data;
+	unsigned long active_pages_target;
+	signed long page_loan_request;
+
+	rc = h_get_mpp(&mpp_data);
+
+	if (rc != H_SUCCESS)
+		return;
+
+	page_loan_request = div_s64((s64)mpp_data.loan_request, PAGE_SIZE);
+	loaned_pages_target = page_loan_request + loaned_pages;
+	if (loaned_pages_target > oom_freed_pages)
+		loaned_pages_target -= oom_freed_pages;
+	else
+		loaned_pages_target = 0;
+
+	active_pages_target = totalram_pages + loaned_pages - loaned_pages_target;
+
+	if ((min_mem_mb * 1024 * 1024) > (active_pages_target * PAGE_SIZE))
+		loaned_pages_target = totalram_pages + loaned_pages -
+			((min_mem_mb * 1024 * 1024) / PAGE_SIZE);
+
+	cmm_dbg("delta = %ld, loaned = %lu, target = %lu, oom = %lu, totalram = %lu\n",
+		page_loan_request, loaned_pages, loaned_pages_target,
+		oom_freed_pages, totalram_pages);
+}
+
+static struct notifier_block cmm_oom_nb = {
+	.notifier_call = cmm_oom_notify
+};
+
+/**
+ * cmm_thread - CMM task thread
+ * @dummy:	not used
+ *
+ * Return value:
+ * 	0
+ **/
+static int cmm_thread(void *dummy)
+{
+	unsigned long timeleft;
+
+	while (1) {
+		timeleft = msleep_interruptible(delay * 1000);
+
+		if (kthread_should_stop() || timeleft) {
+			loaned_pages_target = loaned_pages;
+			break;
+		}
+
+		cmm_get_mpp();
+
+		if (loaned_pages_target > loaned_pages) {
+			if (cmm_alloc_pages(loaned_pages_target - loaned_pages))
+				loaned_pages_target = loaned_pages;
+		} else if (loaned_pages_target < loaned_pages)
+			cmm_free_pages(loaned_pages - loaned_pages_target);
+	}
+	return 0;
+}
+
+#define CMM_SHOW(name, format, args...)			\
+	static ssize_t show_##name(struct sys_device *dev, char *buf)	\
+	{							\
+		return sprintf(buf, format, ##args);		\
+	}							\
+	static SYSDEV_ATTR(name, S_IRUGO, show_##name, NULL)
+
+CMM_SHOW(loaned_kb, "%lu\n", PAGES2KB(loaned_pages));
+CMM_SHOW(loaned_target_kb, "%lu\n", PAGES2KB(loaned_pages_target));
+
+static ssize_t show_oom_pages(struct sys_device *dev, char *buf)
+{
+	return sprintf(buf, "%lu\n", PAGES2KB(oom_freed_pages));
+}
+
+static ssize_t store_oom_pages(struct sys_device *dev,
+			       const char *buf, size_t count)
+{
+	unsigned long val = simple_strtoul (buf, NULL, 10);
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (val != 0)
+		return -EBADMSG;
+
+	oom_freed_pages = 0;
+	return count;
+}
+
+static SYSDEV_ATTR(oom_freed_kb, S_IWUSR| S_IRUGO,
+		   show_oom_pages, store_oom_pages);
+
+static struct sysdev_attribute *cmm_attrs[] = {
+	&attr_loaned_kb,
+	&attr_loaned_target_kb,
+	&attr_oom_freed_kb,
+};
+
+static struct sysdev_class cmm_sysdev_class = {
+	.name = "cmm",
+};
+
+/**
+ * cmm_sysfs_register - Register with sysfs
+ *
+ * Return value:
+ * 	0 on success / other on failure
+ **/
+static int cmm_sysfs_register(struct sys_device *sysdev)
+{
+	int i, rc;
+
+	if ((rc = sysdev_class_register(&cmm_sysdev_class)))
+		return rc;
+
+	sysdev->id = 0;
+	sysdev->cls = &cmm_sysdev_class;
+
+	if ((rc = sysdev_register(sysdev)))
+		goto class_unregister;
+
+	for (i = 0; i < ARRAY_SIZE(cmm_attrs); i++) {
+		if ((rc = sysdev_create_file(sysdev, cmm_attrs[i])))
+			goto fail;
+	}
+
+	return 0;
+
+fail:
+	while (--i >= 0)
+		sysdev_remove_file(sysdev, cmm_attrs[i]);
+	sysdev_unregister(sysdev);
+class_unregister:
+	sysdev_class_unregister(&cmm_sysdev_class);
+	return rc;
+}
+
+/**
+ * cmm_unregister_sysfs - Unregister from sysfs
+ *
+ **/
+static void cmm_unregister_sysfs(struct sys_device *sysdev)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(cmm_attrs); i++)
+		sysdev_remove_file(sysdev, cmm_attrs[i]);
+	sysdev_unregister(sysdev);
+	sysdev_class_unregister(&cmm_sysdev_class);
+}
+
+/**
+ * cmm_init - Module initialization
+ *
+ * Return value:
+ * 	0 on success / other on failure
+ **/
+static int cmm_init(void)
+{
+	int rc = -ENOMEM;
+
+	if (!firmware_has_feature(FW_FEATURE_CMO))
+		return -EOPNOTSUPP;
+
+	if ((rc = register_oom_notifier(&cmm_oom_nb)) < 0)
+		return rc;
+
+	if ((rc = cmm_sysfs_register(&cmm_sysdev)))
+		goto out_oom_notifier;
+
+	if (cmm_disabled)
+		return rc;
+
+	cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread");
+	if (IS_ERR(cmm_thread_ptr)) {
+		rc = PTR_ERR(cmm_thread_ptr);
+		goto out_unregister_sysfs;
+	}
+
+	return rc;
+
+out_unregister_sysfs:
+	cmm_unregister_sysfs(&cmm_sysdev);
+out_oom_notifier:
+	unregister_oom_notifier(&cmm_oom_nb);
+	return rc;
+}
+
+/**
+ * cmm_exit - Module exit
+ *
+ * Return value:
+ * 	nothing
+ **/
+static void cmm_exit(void)
+{
+	if (cmm_thread_ptr)
+		kthread_stop(cmm_thread_ptr);
+	unregister_oom_notifier(&cmm_oom_nb);
+	cmm_free_pages(loaned_pages);
+	cmm_unregister_sysfs(&cmm_sysdev);
+}
+
+/**
+ * cmm_set_disable - Disable/Enable CMM
+ *
+ * Return value:
+ * 	0 on success / other on failure
+ **/
+static int cmm_set_disable(const char *val, struct kernel_param *kp)
+{
+	int disable = simple_strtoul(val, NULL, 10);
+
+	if (disable != 0 && disable != 1)
+		return -EINVAL;
+
+	if (disable && !cmm_disabled) {
+		if (cmm_thread_ptr)
+			kthread_stop(cmm_thread_ptr);
+		cmm_thread_ptr = NULL;
+		cmm_free_pages(loaned_pages);
+	} else if (!disable && cmm_disabled) {
+		cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread");
+		if (IS_ERR(cmm_thread_ptr))
+			return PTR_ERR(cmm_thread_ptr);
+	}
+
+	cmm_disabled = disable;
+	return 0;
+}
+
+module_param_call(disable, cmm_set_disable, param_get_uint,
+		  &cmm_disabled, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(disable, "Disable CMM. Set to 1 to disable. "
+		 "[Default=" __stringify(CMM_DISABLE) "]");
+
+module_init(cmm_init);
+module_exit(cmm_exit);
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 5377dd4b849..a8c446697f9 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -48,7 +48,7 @@
 #include "plpar_wrappers.h"
 
 
-static void tce_build_pSeries(struct iommu_table *tbl, long index,
+static int tce_build_pSeries(struct iommu_table *tbl, long index,
 			      long npages, unsigned long uaddr,
 			      enum dma_data_direction direction,
 			      struct dma_attrs *attrs)
@@ -72,6 +72,7 @@ static void tce_build_pSeries(struct iommu_table *tbl, long index,
 		uaddr += TCE_PAGE_SIZE;
 		tcep++;
 	}
+	return 0;
 }
 
 
@@ -94,14 +95,19 @@ static unsigned long tce_get_pseries(struct iommu_table *tbl, long index)
 	return *tcep;
 }
 
-static void tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
+static void tce_free_pSeriesLP(struct iommu_table*, long, long);
+static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long);
+
+static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
 				long npages, unsigned long uaddr,
 				enum dma_data_direction direction,
 				struct dma_attrs *attrs)
 {
-	u64 rc;
+	u64 rc = 0;
 	u64 proto_tce, tce;
 	u64 rpn;
+	int ret = 0;
+	long tcenum_start = tcenum, npages_start = npages;
 
 	rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT;
 	proto_tce = TCE_PCI_READ;
@@ -112,6 +118,13 @@ static void tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
 		tce = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT;
 		rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, tce);
 
+		if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
+			ret = (int)rc;
+			tce_free_pSeriesLP(tbl, tcenum_start,
+			                   (npages_start - (npages + 1)));
+			break;
+		}
+
 		if (rc && printk_ratelimit()) {
 			printk("tce_build_pSeriesLP: plpar_tce_put failed. rc=%ld\n", rc);
 			printk("\tindex   = 0x%lx\n", (u64)tbl->it_index);
@@ -123,25 +136,27 @@ static void tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
 		tcenum++;
 		rpn++;
 	}
+	return ret;
 }
 
 static DEFINE_PER_CPU(u64 *, tce_page) = NULL;
 
-static void tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
+static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 				     long npages, unsigned long uaddr,
 				     enum dma_data_direction direction,
 				     struct dma_attrs *attrs)
 {
-	u64 rc;
+	u64 rc = 0;
 	u64 proto_tce;
 	u64 *tcep;
 	u64 rpn;
 	long l, limit;
+	long tcenum_start = tcenum, npages_start = npages;
+	int ret = 0;
 
 	if (npages == 1) {
-		tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
-				    direction, attrs);
-		return;
+		return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
+		                           direction, attrs);
 	}
 
 	tcep = __get_cpu_var(tce_page);
@@ -153,9 +168,8 @@ static void tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 		tcep = (u64 *)__get_free_page(GFP_ATOMIC);
 		/* If allocation fails, fall back to the loop implementation */
 		if (!tcep) {
-			tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
+			return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
 					    direction, attrs);
-			return;
 		}
 		__get_cpu_var(tce_page) = tcep;
 	}
@@ -187,6 +201,13 @@ static void tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 		tcenum += limit;
 	} while (npages > 0 && !rc);
 
+	if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
+		ret = (int)rc;
+		tce_freemulti_pSeriesLP(tbl, tcenum_start,
+		                        (npages_start - (npages + limit)));
+		return ret;
+	}
+
 	if (rc && printk_ratelimit()) {
 		printk("tce_buildmulti_pSeriesLP: plpar_tce_put failed. rc=%ld\n", rc);
 		printk("\tindex   = 0x%lx\n", (u64)tbl->it_index);
@@ -194,6 +215,7 @@ static void tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 		printk("\ttce[0] val = 0x%lx\n", tcep[0]);
 		show_stack(current, (unsigned long *)__get_SP());
 	}
+	return ret;
 }
 
 static void tce_free_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages)
diff --git a/arch/powerpc/platforms/pseries/plpar_wrappers.h b/arch/powerpc/platforms/pseries/plpar_wrappers.h
index d8680b589dc..a437267c6bf 100644
--- a/arch/powerpc/platforms/pseries/plpar_wrappers.h
+++ b/arch/powerpc/platforms/pseries/plpar_wrappers.h
@@ -42,6 +42,16 @@ static inline long register_slb_shadow(unsigned long cpu, unsigned long vpa)
 	return vpa_call(0x3, cpu, vpa);
 }
 
+static inline long plpar_page_set_loaned(unsigned long vpa)
+{
+	return plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_LOANED, vpa, 0);
+}
+
+static inline long plpar_page_set_active(unsigned long vpa)
+{
+	return plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_ACTIVE, vpa, 0);
+}
+
 extern void vpa_init(int cpu);
 
 static inline long plpar_pte_enter(unsigned long flags,
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 90beb444e1d..063a0d2fba3 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -314,6 +314,76 @@ static int pseries_set_xdabr(unsigned long dabr)
 			H_DABRX_KERNEL | H_DABRX_USER);
 }
 
+#define CMO_CHARACTERISTICS_TOKEN 44
+#define CMO_MAXLENGTH 1026
+
+/**
+ * fw_cmo_feature_init - FW_FEATURE_CMO is not stored in ibm,hypertas-functions,
+ * handle that here. (Stolen from parse_system_parameter_string)
+ */
+void pSeries_cmo_feature_init(void)
+{
+	char *ptr, *key, *value, *end;
+	int call_status;
+	int PrPSP = -1;
+	int SecPSP = -1;
+
+	pr_debug(" -> fw_cmo_feature_init()\n");
+	spin_lock(&rtas_data_buf_lock);
+	memset(rtas_data_buf, 0, RTAS_DATA_BUF_SIZE);
+	call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
+				NULL,
+				CMO_CHARACTERISTICS_TOKEN,
+				__pa(rtas_data_buf),
+				RTAS_DATA_BUF_SIZE);
+
+	if (call_status != 0) {
+		spin_unlock(&rtas_data_buf_lock);
+		pr_debug("CMO not available\n");
+		pr_debug(" <- fw_cmo_feature_init()\n");
+		return;
+	}
+
+	end = rtas_data_buf + CMO_MAXLENGTH - 2;
+	ptr = rtas_data_buf + 2;	/* step over strlen value */
+	key = value = ptr;
+
+	while (*ptr && (ptr <= end)) {
+		/* Separate the key and value by replacing '=' with '\0' and
+		 * point the value at the string after the '='
+		 */
+		if (ptr[0] == '=') {
+			ptr[0] = '\0';
+			value = ptr + 1;
+		} else if (ptr[0] == '\0' || ptr[0] == ',') {
+			/* Terminate the string containing the key/value pair */
+			ptr[0] = '\0';
+
+			if (key == value) {
+				pr_debug("Malformed key/value pair\n");
+				/* Never found a '=', end processing */
+				break;
+			}
+
+			if (0 == strcmp(key, "PrPSP"))
+				PrPSP = simple_strtol(value, NULL, 10);
+			else if (0 == strcmp(key, "SecPSP"))
+				SecPSP = simple_strtol(value, NULL, 10);
+			value = key = ptr + 1;
+		}
+		ptr++;
+	}
+
+	if (PrPSP != -1 || SecPSP != -1) {
+		pr_info("CMO enabled\n");
+		pr_debug("CMO enabled, PrPSP=%d, SecPSP=%d\n", PrPSP, SecPSP);
+		powerpc_firmware_features |= FW_FEATURE_CMO;
+	} else
+		pr_debug("CMO not enabled, PrPSP=%d, SecPSP=%d\n", PrPSP, SecPSP);
+	spin_unlock(&rtas_data_buf_lock);
+	pr_debug(" <- fw_cmo_feature_init()\n");
+}
+
 /*
  * Early initialization.  Relocation is on but do not reference unbolted pages
  */
@@ -329,6 +399,7 @@ static void __init pSeries_init_early(void)
 	else if (firmware_has_feature(FW_FEATURE_XDABR))
 		ppc_md.set_dabr = pseries_set_xdabr;
 
+	pSeries_cmo_feature_init();
 	iommu_init_early_pSeries();
 
 	pr_debug(" <- pSeries_init_early()\n");
diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c
index de8c8b542cf..89639ecbf38 100644
--- a/arch/powerpc/sysdev/dart_iommu.c
+++ b/arch/powerpc/sysdev/dart_iommu.c
@@ -147,7 +147,7 @@ static void dart_flush(struct iommu_table *tbl)
 	}
 }
 
-static void dart_build(struct iommu_table *tbl, long index,
+static int dart_build(struct iommu_table *tbl, long index,
 		       long npages, unsigned long uaddr,
 		       enum dma_data_direction direction,
 		       struct dma_attrs *attrs)
@@ -184,6 +184,7 @@ static void dart_build(struct iommu_table *tbl, long index,
 	} else {
 		dart_dirty = 1;
 	}
+	return 0;
 }
 
 
diff --git a/arch/powerpc/sysdev/qe_lib/Kconfig b/arch/powerpc/sysdev/qe_lib/Kconfig
index 4bb18f57901..1ce546462be 100644
--- a/arch/powerpc/sysdev/qe_lib/Kconfig
+++ b/arch/powerpc/sysdev/qe_lib/Kconfig
@@ -29,7 +29,7 @@ config QE_GPIO
 	bool "QE GPIO support"
 	depends on QUICC_ENGINE
 	select GENERIC_GPIO
-	select HAVE_GPIO_LIB
+	select ARCH_REQUIRE_GPIOLIB
 	help
 	  Say Y here if you're going to use hardware that connects to the
 	  QE GPIOs.
author	Mauro Carvalho Chehab <mchehab@infradead.org>	2008-07-27 12:25:57 -0300
committer	Mauro Carvalho Chehab <mchehab@infradead.org>	2008-07-27 12:25:57 -0300
commit	50cb993ea6cd187bfed085cb3e0747066edeb02f (patch)
tree	61edac62c6c5bc07c59e4369c50c6821ad77f2c0 /arch/powerpc
parent	445c2714cf72817ab1ad3ca894c6d9b2047b3a3e (diff)
parent	8be1a6d6c77ab4532e4476fdb8177030ef48b52c (diff)