Linux-2.6.12-rc2

Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
author: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 15:20:36 -0700
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 15:20:36 -0700
commit: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree: 0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/x86_64/mm
9 files changed, 2452 insertions, 0 deletions
diff --git a/arch/x86_64/mm/Makefile b/arch/x86_64/mm/Makefile
new file mode 100644
index 00000000000..66c354ad80c
--- /dev/null
+++ b/arch/x86_64/mm/Makefile
@@ -0,0 +1,11 @@
+#
+# Makefile for the linux x86_64-specific parts of the memory manager.
+#
+
+obj-y	 := init.o fault.o ioremap.o extable.o pageattr.o
+obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
+obj-$(CONFIG_DISCONTIGMEM) += numa.o
+obj-$(CONFIG_K8_NUMA) += k8topology.o
+obj-$(CONFIG_ACPI_NUMA) += srat.o
+
+hugetlbpage-y = ../../i386/mm/hugetlbpage.o
diff --git a/arch/x86_64/mm/extable.c b/arch/x86_64/mm/extable.c
new file mode 100644
index 00000000000..2d78f9fb403
--- /dev/null
+++ b/arch/x86_64/mm/extable.c
@@ -0,0 +1,35 @@
+/*
+ * linux/arch/x86_64/mm/extable.c
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/init.h>
+#include <asm/uaccess.h>
+
+/* Simple binary search */
+const struct exception_table_entry *
+search_extable(const struct exception_table_entry *first,
+	       const struct exception_table_entry *last,
+	       unsigned long value)
+{
+	/* Work around a B stepping K8 bug */
+	if ((value >> 32) == 0)
+		value |= 0xffffffffUL << 32; 
+
+        while (first <= last) {
+		const struct exception_table_entry *mid;
+		long diff;
+
+		mid = (last - first) / 2 + first;
+		diff = mid->insn - value;
+                if (diff == 0)
+                        return mid;
+                else if (diff < 0)
+                        first = mid+1;
+                else
+                        last = mid-1;
+        }
+        return NULL;
+}
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
new file mode 100644
index 00000000000..5724370475c
--- /dev/null
+++ b/arch/x86_64/mm/fault.c
@@ -0,0 +1,579 @@
+/*
+ *  linux/arch/x86-64/mm/fault.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
+ */
+
+#include <linux/config.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/tty.h>
+#include <linux/vt_kern.h>		/* For unblank_screen() */
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/kprobes.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/pgalloc.h>
+#include <asm/smp.h>
+#include <asm/tlbflush.h>
+#include <asm/proto.h>
+#include <asm/kdebug.h>
+#include <asm-generic/sections.h>
+#include <asm/kdebug.h>
+
+void bust_spinlocks(int yes)
+{
+	int loglevel_save = console_loglevel;
+	if (yes) {
+		oops_in_progress = 1;
+	} else {
+#ifdef CONFIG_VT
+		unblank_screen();
+#endif
+		oops_in_progress = 0;
+		/*
+		 * OK, the message is on the console.  Now we call printk()
+		 * without oops_in_progress set so that printk will give klogd
+		 * a poke.  Hold onto your hats...
+		 */
+		console_loglevel = 15;		/* NMI oopser may have shut the console up */
+		printk(" ");
+		console_loglevel = loglevel_save;
+	}
+}
+
+/* Sometimes the CPU reports invalid exceptions on prefetch.
+   Check that here and ignore.
+   Opcode checker based on code by Richard Brunner */
+static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
+				unsigned long error_code)
+{ 
+	unsigned char *instr = (unsigned char *)(regs->rip);
+	int scan_more = 1;
+	int prefetch = 0; 
+	unsigned char *max_instr = instr + 15;
+
+	/* If it was a exec fault ignore */
+	if (error_code & (1<<4))
+		return 0;
+	
+	/* Code segments in LDT could have a non zero base. Don't check
+	   when that's possible */
+	if (regs->cs & (1<<2))
+		return 0;
+
+	if ((regs->cs & 3) != 0 && regs->rip >= TASK_SIZE)
+		return 0;
+
+	while (scan_more && instr < max_instr) { 
+		unsigned char opcode;
+		unsigned char instr_hi;
+		unsigned char instr_lo;
+
+		if (__get_user(opcode, instr))
+			break; 
+
+		instr_hi = opcode & 0xf0; 
+		instr_lo = opcode & 0x0f; 
+		instr++;
+
+		switch (instr_hi) { 
+		case 0x20:
+		case 0x30:
+			/* Values 0x26,0x2E,0x36,0x3E are valid x86
+			   prefixes.  In long mode, the CPU will signal
+			   invalid opcode if some of these prefixes are
+			   present so we will never get here anyway */
+			scan_more = ((instr_lo & 7) == 0x6);
+			break;
+			
+		case 0x40:
+			/* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
+			   Need to figure out under what instruction mode the
+			   instruction was issued ... */
+			/* Could check the LDT for lm, but for now it's good
+			   enough to assume that long mode only uses well known
+			   segments or kernel. */
+			scan_more = ((regs->cs & 3) == 0) || (regs->cs == __USER_CS);
+			break;
+			
+		case 0x60:
+			/* 0x64 thru 0x67 are valid prefixes in all modes. */
+			scan_more = (instr_lo & 0xC) == 0x4;
+			break;		
+		case 0xF0:
+			/* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
+			scan_more = !instr_lo || (instr_lo>>1) == 1;
+			break;			
+		case 0x00:
+			/* Prefetch instruction is 0x0F0D or 0x0F18 */
+			scan_more = 0;
+			if (__get_user(opcode, instr)) 
+				break;
+			prefetch = (instr_lo == 0xF) &&
+				(opcode == 0x0D || opcode == 0x18);
+			break;			
+		default:
+			scan_more = 0;
+			break;
+		} 
+	}
+	return prefetch;
+}
+
+static int bad_address(void *p) 
+{ 
+	unsigned long dummy;
+	return __get_user(dummy, (unsigned long *)p);
+} 
+
+void dump_pagetable(unsigned long address)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	asm("movq %%cr3,%0" : "=r" (pgd));
+
+	pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); 
+	pgd += pgd_index(address);
+	printk("PGD %lx ", pgd_val(*pgd));
+	if (bad_address(pgd)) goto bad;
+	if (!pgd_present(*pgd)) goto ret; 
+
+	pud = __pud_offset_k((pud_t *)pgd_page(*pgd), address);
+	if (bad_address(pud)) goto bad;
+	printk("PUD %lx ", pud_val(*pud));
+	if (!pud_present(*pud))	goto ret;
+
+	pmd = pmd_offset(pud, address);
+	if (bad_address(pmd)) goto bad;
+	printk("PMD %lx ", pmd_val(*pmd));
+	if (!pmd_present(*pmd))	goto ret;	 
+
+	pte = pte_offset_kernel(pmd, address);
+	if (bad_address(pte)) goto bad;
+	printk("PTE %lx", pte_val(*pte)); 
+ret:
+	printk("\n");
+	return;
+bad:
+	printk("BAD\n");
+}
+
+static const char errata93_warning[] = 
+KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
+KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
+KERN_ERR "******* Please consider a BIOS update.\n"
+KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
+
+/* Workaround for K8 erratum #93 & buggy BIOS.
+   BIOS SMM functions are required to use a specific workaround
+   to avoid corruption of the 64bit RIP register on C stepping K8. 
+   A lot of BIOS that didn't get tested properly miss this. 
+   The OS sees this as a page fault with the upper 32bits of RIP cleared.
+   Try to work around it here.
+   Note we only handle faults in kernel here. */
+
+static int is_errata93(struct pt_regs *regs, unsigned long address) 
+{
+	static int warned;
+	if (address != regs->rip)
+		return 0;
+	if ((address >> 32) != 0) 
+		return 0;
+	address |= 0xffffffffUL << 32;
+	if ((address >= (u64)_stext && address <= (u64)_etext) || 
+	    (address >= MODULES_VADDR && address <= MODULES_END)) { 
+		if (!warned) {
+			printk(errata93_warning); 		
+			warned = 1;
+		}
+		regs->rip = address;
+		return 1;
+	}
+	return 0;
+} 
+
+int unhandled_signal(struct task_struct *tsk, int sig)
+{
+	if (tsk->pid == 1)
+		return 1;
+	/* Warn for strace, but not for gdb */
+	if (!test_ti_thread_flag(tsk->thread_info, TIF_SYSCALL_TRACE) &&
+	    (tsk->ptrace & PT_PTRACED))
+		return 0;
+	return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
+		(tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
+}
+
+static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
+				 unsigned long error_code)
+{
+	oops_begin();
+	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
+	       current->comm, address);
+	dump_pagetable(address);
+	__die("Bad pagetable", regs, error_code);
+	oops_end();
+	do_exit(SIGKILL);
+}
+
+/*
+ * Handle a fault on the vmalloc or module mapping area
+ */
+static int vmalloc_fault(unsigned long address)
+{
+	pgd_t *pgd, *pgd_ref;
+	pud_t *pud, *pud_ref;
+	pmd_t *pmd, *pmd_ref;
+	pte_t *pte, *pte_ref;
+
+	/* Copy kernel mappings over when needed. This can also
+	   happen within a race in page table update. In the later
+	   case just flush. */
+
+	pgd = pgd_offset(current->mm ?: &init_mm, address);
+	pgd_ref = pgd_offset_k(address);
+	if (pgd_none(*pgd_ref))
+		return -1;
+	if (pgd_none(*pgd))
+		set_pgd(pgd, *pgd_ref);
+
+	/* Below here mismatches are bugs because these lower tables
+	   are shared */
+
+	pud = pud_offset(pgd, address);
+	pud_ref = pud_offset(pgd_ref, address);
+	if (pud_none(*pud_ref))
+		return -1;
+	if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref))
+		BUG();
+	pmd = pmd_offset(pud, address);
+	pmd_ref = pmd_offset(pud_ref, address);
+	if (pmd_none(*pmd_ref))
+		return -1;
+	if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
+		BUG();
+	pte_ref = pte_offset_kernel(pmd_ref, address);
+	if (!pte_present(*pte_ref))
+		return -1;
+	pte = pte_offset_kernel(pmd, address);
+	if (!pte_present(*pte) || pte_page(*pte) != pte_page(*pte_ref))
+		BUG();
+	__flush_tlb_all();
+	return 0;
+}
+
+int page_fault_trace = 0;
+int exception_trace = 1;
+
+/*
+ * This routine handles page faults.  It determines the address,
+ * and the problem, and then passes it off to one of the appropriate
+ * routines.
+ *
+ * error_code:
+ *	bit 0 == 0 means no page found, 1 means protection fault
+ *	bit 1 == 0 means read, 1 means write
+ *	bit 2 == 0 means kernel, 1 means user-mode
+ *      bit 3 == 1 means fault was an instruction fetch
+ */
+asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+	struct task_struct *tsk;
+	struct mm_struct *mm;
+	struct vm_area_struct * vma;
+	unsigned long address;
+	const struct exception_table_entry *fixup;
+	int write;
+	siginfo_t info;
+
+#ifdef CONFIG_CHECKING
+	{ 
+		unsigned long gs; 
+		struct x8664_pda *pda = cpu_pda + stack_smp_processor_id(); 
+		rdmsrl(MSR_GS_BASE, gs); 
+		if (gs != (unsigned long)pda) { 
+			wrmsrl(MSR_GS_BASE, pda); 
+			printk("page_fault: wrong gs %lx expected %p\n", gs, pda);
+		}
+	}
+#endif
+
+	/* get the address */
+	__asm__("movq %%cr2,%0":"=r" (address));
+	if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
+					SIGSEGV) == NOTIFY_STOP)
+		return;
+
+	if (likely(regs->eflags & X86_EFLAGS_IF))
+		local_irq_enable();
+
+	if (unlikely(page_fault_trace))
+		printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
+		       regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); 
+
+	tsk = current;
+	mm = tsk->mm;
+	info.si_code = SEGV_MAPERR;
+
+
+	/*
+	 * We fault-in kernel-space virtual memory on-demand. The
+	 * 'reference' page table is init_mm.pgd.
+	 *
+	 * NOTE! We MUST NOT take any locks for this case. We may
+	 * be in an interrupt or a critical region, and should
+	 * only copy the information from the master page table,
+	 * nothing more.
+	 *
+	 * This verifies that the fault happens in kernel space
+	 * (error_code & 4) == 0, and that the fault was not a
+	 * protection error (error_code & 1) == 0.
+	 */
+	if (unlikely(address >= TASK_SIZE)) {
+		if (!(error_code & 5)) {
+			if (vmalloc_fault(address) < 0)
+				goto bad_area_nosemaphore;
+			return;
+		}
+		/*
+		 * Don't take the mm semaphore here. If we fixup a prefetch
+		 * fault we could otherwise deadlock.
+		 */
+		goto bad_area_nosemaphore;
+	}
+
+	if (unlikely(error_code & (1 << 3)))
+		pgtable_bad(address, regs, error_code);
+
+	/*
+	 * If we're in an interrupt or have no user
+	 * context, we must not take the fault..
+	 */
+	if (unlikely(in_atomic() || !mm))
+		goto bad_area_nosemaphore;
+
+ again:
+	/* When running in the kernel we expect faults to occur only to
+	 * addresses in user space.  All other faults represent errors in the
+	 * kernel and should generate an OOPS.  Unfortunatly, in the case of an
+	 * erroneous fault occuring in a code path which already holds mmap_sem
+	 * we will deadlock attempting to validate the fault against the
+	 * address space.  Luckily the kernel only validly references user
+	 * space from well defined areas of code, which are listed in the
+	 * exceptions table.
+	 *
+	 * As the vast majority of faults will be valid we will only perform
+	 * the source reference check when there is a possibilty of a deadlock.
+	 * Attempt to lock the address space, if we cannot we then validate the
+	 * source.  If this is invalid we can skip the address space check,
+	 * thus avoiding the deadlock.
+	 */
+	if (!down_read_trylock(&mm->mmap_sem)) {
+		if ((error_code & 4) == 0 &&
+		    !search_exception_tables(regs->rip))
+			goto bad_area_nosemaphore;
+		down_read(&mm->mmap_sem);
+	}
+
+	vma = find_vma(mm, address);
+	if (!vma)
+		goto bad_area;
+	if (likely(vma->vm_start <= address))
+		goto good_area;
+	if (!(vma->vm_flags & VM_GROWSDOWN))
+		goto bad_area;
+	if (error_code & 4) {
+		// XXX: align red zone size with ABI 
+		if (address + 128 < regs->rsp)
+			goto bad_area;
+	}
+	if (expand_stack(vma, address))
+		goto bad_area;
+/*
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
+good_area:
+	info.si_code = SEGV_ACCERR;
+	write = 0;
+	switch (error_code & 3) {
+		default:	/* 3: write, present */
+			/* fall through */
+		case 2:		/* write, not present */
+			if (!(vma->vm_flags & VM_WRITE))
+				goto bad_area;
+			write++;
+			break;
+		case 1:		/* read, present */
+			goto bad_area;
+		case 0:		/* read, not present */
+			if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
+				goto bad_area;
+	}
+
+	/*
+	 * If for any reason at all we couldn't handle the fault,
+	 * make sure we exit gracefully rather than endlessly redo
+	 * the fault.
+	 */
+	switch (handle_mm_fault(mm, vma, address, write)) {
+	case 1:
+		tsk->min_flt++;
+		break;
+	case 2:
+		tsk->maj_flt++;
+		break;
+	case 0:
+		goto do_sigbus;
+	default:
+		goto out_of_memory;
+	}
+
+	up_read(&mm->mmap_sem);
+	return;
+
+/*
+ * Something tried to access memory that isn't in our memory map..
+ * Fix it, but check if it's kernel or user first..
+ */
+bad_area:
+	up_read(&mm->mmap_sem);
+
+bad_area_nosemaphore:
+
+#ifdef CONFIG_IA32_EMULATION
+	/* 32bit vsyscall. map on demand. */
+	if (test_thread_flag(TIF_IA32) &&
+	    address >= VSYSCALL32_BASE && address < VSYSCALL32_END) {
+		if (map_syscall32(mm, address) < 0)
+			goto out_of_memory2;
+		return;
+	}
+#endif
+
+	/* User mode accesses just cause a SIGSEGV */
+	if (error_code & 4) {
+		if (is_prefetch(regs, address, error_code))
+			return;
+
+		/* Work around K8 erratum #100 K8 in compat mode
+		   occasionally jumps to illegal addresses >4GB.  We
+		   catch this here in the page fault handler because
+		   these addresses are not reachable. Just detect this
+		   case and return.  Any code segment in LDT is
+		   compatibility mode. */
+		if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
+		    (address >> 32))
+			return;
+
+		if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
+			printk(
+		       "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
+					tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
+					tsk->comm, tsk->pid, address, regs->rip,
+					regs->rsp, error_code);
+		}
+       
+		tsk->thread.cr2 = address;
+		/* Kernel addresses are always protection faults */
+		tsk->thread.error_code = error_code | (address >= TASK_SIZE);
+		tsk->thread.trap_no = 14;
+		info.si_signo = SIGSEGV;
+		info.si_errno = 0;
+		/* info.si_code has been set above */
+		info.si_addr = (void __user *)address;
+		force_sig_info(SIGSEGV, &info, tsk);
+		return;
+	}
+
+no_context:
+	
+	/* Are we prepared to handle this kernel fault?  */
+	fixup = search_exception_tables(regs->rip);
+	if (fixup) {
+		regs->rip = fixup->fixup;
+		return;
+	}
+
+	/* 
+	 * Hall of shame of CPU/BIOS bugs.
+	 */
+
+ 	if (is_prefetch(regs, address, error_code))
+ 		return;
+
+	if (is_errata93(regs, address))
+		return; 
+
+/*
+ * Oops. The kernel tried to access some bad page. We'll have to
+ * terminate things with extreme prejudice.
+ */
+
+	oops_begin(); 
+
+	if (address < PAGE_SIZE)
+		printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
+	else
+		printk(KERN_ALERT "Unable to handle kernel paging request");
+	printk(" at %016lx RIP: \n" KERN_ALERT,address);
+	printk_address(regs->rip);
+	printk("\n");
+	dump_pagetable(address);
+	__die("Oops", regs, error_code);
+	/* Executive summary in case the body of the oops scrolled away */
+	printk(KERN_EMERG "CR2: %016lx\n", address);
+	oops_end(); 
+	do_exit(SIGKILL);
+
+/*
+ * We ran out of memory, or some other thing happened to us that made
+ * us unable to handle the page fault gracefully.
+ */
+out_of_memory:
+	up_read(&mm->mmap_sem);
+out_of_memory2:
+	if (current->pid == 1) { 
+		yield();
+		goto again;
+	}
+	printk("VM: killing process %s\n", tsk->comm);
+	if (error_code & 4)
+		do_exit(SIGKILL);
+	goto no_context;
+
+do_sigbus:
+	up_read(&mm->mmap_sem);
+
+	/* Kernel mode? Handle exceptions or die */
+	if (!(error_code & 4))
+		goto no_context;
+
+	tsk->thread.cr2 = address;
+	tsk->thread.error_code = error_code;
+	tsk->thread.trap_no = 14;
+	info.si_signo = SIGBUS;
+	info.si_errno = 0;
+	info.si_code = BUS_ADRERR;
+	info.si_addr = (void __user *)address;
+	force_sig_info(SIGBUS, &info, tsk);
+	return;
+}
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
new file mode 100644
index 00000000000..b0d604551d8
--- /dev/null
+++ b/arch/x86_64/mm/init.c
@@ -0,0 +1,630 @@
+/*
+ *  linux/arch/x86_64/mm/init.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
+ *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
+ */
+
+#include <linux/config.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/pagemap.h>
+#include <linux/bootmem.h>
+#include <linux/proc_fs.h>
+
+#include <asm/processor.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/dma.h>
+#include <asm/fixmap.h>
+#include <asm/e820.h>
+#include <asm/apic.h>
+#include <asm/tlb.h>
+#include <asm/mmu_context.h>
+#include <asm/proto.h>
+#include <asm/smp.h>
+
+#ifndef Dprintk
+#define Dprintk(x...)
+#endif
+
+#ifdef CONFIG_GART_IOMMU
+extern int swiotlb;
+#endif
+
+extern char _stext[];
+
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+
+/*
+ * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
+ * physical space so we can cache the place of the first one and move
+ * around without checking the pgd every time.
+ */
+
+void show_mem(void)
+{
+	int i, total = 0, reserved = 0;
+	int shared = 0, cached = 0;
+	pg_data_t *pgdat;
+	struct page *page;
+
+	printk("Mem-info:\n");
+	show_free_areas();
+	printk("Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
+
+	for_each_pgdat(pgdat) {
+               for (i = 0; i < pgdat->node_spanned_pages; ++i) {
+			page = pfn_to_page(pgdat->node_start_pfn + i);
+			total++;
+                       if (PageReserved(page))
+			reserved++;
+                       else if (PageSwapCache(page))
+			cached++;
+                       else if (page_count(page))
+                               shared += page_count(page) - 1;
+               }
+	}
+	printk("%d pages of RAM\n", total);
+	printk("%d reserved pages\n",reserved);
+	printk("%d pages shared\n",shared);
+	printk("%d pages swap cached\n",cached);
+}
+
+/* References to section boundaries */
+
+extern char _text, _etext, _edata, __bss_start, _end[];
+extern char __init_begin, __init_end;
+
+int after_bootmem;
+
+static void *spp_getpage(void)
+{ 
+	void *ptr;
+	if (after_bootmem)
+		ptr = (void *) get_zeroed_page(GFP_ATOMIC); 
+	else
+		ptr = alloc_bootmem_pages(PAGE_SIZE);
+	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
+		panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
+
+	Dprintk("spp_getpage %p\n", ptr);
+	return ptr;
+} 
+
+static void set_pte_phys(unsigned long vaddr,
+			 unsigned long phys, pgprot_t prot)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte, new_pte;
+
+	Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
+
+	pgd = pgd_offset_k(vaddr);
+	if (pgd_none(*pgd)) {
+		printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
+		return;
+	}
+	pud = pud_offset(pgd, vaddr);
+	if (pud_none(*pud)) {
+		pmd = (pmd_t *) spp_getpage(); 
+		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
+		if (pmd != pmd_offset(pud, 0)) {
+			printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
+			return;
+		}
+	}
+	pmd = pmd_offset(pud, vaddr);
+	if (pmd_none(*pmd)) {
+		pte = (pte_t *) spp_getpage();
+		set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
+		if (pte != pte_offset_kernel(pmd, 0)) {
+			printk("PAGETABLE BUG #02!\n");
+			return;
+		}
+	}
+	new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
+
+	pte = pte_offset_kernel(pmd, vaddr);
+	if (!pte_none(*pte) &&
+	    pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
+		pte_ERROR(*pte);
+	set_pte(pte, new_pte);
+
+	/*
+	 * It's enough to flush this one mapping.
+	 * (PGE mappings get flushed as well)
+	 */
+	__flush_tlb_one(vaddr);
+}
+
+/* NOTE: this is meant to be run only at boot */
+void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
+{
+	unsigned long address = __fix_to_virt(idx);
+
+	if (idx >= __end_of_fixed_addresses) {
+		printk("Invalid __set_fixmap\n");
+		return;
+	}
+	set_pte_phys(address, phys, prot);
+}
+
+unsigned long __initdata table_start, table_end; 
+
+extern pmd_t temp_boot_pmds[]; 
+
+static  struct temp_map { 
+	pmd_t *pmd;
+	void  *address; 
+	int    allocated; 
+} temp_mappings[] __initdata = { 
+	{ &temp_boot_pmds[0], (void *)(40UL * 1024 * 1024) },
+	{ &temp_boot_pmds[1], (void *)(42UL * 1024 * 1024) }, 
+	{}
+}; 
+
+static __init void *alloc_low_page(int *index, unsigned long *phys) 
+{ 
+	struct temp_map *ti;
+	int i; 
+	unsigned long pfn = table_end++, paddr; 
+	void *adr;
+
+	if (pfn >= end_pfn) 
+		panic("alloc_low_page: ran out of memory"); 
+	for (i = 0; temp_mappings[i].allocated; i++) {
+		if (!temp_mappings[i].pmd) 
+			panic("alloc_low_page: ran out of temp mappings"); 
+	} 
+	ti = &temp_mappings[i];
+	paddr = (pfn << PAGE_SHIFT) & PMD_MASK; 
+	set_pmd(ti->pmd, __pmd(paddr | _KERNPG_TABLE | _PAGE_PSE)); 
+	ti->allocated = 1; 
+	__flush_tlb(); 	       
+	adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK); 
+	*index = i; 
+	*phys  = pfn * PAGE_SIZE;  
+	return adr; 
+} 
+
+static __init void unmap_low_page(int i)
+{ 
+	struct temp_map *ti = &temp_mappings[i];
+	set_pmd(ti->pmd, __pmd(0));
+	ti->allocated = 0; 
+} 
+
+static void __init phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
+{ 
+	long i, j; 
+
+	i = pud_index(address);
+	pud = pud + i;
+	for (; i < PTRS_PER_PUD; pud++, i++) {
+		int map; 
+		unsigned long paddr, pmd_phys;
+		pmd_t *pmd;
+
+		paddr = address + i*PUD_SIZE;
+		if (paddr >= end) { 
+			for (; i < PTRS_PER_PUD; i++, pud++) 
+				set_pud(pud, __pud(0)); 
+			break;
+		} 
+
+		if (!e820_mapped(paddr, paddr+PUD_SIZE, 0)) { 
+			set_pud(pud, __pud(0)); 
+			continue;
+		} 
+
+		pmd = alloc_low_page(&map, &pmd_phys);
+		set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
+		for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
+			unsigned long pe;
+
+			if (paddr >= end) { 
+				for (; j < PTRS_PER_PMD; j++, pmd++)
+					set_pmd(pmd,  __pmd(0)); 
+				break;
+		}
+			pe = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | paddr;
+			pe &= __supported_pte_mask;
+			set_pmd(pmd, __pmd(pe));
+		}
+		unmap_low_page(map);
+	}
+	__flush_tlb();
+} 
+
+static void __init find_early_table_space(unsigned long end)
+{
+	unsigned long puds, pmds, tables;
+
+	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
+	pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
+	tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
+		 round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
+
+	table_start = find_e820_area(0x8000, __pa_symbol(&_text), tables);
+	if (table_start == -1UL)
+		panic("Cannot find space for the kernel page tables");
+
+	table_start >>= PAGE_SHIFT;
+	table_end = table_start;
+}
+
+/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
+   This runs before bootmem is initialized and gets pages directly from the 
+   physical memory. To access them they are temporarily mapped. */
+void __init init_memory_mapping(unsigned long start, unsigned long end)
+{ 
+	unsigned long next; 
+
+	Dprintk("init_memory_mapping\n");
+
+	/* 
+	 * Find space for the kernel direct mapping tables.
+	 * Later we should allocate these tables in the local node of the memory
+	 * mapped.  Unfortunately this is done currently before the nodes are 
+	 * discovered.
+	 */
+	find_early_table_space(end);
+
+	start = (unsigned long)__va(start);
+	end = (unsigned long)__va(end);
+
+	for (; start < end; start = next) {
+		int map;
+		unsigned long pud_phys; 
+		pud_t *pud = alloc_low_page(&map, &pud_phys);
+		next = start + PGDIR_SIZE;
+		if (next > end) 
+			next = end; 
+		phys_pud_init(pud, __pa(start), __pa(next));
+		set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
+		unmap_low_page(map);   
+	} 
+
+	asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features));
+	__flush_tlb_all();
+	early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end, 
+	       table_start<<PAGE_SHIFT, 
+	       table_end<<PAGE_SHIFT);
+}
+
+extern struct x8664_pda cpu_pda[NR_CPUS];
+
+/* Assumes all CPUs still execute in init_mm */
+void zap_low_mappings(void)
+{
+	pgd_t *pgd = pgd_offset_k(0UL);
+	pgd_clear(pgd);
+	flush_tlb_all();
+}
+
+#ifndef CONFIG_DISCONTIGMEM
+void __init paging_init(void)
+{
+	{
+		unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
+		unsigned int max_dma;
+
+		max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+
+		if (end_pfn < max_dma)
+			zones_size[ZONE_DMA] = end_pfn;
+		else {
+			zones_size[ZONE_DMA] = max_dma;
+			zones_size[ZONE_NORMAL] = end_pfn - max_dma;
+		}
+		free_area_init(zones_size);
+	}
+	return;
+}
+#endif
+
+/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
+   from the CPU leading to inconsistent cache lines. address and size
+   must be aligned to 2MB boundaries. 
+   Does nothing when the mapping doesn't exist. */
+void __init clear_kernel_mapping(unsigned long address, unsigned long size) 
+{
+	unsigned long end = address + size;
+
+	BUG_ON(address & ~LARGE_PAGE_MASK);
+	BUG_ON(size & ~LARGE_PAGE_MASK); 
+	
+	for (; address < end; address += LARGE_PAGE_SIZE) { 
+		pgd_t *pgd = pgd_offset_k(address);
+		pud_t *pud;
+		pmd_t *pmd;
+		if (pgd_none(*pgd))
+			continue;
+		pud = pud_offset(pgd, address);
+		if (pud_none(*pud))
+			continue; 
+		pmd = pmd_offset(pud, address);
+		if (!pmd || pmd_none(*pmd))
+			continue; 
+		if (0 == (pmd_val(*pmd) & _PAGE_PSE)) { 
+			/* Could handle this, but it should not happen currently. */
+			printk(KERN_ERR 
+	       "clear_kernel_mapping: mapping has been split. will leak memory\n"); 
+			pmd_ERROR(*pmd); 
+		}
+		set_pmd(pmd, __pmd(0)); 		
+	}
+	__flush_tlb_all();
+} 
+
+static inline int page_is_ram (unsigned long pagenr)
+{
+	int i;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		unsigned long addr, end;
+
+		if (e820.map[i].type != E820_RAM)	/* not usable memory */
+			continue;
+		/*
+		 *	!!!FIXME!!! Some BIOSen report areas as RAM that
+		 *	are not. Notably the 640->1Mb area. We need a sanity
+		 *	check here.
+		 */
+		addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
+		end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
+		if  ((pagenr >= addr) && (pagenr < end))
+			return 1;
+	}
+	return 0;
+}
+
+extern int swiotlb_force;
+
+static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
+			 kcore_vsyscall;
+
+void __init mem_init(void)
+{
+	int codesize, reservedpages, datasize, initsize;
+	int tmp;
+
+#ifdef CONFIG_SWIOTLB
+	if (swiotlb_force)
+		swiotlb = 1;
+	if (!iommu_aperture &&
+	    (end_pfn >= 0xffffffff>>PAGE_SHIFT || force_iommu))
+	       swiotlb = 1;
+	if (swiotlb)
+		swiotlb_init();	
+#endif
+
+	/* How many end-of-memory variables you have, grandma! */
+	max_low_pfn = end_pfn;
+	max_pfn = end_pfn;
+	num_physpages = end_pfn;
+	high_memory = (void *) __va(end_pfn * PAGE_SIZE);
+
+	/* clear the zero-page */
+	memset(empty_zero_page, 0, PAGE_SIZE);
+
+	reservedpages = 0;
+
+	/* this will put all low memory onto the freelists */
+#ifdef CONFIG_DISCONTIGMEM
+	totalram_pages += numa_free_all_bootmem();
+	tmp = 0;
+	/* should count reserved pages here for all nodes */ 
+#else
+	max_mapnr = end_pfn;
+	if (!mem_map) BUG();
+
+	totalram_pages += free_all_bootmem();
+
+	for (tmp = 0; tmp < end_pfn; tmp++)
+		/*
+		 * Only count reserved RAM pages
+		 */
+		if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
+			reservedpages++;
+#endif
+
+	after_bootmem = 1;
+
+	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
+	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
+	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
+
+	/* Register memory areas for /proc/kcore */
+	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); 
+	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, 
+		   VMALLOC_END-VMALLOC_START);
+	kclist_add(&kcore_kernel, &_stext, _end - _stext);
+	kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
+	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, 
+				 VSYSCALL_END - VSYSCALL_START);
+
+	printk("Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init)\n",
+		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
+		end_pfn << (PAGE_SHIFT-10),
+		codesize >> 10,
+		reservedpages << (PAGE_SHIFT-10),
+		datasize >> 10,
+		initsize >> 10);
+
+	/*
+	 * Subtle. SMP is doing its boot stuff late (because it has to
+	 * fork idle threads) - but it also needs low mappings for the
+	 * protected-mode entry to work. We zap these entries only after
+	 * the WP-bit has been tested.
+	 */
+#ifndef CONFIG_SMP
+	zap_low_mappings();
+#endif
+}
+
+extern char __initdata_begin[], __initdata_end[];
+
+void free_initmem(void)
+{
+	unsigned long addr;
+
+	addr = (unsigned long)(&__init_begin);
+	for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
+		ClearPageReserved(virt_to_page(addr));
+		set_page_count(virt_to_page(addr), 1);
+		memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE); 
+		free_page(addr);
+		totalram_pages++;
+	}
+	memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin);
+	printk ("Freeing unused kernel memory: %luk freed\n", (&__init_end - &__init_begin) >> 10);
+}
+
+#ifdef CONFIG_BLK_DEV_INITRD
+void free_initrd_mem(unsigned long start, unsigned long end)
+{
+	if (start < (unsigned long)&_end)
+		return;
+	printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
+	for (; start < end; start += PAGE_SIZE) {
+		ClearPageReserved(virt_to_page(start));
+		set_page_count(virt_to_page(start), 1);
+		free_page(start);
+		totalram_pages++;
+	}
+}
+#endif
+
+void __init reserve_bootmem_generic(unsigned long phys, unsigned len) 
+{ 
+	/* Should check here against the e820 map to avoid double free */ 
+#ifdef CONFIG_DISCONTIGMEM
+	int nid = phys_to_nid(phys);
+  	reserve_bootmem_node(NODE_DATA(nid), phys, len);
+#else       		
+	reserve_bootmem(phys, len);    
+#endif
+}
+
+int kern_addr_valid(unsigned long addr) 
+{ 
+	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+	if (above != 0 && above != -1UL)
+		return 0; 
+	
+	pgd = pgd_offset_k(addr);
+	if (pgd_none(*pgd))
+		return 0;
+
+	pud = pud_offset(pgd, addr);
+	if (pud_none(*pud))
+		return 0; 
+
+	pmd = pmd_offset(pud, addr);
+	if (pmd_none(*pmd))
+		return 0;
+	if (pmd_large(*pmd))
+		return pfn_valid(pmd_pfn(*pmd));
+
+	pte = pte_offset_kernel(pmd, addr);
+	if (pte_none(*pte))
+		return 0;
+	return pfn_valid(pte_pfn(*pte));
+}
+
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+
+extern int exception_trace, page_fault_trace;
+
+static ctl_table debug_table2[] = {
+	{ 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
+	  proc_dointvec },
+#ifdef CONFIG_CHECKING
+	{ 100, "page-fault-trace", &page_fault_trace, sizeof(int), 0644, NULL,
+	  proc_dointvec },
+#endif
+	{ 0, }
+}; 
+
+static ctl_table debug_root_table2[] = { 
+	{ .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555, 
+	   .child = debug_table2 }, 
+	{ 0 }, 
+}; 
+
+static __init int x8664_sysctl_init(void)
+{ 
+	register_sysctl_table(debug_root_table2, 1);
+	return 0;
+}
+__initcall(x8664_sysctl_init);
+#endif
+
+/* Pseudo VMAs to allow ptrace access for the vsyscall pages.  x86-64 has two
+   different ones: one for 32bit and one for 64bit. Use the appropiate
+   for the target task. */
+
+static struct vm_area_struct gate_vma = {
+	.vm_start = VSYSCALL_START,
+	.vm_end = VSYSCALL_END,
+	.vm_page_prot = PAGE_READONLY
+};
+
+static struct vm_area_struct gate32_vma = {
+	.vm_start = VSYSCALL32_BASE,
+	.vm_end = VSYSCALL32_END,
+	.vm_page_prot = PAGE_READONLY
+};
+
+struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
+{
+#ifdef CONFIG_IA32_EMULATION
+	if (test_tsk_thread_flag(tsk, TIF_IA32)) {
+		/* lookup code assumes the pages are present. set them up
+		   now */
+		if (__map_syscall32(tsk->mm, VSYSCALL32_BASE) < 0)
+			return NULL;
+		return &gate32_vma;
+	}
+#endif
+	return &gate_vma;
+}
+
+int in_gate_area(struct task_struct *task, unsigned long addr)
+{
+	struct vm_area_struct *vma = get_gate_vma(task);
+	return (addr >= vma->vm_start) && (addr < vma->vm_end);
+}
+
+/* Use this when you have no reliable task/vma, typically from interrupt
+ * context.  It is less reliable than using the task's vma and may give
+ * false positives.
+ */
+int in_gate_area_no_task(unsigned long addr)
+{
+	return (((addr >= VSYSCALL_START) && (addr < VSYSCALL_END)) ||
+		((addr >= VSYSCALL32_BASE) && (addr < VSYSCALL32_END)));
+}
diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c
new file mode 100644
index 00000000000..74ec8554b19
--- /dev/null
+++ b/arch/x86_64/mm/ioremap.c
@@ -0,0 +1,283 @@
+/*
+ * arch/x86_64/mm/ioremap.c
+ *
+ * Re-map IO memory to kernel address space so that we can access it.
+ * This is needed for high PCI addresses that aren't mapped in the
+ * 640k-1MB IO memory area on PC's
+ *
+ * (C) Copyright 1995 1996 Linus Torvalds
+ */
+
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <asm/io.h>
+#include <asm/pgalloc.h>
+#include <asm/fixmap.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/proto.h>
+
+#define ISA_START_ADDRESS      0xa0000
+#define ISA_END_ADDRESS                0x100000
+
+static inline void remap_area_pte(pte_t * pte, unsigned long address, unsigned long size,
+	unsigned long phys_addr, unsigned long flags)
+{
+	unsigned long end;
+	unsigned long pfn;
+
+	address &= ~PMD_MASK;
+	end = address + size;
+	if (end > PMD_SIZE)
+		end = PMD_SIZE;
+	if (address >= end)
+		BUG();
+	pfn = phys_addr >> PAGE_SHIFT;
+	do {
+		if (!pte_none(*pte)) {
+			printk("remap_area_pte: page already exists\n");
+			BUG();
+		}
+		set_pte(pte, pfn_pte(pfn, __pgprot(_PAGE_PRESENT | _PAGE_RW | 
+					_PAGE_GLOBAL | _PAGE_DIRTY | _PAGE_ACCESSED | flags)));
+		address += PAGE_SIZE;
+		pfn++;
+		pte++;
+	} while (address && (address < end));
+}
+
+static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size,
+	unsigned long phys_addr, unsigned long flags)
+{
+	unsigned long end;
+
+	address &= ~PUD_MASK;
+	end = address + size;
+	if (end > PUD_SIZE)
+		end = PUD_SIZE;
+	phys_addr -= address;
+	if (address >= end)
+		BUG();
+	do {
+		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
+		if (!pte)
+			return -ENOMEM;
+		remap_area_pte(pte, address, end - address, address + phys_addr, flags);
+		address = (address + PMD_SIZE) & PMD_MASK;
+		pmd++;
+	} while (address && (address < end));
+	return 0;
+}
+
+static inline int remap_area_pud(pud_t * pud, unsigned long address, unsigned long size,
+	unsigned long phys_addr, unsigned long flags)
+{
+	unsigned long end;
+
+	address &= ~PGDIR_MASK;
+	end = address + size;
+	if (end > PGDIR_SIZE)
+		end = PGDIR_SIZE;
+	phys_addr -= address;
+	if (address >= end)
+		BUG();
+	do {
+		pmd_t * pmd = pmd_alloc(&init_mm, pud, address);
+		if (!pmd)
+			return -ENOMEM;
+		remap_area_pmd(pmd, address, end - address, address + phys_addr, flags);
+		address = (address + PUD_SIZE) & PUD_MASK;
+		pud++;
+	} while (address && (address < end));
+	return 0;
+}
+
+static int remap_area_pages(unsigned long address, unsigned long phys_addr,
+				 unsigned long size, unsigned long flags)
+{
+	int error;
+	pgd_t *pgd;
+	unsigned long end = address + size;
+
+	phys_addr -= address;
+	pgd = pgd_offset_k(address);
+	flush_cache_all();
+	if (address >= end)
+		BUG();
+	spin_lock(&init_mm.page_table_lock);
+	do {
+		pud_t *pud;
+		pud = pud_alloc(&init_mm, pgd, address);
+		error = -ENOMEM;
+		if (!pud)
+			break;
+		if (remap_area_pud(pud, address, end - address,
+					 phys_addr + address, flags))
+			break;
+		error = 0;
+		address = (address + PGDIR_SIZE) & PGDIR_MASK;
+		pgd++;
+	} while (address && (address < end));
+	spin_unlock(&init_mm.page_table_lock);
+	flush_tlb_all();
+	return error;
+}
+
+/*
+ * Fix up the linear direct mapping of the kernel to avoid cache attribute
+ * conflicts.
+ */
+static int
+ioremap_change_attr(unsigned long phys_addr, unsigned long size,
+					unsigned long flags)
+{
+	int err = 0;
+	if (flags && phys_addr + size - 1 < (end_pfn_map << PAGE_SHIFT)) {
+		unsigned long npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		unsigned long vaddr = (unsigned long) __va(phys_addr);
+
+		/*
+ 		 * Must use a address here and not struct page because the phys addr
+		 * can be a in hole between nodes and not have an memmap entry.
+		 */
+		err = change_page_attr_addr(vaddr,npages,__pgprot(__PAGE_KERNEL|flags));
+		if (!err)
+			global_flush_tlb();
+	}
+	return err;
+}
+
+/*
+ * Generic mapping function
+ */
+
+/*
+ * Remap an arbitrary physical address space into the kernel virtual
+ * address space. Needed when the kernel wants to access high addresses
+ * directly.
+ *
+ * NOTE! We need to allow non-page-aligned mappings too: we will obviously
+ * have to convert them into an offset in a page-aligned mapping, but the
+ * caller shouldn't need to know that small detail.
+ */
+void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
+{
+	void * addr;
+	struct vm_struct * area;
+	unsigned long offset, last_addr;
+
+	/* Don't allow wraparound or zero size */
+	last_addr = phys_addr + size - 1;
+	if (!size || last_addr < phys_addr)
+		return NULL;
+
+	/*
+	 * Don't remap the low PCI/ISA area, it's always mapped..
+	 */
+	if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
+		return (__force void __iomem *)phys_to_virt(phys_addr);
+
+#ifndef CONFIG_DISCONTIGMEM
+	/*
+	 * Don't allow anybody to remap normal RAM that we're using..
+	 */
+	if (last_addr < virt_to_phys(high_memory)) {
+		char *t_addr, *t_end;
+ 		struct page *page;
+
+		t_addr = __va(phys_addr);
+		t_end = t_addr + (size - 1);
+	   
+		for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
+			if(!PageReserved(page))
+				return NULL;
+	}
+#endif
+
+	/*
+	 * Mappings have to be page-aligned
+	 */
+	offset = phys_addr & ~PAGE_MASK;
+	phys_addr &= PAGE_MASK;
+	size = PAGE_ALIGN(last_addr+1) - phys_addr;
+
+	/*
+	 * Ok, go for it..
+	 */
+	area = get_vm_area(size, VM_IOREMAP | (flags << 20));
+	if (!area)
+		return NULL;
+	area->phys_addr = phys_addr;
+	addr = area->addr;
+	if (remap_area_pages((unsigned long) addr, phys_addr, size, flags)) {
+		remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
+		return NULL;
+	}
+	if (ioremap_change_attr(phys_addr, size, flags) < 0) {
+		area->flags &= 0xffffff;
+		vunmap(addr);
+		return NULL;
+	}
+	return (__force void __iomem *) (offset + (char *)addr);
+}
+
+/**
+ * ioremap_nocache     -   map bus memory into CPU space
+ * @offset:    bus address of the memory
+ * @size:      size of the resource to map
+ *
+ * ioremap_nocache performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address. 
+ *
+ * This version of ioremap ensures that the memory is marked uncachable
+ * on the CPU as well as honouring existing caching rules from things like
+ * the PCI bus. Note that there are other caches and buffers on many 
+ * busses. In particular driver authors should read up on PCI writes
+ *
+ * It's useful if some control registers are in such an area and
+ * write combining or read caching is not desirable:
+ * 
+ * Must be freed with iounmap.
+ */
+
+void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
+{
+	return __ioremap(phys_addr, size, _PAGE_PCD);
+}
+
+void iounmap(volatile void __iomem *addr)
+{
+	struct vm_struct *p, **pprev;
+
+	if (addr <= high_memory) 
+		return; 
+	if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
+		addr < phys_to_virt(ISA_END_ADDRESS))
+		return;
+
+	write_lock(&vmlist_lock);
+	for (p = vmlist, pprev = &vmlist; p != NULL; pprev = &p->next, p = *pprev)
+		if (p->addr == (void *)(PAGE_MASK & (unsigned long)addr))
+			break;
+	if (!p) { 
+		printk("__iounmap: bad address %p\n", addr);
+		goto out_unlock;
+	}
+	*pprev = p->next;
+	unmap_vm_area(p);
+	if ((p->flags >> 20) &&
+		p->phys_addr + p->size - 1 < virt_to_phys(high_memory)) {
+		/* p->size includes the guard page, but cpa doesn't like that */
+		change_page_attr(virt_to_page(__va(p->phys_addr)),
+				 p->size >> PAGE_SHIFT,
+				 PAGE_KERNEL);
+		global_flush_tlb();
+	} 
+out_unlock:
+	write_unlock(&vmlist_lock);
+	kfree(p); 
+}
diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c
new file mode 100644
index 00000000000..ec35747aacd
--- /dev/null
+++ b/arch/x86_64/mm/k8topology.c
@@ -0,0 +1,168 @@
+/* 
+ * AMD K8 NUMA support.
+ * Discover the memory map and associated nodes.
+ * 
+ * This version reads it directly from the K8 northbridge.
+ * 
+ * Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/nodemask.h>
+#include <asm/io.h>
+#include <linux/pci_ids.h>
+#include <asm/types.h>
+#include <asm/mmzone.h>
+#include <asm/proto.h>
+#include <asm/e820.h>
+#include <asm/pci-direct.h>
+#include <asm/numa.h>
+
+static __init int find_northbridge(void)
+{
+	int num; 
+
+	for (num = 0; num < 32; num++) { 
+		u32 header;
+		
+		header = read_pci_config(0, num, 0, 0x00);  
+		if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)))
+			continue; 	
+
+		header = read_pci_config(0, num, 1, 0x00); 
+		if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)))
+			continue;	
+		return num; 
+	} 
+
+	return -1; 	
+}
+
+int __init k8_scan_nodes(unsigned long start, unsigned long end)
+{ 
+	unsigned long prevbase;
+	struct node nodes[8];
+	int nodeid, i, nb; 
+	int found = 0;
+	u32 reg;
+	unsigned numnodes;
+	nodemask_t nodes_parsed;
+
+	nodes_clear(nodes_parsed);
+
+	nb = find_northbridge(); 
+	if (nb < 0) 
+		return nb;
+
+	printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); 
+
+	reg = read_pci_config(0, nb, 0, 0x60); 
+	numnodes = ((reg >> 4) & 0xF) + 1;
+
+	printk(KERN_INFO "Number of nodes %d\n", numnodes);
+
+	memset(&nodes,0,sizeof(nodes)); 
+	prevbase = 0;
+	for (i = 0; i < 8; i++) { 
+		unsigned long base,limit; 
+
+		base = read_pci_config(0, nb, 1, 0x40 + i*8);
+		limit = read_pci_config(0, nb, 1, 0x44 + i*8);
+
+		nodeid = limit & 7; 
+		if ((base & 3) == 0) { 
+			if (i < numnodes)
+				printk("Skipping disabled node %d\n", i); 
+			continue;
+		} 
+		if (nodeid >= numnodes) {
+			printk("Ignoring excess node %d (%lx:%lx)\n", nodeid,
+			       base, limit); 
+			continue;
+		} 
+
+		if (!limit) { 
+			printk(KERN_INFO "Skipping node entry %d (base %lx)\n", i,
+			       base);
+			continue;
+		}
+		if ((base >> 8) & 3 || (limit >> 8) & 3) {
+			printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n", 
+			       nodeid, (base>>8)&3, (limit>>8) & 3); 
+			return -1; 
+		}	
+		if (node_isset(nodeid, nodes_parsed)) { 
+			printk(KERN_INFO "Node %d already present. Skipping\n", 
+			       nodeid);
+			continue;
+		}
+
+		limit >>= 16; 
+		limit <<= 24; 
+		limit |= (1<<24)-1;
+
+		if (limit > end_pfn << PAGE_SHIFT)
+			limit = end_pfn << PAGE_SHIFT;
+		if (limit <= base)
+			continue; 
+			
+		base >>= 16;
+		base <<= 24; 
+
+		if (base < start) 
+			base = start; 
+		if (limit > end) 
+			limit = end; 
+		if (limit == base) { 
+			printk(KERN_ERR "Empty node %d\n", nodeid); 
+			continue; 
+		}
+		if (limit < base) { 
+			printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n",
+			       nodeid, base, limit); 			       
+			continue;
+		} 
+		
+		/* Could sort here, but pun for now. Should not happen anyroads. */
+		if (prevbase > base) { 
+			printk(KERN_ERR "Node map not sorted %lx,%lx\n",
+			       prevbase,base);
+			return -1;
+		}
+			
+		printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n", 
+		       nodeid, base, limit); 
+		
+		found++;
+		
+		nodes[nodeid].start = base; 
+		nodes[nodeid].end = limit;
+
+		prevbase = base;
+
+		node_set(nodeid, nodes_parsed);
+	} 
+
+	if (!found)
+		return -1; 
+
+	memnode_shift = compute_hash_shift(nodes, numnodes);
+	if (memnode_shift < 0) { 
+		printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); 
+		return -1; 
+	} 
+	printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); 
+
+	for (i = 0; i < 8; i++) {
+		if (nodes[i].start != nodes[i].end) { 
+			/* assume 1:1 NODE:CPU */
+			cpu_to_node[i] = i; 
+			setup_node_bootmem(i, nodes[i].start, nodes[i].end); 
+		} 
+	}
+
+	numa_init_array();
+	return 0;
+} 
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
new file mode 100644
index 00000000000..fd9f25d7a6c
--- /dev/null
+++ b/arch/x86_64/mm/numa.c
@@ -0,0 +1,294 @@
+/* 
+ * Generic VM initialization for x86-64 NUMA setups.
+ * Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ */ 
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/mmzone.h>
+#include <linux/ctype.h>
+#include <linux/module.h>
+#include <linux/nodemask.h>
+
+#include <asm/e820.h>
+#include <asm/proto.h>
+#include <asm/dma.h>
+#include <asm/numa.h>
+#include <asm/acpi.h>
+
+#ifndef Dprintk
+#define Dprintk(x...)
+#endif
+
+struct pglist_data *node_data[MAX_NUMNODES];
+bootmem_data_t plat_node_bdata[MAX_NUMNODES];
+
+int memnode_shift;
+u8  memnodemap[NODEMAPSIZE];
+
+unsigned char cpu_to_node[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE };
+cpumask_t     node_to_cpumask[MAX_NUMNODES];
+
+int numa_off __initdata;
+
+int __init compute_hash_shift(struct node *nodes, int numnodes)
+{
+	int i; 
+	int shift = 24;
+	u64 addr;
+	
+	/* When in doubt use brute force. */
+	while (shift < 48) { 
+		memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE); 
+		for (i = 0; i < numnodes; i++) {
+			if (nodes[i].start == nodes[i].end) 
+				continue;
+			for (addr = nodes[i].start; 
+			     addr < nodes[i].end; 
+			     addr += (1UL << shift)) {
+				if (memnodemap[addr >> shift] != 0xff && 
+				    memnodemap[addr >> shift] != i) { 
+					printk(KERN_INFO 
+					    "node %d shift %d addr %Lx conflict %d\n", 
+					       i, shift, addr, memnodemap[addr>>shift]);
+					goto next; 
+				} 
+				memnodemap[addr >> shift] = i; 
+			} 
+		} 
+		return shift; 
+	next:
+		shift++; 
+	} 
+	memset(memnodemap,0,sizeof(*memnodemap) * NODEMAPSIZE); 
+	return -1; 
+}
+
+/* Initialize bootmem allocator for a node */
+void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
+{ 
+	unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; 
+	unsigned long nodedata_phys;
+	const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
+
+	start = round_up(start, ZONE_ALIGN); 
+
+	printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
+
+	start_pfn = start >> PAGE_SHIFT;
+	end_pfn = end >> PAGE_SHIFT;
+
+	nodedata_phys = find_e820_area(start, end, pgdat_size); 
+	if (nodedata_phys == -1L) 
+		panic("Cannot find memory pgdat in node %d\n", nodeid);
+
+	Dprintk("nodedata_phys %lx\n", nodedata_phys); 
+
+	node_data[nodeid] = phys_to_virt(nodedata_phys);
+	memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
+	NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
+	NODE_DATA(nodeid)->node_start_pfn = start_pfn;
+	NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
+
+	/* Find a place for the bootmem map */
+	bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 
+	bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
+	bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
+	if (bootmap_start == -1L) 
+		panic("Not enough continuous space for bootmap on node %d", nodeid); 
+	Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); 
+	
+	bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
+					 bootmap_start >> PAGE_SHIFT, 
+					 start_pfn, end_pfn); 
+
+	e820_bootmem_free(NODE_DATA(nodeid), start, end);
+
+	reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); 
+	reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
+	node_set_online(nodeid);
+} 
+
+/* Initialize final allocator for a zone */
+void __init setup_node_zones(int nodeid)
+{ 
+	unsigned long start_pfn, end_pfn; 
+	unsigned long zones[MAX_NR_ZONES];
+	unsigned long dma_end_pfn;
+
+	memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES); 
+
+	start_pfn = node_start_pfn(nodeid);
+	end_pfn = node_end_pfn(nodeid);
+
+	Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn);
+	
+	/* All nodes > 0 have a zero length zone DMA */ 
+	dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT; 
+	if (start_pfn < dma_end_pfn) { 
+		zones[ZONE_DMA] = dma_end_pfn - start_pfn;
+		zones[ZONE_NORMAL] = end_pfn - dma_end_pfn; 
+	} else { 
+		zones[ZONE_NORMAL] = end_pfn - start_pfn; 
+	} 
+    
+	free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
+			    start_pfn, NULL); 
+} 
+
+void __init numa_init_array(void)
+{
+	int rr, i;
+	/* There are unfortunately some poorly designed mainboards around
+	   that only connect memory to a single CPU. This breaks the 1:1 cpu->node
+	   mapping. To avoid this fill in the mapping for all possible
+	   CPUs, as the number of CPUs is not known yet. 
+	   We round robin the existing nodes. */
+	rr = 0;
+	for (i = 0; i < NR_CPUS; i++) {
+		if (cpu_to_node[i] != NUMA_NO_NODE)
+			continue;
+		rr = next_node(rr, node_online_map);
+		if (rr == MAX_NUMNODES)
+			rr = first_node(node_online_map);
+		cpu_to_node[i] = rr;
+		rr++; 
+	}
+
+	set_bit(0, &node_to_cpumask[cpu_to_node(0)]);
+}
+
+#ifdef CONFIG_NUMA_EMU
+int numa_fake __initdata = 0;
+
+/* Numa emulation */
+static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
+{
+ 	int i;
+ 	struct node nodes[MAX_NUMNODES];
+ 	unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
+
+ 	/* Kludge needed for the hash function */
+ 	if (hweight64(sz) > 1) {
+ 		unsigned long x = 1;
+ 		while ((x << 1) < sz)
+ 			x <<= 1;
+ 		if (x < sz/2)
+ 			printk("Numa emulation unbalanced. Complain to maintainer\n");
+ 		sz = x;
+ 	}
+
+ 	memset(&nodes,0,sizeof(nodes));
+ 	for (i = 0; i < numa_fake; i++) {
+ 		nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
+ 		if (i == numa_fake-1)
+ 			sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
+ 		nodes[i].end = nodes[i].start + sz;
+ 		if (i != numa_fake-1)
+ 			nodes[i].end--;
+ 		printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
+ 		       i,
+ 		       nodes[i].start, nodes[i].end,
+ 		       (nodes[i].end - nodes[i].start) >> 20);
+		node_set_online(i);
+ 	}
+ 	memnode_shift = compute_hash_shift(nodes, numa_fake);
+ 	if (memnode_shift < 0) {
+ 		memnode_shift = 0;
+ 		printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
+ 		return -1;
+ 	}
+ 	for_each_online_node(i)
+ 		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+ 	numa_init_array();
+ 	return 0;
+}
+#endif
+
+void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
+{ 
+	int i;
+
+#ifdef CONFIG_NUMA_EMU
+	if (numa_fake && !numa_emulation(start_pfn, end_pfn))
+ 		return;
+#endif
+
+#ifdef CONFIG_ACPI_NUMA
+	if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
+					  end_pfn << PAGE_SHIFT))
+ 		return;
+#endif
+
+#ifdef CONFIG_K8_NUMA
+	if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
+		return;
+#endif
+	printk(KERN_INFO "%s\n",
+	       numa_off ? "NUMA turned off" : "No NUMA configuration found");
+
+	printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 
+	       start_pfn << PAGE_SHIFT,
+	       end_pfn << PAGE_SHIFT); 
+		/* setup dummy node covering all memory */ 
+	memnode_shift = 63; 
+	memnodemap[0] = 0;
+	nodes_clear(node_online_map);
+	node_set_online(0);
+	for (i = 0; i < NR_CPUS; i++)
+		cpu_to_node[i] = 0;
+	node_to_cpumask[0] = cpumask_of_cpu(0);
+	setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
+}
+
+__init void numa_add_cpu(int cpu)
+{
+	/* BP is initialized elsewhere */
+	if (cpu) 
+		set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
+} 
+
+unsigned long __init numa_free_all_bootmem(void) 
+{ 
+	int i;
+	unsigned long pages = 0;
+	for_each_online_node(i) {
+		pages += free_all_bootmem_node(NODE_DATA(i));
+	}
+	return pages;
+} 
+
+void __init paging_init(void)
+{ 
+	int i;
+	for_each_online_node(i) {
+		setup_node_zones(i); 
+	}
+} 
+
+/* [numa=off] */
+__init int numa_setup(char *opt) 
+{ 
+	if (!strncmp(opt,"off",3))
+		numa_off = 1;
+#ifdef CONFIG_NUMA_EMU
+	if(!strncmp(opt, "fake=", 5)) {
+		numa_fake = simple_strtoul(opt+5,NULL,0); ;
+		if (numa_fake >= MAX_NUMNODES)
+			numa_fake = MAX_NUMNODES;
+	}
+#endif
+#ifdef CONFIG_ACPI_NUMA
+ 	if (!strncmp(opt,"noacpi",6))
+ 		acpi_numa = -1;
+#endif
+	return 1;
+} 
+
+EXPORT_SYMBOL(cpu_to_node);
+EXPORT_SYMBOL(node_to_cpumask);
+EXPORT_SYMBOL(memnode_shift);
+EXPORT_SYMBOL(memnodemap);
+EXPORT_SYMBOL(node_data);
diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c
new file mode 100644
index 00000000000..94862e1ec03
--- /dev/null
+++ b/arch/x86_64/mm/pageattr.c
@@ -0,0 +1,235 @@
+/* 
+ * Copyright 2002 Andi Kleen, SuSE Labs. 
+ * Thanks to Ben LaHaise for precious feedback.
+ */ 
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+
+static inline pte_t *lookup_address(unsigned long address) 
+{ 
+	pgd_t *pgd = pgd_offset_k(address);
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	if (pgd_none(*pgd))
+		return NULL;
+	pud = pud_offset(pgd, address);
+	if (!pud_present(*pud))
+		return NULL; 
+	pmd = pmd_offset(pud, address);
+	if (!pmd_present(*pmd))
+		return NULL; 
+	if (pmd_large(*pmd))
+		return (pte_t *)pmd;
+	pte = pte_offset_kernel(pmd, address);
+	if (pte && !pte_present(*pte))
+		pte = NULL; 
+	return pte;
+} 
+
+static struct page *split_large_page(unsigned long address, pgprot_t prot,
+				     pgprot_t ref_prot)
+{ 
+	int i; 
+	unsigned long addr;
+	struct page *base = alloc_pages(GFP_KERNEL, 0);
+	pte_t *pbase;
+	if (!base) 
+		return NULL;
+	address = __pa(address);
+	addr = address & LARGE_PAGE_MASK; 
+	pbase = (pte_t *)page_address(base);
+	for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
+		pbase[i] = pfn_pte(addr >> PAGE_SHIFT, 
+				   addr == address ? prot : ref_prot);
+	}
+	return base;
+} 
+
+
+static void flush_kernel_map(void *address) 
+{
+	if (0 && address && cpu_has_clflush) {
+		/* is this worth it? */ 
+		int i;
+		for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size) 
+			asm volatile("clflush (%0)" :: "r" (address + i)); 
+	} else
+		asm volatile("wbinvd":::"memory"); 
+	if (address)
+		__flush_tlb_one(address);
+	else
+		__flush_tlb_all();
+}
+
+
+static inline void flush_map(unsigned long address)
+{	
+	on_each_cpu(flush_kernel_map, (void *)address, 1, 1);
+}
+
+struct deferred_page { 
+	struct deferred_page *next; 
+	struct page *fpage;
+	unsigned long address;
+}; 
+static struct deferred_page *df_list; /* protected by init_mm.mmap_sem */
+
+static inline void save_page(unsigned long address, struct page *fpage)
+{
+	struct deferred_page *df;
+	df = kmalloc(sizeof(struct deferred_page), GFP_KERNEL); 
+	if (!df) {
+		flush_map(address);
+		__free_page(fpage);
+	} else { 
+		df->next = df_list;
+		df->fpage = fpage;
+		df->address = address;
+		df_list = df;
+	} 			
+}
+
+/* 
+ * No more special protections in this 2/4MB area - revert to a
+ * large page again. 
+ */
+static void revert_page(unsigned long address, pgprot_t ref_prot)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t large_pte;
+
+	pgd = pgd_offset_k(address);
+	BUG_ON(pgd_none(*pgd));
+	pud = pud_offset(pgd,address);
+	BUG_ON(pud_none(*pud));
+	pmd = pmd_offset(pud, address);
+	BUG_ON(pmd_val(*pmd) & _PAGE_PSE);
+	pgprot_val(ref_prot) |= _PAGE_PSE;
+	large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot);
+	set_pte((pte_t *)pmd, large_pte);
+}      
+
+static int
+__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
+				   pgprot_t ref_prot)
+{ 
+	pte_t *kpte; 
+	struct page *kpte_page;
+	unsigned kpte_flags;
+	kpte = lookup_address(address);
+	if (!kpte) return 0;
+	kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
+	kpte_flags = pte_val(*kpte); 
+	if (pgprot_val(prot) != pgprot_val(ref_prot)) { 
+		if ((kpte_flags & _PAGE_PSE) == 0) { 
+			set_pte(kpte, pfn_pte(pfn, prot));
+		} else {
+ 			/*
+ 			 * split_large_page will take the reference for this change_page_attr
+ 			 * on the split page.
+ 			 */
+			struct page *split = split_large_page(address, prot, ref_prot); 
+			if (!split)
+				return -ENOMEM;
+			set_pte(kpte,mk_pte(split, ref_prot));
+			kpte_page = split;
+		}	
+		get_page(kpte_page);
+	} else if ((kpte_flags & _PAGE_PSE) == 0) { 
+		set_pte(kpte, pfn_pte(pfn, ref_prot));
+		__put_page(kpte_page);
+	} else
+		BUG();
+
+	/* on x86-64 the direct mapping set at boot is not using 4k pages */
+ 	BUG_ON(PageReserved(kpte_page));
+
+	switch (page_count(kpte_page)) {
+ 	case 1:
+		save_page(address, kpte_page); 		     
+		revert_page(address, ref_prot);
+		break;
+ 	case 0:
+ 		BUG(); /* memleak and failed 2M page regeneration */
+ 	}
+	return 0;
+} 
+
+/*
+ * Change the page attributes of an page in the linear mapping.
+ *
+ * This should be used when a page is mapped with a different caching policy
+ * than write-back somewhere - some CPUs do not like it when mappings with
+ * different caching policies exist. This changes the page attributes of the
+ * in kernel linear mapping too.
+ * 
+ * The caller needs to ensure that there are no conflicting mappings elsewhere.
+ * This function only deals with the kernel linear map.
+ * 
+ * Caller must call global_flush_tlb() after this.
+ */
+int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
+{
+	int err = 0; 
+	int i; 
+
+	down_write(&init_mm.mmap_sem);
+	for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
+		unsigned long pfn = __pa(address) >> PAGE_SHIFT;
+
+		err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
+		if (err) 
+			break; 
+		/* Handle kernel mapping too which aliases part of the
+		 * lowmem */
+		if (__pa(address) < KERNEL_TEXT_SIZE) {
+			unsigned long addr2;
+			pgprot_t prot2 = prot;
+			addr2 = __START_KERNEL_map + __pa(address);
+ 			pgprot_val(prot2) &= ~_PAGE_NX;
+			err = __change_page_attr(addr2, pfn, prot2, PAGE_KERNEL_EXEC);
+		} 
+	} 	
+	up_write(&init_mm.mmap_sem); 
+	return err;
+}
+
+/* Don't call this for MMIO areas that may not have a mem_map entry */
+int change_page_attr(struct page *page, int numpages, pgprot_t prot)
+{
+	unsigned long addr = (unsigned long)page_address(page);
+	return change_page_attr_addr(addr, numpages, prot);
+}
+
+void global_flush_tlb(void)
+{ 
+	struct deferred_page *df, *next_df;
+
+	down_read(&init_mm.mmap_sem);
+	df = xchg(&df_list, NULL);
+	up_read(&init_mm.mmap_sem);
+	if (!df)
+		return;
+	flush_map((df && !df->next) ? df->address : 0);
+	for (; df; df = next_df) { 
+		next_df = df->next;
+		if (df->fpage) 
+			__free_page(df->fpage);
+		kfree(df);
+	} 
+} 
+
+EXPORT_SYMBOL(change_page_attr);
+EXPORT_SYMBOL(global_flush_tlb);
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c
new file mode 100644
index 00000000000..5d01b31472e
--- /dev/null
+++ b/arch/x86_64/mm/srat.c
@@ -0,0 +1,217 @@
+/*
+ * ACPI 3.0 based NUMA setup
+ * Copyright 2004 Andi Kleen, SuSE Labs.
+ *
+ * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
+ *
+ * Called from acpi_numa_init while reading the SRAT and SLIT tables.
+ * Assumes all memory regions belonging to a single proximity domain
+ * are in one chunk. Holes between them will be included in the node.
+ */
+
+#include <linux/kernel.h>
+#include <linux/acpi.h>
+#include <linux/mmzone.h>
+#include <linux/bitmap.h>
+#include <linux/module.h>
+#include <linux/topology.h>
+#include <asm/proto.h>
+#include <asm/numa.h>
+
+static struct acpi_table_slit *acpi_slit;
+
+static nodemask_t nodes_parsed __initdata;
+static nodemask_t nodes_found __initdata;
+static struct node nodes[MAX_NUMNODES] __initdata;
+static __u8  pxm2node[256] = { [0 ... 255] = 0xff };
+
+static __init int setup_node(int pxm)
+{
+	unsigned node = pxm2node[pxm];
+	if (node == 0xff) {
+		if (nodes_weight(nodes_found) >= MAX_NUMNODES)
+			return -1;
+		node = first_unset_node(nodes_found); 
+		node_set(node, nodes_found);
+		pxm2node[pxm] = node;
+	}
+	return pxm2node[pxm];
+}
+
+static __init int conflicting_nodes(unsigned long start, unsigned long end)
+{
+	int i;
+	for_each_online_node(i) {
+		struct node *nd = &nodes[i];
+		if (nd->start == nd->end)
+			continue;
+		if (nd->end > start && nd->start < end)
+			return 1;
+		if (nd->end == end && nd->start == start)
+			return 1;
+	}
+	return -1;
+}
+
+static __init void cutoff_node(int i, unsigned long start, unsigned long end)
+{
+	struct node *nd = &nodes[i];
+	if (nd->start < start) {
+		nd->start = start;
+		if (nd->end < nd->start)
+			nd->start = nd->end;
+	}
+	if (nd->end > end) {
+		if (!(end & 0xfff))
+			end--;
+		nd->end = end;
+		if (nd->start > nd->end)
+			nd->start = nd->end;
+	}
+}
+
+static __init void bad_srat(void)
+{
+	printk(KERN_ERR "SRAT: SRAT not used.\n");
+	acpi_numa = -1;
+}
+
+static __init inline int srat_disabled(void)
+{
+	return numa_off || acpi_numa < 0;
+}
+
+/* Callback for SLIT parsing */
+void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
+{
+	acpi_slit = slit;
+}
+
+/* Callback for Proximity Domain -> LAPIC mapping */
+void __init
+acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa)
+{
+	int pxm, node;
+	if (srat_disabled() || pa->flags.enabled == 0)
+		return;
+	pxm = pa->proximity_domain;
+	node = setup_node(pxm);
+	if (node < 0) {
+		printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
+		bad_srat();
+		return;
+	}
+	if (pa->apic_id >= NR_CPUS) {
+		printk(KERN_ERR "SRAT: lapic %u too large.\n",
+		       pa->apic_id);
+		bad_srat();
+		return;
+	}
+	cpu_to_node[pa->apic_id] = node;
+	acpi_numa = 1;
+	printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
+	       pxm, pa->apic_id, node);
+}
+
+/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
+void __init
+acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
+{
+	struct node *nd;
+	unsigned long start, end;
+	int node, pxm;
+	int i;
+
+	if (srat_disabled() || ma->flags.enabled == 0)
+		return;
+	/* hotplug bit is ignored for now */
+	pxm = ma->proximity_domain;
+	node = setup_node(pxm);
+	if (node < 0) {
+		printk(KERN_ERR "SRAT: Too many proximity domains.\n");
+		bad_srat();
+		return;
+	}
+	start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32);
+	end = start + (ma->length_lo | ((u64)ma->length_hi << 32));
+	i = conflicting_nodes(start, end);
+	if (i >= 0) {
+		printk(KERN_ERR
+		       "SRAT: pxm %d overlap %lx-%lx with node %d(%Lx-%Lx)\n",
+		       pxm, start, end, i, nodes[i].start, nodes[i].end);
+		bad_srat();
+		return;
+	}
+	nd = &nodes[node];
+	if (!node_test_and_set(node, nodes_parsed)) {
+		nd->start = start;
+		nd->end = end;
+	} else {
+		if (start < nd->start)
+			nd->start = start;
+		if (nd->end < end)
+			nd->end = end;
+	}
+	if (!(nd->end & 0xfff))
+		nd->end--;
+	printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
+	       nd->start, nd->end);
+}
+
+void __init acpi_numa_arch_fixup(void) {}
+
+/* Use the information discovered above to actually set up the nodes. */
+int __init acpi_scan_nodes(unsigned long start, unsigned long end)
+{
+	int i;
+	if (acpi_numa <= 0)
+		return -1;
+	memnode_shift = compute_hash_shift(nodes, nodes_weight(nodes_parsed));
+	if (memnode_shift < 0) {
+		printk(KERN_ERR
+		     "SRAT: No NUMA node hash function found. Contact maintainer\n");
+		bad_srat();
+		return -1;
+	}
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		if (!node_isset(i, nodes_parsed))
+			continue;
+		cutoff_node(i, start, end);
+		if (nodes[i].start == nodes[i].end) { 
+			node_clear(i, nodes_parsed);
+			continue;
+		}
+		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+	}
+	for (i = 0; i < NR_CPUS; i++) { 
+		if (cpu_to_node[i] == NUMA_NO_NODE)
+			continue;
+		if (!node_isset(cpu_to_node[i], nodes_parsed))
+			cpu_to_node[i] = NUMA_NO_NODE; 
+	}
+	numa_init_array();
+	return 0;
+}
+
+int node_to_pxm(int n)
+{
+       int i;
+       if (pxm2node[n] == n)
+               return n;
+       for (i = 0; i < 256; i++)
+               if (pxm2node[i] == n)
+                       return i;
+       return 0;
+}
+
+int __node_distance(int a, int b)
+{
+	int index;
+
+	if (!acpi_slit)
+		return a == b ? 10 : 20;
+	index = acpi_slit->localities * node_to_pxm(a);
+	return acpi_slit->entry[index + node_to_pxm(b)];
+}
+
+EXPORT_SYMBOL(__node_distance);
author	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 15:20:36 -0700
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 15:20:36 -0700
commit	1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree	0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/x86_64/mm