From 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 16 Apr 2005 15:20:36 -0700 Subject: Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! --- arch/x86_64/mm/Makefile | 11 + arch/x86_64/mm/extable.c | 35 +++ arch/x86_64/mm/fault.c | 579 ++++++++++++++++++++++++++++++++++++++++ arch/x86_64/mm/init.c | 630 ++++++++++++++++++++++++++++++++++++++++++++ arch/x86_64/mm/ioremap.c | 283 ++++++++++++++++++++ arch/x86_64/mm/k8topology.c | 168 ++++++++++++ arch/x86_64/mm/numa.c | 294 +++++++++++++++++++++ arch/x86_64/mm/pageattr.c | 235 +++++++++++++++++ arch/x86_64/mm/srat.c | 217 +++++++++++++++ 9 files changed, 2452 insertions(+) create mode 100644 arch/x86_64/mm/Makefile create mode 100644 arch/x86_64/mm/extable.c create mode 100644 arch/x86_64/mm/fault.c create mode 100644 arch/x86_64/mm/init.c create mode 100644 arch/x86_64/mm/ioremap.c create mode 100644 arch/x86_64/mm/k8topology.c create mode 100644 arch/x86_64/mm/numa.c create mode 100644 arch/x86_64/mm/pageattr.c create mode 100644 arch/x86_64/mm/srat.c (limited to 'arch/x86_64/mm') diff --git a/arch/x86_64/mm/Makefile b/arch/x86_64/mm/Makefile new file mode 100644 index 00000000000..66c354ad80c --- /dev/null +++ b/arch/x86_64/mm/Makefile @@ -0,0 +1,11 @@ +# +# Makefile for the linux x86_64-specific parts of the memory manager. +# + +obj-y := init.o fault.o ioremap.o extable.o pageattr.o +obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o +obj-$(CONFIG_DISCONTIGMEM) += numa.o +obj-$(CONFIG_K8_NUMA) += k8topology.o +obj-$(CONFIG_ACPI_NUMA) += srat.o + +hugetlbpage-y = ../../i386/mm/hugetlbpage.o diff --git a/arch/x86_64/mm/extable.c b/arch/x86_64/mm/extable.c new file mode 100644 index 00000000000..2d78f9fb403 --- /dev/null +++ b/arch/x86_64/mm/extable.c @@ -0,0 +1,35 @@ +/* + * linux/arch/x86_64/mm/extable.c + */ + +#include +#include +#include +#include +#include + +/* Simple binary search */ +const struct exception_table_entry * +search_extable(const struct exception_table_entry *first, + const struct exception_table_entry *last, + unsigned long value) +{ + /* Work around a B stepping K8 bug */ + if ((value >> 32) == 0) + value |= 0xffffffffUL << 32; + + while (first <= last) { + const struct exception_table_entry *mid; + long diff; + + mid = (last - first) / 2 + first; + diff = mid->insn - value; + if (diff == 0) + return mid; + else if (diff < 0) + first = mid+1; + else + last = mid-1; + } + return NULL; +} diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c new file mode 100644 index 00000000000..5724370475c --- /dev/null +++ b/arch/x86_64/mm/fault.c @@ -0,0 +1,579 @@ +/* + * linux/arch/x86-64/mm/fault.c + * + * Copyright (C) 1995 Linus Torvalds + * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For unblank_screen() */ +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +void bust_spinlocks(int yes) +{ + int loglevel_save = console_loglevel; + if (yes) { + oops_in_progress = 1; + } else { +#ifdef CONFIG_VT + unblank_screen(); +#endif + oops_in_progress = 0; + /* + * OK, the message is on the console. Now we call printk() + * without oops_in_progress set so that printk will give klogd + * a poke. Hold onto your hats... + */ + console_loglevel = 15; /* NMI oopser may have shut the console up */ + printk(" "); + console_loglevel = loglevel_save; + } +} + +/* Sometimes the CPU reports invalid exceptions on prefetch. + Check that here and ignore. + Opcode checker based on code by Richard Brunner */ +static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, + unsigned long error_code) +{ + unsigned char *instr = (unsigned char *)(regs->rip); + int scan_more = 1; + int prefetch = 0; + unsigned char *max_instr = instr + 15; + + /* If it was a exec fault ignore */ + if (error_code & (1<<4)) + return 0; + + /* Code segments in LDT could have a non zero base. Don't check + when that's possible */ + if (regs->cs & (1<<2)) + return 0; + + if ((regs->cs & 3) != 0 && regs->rip >= TASK_SIZE) + return 0; + + while (scan_more && instr < max_instr) { + unsigned char opcode; + unsigned char instr_hi; + unsigned char instr_lo; + + if (__get_user(opcode, instr)) + break; + + instr_hi = opcode & 0xf0; + instr_lo = opcode & 0x0f; + instr++; + + switch (instr_hi) { + case 0x20: + case 0x30: + /* Values 0x26,0x2E,0x36,0x3E are valid x86 + prefixes. In long mode, the CPU will signal + invalid opcode if some of these prefixes are + present so we will never get here anyway */ + scan_more = ((instr_lo & 7) == 0x6); + break; + + case 0x40: + /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes + Need to figure out under what instruction mode the + instruction was issued ... */ + /* Could check the LDT for lm, but for now it's good + enough to assume that long mode only uses well known + segments or kernel. */ + scan_more = ((regs->cs & 3) == 0) || (regs->cs == __USER_CS); + break; + + case 0x60: + /* 0x64 thru 0x67 are valid prefixes in all modes. */ + scan_more = (instr_lo & 0xC) == 0x4; + break; + case 0xF0: + /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */ + scan_more = !instr_lo || (instr_lo>>1) == 1; + break; + case 0x00: + /* Prefetch instruction is 0x0F0D or 0x0F18 */ + scan_more = 0; + if (__get_user(opcode, instr)) + break; + prefetch = (instr_lo == 0xF) && + (opcode == 0x0D || opcode == 0x18); + break; + default: + scan_more = 0; + break; + } + } + return prefetch; +} + +static int bad_address(void *p) +{ + unsigned long dummy; + return __get_user(dummy, (unsigned long *)p); +} + +void dump_pagetable(unsigned long address) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + asm("movq %%cr3,%0" : "=r" (pgd)); + + pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); + pgd += pgd_index(address); + printk("PGD %lx ", pgd_val(*pgd)); + if (bad_address(pgd)) goto bad; + if (!pgd_present(*pgd)) goto ret; + + pud = __pud_offset_k((pud_t *)pgd_page(*pgd), address); + if (bad_address(pud)) goto bad; + printk("PUD %lx ", pud_val(*pud)); + if (!pud_present(*pud)) goto ret; + + pmd = pmd_offset(pud, address); + if (bad_address(pmd)) goto bad; + printk("PMD %lx ", pmd_val(*pmd)); + if (!pmd_present(*pmd)) goto ret; + + pte = pte_offset_kernel(pmd, address); + if (bad_address(pte)) goto bad; + printk("PTE %lx", pte_val(*pte)); +ret: + printk("\n"); + return; +bad: + printk("BAD\n"); +} + +static const char errata93_warning[] = +KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" +KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" +KERN_ERR "******* Please consider a BIOS update.\n" +KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; + +/* Workaround for K8 erratum #93 & buggy BIOS. + BIOS SMM functions are required to use a specific workaround + to avoid corruption of the 64bit RIP register on C stepping K8. + A lot of BIOS that didn't get tested properly miss this. + The OS sees this as a page fault with the upper 32bits of RIP cleared. + Try to work around it here. + Note we only handle faults in kernel here. */ + +static int is_errata93(struct pt_regs *regs, unsigned long address) +{ + static int warned; + if (address != regs->rip) + return 0; + if ((address >> 32) != 0) + return 0; + address |= 0xffffffffUL << 32; + if ((address >= (u64)_stext && address <= (u64)_etext) || + (address >= MODULES_VADDR && address <= MODULES_END)) { + if (!warned) { + printk(errata93_warning); + warned = 1; + } + regs->rip = address; + return 1; + } + return 0; +} + +int unhandled_signal(struct task_struct *tsk, int sig) +{ + if (tsk->pid == 1) + return 1; + /* Warn for strace, but not for gdb */ + if (!test_ti_thread_flag(tsk->thread_info, TIF_SYSCALL_TRACE) && + (tsk->ptrace & PT_PTRACED)) + return 0; + return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) || + (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL); +} + +static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, + unsigned long error_code) +{ + oops_begin(); + printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", + current->comm, address); + dump_pagetable(address); + __die("Bad pagetable", regs, error_code); + oops_end(); + do_exit(SIGKILL); +} + +/* + * Handle a fault on the vmalloc or module mapping area + */ +static int vmalloc_fault(unsigned long address) +{ + pgd_t *pgd, *pgd_ref; + pud_t *pud, *pud_ref; + pmd_t *pmd, *pmd_ref; + pte_t *pte, *pte_ref; + + /* Copy kernel mappings over when needed. This can also + happen within a race in page table update. In the later + case just flush. */ + + pgd = pgd_offset(current->mm ?: &init_mm, address); + pgd_ref = pgd_offset_k(address); + if (pgd_none(*pgd_ref)) + return -1; + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + + /* Below here mismatches are bugs because these lower tables + are shared */ + + pud = pud_offset(pgd, address); + pud_ref = pud_offset(pgd_ref, address); + if (pud_none(*pud_ref)) + return -1; + if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref)) + BUG(); + pmd = pmd_offset(pud, address); + pmd_ref = pmd_offset(pud_ref, address); + if (pmd_none(*pmd_ref)) + return -1; + if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) + BUG(); + pte_ref = pte_offset_kernel(pmd_ref, address); + if (!pte_present(*pte_ref)) + return -1; + pte = pte_offset_kernel(pmd, address); + if (!pte_present(*pte) || pte_page(*pte) != pte_page(*pte_ref)) + BUG(); + __flush_tlb_all(); + return 0; +} + +int page_fault_trace = 0; +int exception_trace = 1; + +/* + * This routine handles page faults. It determines the address, + * and the problem, and then passes it off to one of the appropriate + * routines. + * + * error_code: + * bit 0 == 0 means no page found, 1 means protection fault + * bit 1 == 0 means read, 1 means write + * bit 2 == 0 means kernel, 1 means user-mode + * bit 3 == 1 means fault was an instruction fetch + */ +asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code) +{ + struct task_struct *tsk; + struct mm_struct *mm; + struct vm_area_struct * vma; + unsigned long address; + const struct exception_table_entry *fixup; + int write; + siginfo_t info; + +#ifdef CONFIG_CHECKING + { + unsigned long gs; + struct x8664_pda *pda = cpu_pda + stack_smp_processor_id(); + rdmsrl(MSR_GS_BASE, gs); + if (gs != (unsigned long)pda) { + wrmsrl(MSR_GS_BASE, pda); + printk("page_fault: wrong gs %lx expected %p\n", gs, pda); + } + } +#endif + + /* get the address */ + __asm__("movq %%cr2,%0":"=r" (address)); + if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, + SIGSEGV) == NOTIFY_STOP) + return; + + if (likely(regs->eflags & X86_EFLAGS_IF)) + local_irq_enable(); + + if (unlikely(page_fault_trace)) + printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n", + regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); + + tsk = current; + mm = tsk->mm; + info.si_code = SEGV_MAPERR; + + + /* + * We fault-in kernel-space virtual memory on-demand. The + * 'reference' page table is init_mm.pgd. + * + * NOTE! We MUST NOT take any locks for this case. We may + * be in an interrupt or a critical region, and should + * only copy the information from the master page table, + * nothing more. + * + * This verifies that the fault happens in kernel space + * (error_code & 4) == 0, and that the fault was not a + * protection error (error_code & 1) == 0. + */ + if (unlikely(address >= TASK_SIZE)) { + if (!(error_code & 5)) { + if (vmalloc_fault(address) < 0) + goto bad_area_nosemaphore; + return; + } + /* + * Don't take the mm semaphore here. If we fixup a prefetch + * fault we could otherwise deadlock. + */ + goto bad_area_nosemaphore; + } + + if (unlikely(error_code & (1 << 3))) + pgtable_bad(address, regs, error_code); + + /* + * If we're in an interrupt or have no user + * context, we must not take the fault.. + */ + if (unlikely(in_atomic() || !mm)) + goto bad_area_nosemaphore; + + again: + /* When running in the kernel we expect faults to occur only to + * addresses in user space. All other faults represent errors in the + * kernel and should generate an OOPS. Unfortunatly, in the case of an + * erroneous fault occuring in a code path which already holds mmap_sem + * we will deadlock attempting to validate the fault against the + * address space. Luckily the kernel only validly references user + * space from well defined areas of code, which are listed in the + * exceptions table. + * + * As the vast majority of faults will be valid we will only perform + * the source reference check when there is a possibilty of a deadlock. + * Attempt to lock the address space, if we cannot we then validate the + * source. If this is invalid we can skip the address space check, + * thus avoiding the deadlock. + */ + if (!down_read_trylock(&mm->mmap_sem)) { + if ((error_code & 4) == 0 && + !search_exception_tables(regs->rip)) + goto bad_area_nosemaphore; + down_read(&mm->mmap_sem); + } + + vma = find_vma(mm, address); + if (!vma) + goto bad_area; + if (likely(vma->vm_start <= address)) + goto good_area; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto bad_area; + if (error_code & 4) { + // XXX: align red zone size with ABI + if (address + 128 < regs->rsp) + goto bad_area; + } + if (expand_stack(vma, address)) + goto bad_area; +/* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ +good_area: + info.si_code = SEGV_ACCERR; + write = 0; + switch (error_code & 3) { + default: /* 3: write, present */ + /* fall through */ + case 2: /* write, not present */ + if (!(vma->vm_flags & VM_WRITE)) + goto bad_area; + write++; + break; + case 1: /* read, present */ + goto bad_area; + case 0: /* read, not present */ + if (!(vma->vm_flags & (VM_READ | VM_EXEC))) + goto bad_area; + } + + /* + * If for any reason at all we couldn't handle the fault, + * make sure we exit gracefully rather than endlessly redo + * the fault. + */ + switch (handle_mm_fault(mm, vma, address, write)) { + case 1: + tsk->min_flt++; + break; + case 2: + tsk->maj_flt++; + break; + case 0: + goto do_sigbus; + default: + goto out_of_memory; + } + + up_read(&mm->mmap_sem); + return; + +/* + * Something tried to access memory that isn't in our memory map.. + * Fix it, but check if it's kernel or user first.. + */ +bad_area: + up_read(&mm->mmap_sem); + +bad_area_nosemaphore: + +#ifdef CONFIG_IA32_EMULATION + /* 32bit vsyscall. map on demand. */ + if (test_thread_flag(TIF_IA32) && + address >= VSYSCALL32_BASE && address < VSYSCALL32_END) { + if (map_syscall32(mm, address) < 0) + goto out_of_memory2; + return; + } +#endif + + /* User mode accesses just cause a SIGSEGV */ + if (error_code & 4) { + if (is_prefetch(regs, address, error_code)) + return; + + /* Work around K8 erratum #100 K8 in compat mode + occasionally jumps to illegal addresses >4GB. We + catch this here in the page fault handler because + these addresses are not reachable. Just detect this + case and return. Any code segment in LDT is + compatibility mode. */ + if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && + (address >> 32)) + return; + + if (exception_trace && unhandled_signal(tsk, SIGSEGV)) { + printk( + "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n", + tsk->pid > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, tsk->pid, address, regs->rip, + regs->rsp, error_code); + } + + tsk->thread.cr2 = address; + /* Kernel addresses are always protection faults */ + tsk->thread.error_code = error_code | (address >= TASK_SIZE); + tsk->thread.trap_no = 14; + info.si_signo = SIGSEGV; + info.si_errno = 0; + /* info.si_code has been set above */ + info.si_addr = (void __user *)address; + force_sig_info(SIGSEGV, &info, tsk); + return; + } + +no_context: + + /* Are we prepared to handle this kernel fault? */ + fixup = search_exception_tables(regs->rip); + if (fixup) { + regs->rip = fixup->fixup; + return; + } + + /* + * Hall of shame of CPU/BIOS bugs. + */ + + if (is_prefetch(regs, address, error_code)) + return; + + if (is_errata93(regs, address)) + return; + +/* + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice. + */ + + oops_begin(); + + if (address < PAGE_SIZE) + printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); + else + printk(KERN_ALERT "Unable to handle kernel paging request"); + printk(" at %016lx RIP: \n" KERN_ALERT,address); + printk_address(regs->rip); + printk("\n"); + dump_pagetable(address); + __die("Oops", regs, error_code); + /* Executive summary in case the body of the oops scrolled away */ + printk(KERN_EMERG "CR2: %016lx\n", address); + oops_end(); + do_exit(SIGKILL); + +/* + * We ran out of memory, or some other thing happened to us that made + * us unable to handle the page fault gracefully. + */ +out_of_memory: + up_read(&mm->mmap_sem); +out_of_memory2: + if (current->pid == 1) { + yield(); + goto again; + } + printk("VM: killing process %s\n", tsk->comm); + if (error_code & 4) + do_exit(SIGKILL); + goto no_context; + +do_sigbus: + up_read(&mm->mmap_sem); + + /* Kernel mode? Handle exceptions or die */ + if (!(error_code & 4)) + goto no_context; + + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 14; + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = BUS_ADRERR; + info.si_addr = (void __user *)address; + force_sig_info(SIGBUS, &info, tsk); + return; +} diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c new file mode 100644 index 00000000000..b0d604551d8 --- /dev/null +++ b/arch/x86_64/mm/init.c @@ -0,0 +1,630 @@ +/* + * linux/arch/x86_64/mm/init.c + * + * Copyright (C) 1995 Linus Torvalds + * Copyright (C) 2000 Pavel Machek + * Copyright (C) 2002,2003 Andi Kleen + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef Dprintk +#define Dprintk(x...) +#endif + +#ifdef CONFIG_GART_IOMMU +extern int swiotlb; +#endif + +extern char _stext[]; + +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); + +/* + * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the + * physical space so we can cache the place of the first one and move + * around without checking the pgd every time. + */ + +void show_mem(void) +{ + int i, total = 0, reserved = 0; + int shared = 0, cached = 0; + pg_data_t *pgdat; + struct page *page; + + printk("Mem-info:\n"); + show_free_areas(); + printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); + + for_each_pgdat(pgdat) { + for (i = 0; i < pgdat->node_spanned_pages; ++i) { + page = pfn_to_page(pgdat->node_start_pfn + i); + total++; + if (PageReserved(page)) + reserved++; + else if (PageSwapCache(page)) + cached++; + else if (page_count(page)) + shared += page_count(page) - 1; + } + } + printk("%d pages of RAM\n", total); + printk("%d reserved pages\n",reserved); + printk("%d pages shared\n",shared); + printk("%d pages swap cached\n",cached); +} + +/* References to section boundaries */ + +extern char _text, _etext, _edata, __bss_start, _end[]; +extern char __init_begin, __init_end; + +int after_bootmem; + +static void *spp_getpage(void) +{ + void *ptr; + if (after_bootmem) + ptr = (void *) get_zeroed_page(GFP_ATOMIC); + else + ptr = alloc_bootmem_pages(PAGE_SIZE); + if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) + panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":""); + + Dprintk("spp_getpage %p\n", ptr); + return ptr; +} + +static void set_pte_phys(unsigned long vaddr, + unsigned long phys, pgprot_t prot) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte, new_pte; + + Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys); + + pgd = pgd_offset_k(vaddr); + if (pgd_none(*pgd)) { + printk("PGD FIXMAP MISSING, it should be setup in head.S!\n"); + return; + } + pud = pud_offset(pgd, vaddr); + if (pud_none(*pud)) { + pmd = (pmd_t *) spp_getpage(); + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER)); + if (pmd != pmd_offset(pud, 0)) { + printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0)); + return; + } + } + pmd = pmd_offset(pud, vaddr); + if (pmd_none(*pmd)) { + pte = (pte_t *) spp_getpage(); + set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER)); + if (pte != pte_offset_kernel(pmd, 0)) { + printk("PAGETABLE BUG #02!\n"); + return; + } + } + new_pte = pfn_pte(phys >> PAGE_SHIFT, prot); + + pte = pte_offset_kernel(pmd, vaddr); + if (!pte_none(*pte) && + pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask)) + pte_ERROR(*pte); + set_pte(pte, new_pte); + + /* + * It's enough to flush this one mapping. + * (PGE mappings get flushed as well) + */ + __flush_tlb_one(vaddr); +} + +/* NOTE: this is meant to be run only at boot */ +void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot) +{ + unsigned long address = __fix_to_virt(idx); + + if (idx >= __end_of_fixed_addresses) { + printk("Invalid __set_fixmap\n"); + return; + } + set_pte_phys(address, phys, prot); +} + +unsigned long __initdata table_start, table_end; + +extern pmd_t temp_boot_pmds[]; + +static struct temp_map { + pmd_t *pmd; + void *address; + int allocated; +} temp_mappings[] __initdata = { + { &temp_boot_pmds[0], (void *)(40UL * 1024 * 1024) }, + { &temp_boot_pmds[1], (void *)(42UL * 1024 * 1024) }, + {} +}; + +static __init void *alloc_low_page(int *index, unsigned long *phys) +{ + struct temp_map *ti; + int i; + unsigned long pfn = table_end++, paddr; + void *adr; + + if (pfn >= end_pfn) + panic("alloc_low_page: ran out of memory"); + for (i = 0; temp_mappings[i].allocated; i++) { + if (!temp_mappings[i].pmd) + panic("alloc_low_page: ran out of temp mappings"); + } + ti = &temp_mappings[i]; + paddr = (pfn << PAGE_SHIFT) & PMD_MASK; + set_pmd(ti->pmd, __pmd(paddr | _KERNPG_TABLE | _PAGE_PSE)); + ti->allocated = 1; + __flush_tlb(); + adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK); + *index = i; + *phys = pfn * PAGE_SIZE; + return adr; +} + +static __init void unmap_low_page(int i) +{ + struct temp_map *ti = &temp_mappings[i]; + set_pmd(ti->pmd, __pmd(0)); + ti->allocated = 0; +} + +static void __init phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) +{ + long i, j; + + i = pud_index(address); + pud = pud + i; + for (; i < PTRS_PER_PUD; pud++, i++) { + int map; + unsigned long paddr, pmd_phys; + pmd_t *pmd; + + paddr = address + i*PUD_SIZE; + if (paddr >= end) { + for (; i < PTRS_PER_PUD; i++, pud++) + set_pud(pud, __pud(0)); + break; + } + + if (!e820_mapped(paddr, paddr+PUD_SIZE, 0)) { + set_pud(pud, __pud(0)); + continue; + } + + pmd = alloc_low_page(&map, &pmd_phys); + set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); + for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) { + unsigned long pe; + + if (paddr >= end) { + for (; j < PTRS_PER_PMD; j++, pmd++) + set_pmd(pmd, __pmd(0)); + break; + } + pe = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | paddr; + pe &= __supported_pte_mask; + set_pmd(pmd, __pmd(pe)); + } + unmap_low_page(map); + } + __flush_tlb(); +} + +static void __init find_early_table_space(unsigned long end) +{ + unsigned long puds, pmds, tables; + + puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; + pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; + tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) + + round_up(pmds * sizeof(pmd_t), PAGE_SIZE); + + table_start = find_e820_area(0x8000, __pa_symbol(&_text), tables); + if (table_start == -1UL) + panic("Cannot find space for the kernel page tables"); + + table_start >>= PAGE_SHIFT; + table_end = table_start; +} + +/* Setup the direct mapping of the physical memory at PAGE_OFFSET. + This runs before bootmem is initialized and gets pages directly from the + physical memory. To access them they are temporarily mapped. */ +void __init init_memory_mapping(unsigned long start, unsigned long end) +{ + unsigned long next; + + Dprintk("init_memory_mapping\n"); + + /* + * Find space for the kernel direct mapping tables. + * Later we should allocate these tables in the local node of the memory + * mapped. Unfortunately this is done currently before the nodes are + * discovered. + */ + find_early_table_space(end); + + start = (unsigned long)__va(start); + end = (unsigned long)__va(end); + + for (; start < end; start = next) { + int map; + unsigned long pud_phys; + pud_t *pud = alloc_low_page(&map, &pud_phys); + next = start + PGDIR_SIZE; + if (next > end) + next = end; + phys_pud_init(pud, __pa(start), __pa(next)); + set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); + unmap_low_page(map); + } + + asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features)); + __flush_tlb_all(); + early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end, + table_start<> PAGE_SHIFT; + + if (end_pfn < max_dma) + zones_size[ZONE_DMA] = end_pfn; + else { + zones_size[ZONE_DMA] = max_dma; + zones_size[ZONE_NORMAL] = end_pfn - max_dma; + } + free_area_init(zones_size); + } + return; +} +#endif + +/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches + from the CPU leading to inconsistent cache lines. address and size + must be aligned to 2MB boundaries. + Does nothing when the mapping doesn't exist. */ +void __init clear_kernel_mapping(unsigned long address, unsigned long size) +{ + unsigned long end = address + size; + + BUG_ON(address & ~LARGE_PAGE_MASK); + BUG_ON(size & ~LARGE_PAGE_MASK); + + for (; address < end; address += LARGE_PAGE_SIZE) { + pgd_t *pgd = pgd_offset_k(address); + pud_t *pud; + pmd_t *pmd; + if (pgd_none(*pgd)) + continue; + pud = pud_offset(pgd, address); + if (pud_none(*pud)) + continue; + pmd = pmd_offset(pud, address); + if (!pmd || pmd_none(*pmd)) + continue; + if (0 == (pmd_val(*pmd) & _PAGE_PSE)) { + /* Could handle this, but it should not happen currently. */ + printk(KERN_ERR + "clear_kernel_mapping: mapping has been split. will leak memory\n"); + pmd_ERROR(*pmd); + } + set_pmd(pmd, __pmd(0)); + } + __flush_tlb_all(); +} + +static inline int page_is_ram (unsigned long pagenr) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + unsigned long addr, end; + + if (e820.map[i].type != E820_RAM) /* not usable memory */ + continue; + /* + * !!!FIXME!!! Some BIOSen report areas as RAM that + * are not. Notably the 640->1Mb area. We need a sanity + * check here. + */ + addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT; + end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT; + if ((pagenr >= addr) && (pagenr < end)) + return 1; + } + return 0; +} + +extern int swiotlb_force; + +static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, + kcore_vsyscall; + +void __init mem_init(void) +{ + int codesize, reservedpages, datasize, initsize; + int tmp; + +#ifdef CONFIG_SWIOTLB + if (swiotlb_force) + swiotlb = 1; + if (!iommu_aperture && + (end_pfn >= 0xffffffff>>PAGE_SHIFT || force_iommu)) + swiotlb = 1; + if (swiotlb) + swiotlb_init(); +#endif + + /* How many end-of-memory variables you have, grandma! */ + max_low_pfn = end_pfn; + max_pfn = end_pfn; + num_physpages = end_pfn; + high_memory = (void *) __va(end_pfn * PAGE_SIZE); + + /* clear the zero-page */ + memset(empty_zero_page, 0, PAGE_SIZE); + + reservedpages = 0; + + /* this will put all low memory onto the freelists */ +#ifdef CONFIG_DISCONTIGMEM + totalram_pages += numa_free_all_bootmem(); + tmp = 0; + /* should count reserved pages here for all nodes */ +#else + max_mapnr = end_pfn; + if (!mem_map) BUG(); + + totalram_pages += free_all_bootmem(); + + for (tmp = 0; tmp < end_pfn; tmp++) + /* + * Only count reserved RAM pages + */ + if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) + reservedpages++; +#endif + + after_bootmem = 1; + + codesize = (unsigned long) &_etext - (unsigned long) &_text; + datasize = (unsigned long) &_edata - (unsigned long) &_etext; + initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; + + /* Register memory areas for /proc/kcore */ + kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); + kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, + VMALLOC_END-VMALLOC_START); + kclist_add(&kcore_kernel, &_stext, _end - _stext); + kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN); + kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, + VSYSCALL_END - VSYSCALL_START); + + printk("Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init)\n", + (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), + end_pfn << (PAGE_SHIFT-10), + codesize >> 10, + reservedpages << (PAGE_SHIFT-10), + datasize >> 10, + initsize >> 10); + + /* + * Subtle. SMP is doing its boot stuff late (because it has to + * fork idle threads) - but it also needs low mappings for the + * protected-mode entry to work. We zap these entries only after + * the WP-bit has been tested. + */ +#ifndef CONFIG_SMP + zap_low_mappings(); +#endif +} + +extern char __initdata_begin[], __initdata_end[]; + +void free_initmem(void) +{ + unsigned long addr; + + addr = (unsigned long)(&__init_begin); + for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { + ClearPageReserved(virt_to_page(addr)); + set_page_count(virt_to_page(addr), 1); + memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE); + free_page(addr); + totalram_pages++; + } + memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin); + printk ("Freeing unused kernel memory: %luk freed\n", (&__init_end - &__init_begin) >> 10); +} + +#ifdef CONFIG_BLK_DEV_INITRD +void free_initrd_mem(unsigned long start, unsigned long end) +{ + if (start < (unsigned long)&_end) + return; + printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); + for (; start < end; start += PAGE_SIZE) { + ClearPageReserved(virt_to_page(start)); + set_page_count(virt_to_page(start), 1); + free_page(start); + totalram_pages++; + } +} +#endif + +void __init reserve_bootmem_generic(unsigned long phys, unsigned len) +{ + /* Should check here against the e820 map to avoid double free */ +#ifdef CONFIG_DISCONTIGMEM + int nid = phys_to_nid(phys); + reserve_bootmem_node(NODE_DATA(nid), phys, len); +#else + reserve_bootmem(phys, len); +#endif +} + +int kern_addr_valid(unsigned long addr) +{ + unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + if (above != 0 && above != -1UL) + return 0; + + pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) + return 0; + + pud = pud_offset(pgd, addr); + if (pud_none(*pud)) + return 0; + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + return 0; + if (pmd_large(*pmd)) + return pfn_valid(pmd_pfn(*pmd)); + + pte = pte_offset_kernel(pmd, addr); + if (pte_none(*pte)) + return 0; + return pfn_valid(pte_pfn(*pte)); +} + +#ifdef CONFIG_SYSCTL +#include + +extern int exception_trace, page_fault_trace; + +static ctl_table debug_table2[] = { + { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL, + proc_dointvec }, +#ifdef CONFIG_CHECKING + { 100, "page-fault-trace", &page_fault_trace, sizeof(int), 0644, NULL, + proc_dointvec }, +#endif + { 0, } +}; + +static ctl_table debug_root_table2[] = { + { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555, + .child = debug_table2 }, + { 0 }, +}; + +static __init int x8664_sysctl_init(void) +{ + register_sysctl_table(debug_root_table2, 1); + return 0; +} +__initcall(x8664_sysctl_init); +#endif + +/* Pseudo VMAs to allow ptrace access for the vsyscall pages. x86-64 has two + different ones: one for 32bit and one for 64bit. Use the appropiate + for the target task. */ + +static struct vm_area_struct gate_vma = { + .vm_start = VSYSCALL_START, + .vm_end = VSYSCALL_END, + .vm_page_prot = PAGE_READONLY +}; + +static struct vm_area_struct gate32_vma = { + .vm_start = VSYSCALL32_BASE, + .vm_end = VSYSCALL32_END, + .vm_page_prot = PAGE_READONLY +}; + +struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +{ +#ifdef CONFIG_IA32_EMULATION + if (test_tsk_thread_flag(tsk, TIF_IA32)) { + /* lookup code assumes the pages are present. set them up + now */ + if (__map_syscall32(tsk->mm, VSYSCALL32_BASE) < 0) + return NULL; + return &gate32_vma; + } +#endif + return &gate_vma; +} + +int in_gate_area(struct task_struct *task, unsigned long addr) +{ + struct vm_area_struct *vma = get_gate_vma(task); + return (addr >= vma->vm_start) && (addr < vma->vm_end); +} + +/* Use this when you have no reliable task/vma, typically from interrupt + * context. It is less reliable than using the task's vma and may give + * false positives. + */ +int in_gate_area_no_task(unsigned long addr) +{ + return (((addr >= VSYSCALL_START) && (addr < VSYSCALL_END)) || + ((addr >= VSYSCALL32_BASE) && (addr < VSYSCALL32_END))); +} diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c new file mode 100644 index 00000000000..74ec8554b19 --- /dev/null +++ b/arch/x86_64/mm/ioremap.c @@ -0,0 +1,283 @@ +/* + * arch/x86_64/mm/ioremap.c + * + * Re-map IO memory to kernel address space so that we can access it. + * This is needed for high PCI addresses that aren't mapped in the + * 640k-1MB IO memory area on PC's + * + * (C) Copyright 1995 1996 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ISA_START_ADDRESS 0xa0000 +#define ISA_END_ADDRESS 0x100000 + +static inline void remap_area_pte(pte_t * pte, unsigned long address, unsigned long size, + unsigned long phys_addr, unsigned long flags) +{ + unsigned long end; + unsigned long pfn; + + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + if (address >= end) + BUG(); + pfn = phys_addr >> PAGE_SHIFT; + do { + if (!pte_none(*pte)) { + printk("remap_area_pte: page already exists\n"); + BUG(); + } + set_pte(pte, pfn_pte(pfn, __pgprot(_PAGE_PRESENT | _PAGE_RW | + _PAGE_GLOBAL | _PAGE_DIRTY | _PAGE_ACCESSED | flags))); + address += PAGE_SIZE; + pfn++; + pte++; + } while (address && (address < end)); +} + +static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size, + unsigned long phys_addr, unsigned long flags) +{ + unsigned long end; + + address &= ~PUD_MASK; + end = address + size; + if (end > PUD_SIZE) + end = PUD_SIZE; + phys_addr -= address; + if (address >= end) + BUG(); + do { + pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); + if (!pte) + return -ENOMEM; + remap_area_pte(pte, address, end - address, address + phys_addr, flags); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address && (address < end)); + return 0; +} + +static inline int remap_area_pud(pud_t * pud, unsigned long address, unsigned long size, + unsigned long phys_addr, unsigned long flags) +{ + unsigned long end; + + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + phys_addr -= address; + if (address >= end) + BUG(); + do { + pmd_t * pmd = pmd_alloc(&init_mm, pud, address); + if (!pmd) + return -ENOMEM; + remap_area_pmd(pmd, address, end - address, address + phys_addr, flags); + address = (address + PUD_SIZE) & PUD_MASK; + pud++; + } while (address && (address < end)); + return 0; +} + +static int remap_area_pages(unsigned long address, unsigned long phys_addr, + unsigned long size, unsigned long flags) +{ + int error; + pgd_t *pgd; + unsigned long end = address + size; + + phys_addr -= address; + pgd = pgd_offset_k(address); + flush_cache_all(); + if (address >= end) + BUG(); + spin_lock(&init_mm.page_table_lock); + do { + pud_t *pud; + pud = pud_alloc(&init_mm, pgd, address); + error = -ENOMEM; + if (!pud) + break; + if (remap_area_pud(pud, address, end - address, + phys_addr + address, flags)) + break; + error = 0; + address = (address + PGDIR_SIZE) & PGDIR_MASK; + pgd++; + } while (address && (address < end)); + spin_unlock(&init_mm.page_table_lock); + flush_tlb_all(); + return error; +} + +/* + * Fix up the linear direct mapping of the kernel to avoid cache attribute + * conflicts. + */ +static int +ioremap_change_attr(unsigned long phys_addr, unsigned long size, + unsigned long flags) +{ + int err = 0; + if (flags && phys_addr + size - 1 < (end_pfn_map << PAGE_SHIFT)) { + unsigned long npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long vaddr = (unsigned long) __va(phys_addr); + + /* + * Must use a address here and not struct page because the phys addr + * can be a in hole between nodes and not have an memmap entry. + */ + err = change_page_attr_addr(vaddr,npages,__pgprot(__PAGE_KERNEL|flags)); + if (!err) + global_flush_tlb(); + } + return err; +} + +/* + * Generic mapping function + */ + +/* + * Remap an arbitrary physical address space into the kernel virtual + * address space. Needed when the kernel wants to access high addresses + * directly. + * + * NOTE! We need to allow non-page-aligned mappings too: we will obviously + * have to convert them into an offset in a page-aligned mapping, but the + * caller shouldn't need to know that small detail. + */ +void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags) +{ + void * addr; + struct vm_struct * area; + unsigned long offset, last_addr; + + /* Don't allow wraparound or zero size */ + last_addr = phys_addr + size - 1; + if (!size || last_addr < phys_addr) + return NULL; + + /* + * Don't remap the low PCI/ISA area, it's always mapped.. + */ + if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) + return (__force void __iomem *)phys_to_virt(phys_addr); + +#ifndef CONFIG_DISCONTIGMEM + /* + * Don't allow anybody to remap normal RAM that we're using.. + */ + if (last_addr < virt_to_phys(high_memory)) { + char *t_addr, *t_end; + struct page *page; + + t_addr = __va(phys_addr); + t_end = t_addr + (size - 1); + + for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++) + if(!PageReserved(page)) + return NULL; + } +#endif + + /* + * Mappings have to be page-aligned + */ + offset = phys_addr & ~PAGE_MASK; + phys_addr &= PAGE_MASK; + size = PAGE_ALIGN(last_addr+1) - phys_addr; + + /* + * Ok, go for it.. + */ + area = get_vm_area(size, VM_IOREMAP | (flags << 20)); + if (!area) + return NULL; + area->phys_addr = phys_addr; + addr = area->addr; + if (remap_area_pages((unsigned long) addr, phys_addr, size, flags)) { + remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr)); + return NULL; + } + if (ioremap_change_attr(phys_addr, size, flags) < 0) { + area->flags &= 0xffffff; + vunmap(addr); + return NULL; + } + return (__force void __iomem *) (offset + (char *)addr); +} + +/** + * ioremap_nocache - map bus memory into CPU space + * @offset: bus address of the memory + * @size: size of the resource to map + * + * ioremap_nocache performs a platform specific sequence of operations to + * make bus memory CPU accessible via the readb/readw/readl/writeb/ + * writew/writel functions and the other mmio helpers. The returned + * address is not guaranteed to be usable directly as a virtual + * address. + * + * This version of ioremap ensures that the memory is marked uncachable + * on the CPU as well as honouring existing caching rules from things like + * the PCI bus. Note that there are other caches and buffers on many + * busses. In particular driver authors should read up on PCI writes + * + * It's useful if some control registers are in such an area and + * write combining or read caching is not desirable: + * + * Must be freed with iounmap. + */ + +void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size) +{ + return __ioremap(phys_addr, size, _PAGE_PCD); +} + +void iounmap(volatile void __iomem *addr) +{ + struct vm_struct *p, **pprev; + + if (addr <= high_memory) + return; + if (addr >= phys_to_virt(ISA_START_ADDRESS) && + addr < phys_to_virt(ISA_END_ADDRESS)) + return; + + write_lock(&vmlist_lock); + for (p = vmlist, pprev = &vmlist; p != NULL; pprev = &p->next, p = *pprev) + if (p->addr == (void *)(PAGE_MASK & (unsigned long)addr)) + break; + if (!p) { + printk("__iounmap: bad address %p\n", addr); + goto out_unlock; + } + *pprev = p->next; + unmap_vm_area(p); + if ((p->flags >> 20) && + p->phys_addr + p->size - 1 < virt_to_phys(high_memory)) { + /* p->size includes the guard page, but cpa doesn't like that */ + change_page_attr(virt_to_page(__va(p->phys_addr)), + p->size >> PAGE_SHIFT, + PAGE_KERNEL); + global_flush_tlb(); + } +out_unlock: + write_unlock(&vmlist_lock); + kfree(p); +} diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c new file mode 100644 index 00000000000..ec35747aacd --- /dev/null +++ b/arch/x86_64/mm/k8topology.c @@ -0,0 +1,168 @@ +/* + * AMD K8 NUMA support. + * Discover the memory map and associated nodes. + * + * This version reads it directly from the K8 northbridge. + * + * Copyright 2002,2003 Andi Kleen, SuSE Labs. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static __init int find_northbridge(void) +{ + int num; + + for (num = 0; num < 32; num++) { + u32 header; + + header = read_pci_config(0, num, 0, 0x00); + if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16))) + continue; + + header = read_pci_config(0, num, 1, 0x00); + if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16))) + continue; + return num; + } + + return -1; +} + +int __init k8_scan_nodes(unsigned long start, unsigned long end) +{ + unsigned long prevbase; + struct node nodes[8]; + int nodeid, i, nb; + int found = 0; + u32 reg; + unsigned numnodes; + nodemask_t nodes_parsed; + + nodes_clear(nodes_parsed); + + nb = find_northbridge(); + if (nb < 0) + return nb; + + printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); + + reg = read_pci_config(0, nb, 0, 0x60); + numnodes = ((reg >> 4) & 0xF) + 1; + + printk(KERN_INFO "Number of nodes %d\n", numnodes); + + memset(&nodes,0,sizeof(nodes)); + prevbase = 0; + for (i = 0; i < 8; i++) { + unsigned long base,limit; + + base = read_pci_config(0, nb, 1, 0x40 + i*8); + limit = read_pci_config(0, nb, 1, 0x44 + i*8); + + nodeid = limit & 7; + if ((base & 3) == 0) { + if (i < numnodes) + printk("Skipping disabled node %d\n", i); + continue; + } + if (nodeid >= numnodes) { + printk("Ignoring excess node %d (%lx:%lx)\n", nodeid, + base, limit); + continue; + } + + if (!limit) { + printk(KERN_INFO "Skipping node entry %d (base %lx)\n", i, + base); + continue; + } + if ((base >> 8) & 3 || (limit >> 8) & 3) { + printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n", + nodeid, (base>>8)&3, (limit>>8) & 3); + return -1; + } + if (node_isset(nodeid, nodes_parsed)) { + printk(KERN_INFO "Node %d already present. Skipping\n", + nodeid); + continue; + } + + limit >>= 16; + limit <<= 24; + limit |= (1<<24)-1; + + if (limit > end_pfn << PAGE_SHIFT) + limit = end_pfn << PAGE_SHIFT; + if (limit <= base) + continue; + + base >>= 16; + base <<= 24; + + if (base < start) + base = start; + if (limit > end) + limit = end; + if (limit == base) { + printk(KERN_ERR "Empty node %d\n", nodeid); + continue; + } + if (limit < base) { + printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n", + nodeid, base, limit); + continue; + } + + /* Could sort here, but pun for now. Should not happen anyroads. */ + if (prevbase > base) { + printk(KERN_ERR "Node map not sorted %lx,%lx\n", + prevbase,base); + return -1; + } + + printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n", + nodeid, base, limit); + + found++; + + nodes[nodeid].start = base; + nodes[nodeid].end = limit; + + prevbase = base; + + node_set(nodeid, nodes_parsed); + } + + if (!found) + return -1; + + memnode_shift = compute_hash_shift(nodes, numnodes); + if (memnode_shift < 0) { + printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); + return -1; + } + printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); + + for (i = 0; i < 8; i++) { + if (nodes[i].start != nodes[i].end) { + /* assume 1:1 NODE:CPU */ + cpu_to_node[i] = i; + setup_node_bootmem(i, nodes[i].start, nodes[i].end); + } + } + + numa_init_array(); + return 0; +} diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c new file mode 100644 index 00000000000..fd9f25d7a6c --- /dev/null +++ b/arch/x86_64/mm/numa.c @@ -0,0 +1,294 @@ +/* + * Generic VM initialization for x86-64 NUMA setups. + * Copyright 2002,2003 Andi Kleen, SuSE Labs. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifndef Dprintk +#define Dprintk(x...) +#endif + +struct pglist_data *node_data[MAX_NUMNODES]; +bootmem_data_t plat_node_bdata[MAX_NUMNODES]; + +int memnode_shift; +u8 memnodemap[NODEMAPSIZE]; + +unsigned char cpu_to_node[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE }; +cpumask_t node_to_cpumask[MAX_NUMNODES]; + +int numa_off __initdata; + +int __init compute_hash_shift(struct node *nodes, int numnodes) +{ + int i; + int shift = 24; + u64 addr; + + /* When in doubt use brute force. */ + while (shift < 48) { + memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE); + for (i = 0; i < numnodes; i++) { + if (nodes[i].start == nodes[i].end) + continue; + for (addr = nodes[i].start; + addr < nodes[i].end; + addr += (1UL << shift)) { + if (memnodemap[addr >> shift] != 0xff && + memnodemap[addr >> shift] != i) { + printk(KERN_INFO + "node %d shift %d addr %Lx conflict %d\n", + i, shift, addr, memnodemap[addr>>shift]); + goto next; + } + memnodemap[addr >> shift] = i; + } + } + return shift; + next: + shift++; + } + memset(memnodemap,0,sizeof(*memnodemap) * NODEMAPSIZE); + return -1; +} + +/* Initialize bootmem allocator for a node */ +void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) +{ + unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; + unsigned long nodedata_phys; + const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); + + start = round_up(start, ZONE_ALIGN); + + printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end); + + start_pfn = start >> PAGE_SHIFT; + end_pfn = end >> PAGE_SHIFT; + + nodedata_phys = find_e820_area(start, end, pgdat_size); + if (nodedata_phys == -1L) + panic("Cannot find memory pgdat in node %d\n", nodeid); + + Dprintk("nodedata_phys %lx\n", nodedata_phys); + + node_data[nodeid] = phys_to_virt(nodedata_phys); + memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); + NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; + NODE_DATA(nodeid)->node_start_pfn = start_pfn; + NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; + + /* Find a place for the bootmem map */ + bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); + bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); + bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<> PAGE_SHIFT, + start_pfn, end_pfn); + + e820_bootmem_free(NODE_DATA(nodeid), start, end); + + reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); + reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages< 0 have a zero length zone DMA */ + dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT; + if (start_pfn < dma_end_pfn) { + zones[ZONE_DMA] = dma_end_pfn - start_pfn; + zones[ZONE_NORMAL] = end_pfn - dma_end_pfn; + } else { + zones[ZONE_NORMAL] = end_pfn - start_pfn; + } + + free_area_init_node(nodeid, NODE_DATA(nodeid), zones, + start_pfn, NULL); +} + +void __init numa_init_array(void) +{ + int rr, i; + /* There are unfortunately some poorly designed mainboards around + that only connect memory to a single CPU. This breaks the 1:1 cpu->node + mapping. To avoid this fill in the mapping for all possible + CPUs, as the number of CPUs is not known yet. + We round robin the existing nodes. */ + rr = 0; + for (i = 0; i < NR_CPUS; i++) { + if (cpu_to_node[i] != NUMA_NO_NODE) + continue; + rr = next_node(rr, node_online_map); + if (rr == MAX_NUMNODES) + rr = first_node(node_online_map); + cpu_to_node[i] = rr; + rr++; + } + + set_bit(0, &node_to_cpumask[cpu_to_node(0)]); +} + +#ifdef CONFIG_NUMA_EMU +int numa_fake __initdata = 0; + +/* Numa emulation */ +static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn) +{ + int i; + struct node nodes[MAX_NUMNODES]; + unsigned long sz = ((end_pfn - start_pfn)< 1) { + unsigned long x = 1; + while ((x << 1) < sz) + x <<= 1; + if (x < sz/2) + printk("Numa emulation unbalanced. Complain to maintainer\n"); + sz = x; + } + + memset(&nodes,0,sizeof(nodes)); + for (i = 0; i < numa_fake; i++) { + nodes[i].start = (start_pfn<> 20); + node_set_online(i); + } + memnode_shift = compute_hash_shift(nodes, numa_fake); + if (memnode_shift < 0) { + memnode_shift = 0; + printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n"); + return -1; + } + for_each_online_node(i) + setup_node_bootmem(i, nodes[i].start, nodes[i].end); + numa_init_array(); + return 0; +} +#endif + +void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) +{ + int i; + +#ifdef CONFIG_NUMA_EMU + if (numa_fake && !numa_emulation(start_pfn, end_pfn)) + return; +#endif + +#ifdef CONFIG_ACPI_NUMA + if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, + end_pfn << PAGE_SHIFT)) + return; +#endif + +#ifdef CONFIG_K8_NUMA + if (!numa_off && !k8_scan_nodes(start_pfn< +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static inline pte_t *lookup_address(unsigned long address) +{ + pgd_t *pgd = pgd_offset_k(address); + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + if (pgd_none(*pgd)) + return NULL; + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return NULL; + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + return NULL; + if (pmd_large(*pmd)) + return (pte_t *)pmd; + pte = pte_offset_kernel(pmd, address); + if (pte && !pte_present(*pte)) + pte = NULL; + return pte; +} + +static struct page *split_large_page(unsigned long address, pgprot_t prot, + pgprot_t ref_prot) +{ + int i; + unsigned long addr; + struct page *base = alloc_pages(GFP_KERNEL, 0); + pte_t *pbase; + if (!base) + return NULL; + address = __pa(address); + addr = address & LARGE_PAGE_MASK; + pbase = (pte_t *)page_address(base); + for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { + pbase[i] = pfn_pte(addr >> PAGE_SHIFT, + addr == address ? prot : ref_prot); + } + return base; +} + + +static void flush_kernel_map(void *address) +{ + if (0 && address && cpu_has_clflush) { + /* is this worth it? */ + int i; + for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size) + asm volatile("clflush (%0)" :: "r" (address + i)); + } else + asm volatile("wbinvd":::"memory"); + if (address) + __flush_tlb_one(address); + else + __flush_tlb_all(); +} + + +static inline void flush_map(unsigned long address) +{ + on_each_cpu(flush_kernel_map, (void *)address, 1, 1); +} + +struct deferred_page { + struct deferred_page *next; + struct page *fpage; + unsigned long address; +}; +static struct deferred_page *df_list; /* protected by init_mm.mmap_sem */ + +static inline void save_page(unsigned long address, struct page *fpage) +{ + struct deferred_page *df; + df = kmalloc(sizeof(struct deferred_page), GFP_KERNEL); + if (!df) { + flush_map(address); + __free_page(fpage); + } else { + df->next = df_list; + df->fpage = fpage; + df->address = address; + df_list = df; + } +} + +/* + * No more special protections in this 2/4MB area - revert to a + * large page again. + */ +static void revert_page(unsigned long address, pgprot_t ref_prot) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t large_pte; + + pgd = pgd_offset_k(address); + BUG_ON(pgd_none(*pgd)); + pud = pud_offset(pgd,address); + BUG_ON(pud_none(*pud)); + pmd = pmd_offset(pud, address); + BUG_ON(pmd_val(*pmd) & _PAGE_PSE); + pgprot_val(ref_prot) |= _PAGE_PSE; + large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot); + set_pte((pte_t *)pmd, large_pte); +} + +static int +__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, + pgprot_t ref_prot) +{ + pte_t *kpte; + struct page *kpte_page; + unsigned kpte_flags; + kpte = lookup_address(address); + if (!kpte) return 0; + kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK); + kpte_flags = pte_val(*kpte); + if (pgprot_val(prot) != pgprot_val(ref_prot)) { + if ((kpte_flags & _PAGE_PSE) == 0) { + set_pte(kpte, pfn_pte(pfn, prot)); + } else { + /* + * split_large_page will take the reference for this change_page_attr + * on the split page. + */ + struct page *split = split_large_page(address, prot, ref_prot); + if (!split) + return -ENOMEM; + set_pte(kpte,mk_pte(split, ref_prot)); + kpte_page = split; + } + get_page(kpte_page); + } else if ((kpte_flags & _PAGE_PSE) == 0) { + set_pte(kpte, pfn_pte(pfn, ref_prot)); + __put_page(kpte_page); + } else + BUG(); + + /* on x86-64 the direct mapping set at boot is not using 4k pages */ + BUG_ON(PageReserved(kpte_page)); + + switch (page_count(kpte_page)) { + case 1: + save_page(address, kpte_page); + revert_page(address, ref_prot); + break; + case 0: + BUG(); /* memleak and failed 2M page regeneration */ + } + return 0; +} + +/* + * Change the page attributes of an page in the linear mapping. + * + * This should be used when a page is mapped with a different caching policy + * than write-back somewhere - some CPUs do not like it when mappings with + * different caching policies exist. This changes the page attributes of the + * in kernel linear mapping too. + * + * The caller needs to ensure that there are no conflicting mappings elsewhere. + * This function only deals with the kernel linear map. + * + * Caller must call global_flush_tlb() after this. + */ +int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot) +{ + int err = 0; + int i; + + down_write(&init_mm.mmap_sem); + for (i = 0; i < numpages; i++, address += PAGE_SIZE) { + unsigned long pfn = __pa(address) >> PAGE_SHIFT; + + err = __change_page_attr(address, pfn, prot, PAGE_KERNEL); + if (err) + break; + /* Handle kernel mapping too which aliases part of the + * lowmem */ + if (__pa(address) < KERNEL_TEXT_SIZE) { + unsigned long addr2; + pgprot_t prot2 = prot; + addr2 = __START_KERNEL_map + __pa(address); + pgprot_val(prot2) &= ~_PAGE_NX; + err = __change_page_attr(addr2, pfn, prot2, PAGE_KERNEL_EXEC); + } + } + up_write(&init_mm.mmap_sem); + return err; +} + +/* Don't call this for MMIO areas that may not have a mem_map entry */ +int change_page_attr(struct page *page, int numpages, pgprot_t prot) +{ + unsigned long addr = (unsigned long)page_address(page); + return change_page_attr_addr(addr, numpages, prot); +} + +void global_flush_tlb(void) +{ + struct deferred_page *df, *next_df; + + down_read(&init_mm.mmap_sem); + df = xchg(&df_list, NULL); + up_read(&init_mm.mmap_sem); + if (!df) + return; + flush_map((df && !df->next) ? df->address : 0); + for (; df; df = next_df) { + next_df = df->next; + if (df->fpage) + __free_page(df->fpage); + kfree(df); + } +} + +EXPORT_SYMBOL(change_page_attr); +EXPORT_SYMBOL(global_flush_tlb); diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c new file mode 100644 index 00000000000..5d01b31472e --- /dev/null +++ b/arch/x86_64/mm/srat.c @@ -0,0 +1,217 @@ +/* + * ACPI 3.0 based NUMA setup + * Copyright 2004 Andi Kleen, SuSE Labs. + * + * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs. + * + * Called from acpi_numa_init while reading the SRAT and SLIT tables. + * Assumes all memory regions belonging to a single proximity domain + * are in one chunk. Holes between them will be included in the node. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static struct acpi_table_slit *acpi_slit; + +static nodemask_t nodes_parsed __initdata; +static nodemask_t nodes_found __initdata; +static struct node nodes[MAX_NUMNODES] __initdata; +static __u8 pxm2node[256] = { [0 ... 255] = 0xff }; + +static __init int setup_node(int pxm) +{ + unsigned node = pxm2node[pxm]; + if (node == 0xff) { + if (nodes_weight(nodes_found) >= MAX_NUMNODES) + return -1; + node = first_unset_node(nodes_found); + node_set(node, nodes_found); + pxm2node[pxm] = node; + } + return pxm2node[pxm]; +} + +static __init int conflicting_nodes(unsigned long start, unsigned long end) +{ + int i; + for_each_online_node(i) { + struct node *nd = &nodes[i]; + if (nd->start == nd->end) + continue; + if (nd->end > start && nd->start < end) + return 1; + if (nd->end == end && nd->start == start) + return 1; + } + return -1; +} + +static __init void cutoff_node(int i, unsigned long start, unsigned long end) +{ + struct node *nd = &nodes[i]; + if (nd->start < start) { + nd->start = start; + if (nd->end < nd->start) + nd->start = nd->end; + } + if (nd->end > end) { + if (!(end & 0xfff)) + end--; + nd->end = end; + if (nd->start > nd->end) + nd->start = nd->end; + } +} + +static __init void bad_srat(void) +{ + printk(KERN_ERR "SRAT: SRAT not used.\n"); + acpi_numa = -1; +} + +static __init inline int srat_disabled(void) +{ + return numa_off || acpi_numa < 0; +} + +/* Callback for SLIT parsing */ +void __init acpi_numa_slit_init(struct acpi_table_slit *slit) +{ + acpi_slit = slit; +} + +/* Callback for Proximity Domain -> LAPIC mapping */ +void __init +acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa) +{ + int pxm, node; + if (srat_disabled() || pa->flags.enabled == 0) + return; + pxm = pa->proximity_domain; + node = setup_node(pxm); + if (node < 0) { + printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); + bad_srat(); + return; + } + if (pa->apic_id >= NR_CPUS) { + printk(KERN_ERR "SRAT: lapic %u too large.\n", + pa->apic_id); + bad_srat(); + return; + } + cpu_to_node[pa->apic_id] = node; + acpi_numa = 1; + printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", + pxm, pa->apic_id, node); +} + +/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ +void __init +acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) +{ + struct node *nd; + unsigned long start, end; + int node, pxm; + int i; + + if (srat_disabled() || ma->flags.enabled == 0) + return; + /* hotplug bit is ignored for now */ + pxm = ma->proximity_domain; + node = setup_node(pxm); + if (node < 0) { + printk(KERN_ERR "SRAT: Too many proximity domains.\n"); + bad_srat(); + return; + } + start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32); + end = start + (ma->length_lo | ((u64)ma->length_hi << 32)); + i = conflicting_nodes(start, end); + if (i >= 0) { + printk(KERN_ERR + "SRAT: pxm %d overlap %lx-%lx with node %d(%Lx-%Lx)\n", + pxm, start, end, i, nodes[i].start, nodes[i].end); + bad_srat(); + return; + } + nd = &nodes[node]; + if (!node_test_and_set(node, nodes_parsed)) { + nd->start = start; + nd->end = end; + } else { + if (start < nd->start) + nd->start = start; + if (nd->end < end) + nd->end = end; + } + if (!(nd->end & 0xfff)) + nd->end--; + printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, + nd->start, nd->end); +} + +void __init acpi_numa_arch_fixup(void) {} + +/* Use the information discovered above to actually set up the nodes. */ +int __init acpi_scan_nodes(unsigned long start, unsigned long end) +{ + int i; + if (acpi_numa <= 0) + return -1; + memnode_shift = compute_hash_shift(nodes, nodes_weight(nodes_parsed)); + if (memnode_shift < 0) { + printk(KERN_ERR + "SRAT: No NUMA node hash function found. Contact maintainer\n"); + bad_srat(); + return -1; + } + for (i = 0; i < MAX_NUMNODES; i++) { + if (!node_isset(i, nodes_parsed)) + continue; + cutoff_node(i, start, end); + if (nodes[i].start == nodes[i].end) { + node_clear(i, nodes_parsed); + continue; + } + setup_node_bootmem(i, nodes[i].start, nodes[i].end); + } + for (i = 0; i < NR_CPUS; i++) { + if (cpu_to_node[i] == NUMA_NO_NODE) + continue; + if (!node_isset(cpu_to_node[i], nodes_parsed)) + cpu_to_node[i] = NUMA_NO_NODE; + } + numa_init_array(); + return 0; +} + +int node_to_pxm(int n) +{ + int i; + if (pxm2node[n] == n) + return n; + for (i = 0; i < 256; i++) + if (pxm2node[i] == n) + return i; + return 0; +} + +int __node_distance(int a, int b) +{ + int index; + + if (!acpi_slit) + return a == b ? 10 : 20; + index = acpi_slit->localities * node_to_pxm(a); + return acpi_slit->entry[index + node_to_pxm(b)]; +} + +EXPORT_SYMBOL(__node_distance); -- cgit v1.2.3