diff options
Diffstat (limited to 'arch/sparc64/mm')
-rw-r--r-- | arch/sparc64/mm/Makefile | 2 | ||||
-rw-r--r-- | arch/sparc64/mm/fault.c | 24 | ||||
-rw-r--r-- | arch/sparc64/mm/generic.c | 41 | ||||
-rw-r--r-- | arch/sparc64/mm/hugetlbpage.c | 219 | ||||
-rw-r--r-- | arch/sparc64/mm/init.c | 1452 | ||||
-rw-r--r-- | arch/sparc64/mm/tlb.c | 64 | ||||
-rw-r--r-- | arch/sparc64/mm/tsb.c | 500 | ||||
-rw-r--r-- | arch/sparc64/mm/ultra.S | 374 |
8 files changed, 1838 insertions, 838 deletions
diff --git a/arch/sparc64/mm/Makefile b/arch/sparc64/mm/Makefile index 9d0960e69f4..e415bf942bc 100644 --- a/arch/sparc64/mm/Makefile +++ b/arch/sparc64/mm/Makefile @@ -5,6 +5,6 @@ EXTRA_AFLAGS := -ansi EXTRA_CFLAGS := -Werror -obj-y := ultra.o tlb.o fault.o init.o generic.o +obj-y := ultra.o tlb.o tsb.o fault.o init.o generic.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o diff --git a/arch/sparc64/mm/fault.c b/arch/sparc64/mm/fault.c index 6f0539aa44d..d21ff3230c0 100644 --- a/arch/sparc64/mm/fault.c +++ b/arch/sparc64/mm/fault.c @@ -29,6 +29,7 @@ #include <asm/lsu.h> #include <asm/sections.h> #include <asm/kdebug.h> +#include <asm/mmu_context.h> /* * To debug kernel to catch accesses to certain virtual/physical addresses. @@ -91,12 +92,13 @@ static void __kprobes unhandled_fault(unsigned long address, die_if_kernel("Oops", regs); } -static void bad_kernel_pc(struct pt_regs *regs) +static void bad_kernel_pc(struct pt_regs *regs, unsigned long vaddr) { unsigned long *ksp; printk(KERN_CRIT "OOPS: Bogus kernel PC [%016lx] in fault handler\n", regs->tpc); + printk(KERN_CRIT "OOPS: Fault was to vaddr[%lx]\n", vaddr); __asm__("mov %%sp, %0" : "=r" (ksp)); show_stack(current, ksp); unhandled_fault(regs->tpc, current, regs); @@ -137,7 +139,7 @@ static unsigned int get_user_insn(unsigned long tpc) if (!pte_present(pte)) goto out; - pa = (pte_val(pte) & _PAGE_PADDR); + pa = (pte_pfn(pte) << PAGE_SHIFT); pa += (tpc & ~PAGE_MASK); /* Use phys bypass so we don't pollute dtlb/dcache. */ @@ -257,7 +259,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs) struct vm_area_struct *vma; unsigned int insn = 0; int si_code, fault_code; - unsigned long address; + unsigned long address, mm_rss; fault_code = get_thread_fault_code(); @@ -280,7 +282,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs) (tpc >= MODULES_VADDR && tpc < MODULES_END)) { /* Valid, no problems... */ } else { - bad_kernel_pc(regs); + bad_kernel_pc(regs, address); return; } } @@ -406,6 +408,20 @@ good_area: } up_read(&mm->mmap_sem); + + mm_rss = get_mm_rss(mm); +#ifdef CONFIG_HUGETLB_PAGE + mm_rss -= (mm->context.huge_pte_count * (HPAGE_SIZE / PAGE_SIZE)); +#endif + if (unlikely(mm_rss >= + mm->context.tsb_block[MM_TSB_BASE].tsb_rss_limit)) + tsb_grow(mm, MM_TSB_BASE, mm_rss); +#ifdef CONFIG_HUGETLB_PAGE + mm_rss = mm->context.huge_pte_count; + if (unlikely(mm_rss >= + mm->context.tsb_block[MM_TSB_HUGE].tsb_rss_limit)) + tsb_grow(mm, MM_TSB_HUGE, mm_rss); +#endif return; /* diff --git a/arch/sparc64/mm/generic.c b/arch/sparc64/mm/generic.c index 580b63da836..8cb06205d26 100644 --- a/arch/sparc64/mm/generic.c +++ b/arch/sparc64/mm/generic.c @@ -15,15 +15,6 @@ #include <asm/page.h> #include <asm/tlbflush.h> -static inline pte_t mk_pte_io(unsigned long page, pgprot_t prot, int space) -{ - pte_t pte; - pte_val(pte) = (((page) | pgprot_val(prot) | _PAGE_E) & - ~(unsigned long)_PAGE_CACHE); - pte_val(pte) |= (((unsigned long)space) << 32); - return pte; -} - /* Remap IO memory, the same way as remap_pfn_range(), but use * the obio memory space. * @@ -48,24 +39,29 @@ static inline void io_remap_pte_range(struct mm_struct *mm, pte_t * pte, pte_t entry; unsigned long curend = address + PAGE_SIZE; - entry = mk_pte_io(offset, prot, space); + entry = mk_pte_io(offset, prot, space, PAGE_SIZE); if (!(address & 0xffff)) { - if (!(address & 0x3fffff) && !(offset & 0x3ffffe) && end >= address + 0x400000) { - entry = mk_pte_io(offset, - __pgprot(pgprot_val (prot) | _PAGE_SZ4MB), - space); + if (PAGE_SIZE < (4 * 1024 * 1024) && + !(address & 0x3fffff) && + !(offset & 0x3ffffe) && + end >= address + 0x400000) { + entry = mk_pte_io(offset, prot, space, + 4 * 1024 * 1024); curend = address + 0x400000; offset += 0x400000; - } else if (!(address & 0x7ffff) && !(offset & 0x7fffe) && end >= address + 0x80000) { - entry = mk_pte_io(offset, - __pgprot(pgprot_val (prot) | _PAGE_SZ512K), - space); + } else if (PAGE_SIZE < (512 * 1024) && + !(address & 0x7ffff) && + !(offset & 0x7fffe) && + end >= address + 0x80000) { + entry = mk_pte_io(offset, prot, space, + 512 * 1024 * 1024); curend = address + 0x80000; offset += 0x80000; - } else if (!(offset & 0xfffe) && end >= address + 0x10000) { - entry = mk_pte_io(offset, - __pgprot(pgprot_val (prot) | _PAGE_SZ64K), - space); + } else if (PAGE_SIZE < (64 * 1024) && + !(offset & 0xfffe) && + end >= address + 0x10000) { + entry = mk_pte_io(offset, prot, space, + 64 * 1024); curend = address + 0x10000; offset += 0x10000; } else @@ -144,7 +140,6 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from, vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; vma->vm_pgoff = phys_base >> PAGE_SHIFT; - prot = __pgprot(pg_iobits); offset -= from; dir = pgd_offset(mm, from); flush_cache_range(vma, beg, end); diff --git a/arch/sparc64/mm/hugetlbpage.c b/arch/sparc64/mm/hugetlbpage.c index 625cbb336a2..074620d413d 100644 --- a/arch/sparc64/mm/hugetlbpage.c +++ b/arch/sparc64/mm/hugetlbpage.c @@ -1,7 +1,7 @@ /* * SPARC64 Huge TLB page support. * - * Copyright (C) 2002, 2003 David S. Miller (davem@redhat.com) + * Copyright (C) 2002, 2003, 2006 David S. Miller (davem@davemloft.net) */ #include <linux/config.h> @@ -22,6 +22,175 @@ #include <asm/cacheflush.h> #include <asm/mmu_context.h> +/* Slightly simplified from the non-hugepage variant because by + * definition we don't have to worry about any page coloring stuff + */ +#define VA_EXCLUDE_START (0x0000080000000000UL - (1UL << 32UL)) +#define VA_EXCLUDE_END (0xfffff80000000000UL + (1UL << 32UL)) + +static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *filp, + unsigned long addr, + unsigned long len, + unsigned long pgoff, + unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct * vma; + unsigned long task_size = TASK_SIZE; + unsigned long start_addr; + + if (test_thread_flag(TIF_32BIT)) + task_size = STACK_TOP32; + if (unlikely(len >= VA_EXCLUDE_START)) + return -ENOMEM; + + if (len > mm->cached_hole_size) { + start_addr = addr = mm->free_area_cache; + } else { + start_addr = addr = TASK_UNMAPPED_BASE; + mm->cached_hole_size = 0; + } + + task_size -= len; + +full_search: + addr = ALIGN(addr, HPAGE_SIZE); + + for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { + /* At this point: (!vma || addr < vma->vm_end). */ + if (addr < VA_EXCLUDE_START && + (addr + len) >= VA_EXCLUDE_START) { + addr = VA_EXCLUDE_END; + vma = find_vma(mm, VA_EXCLUDE_END); + } + if (unlikely(task_size < addr)) { + if (start_addr != TASK_UNMAPPED_BASE) { + start_addr = addr = TASK_UNMAPPED_BASE; + mm->cached_hole_size = 0; + goto full_search; + } + return -ENOMEM; + } + if (likely(!vma || addr + len <= vma->vm_start)) { + /* + * Remember the place where we stopped the search: + */ + mm->free_area_cache = addr + len; + return addr; + } + if (addr + mm->cached_hole_size < vma->vm_start) + mm->cached_hole_size = vma->vm_start - addr; + + addr = ALIGN(vma->vm_end, HPAGE_SIZE); + } +} + +static unsigned long +hugetlb_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, + const unsigned long len, + const unsigned long pgoff, + const unsigned long flags) +{ + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; + unsigned long addr = addr0; + + /* This should only ever run for 32-bit processes. */ + BUG_ON(!test_thread_flag(TIF_32BIT)); + + /* check if free_area_cache is useful for us */ + if (len <= mm->cached_hole_size) { + mm->cached_hole_size = 0; + mm->free_area_cache = mm->mmap_base; + } + + /* either no address requested or can't fit in requested address hole */ + addr = mm->free_area_cache & HPAGE_MASK; + + /* make sure it can fit in the remaining address space */ + if (likely(addr > len)) { + vma = find_vma(mm, addr-len); + if (!vma || addr <= vma->vm_start) { + /* remember the address as a hint for next time */ + return (mm->free_area_cache = addr-len); + } + } + + if (unlikely(mm->mmap_base < len)) + goto bottomup; + + addr = (mm->mmap_base-len) & HPAGE_MASK; + + do { + /* + * Lookup failure means no vma is above this address, + * else if new region fits below vma->vm_start, + * return with success: + */ + vma = find_vma(mm, addr); + if (likely(!vma || addr+len <= vma->vm_start)) { + /* remember the address as a hint for next time */ + return (mm->free_area_cache = addr); + } + + /* remember the largest hole we saw so far */ + if (addr + mm->cached_hole_size < vma->vm_start) + mm->cached_hole_size = vma->vm_start - addr; + + /* try just below the current vma->vm_start */ + addr = (vma->vm_start-len) & HPAGE_MASK; + } while (likely(len < vma->vm_start)); + +bottomup: + /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ + mm->cached_hole_size = ~0UL; + mm->free_area_cache = TASK_UNMAPPED_BASE; + addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); + /* + * Restore the topdown base: + */ + mm->free_area_cache = mm->mmap_base; + mm->cached_hole_size = ~0UL; + + return addr; +} + +unsigned long +hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long task_size = TASK_SIZE; + + if (test_thread_flag(TIF_32BIT)) + task_size = STACK_TOP32; + + if (len & ~HPAGE_MASK) + return -EINVAL; + if (len > task_size) + return -ENOMEM; + + if (addr) { + addr = ALIGN(addr, HPAGE_SIZE); + vma = find_vma(mm, addr); + if (task_size - len >= addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + if (mm->get_unmapped_area == arch_get_unmapped_area) + return hugetlb_get_unmapped_area_bottomup(file, addr, len, + pgoff, flags); + else + return hugetlb_get_unmapped_area_topdown(file, addr, len, + pgoff, flags); +} + pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; @@ -30,13 +199,11 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) pte_t *pte = NULL; pgd = pgd_offset(mm, addr); - if (pgd) { - pud = pud_offset(pgd, addr); - if (pud) { - pmd = pmd_alloc(mm, pud, addr); - if (pmd) - pte = pte_alloc_map(mm, pmd, addr); - } + pud = pud_alloc(mm, pgd, addr); + if (pud) { + pmd = pmd_alloc(mm, pud, addr); + if (pmd) + pte = pte_alloc_map(mm, pmd, addr); } return pte; } @@ -48,25 +215,28 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) pmd_t *pmd; pte_t *pte = NULL; + addr &= HPAGE_MASK; + pgd = pgd_offset(mm, addr); - if (pgd) { + if (!pgd_none(*pgd)) { pud = pud_offset(pgd, addr); - if (pud) { + if (!pud_none(*pud)) { pmd = pmd_offset(pud, addr); - if (pmd) + if (!pmd_none(*pmd)) pte = pte_offset_map(pmd, addr); } } return pte; } -#define mk_pte_huge(entry) do { pte_val(entry) |= _PAGE_SZHUGE; } while (0) - void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t entry) { int i; + if (!pte_present(*ptep) && pte_present(entry)) + mm->context.huge_pte_count++; + for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { set_pte_at(mm, addr, ptep, entry); ptep++; @@ -82,6 +252,8 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, int i; entry = *ptep; + if (pte_present(entry)) + mm->context.huge_pte_count--; for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { pte_clear(mm, addr, ptep); @@ -92,18 +264,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, return entry; } -/* - * This function checks for proper alignment of input addr and len parameters. - */ -int is_aligned_hugepage_range(unsigned long addr, unsigned long len) -{ - if (len & ~HPAGE_MASK) - return -EINVAL; - if (addr & ~HPAGE_MASK) - return -EINVAL; - return 0; -} - struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) { @@ -131,6 +291,15 @@ static void context_reload(void *__data) void hugetlb_prefault_arch_hook(struct mm_struct *mm) { + struct tsb_config *tp = &mm->context.tsb_block[MM_TSB_HUGE]; + + if (likely(tp->tsb != NULL)) + return; + + tsb_grow(mm, MM_TSB_HUGE, 0); + tsb_context_switch(mm); + smp_tsb_sync(mm); + /* On UltraSPARC-III+ and later, configure the second half of * the Data-TLB for huge pages. */ diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c index 1e44ee26cee..1539a8362b6 100644 --- a/arch/sparc64/mm/init.c +++ b/arch/sparc64/mm/init.c @@ -6,6 +6,7 @@ */ #include <linux/config.h> +#include <linux/module.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/string.h> @@ -39,9 +40,27 @@ #include <asm/tlb.h> #include <asm/spitfire.h> #include <asm/sections.h> +#include <asm/tsb.h> +#include <asm/hypervisor.h> extern void device_scan(void); +#define MAX_PHYS_ADDRESS (1UL << 42UL) +#define KPTE_BITMAP_CHUNK_SZ (256UL * 1024UL * 1024UL) +#define KPTE_BITMAP_BYTES \ + ((MAX_PHYS_ADDRESS / KPTE_BITMAP_CHUNK_SZ) / 8) + +unsigned long kern_linear_pte_xor[2] __read_mostly; + +/* A bitmap, one bit for every 256MB of physical memory. If the bit + * is clear, we should use a 4MB page (via kern_linear_pte_xor[0]) else + * if set we should use a 256MB page (via kern_linear_pte_xor[1]). + */ +unsigned long kpte_linear_bitmap[KPTE_BITMAP_BYTES / sizeof(unsigned long)]; + +/* A special kernel TSB for 4MB and 256MB linear mappings. */ +struct tsb swapper_4m_tsb[KERNEL_TSB4M_NENTRIES]; + #define MAX_BANKS 32 static struct linux_prom64_registers pavail[MAX_BANKS] __initdata; @@ -111,11 +130,9 @@ static void __init read_obp_memory(const char *property, unsigned long *sparc64_valid_addr_bitmap __read_mostly; -/* Ugly, but necessary... -DaveM */ -unsigned long phys_base __read_mostly; +/* Kernel physical address base and size in bytes. */ unsigned long kern_base __read_mostly; unsigned long kern_size __read_mostly; -unsigned long pfn_base __read_mostly; /* get_new_mmu_context() uses "cache + 1". */ DEFINE_SPINLOCK(ctx_alloc_lock); @@ -141,24 +158,28 @@ unsigned long sparc64_kern_sec_context __read_mostly; int bigkernel = 0; -/* XXX Tune this... */ -#define PGT_CACHE_LOW 25 -#define PGT_CACHE_HIGH 50 +kmem_cache_t *pgtable_cache __read_mostly; + +static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags) +{ + clear_page(addr); +} + +extern void tsb_cache_init(void); -void check_pgt_cache(void) +void pgtable_cache_init(void) { - preempt_disable(); - if (pgtable_cache_size > PGT_CACHE_HIGH) { - do { - if (pgd_quicklist) - free_pgd_slow(get_pgd_fast()); - if (pte_quicklist[0]) - free_pte_slow(pte_alloc_one_fast(NULL, 0)); - if (pte_quicklist[1]) - free_pte_slow(pte_alloc_one_fast(NULL, 1 << (PAGE_SHIFT + 10))); - } while (pgtable_cache_size > PGT_CACHE_LOW); + pgtable_cache = kmem_cache_create("pgtable_cache", + PAGE_SIZE, PAGE_SIZE, + SLAB_HWCACHE_ALIGN | + SLAB_MUST_HWCACHE_ALIGN, + zero_ctor, + NULL); + if (!pgtable_cache) { + prom_printf("Could not create pgtable_cache\n"); + prom_halt(); } - preempt_enable(); + tsb_cache_init(); } #ifdef CONFIG_DEBUG_DCFLUSH @@ -168,8 +189,9 @@ atomic_t dcpage_flushes_xcall = ATOMIC_INIT(0); #endif #endif -__inline__ void flush_dcache_page_impl(struct page *page) +inline void flush_dcache_page_impl(struct page *page) { + BUG_ON(tlb_type == hypervisor); #ifdef CONFIG_DEBUG_DCFLUSH atomic_inc(&dcpage_flushes); #endif @@ -186,8 +208,8 @@ __inline__ void flush_dcache_page_impl(struct page *page) } #define PG_dcache_dirty PG_arch_1 -#define PG_dcache_cpu_shift 24 -#define PG_dcache_cpu_mask (256 - 1) +#define PG_dcache_cpu_shift 24UL +#define PG_dcache_cpu_mask (256UL - 1UL) #if NR_CPUS > 256 #error D-cache dirty tracking and thread_info->cpu need fixing for > 256 cpus @@ -243,32 +265,78 @@ static __inline__ void clear_dcache_dirty_cpu(struct page *page, unsigned long c : "g1", "g7"); } +static inline void tsb_insert(struct tsb *ent, unsigned long tag, unsigned long pte) +{ + unsigned long tsb_addr = (unsigned long) ent; + + if (tlb_type == cheetah_plus || tlb_type == hypervisor) + tsb_addr = __pa(tsb_addr); + + __tsb_insert(tsb_addr, tag, pte); +} + +unsigned long _PAGE_ALL_SZ_BITS __read_mostly; +unsigned long _PAGE_SZBITS __read_mostly; + void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t pte) { - struct page *page; - unsigned long pfn; - unsigned long pg_flags; - - pfn = pte_pfn(pte); - if (pfn_valid(pfn) && - (page = pfn_to_page(pfn), page_mapping(page)) && - ((pg_flags = page->flags) & (1UL << PG_dcache_dirty))) { - int cpu = ((pg_flags >> PG_dcache_cpu_shift) & - PG_dcache_cpu_mask); - int this_cpu = get_cpu(); - - /* This is just to optimize away some function calls - * in the SMP case. - */ - if (cpu == this_cpu) - flush_dcache_page_impl(page); - else - smp_flush_dcache_page_impl(page, cpu); + struct mm_struct *mm; + struct tsb *tsb; + unsigned long tag, flags; + unsigned long tsb_index, tsb_hash_shift; + + if (tlb_type != hypervisor) { + unsigned long pfn = pte_pfn(pte); + unsigned long pg_flags; + struct page *page; + + if (pfn_valid(pfn) && + (page = pfn_to_page(pfn), page_mapping(page)) && + ((pg_flags = page->flags) & (1UL << PG_dcache_dirty))) { + int cpu = ((pg_flags >> PG_dcache_cpu_shift) & + PG_dcache_cpu_mask); + int this_cpu = get_cpu(); + + /* This is just to optimize away some function calls + * in the SMP case. + */ + if (cpu == this_cpu) + flush_dcache_page_impl(page); + else + smp_flush_dcache_page_impl(page, cpu); + + clear_dcache_dirty_cpu(page, cpu); - clear_dcache_dirty_cpu(page, cpu); + put_cpu(); + } + } - put_cpu(); + mm = vma->vm_mm; + + tsb_index = MM_TSB_BASE; + tsb_hash_shift = PAGE_SHIFT; + + spin_lock_irqsave(&mm->context.lock, flags); + +#ifdef CONFIG_HUGETLB_PAGE + if (mm->context.tsb_block[MM_TSB_HUGE].tsb != NULL) { + if ((tlb_type == hypervisor && + (pte_val(pte) & _PAGE_SZALL_4V) == _PAGE_SZHUGE_4V) || + (tlb_type != hypervisor && + (pte_val(pte) & _PAGE_SZALL_4U) == _PAGE_SZHUGE_4U)) { + tsb_index = MM_TSB_HUGE; + tsb_hash_shift = HPAGE_SHIFT; + } } +#endif + + tsb = mm->context.tsb_block[tsb_index].tsb; + tsb += ((address >> tsb_hash_shift) & + (mm->context.tsb_block[tsb_index].tsb_nentries - 1UL)); + tag = (address >> 22UL); + tsb_insert(tsb, tag, pte_val(pte)); + + spin_unlock_irqrestore(&mm->context.lock, flags); } void flush_dcache_page(struct page *page) @@ -276,6 +344,9 @@ void flush_dcache_page(struct page *page) struct address_space *mapping; int this_cpu; + if (tlb_type == hypervisor) + return; + /* Do not bother with the expensive D-cache flush if it * is merely the zero page. The 'bigcore' testcase in GDB * causes this case to run millions of times. @@ -311,7 +382,7 @@ out: void __kprobes flush_icache_range(unsigned long start, unsigned long end) { - /* Cheetah has coherent I-cache. */ + /* Cheetah and Hypervisor platform cpus have coherent I-cache. */ if (tlb_type == spitfire) { unsigned long kaddr; @@ -320,16 +391,6 @@ void __kprobes flush_icache_range(unsigned long start, unsigned long end) } } -unsigned long page_to_pfn(struct page *page) -{ - return (unsigned long) ((page - mem_map) + pfn_base); -} - -struct page *pfn_to_page(unsigned long pfn) -{ - return (mem_map + (pfn - pfn_base)); -} - void show_mem(void) { printk("Mem-info:\n"); @@ -338,7 +399,6 @@ void show_mem(void) nr_swap_pages << (PAGE_SHIFT-10)); printk("%ld pages of RAM\n", num_physpages); printk("%d free pages\n", nr_free_pages()); - printk("%d pages in page table cache\n",pgtable_cache_size); } void mmu_info(struct seq_file *m) @@ -349,6 +409,8 @@ void mmu_info(struct seq_file *m) seq_printf(m, "MMU Type\t: Cheetah+\n"); else if (tlb_type == spitfire) seq_printf(m, "MMU Type\t: Spitfire\n"); + else if (tlb_type == hypervisor) + seq_printf(m, "MMU Type\t: Hypervisor (sun4v)\n"); else seq_printf(m, "MMU Type\t: ???\n"); @@ -371,45 +433,13 @@ struct linux_prom_translation { /* Exported for kernel TLB miss handling in ktlb.S */ struct linux_prom_translation prom_trans[512] __read_mostly; unsigned int prom_trans_ents __read_mostly; -unsigned int swapper_pgd_zero __read_mostly; - -extern unsigned long prom_boot_page; -extern void prom_remap(unsigned long physpage, unsigned long virtpage, int mmu_ihandle); -extern int prom_get_mmu_ihandle(void); -extern void register_prom_callbacks(void); /* Exported for SMP bootup purposes. */ unsigned long kern_locked_tte_data; -/* - * Translate PROM's mapping we capture at boot time into physical address. - * The second parameter is only set from prom_callback() invocations. - */ -unsigned long prom_virt_to_phys(unsigned long promva, int *error) -{ - int i; - - for (i = 0; i < prom_trans_ents; i++) { - struct linux_prom_translation *p = &prom_trans[i]; - - if (promva >= p->virt && - promva < (p->virt + p->size)) { - unsigned long base = p->data & _PAGE_PADDR; - - if (error) - *error = 0; - return base + (promva & (8192 - 1)); - } - } - if (error) - *error = 1; - return 0UL; -} - /* The obp translations are saved based on 8k pagesize, since obp can * use a mixture of pagesizes. Misses to the LOW_OBP_ADDRESS -> - * HI_OBP_ADDRESS range are handled in ktlb.S and do not use the vpte - * scheme (also, see rant in inherit_locked_prom_mappings()). + * HI_OBP_ADDRESS range are handled in ktlb.S. */ static inline int in_obp_range(unsigned long vaddr) { @@ -490,6 +520,36 @@ static void __init read_obp_translations(void) } } +static void __init hypervisor_tlb_lock(unsigned long vaddr, + unsigned long pte, + unsigned long mmu) +{ + register unsigned long func asm("%o5"); + register unsigned long arg0 asm("%o0"); + register unsigned long arg1 asm("%o1"); + register unsigned long arg2 asm("%o2"); + register unsigned long arg3 asm("%o3"); + + func = HV_FAST_MMU_MAP_PERM_ADDR; + arg0 = vaddr; + arg1 = 0; + arg2 = pte; + arg3 = mmu; + __asm__ __volatile__("ta 0x80" + : "=&r" (func), "=&r" (arg0), + "=&r" (arg1), "=&r" (arg2), + "=&r" (arg3) + : "0" (func), "1" (arg0), "2" (arg1), + "3" (arg2), "4" (arg3)); + if (arg0 != 0) { + prom_printf("hypervisor_tlb_lock[%lx:%lx:%lx:%lx]: " + "errors with %lx\n", vaddr, 0, pte, mmu, arg0); + prom_halt(); + } +} + +static unsigned long kern_large_tte(unsigned long paddr); + static void __init remap_kernel(void) { unsigned long phys_page, tte_vaddr, tte_data; @@ -497,25 +557,34 @@ static void __init remap_kernel(void) tte_vaddr = (unsigned long) KERNBASE; phys_page = (prom_boot_mapping_phys_low >> 22UL) << 22UL; - tte_data = (phys_page | (_PAGE_VALID | _PAGE_SZ4MB | - _PAGE_CP | _PAGE_CV | _PAGE_P | - _PAGE_L | _PAGE_W)); + tte_data = kern_large_tte(phys_page); kern_locked_tte_data = tte_data; - /* Now lock us into the TLBs via OBP. */ - prom_dtlb_load(tlb_ent, tte_data, tte_vaddr); - prom_itlb_load(tlb_ent, tte_data, tte_vaddr); - if (bigkernel) { - tlb_ent -= 1; - prom_dtlb_load(tlb_ent, - tte_data + 0x400000, - tte_vaddr + 0x400000); - prom_itlb_load(tlb_ent, - tte_data + 0x400000, - tte_vaddr + 0x400000); + /* Now lock us into the TLBs via Hypervisor or OBP. */ + if (tlb_type == hypervisor) { + hypervisor_tlb_lock(tte_vaddr, tte_data, HV_MMU_DMMU); + hypervisor_tlb_lock(tte_vaddr, tte_data, HV_MMU_IMMU); + if (bigkernel) { + tte_vaddr += 0x400000; + tte_data += 0x400000; + hypervisor_tlb_lock(tte_vaddr, tte_data, HV_MMU_DMMU); + hypervisor_tlb_lock(tte_vaddr, tte_data, HV_MMU_IMMU); + } + } else { + prom_dtlb_load(tlb_ent, tte_data, tte_vaddr); + prom_itlb_load(tlb_ent, tte_data, tte_vaddr); + if (bigkernel) { + tlb_ent -= 1; + prom_dtlb_load(tlb_ent, + tte_data + 0x400000, + tte_vaddr + 0x400000); + prom_itlb_load(tlb_ent, + tte_data + 0x400000, + tte_vaddr + 0x400000); + } + sparc64_highest_unlocked_tlb_ent = tlb_ent - 1; } - sparc64_highest_unlocked_tlb_ent = tlb_ent - 1; if (tlb_type == cheetah_plus) { sparc64_kern_pri_context = (CTX_CHEETAH_PLUS_CTX0 | CTX_CHEETAH_PLUS_NUC); @@ -533,372 +602,14 @@ static void __init inherit_prom_mappings(void) prom_printf("Remapping the kernel... "); remap_kernel(); prom_printf("done.\n"); - - prom_printf("Registering callbacks... "); - register_prom_callbacks(); - prom_printf("done.\n"); } -/* The OBP specifications for sun4u mark 0xfffffffc00000000 and - * upwards as reserved for use by the firmware (I wonder if this - * will be the same on Cheetah...). We use this virtual address - * range for the VPTE table mappings of the nucleus so we need - * to zap them when we enter the PROM. -DaveM - */ -static void __flush_nucleus_vptes(void) -{ - unsigned long prom_reserved_base = 0xfffffffc00000000UL; - int i; - - /* Only DTLB must be checked for VPTE entries. */ - if (tlb_type == spitfire) { - for (i = 0; i < 63; i++) { - unsigned long tag; - - /* Spitfire Errata #32 workaround */ - /* NOTE: Always runs on spitfire, so no cheetah+ - * page size encodings. - */ - __asm__ __volatile__("stxa %0, [%1] %2\n\t" - "flush %%g6" - : /* No outputs */ - : "r" (0), - "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU)); - - tag = spitfire_get_dtlb_tag(i); - if (((tag & ~(PAGE_MASK)) == 0) && - ((tag & (PAGE_MASK)) >= prom_reserved_base)) { - __asm__ __volatile__("stxa %%g0, [%0] %1\n\t" - "membar #Sync" - : /* no outputs */ - : "r" (TLB_TAG_ACCESS), "i" (ASI_DMMU)); - spitfire_put_dtlb_data(i, 0x0UL); - } - } - } else if (tlb_type == cheetah || tlb_type == cheetah_plus) { - for (i = 0; i < 512; i++) { - unsigned long tag = cheetah_get_dtlb_tag(i, 2); - - if ((tag & ~PAGE_MASK) == 0 && - (tag & PAGE_MASK) >= prom_reserved_base) { - __asm__ __volatile__("stxa %%g0, [%0] %1\n\t" - "membar #Sync" - : /* no outputs */ - : "r" (TLB_TAG_ACCESS), "i" (ASI_DMMU)); - cheetah_put_dtlb_data(i, 0x0UL, 2); - } - - if (tlb_type != cheetah_plus) - continue; - - tag = cheetah_get_dtlb_tag(i, 3); - - if ((tag & ~PAGE_MASK) == 0 && - (tag & PAGE_MASK) >= prom_reserved_base) { - __asm__ __volatile__("stxa %%g0, [%0] %1\n\t" - "membar #Sync" - : /* no outputs */ - : "r" (TLB_TAG_ACCESS), "i" (ASI_DMMU)); - cheetah_put_dtlb_data(i, 0x0UL, 3); - } - } - } else { - /* Implement me :-) */ - BUG(); - } -} - -static int prom_ditlb_set; -struct prom_tlb_entry { - int tlb_ent; - unsigned long tlb_tag; - unsigned long tlb_data; -}; -struct prom_tlb_entry prom_itlb[16], prom_dtlb[16]; - void prom_world(int enter) { - unsigned long pstate; - int i; - if (!enter) set_fs((mm_segment_t) { get_thread_current_ds() }); - if (!prom_ditlb_set) - return; - - /* Make sure the following runs atomically. */ - __asm__ __volatile__("flushw\n\t" - "rdpr %%pstate, %0\n\t" - "wrpr %0, %1, %%pstate" - : "=r" (pstate) - : "i" (PSTATE_IE)); - - if (enter) { - /* Kick out nucleus VPTEs. */ - __flush_nucleus_vptes(); - - /* Install PROM world. */ - for (i = 0; i < 16; i++) { - if (prom_dtlb[i].tlb_ent != -1) { - __asm__ __volatile__("stxa %0, [%1] %2\n\t" - "membar #Sync" - : : "r" (prom_dtlb[i].tlb_tag), "r" (TLB_TAG_ACCESS), - "i" (ASI_DMMU)); - if (tlb_type == spitfire) - spitfire_put_dtlb_data(prom_dtlb[i].tlb_ent, - prom_dtlb[i].tlb_data); - else if (tlb_type == cheetah || tlb_type == cheetah_plus) - cheetah_put_ldtlb_data(prom_dtlb[i].tlb_ent, - prom_dtlb[i].tlb_data); - } - if (prom_itlb[i].tlb_ent != -1) { - __asm__ __volatile__("stxa %0, [%1] %2\n\t" - "membar #Sync" - : : "r" (prom_itlb[i].tlb_tag), - "r" (TLB_TAG_ACCESS), - "i" (ASI_IMMU)); - if (tlb_type == spitfire) - spitfire_put_itlb_data(prom_itlb[i].tlb_ent, - prom_itlb[i].tlb_data); - else if (tlb_type == cheetah || tlb_type == cheetah_plus) - cheetah_put_litlb_data(prom_itlb[i].tlb_ent, - prom_itlb[i].tlb_data); - } - } - } else { - for (i = 0; i < 16; i++) { - if (prom_dtlb[i].tlb_ent != -1) { - __asm__ __volatile__("stxa %%g0, [%0] %1\n\t" - "membar #Sync" - : : "r" (TLB_TAG_ACCESS), "i" (ASI_DMMU)); - if (tlb_type == spitfire) - spitfire_put_dtlb_data(prom_dtlb[i].tlb_ent, 0x0UL); - else - cheetah_put_ldtlb_data(prom_dtlb[i].tlb_ent, 0x0UL); - } - if (prom_itlb[i].tlb_ent != -1) { - __asm__ __volatile__("stxa %%g0, [%0] %1\n\t" - "membar #Sync" - : : "r" (TLB_TAG_ACCESS), - "i" (ASI_IMMU)); - if (tlb_type == spitfire) - spitfire_put_itlb_data(prom_itlb[i].tlb_ent, 0x0UL); - else - cheetah_put_litlb_data(prom_itlb[i].tlb_ent, 0x0UL); - } - } - } - __asm__ __volatile__("wrpr %0, 0, %%pstate" - : : "r" (pstate)); -} - -void inherit_locked_prom_mappings(int save_p) -{ - int i; - int dtlb_seen = 0; - int itlb_seen = 0; - - /* Fucking losing PROM has more mappings in the TLB, but - * it (conveniently) fails to mention any of these in the - * translations property. The only ones that matter are - * the locked PROM tlb entries, so we impose the following - * irrecovable rule on the PROM, it is allowed 8 locked - * entries in the ITLB and 8 in the DTLB. - * - * Supposedly the upper 16GB of the address space is - * reserved for OBP, BUT I WISH THIS WAS DOCUMENTED - * SOMEWHERE!!!!!!!!!!!!!!!!! Furthermore the entire interface - * used between the client program and the firmware on sun5 - * systems to coordinate mmu mappings is also COMPLETELY - * UNDOCUMENTED!!!!!! Thanks S(t)un! - */ - if (save_p) { - for (i = 0; i < 16; i++) { - prom_itlb[i].tlb_ent = -1; - prom_dtlb[i].tlb_ent = -1; - } - } - if (tlb_type == spitfire) { - int high = sparc64_highest_unlocked_tlb_ent; - for (i = 0; i <= high; i++) { - unsigned long data; - - /* Spitfire Errata #32 workaround */ - /* NOTE: Always runs on spitfire, so no cheetah+ - * page size encodings. - */ - __asm__ __volatile__("stxa %0, [%1] %2\n\t" - "flush %%g6" - : /* No outputs */ - : "r" (0), - "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU)); - - data = spitfire_get_dtlb_data(i); - if ((data & (_PAGE_L|_PAGE_VALID)) == (_PAGE_L|_PAGE_VALID)) { - unsigned long tag; - - /* Spitfire Errata #32 workaround */ - /* NOTE: Always runs on spitfire, so no - * cheetah+ page size encodings. - */ - __asm__ __volatile__("stxa %0, [%1] %2\n\t" - "flush %%g6" - : /* No outputs */ - : "r" (0), - "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU)); - - tag = spitfire_get_dtlb_tag(i); - if (save_p) { - prom_dtlb[dtlb_seen].tlb_ent = i; - prom_dtlb[dtlb_seen].tlb_tag = tag; - prom_dtlb[dtlb_seen].tlb_data = data; - } - __asm__ __volatile__("stxa %%g0, [%0] %1\n\t" - "membar #Sync" - : : "r" (TLB_TAG_ACCESS), "i" (ASI_DMMU)); - spitfire_put_dtlb_data(i, 0x0UL); - - dtlb_seen++; - if (dtlb_seen > 15) - break; - } - } - - for (i = 0; i < high; i++) { - unsigned long data; - - /* Spitfire Errata #32 workaround */ - /* NOTE: Always runs on spitfire, so no - * cheetah+ page size encodings. - */ - __asm__ __volatile__("stxa %0, [%1] %2\n\t" - "flush %%g6" - : /* No outputs */ - : "r" (0), - "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU)); - - data = spitfire_get_itlb_data(i); - if ((data & (_PAGE_L|_PAGE_VALID)) == (_PAGE_L|_PAGE_VALID)) { - unsigned long tag; - - /* Spitfire Errata #32 workaround */ - /* NOTE: Always runs on spitfire, so no - * cheetah+ page size encodings. - */ - __asm__ __volatile__("stxa %0, [%1] %2\n\t" - "flush %%g6" - : /* No outputs */ - : "r" (0), - "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU)); - - tag = spitfire_get_itlb_tag(i); - if (save_p) { - prom_itlb[itlb_seen].tlb_ent = i; - prom_itlb[itlb_seen].tlb_tag = tag; - prom_itlb[itlb_seen].tlb_data = data; - } - __asm__ __volatile__("stxa %%g0, [%0] %1\n\t" - "membar #Sync" - : : "r" (TLB_TAG_ACCESS), "i" (ASI_IMMU)); - spitfire_put_itlb_data(i, 0x0UL); - - itlb_seen++; - if (itlb_seen > 15) - break; - } - } - } else if (tlb_type == cheetah || tlb_type == cheetah_plus) { - int high = sparc64_highest_unlocked_tlb_ent; - - for (i = 0; i <= high; i++) { - unsigned long data; - - data = cheetah_get_ldtlb_data(i); - if ((data & (_PAGE_L|_PAGE_VALID)) == (_PAGE_L|_PAGE_VALID)) { - unsigned long tag; - - tag = cheetah_get_ldtlb_tag(i); - if (save_p) { - prom_dtlb[dtlb_seen].tlb_ent = i; - prom_dtlb[dtlb_seen].tlb_tag = tag; - prom_dtlb[dtlb_seen].tlb_data = data; - } - __asm__ __volatile__("stxa %%g0, [%0] %1\n\t" - "membar #Sync" - : : "r" (TLB_TAG_ACCESS), "i" (ASI_DMMU)); - cheetah_put_ldtlb_data(i, 0x0UL); - - dtlb_seen++; - if (dtlb_seen > 15) - break; - } - } - - for (i = 0; i < high; i++) { - unsigned long data; - - data = cheetah_get_litlb_data(i); - if ((data & (_PAGE_L|_PAGE_VALID)) == (_PAGE_L|_PAGE_VALID)) { - unsigned long tag; - - tag = cheetah_get_litlb_tag(i); - if (save_p) { - prom_itlb[itlb_seen].tlb_ent = i; - prom_itlb[itlb_seen].tlb_tag = tag; - prom_itlb[itlb_seen].tlb_data = data; - } - __asm__ __volatile__("stxa %%g0, [%0] %1\n\t" - "membar #Sync" - : : "r" (TLB_TAG_ACCESS), "i" (ASI_IMMU)); - cheetah_put_litlb_data(i, 0x0UL); - - itlb_seen++; - if (itlb_seen > 15) - break; - } - } - } else { - /* Implement me :-) */ - BUG(); - } - if (save_p) - prom_ditlb_set = 1; -} - -/* Give PROM back his world, done during reboots... */ -void prom_reload_locked(void) -{ - int i; - - for (i = 0; i < 16; i++) { - if (prom_dtlb[i].tlb_ent != -1) { - __asm__ __volatile__("stxa %0, [%1] %2\n\t" - "membar #Sync" - : : "r" (prom_dtlb[i].tlb_tag), "r" (TLB_TAG_ACCESS), - "i" (ASI_DMMU)); - if (tlb_type == spitfire) - spitfire_put_dtlb_data(prom_dtlb[i].tlb_ent, - prom_dtlb[i].tlb_data); - else if (tlb_type == cheetah || tlb_type == cheetah_plus) - cheetah_put_ldtlb_data(prom_dtlb[i].tlb_ent, - prom_dtlb[i].tlb_data); - } - - if (prom_itlb[i].tlb_ent != -1) { - __asm__ __volatile__("stxa %0, [%1] %2\n\t" - "membar #Sync" - : : "r" (prom_itlb[i].tlb_tag), - "r" (TLB_TAG_ACCESS), - "i" (ASI_IMMU)); - if (tlb_type == spitfire) - spitfire_put_itlb_data(prom_itlb[i].tlb_ent, - prom_itlb[i].tlb_data); - else - cheetah_put_litlb_data(prom_itlb[i].tlb_ent, - prom_itlb[i].tlb_data); - } - } + __asm__ __volatile__("flushw"); } #ifdef DCACHE_ALIASING_POSSIBLE @@ -914,7 +625,7 @@ void __flush_dcache_range(unsigned long start, unsigned long end) if (++n >= 512) break; } - } else { + } else if (tlb_type == cheetah || tlb_type == cheetah_plus) { start = __pa(start); end = __pa(end); for (va = start; va < end; va += 32) @@ -927,63 +638,6 @@ void __flush_dcache_range(unsigned long start, unsigned long end) } #endif /* DCACHE_ALIASING_POSSIBLE */ -/* If not locked, zap it. */ -void __flush_tlb_all(void) -{ - unsigned long pstate; - int i; - - __asm__ __volatile__("flushw\n\t" - "rdpr %%pstate, %0\n\t" - "wrpr %0, %1, %%pstate" - : "=r" (pstate) - : "i" (PSTATE_IE)); - if (tlb_type == spitfire) { - for (i = 0; i < 64; i++) { - /* Spitfire Errata #32 workaround */ - /* NOTE: Always runs on spitfire, so no - * cheetah+ page size encodings. - */ - __asm__ __volatile__("stxa %0, [%1] %2\n\t" - "flush %%g6" - : /* No outputs */ - : "r" (0), - "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU)); - - if (!(spitfire_get_dtlb_data(i) & _PAGE_L)) { - __asm__ __volatile__("stxa %%g0, [%0] %1\n\t" - "membar #Sync" - : /* no outputs */ - : "r" (TLB_TAG_ACCESS), "i" (ASI_DMMU)); - spitfire_put_dtlb_data(i, 0x0UL); - } - - /* Spitfire Errata #32 workaround */ - /* NOTE: Always runs on spitfire, so no - * cheetah+ page size encodings. - */ - __asm__ __volatile__("stxa %0, [%1] %2\n\t" - "flush %%g6" - : /* No outputs */ - : "r" (0), - "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU)); - - if (!(spitfire_get_itlb_data(i) & _PAGE_L)) { - __asm__ __volatile__("stxa %%g0, [%0] %1\n\t" - "membar #Sync" - : /* no outputs */ - : "r" (TLB_TAG_ACCESS), "i" (ASI_IMMU)); - spitfire_put_itlb_data(i, 0x0UL); - } - } - } else if (tlb_type == cheetah || tlb_type == cheetah_plus) { - cheetah_flush_dtlb_all(); - cheetah_flush_itlb_all(); - } - __asm__ __volatile__("wrpr %0, 0, %%pstate" - : : "r" (pstate)); -} - /* Caller does TLB context flushing on local CPU if necessary. * The caller also ensures that CTX_VALID(mm->context) is false. * @@ -991,17 +645,21 @@ void __flush_tlb_all(void) * let the user have CTX 0 (nucleus) or we ever use a CTX * version of zero (and thus NO_CONTEXT would not be caught * by version mis-match tests in mmu_context.h). + * + * Always invoked with interrupts disabled. */ void get_new_mmu_context(struct mm_struct *mm) { unsigned long ctx, new_ctx; unsigned long orig_pgsz_bits; - + unsigned long flags; + int new_version; - spin_lock(&ctx_alloc_lock); + spin_lock_irqsave(&ctx_alloc_lock, flags); orig_pgsz_bits = (mm->context.sparc64_ctx_val & CTX_PGSZ_MASK); ctx = (tlb_context_cache + 1) & CTX_NR_MASK; new_ctx = find_next_zero_bit(mmu_context_bmap, 1 << CTX_NR_BITS, ctx); + new_version = 0; if (new_ctx >= (1 << CTX_NR_BITS)) { new_ctx = find_next_zero_bit(mmu_context_bmap, ctx, 1); if (new_ctx >= ctx) { @@ -1024,6 +682,7 @@ void get_new_mmu_context(struct mm_struct *mm) mmu_context_bmap[i + 2] = 0; mmu_context_bmap[i + 3] = 0; } + new_version = 1; goto out; } } @@ -1032,79 +691,10 @@ void get_new_mmu_context(struct mm_struct *mm) out: tlb_context_cache = new_ctx; mm->context.sparc64_ctx_val = new_ctx | orig_pgsz_bits; - spin_unlock(&ctx_alloc_lock); -} - -#ifndef CONFIG_SMP -struct pgtable_cache_struct pgt_quicklists; -#endif + spin_unlock_irqrestore(&ctx_alloc_lock, flags); -/* OK, we have to color these pages. The page tables are accessed - * by non-Dcache enabled mapping in the VPTE area by the dtlb_backend.S - * code, as well as by PAGE_OFFSET range direct-mapped addresses by - * other parts of the kernel. By coloring, we make sure that the tlbmiss - * fast handlers do not get data from old/garbage dcache lines that - * correspond to an old/stale virtual address (user/kernel) that - * previously mapped the pagetable page while accessing vpte range - * addresses. The idea is that if the vpte color and PAGE_OFFSET range - * color is the same, then when the kernel initializes the pagetable - * using the later address range, accesses with the first address - * range will see the newly initialized data rather than the garbage. - */ -#ifdef DCACHE_ALIASING_POSSIBLE -#define DC_ALIAS_SHIFT 1 -#else -#define DC_ALIAS_SHIFT 0 -#endif -pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) -{ - struct page *page; - unsigned long color; - - { - pte_t *ptep = pte_alloc_one_fast(mm, address); - - if (ptep) - return ptep; - } - - color = VPTE_COLOR(address); - page = alloc_pages(GFP_KERNEL|__GFP_REPEAT, DC_ALIAS_SHIFT); - if (page) { - unsigned long *to_free; - unsigned long paddr; - pte_t *pte; - -#ifdef DCACHE_ALIASING_POSSIBLE - set_page_count(page, 1); - ClearPageCompound(page); - - set_page_count((page + 1), 1); - ClearPageCompound(page + 1); -#endif - paddr = (unsigned long) page_address(page); - memset((char *)paddr, 0, (PAGE_SIZE << DC_ALIAS_SHIFT)); - - if (!color) { - pte = (pte_t *) paddr; - to_free = (unsigned long *) (paddr + PAGE_SIZE); - } else { - pte = (pte_t *) (paddr + PAGE_SIZE); - to_free = (unsigned long *) paddr; - } - -#ifdef DCACHE_ALIASING_POSSIBLE - /* Now free the other one up, adjust cache size. */ - preempt_disable(); - *to_free = (unsigned long) pte_quicklist[color ^ 0x1]; - pte_quicklist[color ^ 0x1] = to_free; - pgtable_cache_size++; - preempt_enable(); -#endif - - return pte; - } - return NULL; + if (unlikely(new_version)) + smp_new_mmu_context_version(); } void sparc_ultra_dump_itlb(void) @@ -1196,9 +786,78 @@ void sparc_ultra_dump_dtlb(void) extern unsigned long cmdline_memory_size; -unsigned long __init bootmem_init(unsigned long *pages_avail) +/* Find a free area for the bootmem map, avoiding the kernel image + * and the initial ramdisk. + */ +static unsigned long __init choose_bootmap_pfn(unsigned long start_pfn, + unsigned long end_pfn) { - unsigned long bootmap_size, start_pfn, end_pfn; + unsigned long avoid_start, avoid_end, bootmap_size; + int i; + + bootmap_size = ((end_pfn - start_pfn) + 7) / 8; + bootmap_size = ALIGN(bootmap_size, sizeof(long)); + + avoid_start = avoid_end = 0; +#ifdef CONFIG_BLK_DEV_INITRD + avoid_start = initrd_start; + avoid_end = PAGE_ALIGN(initrd_end); +#endif + +#ifdef CONFIG_DEBUG_BOOTMEM + prom_printf("choose_bootmap_pfn: kern[%lx:%lx] avoid[%lx:%lx]\n", + kern_base, PAGE_ALIGN(kern_base + kern_size), + avoid_start, avoid_end); +#endif + for (i = 0; i < pavail_ents; i++) { + unsigned long start, end; + + start = pavail[i].phys_addr; + end = start + pavail[i].reg_size; + + while (start < end) { + if (start >= kern_base && + start < PAGE_ALIGN(kern_base + kern_size)) { + start = PAGE_ALIGN(kern_base + kern_size); + continue; + } + if (start >= avoid_start && start < avoid_end) { + start = avoid_end; + continue; + } + + if ((end - start) < bootmap_size) + break; + + if (start < kern_base && + (start + bootmap_size) > kern_base) { + start = PAGE_ALIGN(kern_base + kern_size); + continue; + } + + if (start < avoid_start && + (start + bootmap_size) > avoid_start) { + start = avoid_end; + continue; + } + + /* OK, it doesn't overlap anything, use it. */ +#ifdef CONFIG_DEBUG_BOOTMEM + prom_printf("choose_bootmap_pfn: Using %lx [%lx]\n", + start >> PAGE_SHIFT, start); +#endif + return start >> PAGE_SHIFT; + } + } + + prom_printf("Cannot find free area for bootmap, aborting.\n"); + prom_halt(); +} + +static unsigned long __init bootmem_init(unsigned long *pages_avail, + unsigned long phys_base) +{ + unsigned long bootmap_size, end_pfn; unsigned long end_of_phys_memory = 0UL; unsigned long bootmap_pfn, bytes_avail, size; int i; @@ -1236,14 +895,6 @@ unsigned long __init bootmem_init(unsigned long *pages_avail) *pages_avail = bytes_avail >> PAGE_SHIFT; - /* Start with page aligned address of last symbol in kernel - * image. The kernel is hard mapped below PAGE_OFFSET in a - * 4MB locked TLB translation. - */ - start_pfn = PAGE_ALIGN(kern_base + kern_size) >> PAGE_SHIFT; - - bootmap_pfn = start_pfn; - end_pfn = end_of_phys_memory >> PAGE_SHIFT; #ifdef CONFIG_BLK_DEV_INITRD @@ -1260,23 +911,22 @@ unsigned long __init bootmem_init(unsigned long *pages_avail) "(0x%016lx > 0x%016lx)\ndisabling initrd\n", initrd_end, end_of_phys_memory); initrd_start = 0; - } - if (initrd_start) { - if (initrd_start >= (start_pfn << PAGE_SHIFT) && - initrd_start < (start_pfn << PAGE_SHIFT) + 2 * PAGE_SIZE) - bootmap_pfn = PAGE_ALIGN (initrd_end) >> PAGE_SHIFT; + initrd_end = 0; } } #endif /* Initialize the boot-time allocator. */ max_pfn = max_low_pfn = end_pfn; - min_low_pfn = pfn_base; + min_low_pfn = (phys_base >> PAGE_SHIFT); + + bootmap_pfn = choose_bootmap_pfn(min_low_pfn, end_pfn); #ifdef CONFIG_DEBUG_BOOTMEM prom_printf("init_bootmem(min[%lx], bootmap[%lx], max[%lx])\n", min_low_pfn, bootmap_pfn, max_low_pfn); #endif - bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap_pfn, pfn_base, end_pfn); + bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap_pfn, + min_low_pfn, end_pfn); /* Now register the available physical memory with the * allocator. @@ -1324,9 +974,26 @@ unsigned long __init bootmem_init(unsigned long *pages_avail) reserve_bootmem((bootmap_pfn << PAGE_SHIFT), size); *pages_avail -= PAGE_ALIGN(size) >> PAGE_SHIFT; + for (i = 0; i < pavail_ents; i++) { + unsigned long start_pfn, end_pfn; + + start_pfn = pavail[i].phys_addr >> PAGE_SHIFT; + end_pfn = (start_pfn + (pavail[i].reg_size >> PAGE_SHIFT)); +#ifdef CONFIG_DEBUG_BOOTMEM + prom_printf("memory_present(0, %lx, %lx)\n", + start_pfn, end_pfn); +#endif + memory_present(0, start_pfn, end_pfn); + } + + sparse_init(); + return end_pfn; } +static struct linux_prom64_registers pall[MAX_BANKS] __initdata; +static int pall_ents __initdata; + #ifdef CONFIG_DEBUG_PAGEALLOC static unsigned long kernel_map_range(unsigned long pstart, unsigned long pend, pgprot_t prot) { @@ -1382,14 +1049,44 @@ static unsigned long kernel_map_range(unsigned long pstart, unsigned long pend, return alloc_bytes; } -static struct linux_prom64_registers pall[MAX_BANKS] __initdata; -static int pall_ents __initdata; - extern unsigned int kvmap_linear_patch[1]; +#endif /* CONFIG_DEBUG_PAGEALLOC */ + +static void __init mark_kpte_bitmap(unsigned long start, unsigned long end) +{ + const unsigned long shift_256MB = 28; + const unsigned long mask_256MB = ((1UL << shift_256MB) - 1UL); + const unsigned long size_256MB = (1UL << shift_256MB); + + while (start < end) { + long remains; + + remains = end - start; + if (remains < size_256MB) + break; + + if (start & mask_256MB) { + start = (start + size_256MB) & ~mask_256MB; + continue; + } + + while (remains >= size_256MB) { + unsigned long index = start >> shift_256MB; + + __set_bit(index, kpte_linear_bitmap); + + start += size_256MB; + remains -= size_256MB; + } + } +} static void __init kernel_physical_mapping_init(void) { - unsigned long i, mem_alloced = 0UL; + unsigned long i; +#ifdef CONFIG_DEBUG_PAGEALLOC + unsigned long mem_alloced = 0UL; +#endif read_obp_memory("reg", &pall[0], &pall_ents); @@ -1398,10 +1095,16 @@ static void __init kernel_physical_mapping_init(void) phys_start = pall[i].phys_addr; phys_end = phys_start + pall[i].reg_size; + + mark_kpte_bitmap(phys_start, phys_end); + +#ifdef CONFIG_DEBUG_PAGEALLOC mem_alloced += kernel_map_range(phys_start, phys_end, PAGE_KERNEL); +#endif } +#ifdef CONFIG_DEBUG_PAGEALLOC printk("Allocated %ld bytes for kernel page tables.\n", mem_alloced); @@ -1409,8 +1112,10 @@ static void __init kernel_physical_mapping_init(void) flushi(&kvmap_linear_patch[0]); __flush_tlb_all(); +#endif } +#ifdef CONFIG_DEBUG_PAGEALLOC void kernel_map_pages(struct page *page, int numpages, int enable) { unsigned long phys_start = page_to_pfn(page) << PAGE_SHIFT; @@ -1419,6 +1124,9 @@ void kernel_map_pages(struct page *page, int numpages, int enable) kernel_map_range(phys_start, phys_end, (enable ? PAGE_KERNEL : __pgprot(0))); + flush_tsb_kernel_range(PAGE_OFFSET + phys_start, + PAGE_OFFSET + phys_end); + /* we should perform an IPI and flush all tlbs, * but that can deadlock->flush only current cpu. */ @@ -1439,18 +1147,150 @@ unsigned long __init find_ecache_flush_span(unsigned long size) return ~0UL; } +static void __init tsb_phys_patch(void) +{ + struct tsb_ldquad_phys_patch_entry *pquad; + struct tsb_phys_patch_entry *p; + + pquad = &__tsb_ldquad_phys_patch; + while (pquad < &__tsb_ldquad_phys_patch_end) { + unsigned long addr = pquad->addr; + + if (tlb_type == hypervisor) + *(unsigned int *) addr = pquad->sun4v_insn; + else + *(unsigned int *) addr = pquad->sun4u_insn; + wmb(); + __asm__ __volatile__("flush %0" + : /* no outputs */ + : "r" (addr)); + + pquad++; + } + + p = &__tsb_phys_patch; + while (p < &__tsb_phys_patch_end) { + unsigned long addr = p->addr; + + *(unsigned int *) addr = p->insn; + wmb(); + __asm__ __volatile__("flush %0" + : /* no outputs */ + : "r" (addr)); + + p++; + } +} + +/* Don't mark as init, we give this to the Hypervisor. */ +static struct hv_tsb_descr ktsb_descr[2]; +extern struct tsb swapper_tsb[KERNEL_TSB_NENTRIES]; + +static void __init sun4v_ktsb_init(void) +{ + unsigned long ktsb_pa; + + /* First KTSB for PAGE_SIZE mappings. */ + ktsb_pa = kern_base + ((unsigned long)&swapper_tsb[0] - KERNBASE); + + switch (PAGE_SIZE) { + case 8 * 1024: + default: + ktsb_descr[0].pgsz_idx = HV_PGSZ_IDX_8K; + ktsb_descr[0].pgsz_mask = HV_PGSZ_MASK_8K; + break; + + case 64 * 1024: + ktsb_descr[0].pgsz_idx = HV_PGSZ_IDX_64K; + ktsb_descr[0].pgsz_mask = HV_PGSZ_MASK_64K; + break; + + case 512 * 1024: + ktsb_descr[0].pgsz_idx = HV_PGSZ_IDX_512K; + ktsb_descr[0].pgsz_mask = HV_PGSZ_MASK_512K; + break; + + case 4 * 1024 * 1024: + ktsb_descr[0].pgsz_idx = HV_PGSZ_IDX_4MB; + ktsb_descr[0].pgsz_mask = HV_PGSZ_MASK_4MB; + break; + }; + + ktsb_descr[0].assoc = 1; + ktsb_descr[0].num_ttes = KERNEL_TSB_NENTRIES; + ktsb_descr[0].ctx_idx = 0; + ktsb_descr[0].tsb_base = ktsb_pa; + ktsb_descr[0].resv = 0; + + /* Second KTSB for 4MB/256MB mappings. */ + ktsb_pa = (kern_base + + ((unsigned long)&swapper_4m_tsb[0] - KERNBASE)); + + ktsb_descr[1].pgsz_idx = HV_PGSZ_IDX_4MB; + ktsb_descr[1].pgsz_mask = (HV_PGSZ_MASK_4MB | + HV_PGSZ_MASK_256MB); + ktsb_descr[1].assoc = 1; + ktsb_descr[1].num_ttes = KERNEL_TSB4M_NENTRIES; + ktsb_descr[1].ctx_idx = 0; + ktsb_descr[1].tsb_base = ktsb_pa; + ktsb_descr[1].resv = 0; +} + +void __cpuinit sun4v_ktsb_register(void) +{ + register unsigned long func asm("%o5"); + register unsigned long arg0 asm("%o0"); + register unsigned long arg1 asm("%o1"); + unsigned long pa; + + pa = kern_base + ((unsigned long)&ktsb_descr[0] - KERNBASE); + + func = HV_FAST_MMU_TSB_CTX0; + arg0 = 2; + arg1 = pa; + __asm__ __volatile__("ta %6" + : "=&r" (func), "=&r" (arg0), "=&r" (arg1) + : "0" (func), "1" (arg0), "2" (arg1), + "i" (HV_FAST_TRAP)); +} + /* paging_init() sets up the page tables */ extern void cheetah_ecache_flush_init(void); +extern void sun4v_patch_tlb_handlers(void); static unsigned long last_valid_pfn; pgd_t swapper_pg_dir[2048]; +static void sun4u_pgprot_init(void); +static void sun4v_pgprot_init(void); + void __init paging_init(void) { - unsigned long end_pfn, pages_avail, shift; + unsigned long end_pfn, pages_avail, shift, phys_base; unsigned long real_end, i; + kern_base = (prom_boot_mapping_phys_low >> 22UL) << 22UL; + kern_size = (unsigned long)&_end - (unsigned long)KERNBASE; + + /* Invalidate both kernel TSBs. */ + memset(swapper_tsb, 0x40, sizeof(swapper_tsb)); + memset(swapper_4m_tsb, 0x40, sizeof(swapper_4m_tsb)); + + if (tlb_type == hypervisor) + sun4v_pgprot_init(); + else + sun4u_pgprot_init(); + + if (tlb_type == cheetah_plus || + tlb_type == hypervisor) + tsb_phys_patch(); + + if (tlb_type == hypervisor) { + sun4v_patch_tlb_handlers(); + sun4v_ktsb_init(); + } + /* Find available physical memory... */ read_obp_memory("available", &pavail[0], &pavail_ents); @@ -1458,11 +1298,6 @@ void __init paging_init(void) for (i = 0; i < pavail_ents; i++) phys_base = min(phys_base, pavail[i].phys_addr); - pfn_base = phys_base >> PAGE_SHIFT; - - kern_base = (prom_boot_mapping_phys_low >> 22UL) << 22UL; - kern_size = (unsigned long)&_end - (unsigned long)KERNBASE; - set_bit(0, mmu_context_bmap); shift = kern_base + PAGE_OFFSET - ((unsigned long)KERNBASE); @@ -1486,47 +1321,38 @@ void __init paging_init(void) pud_set(pud_offset(&swapper_pg_dir[0], 0), swapper_low_pmd_dir + (shift / sizeof(pgd_t))); - swapper_pgd_zero = pgd_val(swapper_pg_dir[0]); - inherit_prom_mappings(); - /* Ok, we can use our TLB miss and window trap handlers safely. - * We need to do a quick peek here to see if we are on StarFire - * or not, so setup_tba can setup the IRQ globals correctly (it - * needs to get the hard smp processor id correctly). - */ - { - extern void setup_tba(int); - setup_tba(this_is_starfire); - } - - inherit_locked_prom_mappings(1); + /* Ok, we can use our TLB miss and window trap handlers safely. */ + setup_tba(); __flush_tlb_all(); + if (tlb_type == hypervisor) + sun4v_ktsb_register(); + /* Setup bootmem... */ pages_avail = 0; - last_valid_pfn = end_pfn = bootmem_init(&pages_avail); + last_valid_pfn = end_pfn = bootmem_init(&pages_avail, phys_base); + + max_mapnr = last_valid_pfn; -#ifdef CONFIG_DEBUG_PAGEALLOC kernel_physical_mapping_init(); -#endif { unsigned long zones_size[MAX_NR_ZONES]; unsigned long zholes_size[MAX_NR_ZONES]; - unsigned long npages; int znum; for (znum = 0; znum < MAX_NR_ZONES; znum++) zones_size[znum] = zholes_size[znum] = 0; - npages = end_pfn - pfn_base; - zones_size[ZONE_DMA] = npages; - zholes_size[ZONE_DMA] = npages - pages_avail; + zones_size[ZONE_DMA] = end_pfn; + zholes_size[ZONE_DMA] = end_pfn - pages_avail; free_area_init_node(0, &contig_page_data, zones_size, - phys_base >> PAGE_SHIFT, zholes_size); + __pa(PAGE_OFFSET) >> PAGE_SHIFT, + zholes_size); } device_scan(); @@ -1596,7 +1422,6 @@ void __init mem_init(void) taint_real_pages(); - max_mapnr = last_valid_pfn - pfn_base; high_memory = __va(last_valid_pfn << PAGE_SHIFT); #ifdef CONFIG_DEBUG_BOOTMEM @@ -1653,7 +1478,7 @@ void free_initmem(void) p = virt_to_page(page); ClearPageReserved(p); - set_page_count(p, 1); + init_page_count(p); __free_page(p); num_physpages++; totalram_pages++; @@ -1669,10 +1494,349 @@ void free_initrd_mem(unsigned long start, unsigned long end) struct page *p = virt_to_page(start); ClearPageReserved(p); - set_page_count(p, 1); + init_page_count(p); __free_page(p); num_physpages++; totalram_pages++; } } #endif + +#define _PAGE_CACHE_4U (_PAGE_CP_4U | _PAGE_CV_4U) +#define _PAGE_CACHE_4V (_PAGE_CP_4V | _PAGE_CV_4V) +#define __DIRTY_BITS_4U (_PAGE_MODIFIED_4U | _PAGE_WRITE_4U | _PAGE_W_4U) +#define __DIRTY_BITS_4V (_PAGE_MODIFIED_4V | _PAGE_WRITE_4V | _PAGE_W_4V) +#define __ACCESS_BITS_4U (_PAGE_ACCESSED_4U | _PAGE_READ_4U | _PAGE_R) +#define __ACCESS_BITS_4V (_PAGE_ACCESSED_4V | _PAGE_READ_4V | _PAGE_R) + +pgprot_t PAGE_KERNEL __read_mostly; +EXPORT_SYMBOL(PAGE_KERNEL); + +pgprot_t PAGE_KERNEL_LOCKED __read_mostly; +pgprot_t PAGE_COPY __read_mostly; + +pgprot_t PAGE_SHARED __read_mostly; +EXPORT_SYMBOL(PAGE_SHARED); + +pgprot_t PAGE_EXEC __read_mostly; +unsigned long pg_iobits __read_mostly; + +unsigned long _PAGE_IE __read_mostly; + +unsigned long _PAGE_E __read_mostly; +EXPORT_SYMBOL(_PAGE_E); + +unsigned long _PAGE_CACHE __read_mostly; +EXPORT_SYMBOL(_PAGE_CACHE); + +static void prot_init_common(unsigned long page_none, + unsigned long page_shared, + unsigned long page_copy, + unsigned long page_readonly, + unsigned long page_exec_bit) +{ + PAGE_COPY = __pgprot(page_copy); + PAGE_SHARED = __pgprot(page_shared); + + protection_map[0x0] = __pgprot(page_none); + protection_map[0x1] = __pgprot(page_readonly & ~page_exec_bit); + protection_map[0x2] = __pgprot(page_copy & ~page_exec_bit); + protection_map[0x3] = __pgprot(page_copy & ~page_exec_bit); + protection_map[0x4] = __pgprot(page_readonly); + protection_map[0x5] = __pgprot(page_readonly); + protection_map[0x6] = __pgprot(page_copy); + protection_map[0x7] = __pgprot(page_copy); + protection_map[0x8] = __pgprot(page_none); + protection_map[0x9] = __pgprot(page_readonly & ~page_exec_bit); + protection_map[0xa] = __pgprot(page_shared & ~page_exec_bit); + protection_map[0xb] = __pgprot(page_shared & ~page_exec_bit); + protection_map[0xc] = __pgprot(page_readonly); + protection_map[0xd] = __pgprot(page_readonly); + protection_map[0xe] = __pgprot(page_shared); + protection_map[0xf] = __pgprot(page_shared); +} + +static void __init sun4u_pgprot_init(void) +{ + unsigned long page_none, page_shared, page_copy, page_readonly; + unsigned long page_exec_bit; + + PAGE_KERNEL = __pgprot (_PAGE_PRESENT_4U | _PAGE_VALID | + _PAGE_CACHE_4U | _PAGE_P_4U | + __ACCESS_BITS_4U | __DIRTY_BITS_4U | + _PAGE_EXEC_4U); + PAGE_KERNEL_LOCKED = __pgprot (_PAGE_PRESENT_4U | _PAGE_VALID | + _PAGE_CACHE_4U | _PAGE_P_4U | + __ACCESS_BITS_4U | __DIRTY_BITS_4U | + _PAGE_EXEC_4U | _PAGE_L_4U); + PAGE_EXEC = __pgprot(_PAGE_EXEC_4U); + + _PAGE_IE = _PAGE_IE_4U; + _PAGE_E = _PAGE_E_4U; + _PAGE_CACHE = _PAGE_CACHE_4U; + + pg_iobits = (_PAGE_VALID | _PAGE_PRESENT_4U | __DIRTY_BITS_4U | + __ACCESS_BITS_4U | _PAGE_E_4U); + + kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4U) ^ + 0xfffff80000000000; + kern_linear_pte_xor[0] |= (_PAGE_CP_4U | _PAGE_CV_4U | + _PAGE_P_4U | _PAGE_W_4U); + + /* XXX Should use 256MB on Panther. XXX */ + kern_linear_pte_xor[1] = kern_linear_pte_xor[0]; + + _PAGE_SZBITS = _PAGE_SZBITS_4U; + _PAGE_ALL_SZ_BITS = (_PAGE_SZ4MB_4U | _PAGE_SZ512K_4U | + _PAGE_SZ64K_4U | _PAGE_SZ8K_4U | + _PAGE_SZ32MB_4U | _PAGE_SZ256MB_4U); + + + page_none = _PAGE_PRESENT_4U | _PAGE_ACCESSED_4U | _PAGE_CACHE_4U; + page_shared = (_PAGE_VALID | _PAGE_PRESENT_4U | _PAGE_CACHE_4U | + __ACCESS_BITS_4U | _PAGE_WRITE_4U | _PAGE_EXEC_4U); + page_copy = (_PAGE_VALID | _PAGE_PRESENT_4U | _PAGE_CACHE_4U | + __ACCESS_BITS_4U | _PAGE_EXEC_4U); + page_readonly = (_PAGE_VALID | _PAGE_PRESENT_4U | _PAGE_CACHE_4U | + __ACCESS_BITS_4U | _PAGE_EXEC_4U); + + page_exec_bit = _PAGE_EXEC_4U; + + prot_init_common(page_none, page_shared, page_copy, page_readonly, + page_exec_bit); +} + +static void __init sun4v_pgprot_init(void) +{ + unsigned long page_none, page_shared, page_copy, page_readonly; + unsigned long page_exec_bit; + + PAGE_KERNEL = __pgprot (_PAGE_PRESENT_4V | _PAGE_VALID | + _PAGE_CACHE_4V | _PAGE_P_4V | + __ACCESS_BITS_4V | __DIRTY_BITS_4V | + _PAGE_EXEC_4V); + PAGE_KERNEL_LOCKED = PAGE_KERNEL; + PAGE_EXEC = __pgprot(_PAGE_EXEC_4V); + + _PAGE_IE = _PAGE_IE_4V; + _PAGE_E = _PAGE_E_4V; + _PAGE_CACHE = _PAGE_CACHE_4V; + + kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4V) ^ + 0xfffff80000000000; + kern_linear_pte_xor[0] |= (_PAGE_CP_4V | _PAGE_CV_4V | + _PAGE_P_4V | _PAGE_W_4V); + + kern_linear_pte_xor[1] = (_PAGE_VALID | _PAGE_SZ256MB_4V) ^ + 0xfffff80000000000; + kern_linear_pte_xor[1] |= (_PAGE_CP_4V | _PAGE_CV_4V | + _PAGE_P_4V | _PAGE_W_4V); + + pg_iobits = (_PAGE_VALID | _PAGE_PRESENT_4V | __DIRTY_BITS_4V | + __ACCESS_BITS_4V | _PAGE_E_4V); + + _PAGE_SZBITS = _PAGE_SZBITS_4V; + _PAGE_ALL_SZ_BITS = (_PAGE_SZ16GB_4V | _PAGE_SZ2GB_4V | + _PAGE_SZ256MB_4V | _PAGE_SZ32MB_4V | + _PAGE_SZ4MB_4V | _PAGE_SZ512K_4V | + _PAGE_SZ64K_4V | _PAGE_SZ8K_4V); + + page_none = _PAGE_PRESENT_4V | _PAGE_ACCESSED_4V | _PAGE_CACHE_4V; + page_shared = (_PAGE_VALID | _PAGE_PRESENT_4V | _PAGE_CACHE_4V | + __ACCESS_BITS_4V | _PAGE_WRITE_4V | _PAGE_EXEC_4V); + page_copy = (_PAGE_VALID | _PAGE_PRESENT_4V | _PAGE_CACHE_4V | + __ACCESS_BITS_4V | _PAGE_EXEC_4V); + page_readonly = (_PAGE_VALID | _PAGE_PRESENT_4V | _PAGE_CACHE_4V | + __ACCESS_BITS_4V | _PAGE_EXEC_4V); + + page_exec_bit = _PAGE_EXEC_4V; + + prot_init_common(page_none, page_shared, page_copy, page_readonly, + page_exec_bit); +} + +unsigned long pte_sz_bits(unsigned long sz) +{ + if (tlb_type == hypervisor) { + switch (sz) { + case 8 * 1024: + default: + return _PAGE_SZ8K_4V; + case 64 * 1024: + return _PAGE_SZ64K_4V; + case 512 * 1024: + return _PAGE_SZ512K_4V; + case 4 * 1024 * 1024: + return _PAGE_SZ4MB_4V; + }; + } else { + switch (sz) { + case 8 * 1024: + default: + return _PAGE_SZ8K_4U; + case 64 * 1024: + return _PAGE_SZ64K_4U; + case 512 * 1024: + return _PAGE_SZ512K_4U; + case 4 * 1024 * 1024: + return _PAGE_SZ4MB_4U; + }; + } +} + +pte_t mk_pte_io(unsigned long page, pgprot_t prot, int space, unsigned long page_size) +{ + pte_t pte; + + pte_val(pte) = page | pgprot_val(pgprot_noncached(prot)); + pte_val(pte) |= (((unsigned long)space) << 32); + pte_val(pte) |= pte_sz_bits(page_size); + + return pte; +} + +static unsigned long kern_large_tte(unsigned long paddr) +{ + unsigned long val; + + val = (_PAGE_VALID | _PAGE_SZ4MB_4U | + _PAGE_CP_4U | _PAGE_CV_4U | _PAGE_P_4U | + _PAGE_EXEC_4U | _PAGE_L_4U | _PAGE_W_4U); + if (tlb_type == hypervisor) + val = (_PAGE_VALID | _PAGE_SZ4MB_4V | + _PAGE_CP_4V | _PAGE_CV_4V | _PAGE_P_4V | + _PAGE_EXEC_4V | _PAGE_W_4V); + + return val | paddr; +} + +/* + * Translate PROM's mapping we capture at boot time into physical address. + * The second parameter is only set from prom_callback() invocations. + */ +unsigned long prom_virt_to_phys(unsigned long promva, int *error) +{ + unsigned long mask; + int i; + + mask = _PAGE_PADDR_4U; + if (tlb_type == hypervisor) + mask = _PAGE_PADDR_4V; + + for (i = 0; i < prom_trans_ents; i++) { + struct linux_prom_translation *p = &prom_trans[i]; + + if (promva >= p->virt && + promva < (p->virt + p->size)) { + unsigned long base = p->data & mask; + + if (error) + *error = 0; + return base + (promva & (8192 - 1)); + } + } + if (error) + *error = 1; + return 0UL; +} + +/* XXX We should kill off this ugly thing at so me point. XXX */ +unsigned long sun4u_get_pte(unsigned long addr) +{ + pgd_t *pgdp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + unsigned long mask = _PAGE_PADDR_4U; + + if (tlb_type == hypervisor) + mask = _PAGE_PADDR_4V; + + if (addr >= PAGE_OFFSET) + return addr & mask; + + if ((addr >= LOW_OBP_ADDRESS) && (addr < HI_OBP_ADDRESS)) + return prom_virt_to_phys(addr, NULL); + + pgdp = pgd_offset_k(addr); + pudp = pud_offset(pgdp, addr); + pmdp = pmd_offset(pudp, addr); + ptep = pte_offset_kernel(pmdp, addr); + + return pte_val(*ptep) & mask; +} + +/* If not locked, zap it. */ +void __flush_tlb_all(void) +{ + unsigned long pstate; + int i; + + __asm__ __volatile__("flushw\n\t" + "rdpr %%pstate, %0\n\t" + "wrpr %0, %1, %%pstate" + : "=r" (pstate) + : "i" (PSTATE_IE)); + if (tlb_type == spitfire) { + for (i = 0; i < 64; i++) { + /* Spitfire Errata #32 workaround */ + /* NOTE: Always runs on spitfire, so no + * cheetah+ page size encodings. + */ + __asm__ __volatile__("stxa %0, [%1] %2\n\t" + "flush %%g6" + : /* No outputs */ + : "r" (0), + "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU)); + + if (!(spitfire_get_dtlb_data(i) & _PAGE_L_4U)) { + __asm__ __volatile__("stxa %%g0, [%0] %1\n\t" + "membar #Sync" + : /* no outputs */ + : "r" (TLB_TAG_ACCESS), "i" (ASI_DMMU)); + spitfire_put_dtlb_data(i, 0x0UL); + } + + /* Spitfire Errata #32 workaround */ + /* NOTE: Always runs on spitfire, so no + * cheetah+ page size encodings. + */ + __asm__ __volatile__("stxa %0, [%1] %2\n\t" + "flush %%g6" + : /* No outputs */ + : "r" (0), + "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU)); + + if (!(spitfire_get_itlb_data(i) & _PAGE_L_4U)) { + __asm__ __volatile__("stxa %%g0, [%0] %1\n\t" + "membar #Sync" + : /* no outputs */ + : "r" (TLB_TAG_ACCESS), "i" (ASI_IMMU)); + spitfire_put_itlb_data(i, 0x0UL); + } + } + } else if (tlb_type == cheetah || tlb_type == cheetah_plus) { + cheetah_flush_dtlb_all(); + cheetah_flush_itlb_all(); + } + __asm__ __volatile__("wrpr %0, 0, %%pstate" + : : "r" (pstate)); +} + +#ifdef CONFIG_MEMORY_HOTPLUG + +void online_page(struct page *page) +{ + ClearPageReserved(page); + init_page_count(page); + __free_page(page); + totalram_pages++; + num_physpages++; +} + +int remove_memory(u64 start, u64 size) +{ + return -EINVAL; +} + +#endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/arch/sparc64/mm/tlb.c b/arch/sparc64/mm/tlb.c index 8b104be4662..a079cf42505 100644 --- a/arch/sparc64/mm/tlb.c +++ b/arch/sparc64/mm/tlb.c @@ -25,6 +25,8 @@ void flush_tlb_pending(void) struct mmu_gather *mp = &__get_cpu_var(mmu_gathers); if (mp->tlb_nr) { + flush_tsb_user(mp); + if (CTX_VALID(mp->mm->context)) { #ifdef CONFIG_SMP smp_flush_tlb_pending(mp->mm, mp->tlb_nr, @@ -47,7 +49,8 @@ void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, pte_t *ptep, pte_t if (pte_exec(orig)) vaddr |= 0x1UL; - if (pte_dirty(orig)) { + if (tlb_type != hypervisor && + pte_dirty(orig)) { unsigned long paddr, pfn = pte_pfn(orig); struct address_space *mapping; struct page *page; @@ -89,62 +92,3 @@ no_cache_flush: if (nr >= TLB_BATCH_NR) flush_tlb_pending(); } - -void flush_tlb_pgtables(struct mm_struct *mm, unsigned long start, unsigned long end) -{ - struct mmu_gather *mp = &__get_cpu_var(mmu_gathers); - unsigned long nr = mp->tlb_nr; - long s = start, e = end, vpte_base; - - if (mp->fullmm) - return; - - /* If start is greater than end, that is a real problem. */ - BUG_ON(start > end); - - /* However, straddling the VA space hole is quite normal. */ - s &= PMD_MASK; - e = (e + PMD_SIZE - 1) & PMD_MASK; - - vpte_base = (tlb_type == spitfire ? - VPTE_BASE_SPITFIRE : - VPTE_BASE_CHEETAH); - - if (unlikely(nr != 0 && mm != mp->mm)) { - flush_tlb_pending(); - nr = 0; - } - - if (nr == 0) - mp->mm = mm; - - start = vpte_base + (s >> (PAGE_SHIFT - 3)); - end = vpte_base + (e >> (PAGE_SHIFT - 3)); - - /* If the request straddles the VA space hole, we - * need to swap start and end. The reason this - * occurs is that "vpte_base" is the center of - * the linear page table mapping area. Thus, - * high addresses with the sign bit set map to - * addresses below vpte_base and non-sign bit - * addresses map to addresses above vpte_base. - */ - if (end < start) { - unsigned long tmp = start; - - start = end; - end = tmp; - } - - while (start < end) { - mp->vaddrs[nr] = start; - mp->tlb_nr = ++nr; - if (nr >= TLB_BATCH_NR) { - flush_tlb_pending(); - nr = 0; - } - start += PAGE_SIZE; - } - if (nr) - flush_tlb_pending(); -} diff --git a/arch/sparc64/mm/tsb.c b/arch/sparc64/mm/tsb.c new file mode 100644 index 00000000000..beaa02810f0 --- /dev/null +++ b/arch/sparc64/mm/tsb.c @@ -0,0 +1,500 @@ +/* arch/sparc64/mm/tsb.c + * + * Copyright (C) 2006 David S. Miller <davem@davemloft.net> + */ + +#include <linux/kernel.h> +#include <asm/system.h> +#include <asm/page.h> +#include <asm/tlbflush.h> +#include <asm/tlb.h> +#include <asm/mmu_context.h> +#include <asm/pgtable.h> +#include <asm/tsb.h> +#include <asm/oplib.h> + +extern struct tsb swapper_tsb[KERNEL_TSB_NENTRIES]; + +static inline unsigned long tsb_hash(unsigned long vaddr, unsigned long hash_shift, unsigned long nentries) +{ + vaddr >>= hash_shift; + return vaddr & (nentries - 1); +} + +static inline int tag_compare(unsigned long tag, unsigned long vaddr) +{ + return (tag == (vaddr >> 22)); +} + +/* TSB flushes need only occur on the processor initiating the address + * space modification, not on each cpu the address space has run on. + * Only the TLB flush needs that treatment. + */ + +void flush_tsb_kernel_range(unsigned long start, unsigned long end) +{ + unsigned long v; + + for (v = start; v < end; v += PAGE_SIZE) { + unsigned long hash = tsb_hash(v, PAGE_SHIFT, + KERNEL_TSB_NENTRIES); + struct tsb *ent = &swapper_tsb[hash]; + + if (tag_compare(ent->tag, v)) { + ent->tag = (1UL << TSB_TAG_INVALID_BIT); + membar_storeload_storestore(); + } + } +} + +static void __flush_tsb_one(struct mmu_gather *mp, unsigned long hash_shift, unsigned long tsb, unsigned long nentries) +{ + unsigned long i; + + for (i = 0; i < mp->tlb_nr; i++) { + unsigned long v = mp->vaddrs[i]; + unsigned long tag, ent, hash; + + v &= ~0x1UL; + + hash = tsb_hash(v, hash_shift, nentries); + ent = tsb + (hash * sizeof(struct tsb)); + tag = (v >> 22UL); + + tsb_flush(ent, tag); + } +} + +void flush_tsb_user(struct mmu_gather *mp) +{ + struct mm_struct *mm = mp->mm; + unsigned long nentries, base, flags; + + spin_lock_irqsave(&mm->context.lock, flags); + + base = (unsigned long) mm->context.tsb_block[MM_TSB_BASE].tsb; + nentries = mm->context.tsb_block[MM_TSB_BASE].tsb_nentries; + if (tlb_type == cheetah_plus || tlb_type == hypervisor) + base = __pa(base); + __flush_tsb_one(mp, PAGE_SHIFT, base, nentries); + +#ifdef CONFIG_HUGETLB_PAGE + if (mm->context.tsb_block[MM_TSB_HUGE].tsb) { + base = (unsigned long) mm->context.tsb_block[MM_TSB_HUGE].tsb; + nentries = mm->context.tsb_block[MM_TSB_HUGE].tsb_nentries; + if (tlb_type == cheetah_plus || tlb_type == hypervisor) + base = __pa(base); + __flush_tsb_one(mp, HPAGE_SHIFT, base, nentries); + } +#endif + spin_unlock_irqrestore(&mm->context.lock, flags); +} + +#if defined(CONFIG_SPARC64_PAGE_SIZE_8KB) +#define HV_PGSZ_IDX_BASE HV_PGSZ_IDX_8K +#define HV_PGSZ_MASK_BASE HV_PGSZ_MASK_8K +#elif defined(CONFIG_SPARC64_PAGE_SIZE_64KB) +#define HV_PGSZ_IDX_BASE HV_PGSZ_IDX_64K +#define HV_PGSZ_MASK_BASE HV_PGSZ_MASK_64K +#elif defined(CONFIG_SPARC64_PAGE_SIZE_512KB) +#define HV_PGSZ_IDX_BASE HV_PGSZ_IDX_512K +#define HV_PGSZ_MASK_BASE HV_PGSZ_MASK_512K +#elif defined(CONFIG_SPARC64_PAGE_SIZE_4MB) +#define HV_PGSZ_IDX_BASE HV_PGSZ_IDX_4MB +#define HV_PGSZ_MASK_BASE HV_PGSZ_MASK_4MB +#else +#error Broken base page size setting... +#endif + +#ifdef CONFIG_HUGETLB_PAGE +#if defined(CONFIG_HUGETLB_PAGE_SIZE_64K) +#define HV_PGSZ_IDX_HUGE HV_PGSZ_IDX_64K +#define HV_PGSZ_MASK_HUGE HV_PGSZ_MASK_64K +#elif defined(CONFIG_HUGETLB_PAGE_SIZE_512K) +#define HV_PGSZ_IDX_HUGE HV_PGSZ_IDX_512K +#define HV_PGSZ_MASK_HUGE HV_PGSZ_MASK_512K +#elif defined(CONFIG_HUGETLB_PAGE_SIZE_4MB) +#define HV_PGSZ_IDX_HUGE HV_PGSZ_IDX_4MB +#define HV_PGSZ_MASK_HUGE HV_PGSZ_MASK_4MB +#else +#error Broken huge page size setting... +#endif +#endif + +static void setup_tsb_params(struct mm_struct *mm, unsigned long tsb_idx, unsigned long tsb_bytes) +{ + unsigned long tsb_reg, base, tsb_paddr; + unsigned long page_sz, tte; + + mm->context.tsb_block[tsb_idx].tsb_nentries = + tsb_bytes / sizeof(struct tsb); + + base = TSBMAP_BASE; + tte = pgprot_val(PAGE_KERNEL_LOCKED); + tsb_paddr = __pa(mm->context.tsb_block[tsb_idx].tsb); + BUG_ON(tsb_paddr & (tsb_bytes - 1UL)); + + /* Use the smallest page size that can map the whole TSB + * in one TLB entry. + */ + switch (tsb_bytes) { + case 8192 << 0: + tsb_reg = 0x0UL; +#ifdef DCACHE_ALIASING_POSSIBLE + base += (tsb_paddr & 8192); +#endif + page_sz = 8192; + break; + + case 8192 << 1: + tsb_reg = 0x1UL; + page_sz = 64 * 1024; + break; + + case 8192 << 2: + tsb_reg = 0x2UL; + page_sz = 64 * 1024; + break; + + case 8192 << 3: + tsb_reg = 0x3UL; + page_sz = 64 * 1024; + break; + + case 8192 << 4: + tsb_reg = 0x4UL; + page_sz = 512 * 1024; + break; + + case 8192 << 5: + tsb_reg = 0x5UL; + page_sz = 512 * 1024; + break; + + case 8192 << 6: + tsb_reg = 0x6UL; + page_sz = 512 * 1024; + break; + + case 8192 << 7: + tsb_reg = 0x7UL; + page_sz = 4 * 1024 * 1024; + break; + + default: + BUG(); + }; + tte |= pte_sz_bits(page_sz); + + if (tlb_type == cheetah_plus || tlb_type == hypervisor) { + /* Physical mapping, no locked TLB entry for TSB. */ + tsb_reg |= tsb_paddr; + + mm->context.tsb_block[tsb_idx].tsb_reg_val = tsb_reg; + mm->context.tsb_block[tsb_idx].tsb_map_vaddr = 0; + mm->context.tsb_block[tsb_idx].tsb_map_pte = 0; + } else { + tsb_reg |= base; + tsb_reg |= (tsb_paddr & (page_sz - 1UL)); + tte |= (tsb_paddr & ~(page_sz - 1UL)); + + mm->context.tsb_block[tsb_idx].tsb_reg_val = tsb_reg; + mm->context.tsb_block[tsb_idx].tsb_map_vaddr = base; + mm->context.tsb_block[tsb_idx].tsb_map_pte = tte; + } + + /* Setup the Hypervisor TSB descriptor. */ + if (tlb_type == hypervisor) { + struct hv_tsb_descr *hp = &mm->context.tsb_descr[tsb_idx]; + + switch (tsb_idx) { + case MM_TSB_BASE: + hp->pgsz_idx = HV_PGSZ_IDX_BASE; + break; +#ifdef CONFIG_HUGETLB_PAGE + case MM_TSB_HUGE: + hp->pgsz_idx = HV_PGSZ_IDX_HUGE; + break; +#endif + default: + BUG(); + }; + hp->assoc = 1; + hp->num_ttes = tsb_bytes / 16; + hp->ctx_idx = 0; + switch (tsb_idx) { + case MM_TSB_BASE: + hp->pgsz_mask = HV_PGSZ_MASK_BASE; + break; +#ifdef CONFIG_HUGETLB_PAGE + case MM_TSB_HUGE: + hp->pgsz_mask = HV_PGSZ_MASK_HUGE; + break; +#endif + default: + BUG(); + }; + hp->tsb_base = tsb_paddr; + hp->resv = 0; + } +} + +static kmem_cache_t *tsb_caches[8] __read_mostly; + +static const char *tsb_cache_names[8] = { + "tsb_8KB", + "tsb_16KB", + "tsb_32KB", + "tsb_64KB", + "tsb_128KB", + "tsb_256KB", + "tsb_512KB", + "tsb_1MB", +}; + +void __init tsb_cache_init(void) +{ + unsigned long i; + + for (i = 0; i < 8; i++) { + unsigned long size = 8192 << i; + const char *name = tsb_cache_names[i]; + + tsb_caches[i] = kmem_cache_create(name, + size, size, + SLAB_HWCACHE_ALIGN | + SLAB_MUST_HWCACHE_ALIGN, + NULL, NULL); + if (!tsb_caches[i]) { + prom_printf("Could not create %s cache\n", name); + prom_halt(); + } + } +} + +/* When the RSS of an address space exceeds tsb_rss_limit for a TSB, + * do_sparc64_fault() invokes this routine to try and grow it. + * + * When we reach the maximum TSB size supported, we stick ~0UL into + * tsb_rss_limit for that TSB so the grow checks in do_sparc64_fault() + * will not trigger any longer. + * + * The TSB can be anywhere from 8K to 1MB in size, in increasing powers + * of two. The TSB must be aligned to it's size, so f.e. a 512K TSB + * must be 512K aligned. It also must be physically contiguous, so we + * cannot use vmalloc(). + * + * The idea here is to grow the TSB when the RSS of the process approaches + * the number of entries that the current TSB can hold at once. Currently, + * we trigger when the RSS hits 3/4 of the TSB capacity. + */ +void tsb_grow(struct mm_struct *mm, unsigned long tsb_index, unsigned long rss) +{ + unsigned long max_tsb_size = 1 * 1024 * 1024; + unsigned long new_size, old_size, flags; + struct tsb *old_tsb, *new_tsb; + unsigned long new_cache_index, old_cache_index; + unsigned long new_rss_limit; + gfp_t gfp_flags; + + if (max_tsb_size > (PAGE_SIZE << MAX_ORDER)) + max_tsb_size = (PAGE_SIZE << MAX_ORDER); + + new_cache_index = 0; + for (new_size = 8192; new_size < max_tsb_size; new_size <<= 1UL) { + unsigned long n_entries = new_size / sizeof(struct tsb); + + n_entries = (n_entries * 3) / 4; + if (n_entries > rss) + break; + + new_cache_index++; + } + + if (new_size == max_tsb_size) + new_rss_limit = ~0UL; + else + new_rss_limit = ((new_size / sizeof(struct tsb)) * 3) / 4; + +retry_tsb_alloc: + gfp_flags = GFP_KERNEL; + if (new_size > (PAGE_SIZE * 2)) + gfp_flags = __GFP_NOWARN | __GFP_NORETRY; + + new_tsb = kmem_cache_alloc(tsb_caches[new_cache_index], gfp_flags); + if (unlikely(!new_tsb)) { + /* Not being able to fork due to a high-order TSB + * allocation failure is very bad behavior. Just back + * down to a 0-order allocation and force no TSB + * growing for this address space. + */ + if (mm->context.tsb_block[tsb_index].tsb == NULL && + new_cache_index > 0) { + new_cache_index = 0; + new_size = 8192; + new_rss_limit = ~0UL; + goto retry_tsb_alloc; + } + + /* If we failed on a TSB grow, we are under serious + * memory pressure so don't try to grow any more. + */ + if (mm->context.tsb_block[tsb_index].tsb != NULL) + mm->context.tsb_block[tsb_index].tsb_rss_limit = ~0UL; + return; + } + + /* Mark all tags as invalid. */ + tsb_init(new_tsb, new_size); + + /* Ok, we are about to commit the changes. If we are + * growing an existing TSB the locking is very tricky, + * so WATCH OUT! + * + * We have to hold mm->context.lock while committing to the + * new TSB, this synchronizes us with processors in + * flush_tsb_user() and switch_mm() for this address space. + * + * But even with that lock held, processors run asynchronously + * accessing the old TSB via TLB miss handling. This is OK + * because those actions are just propagating state from the + * Linux page tables into the TSB, page table mappings are not + * being changed. If a real fault occurs, the processor will + * synchronize with us when it hits flush_tsb_user(), this is + * also true for the case where vmscan is modifying the page + * tables. The only thing we need to be careful with is to + * skip any locked TSB entries during copy_tsb(). + * + * When we finish committing to the new TSB, we have to drop + * the lock and ask all other cpus running this address space + * to run tsb_context_switch() to see the new TSB table. + */ + spin_lock_irqsave(&mm->context.lock, flags); + + old_tsb = mm->context.tsb_block[tsb_index].tsb; + old_cache_index = + (mm->context.tsb_block[tsb_index].tsb_reg_val & 0x7UL); + old_size = (mm->context.tsb_block[tsb_index].tsb_nentries * + sizeof(struct tsb)); + + + /* Handle multiple threads trying to grow the TSB at the same time. + * One will get in here first, and bump the size and the RSS limit. + * The others will get in here next and hit this check. + */ + if (unlikely(old_tsb && + (rss < mm->context.tsb_block[tsb_index].tsb_rss_limit))) { + spin_unlock_irqrestore(&mm->context.lock, flags); + + kmem_cache_free(tsb_caches[new_cache_index], new_tsb); + return; + } + + mm->context.tsb_block[tsb_index].tsb_rss_limit = new_rss_limit; + + if (old_tsb) { + extern void copy_tsb(unsigned long old_tsb_base, + unsigned long old_tsb_size, + unsigned long new_tsb_base, + unsigned long new_tsb_size); + unsigned long old_tsb_base = (unsigned long) old_tsb; + unsigned long new_tsb_base = (unsigned long) new_tsb; + + if (tlb_type == cheetah_plus || tlb_type == hypervisor) { + old_tsb_base = __pa(old_tsb_base); + new_tsb_base = __pa(new_tsb_base); + } + copy_tsb(old_tsb_base, old_size, new_tsb_base, new_size); + } + + mm->context.tsb_block[tsb_index].tsb = new_tsb; + setup_tsb_params(mm, tsb_index, new_size); + + spin_unlock_irqrestore(&mm->context.lock, flags); + + /* If old_tsb is NULL, we're being invoked for the first time + * from init_new_context(). + */ + if (old_tsb) { + /* Reload it on the local cpu. */ + tsb_context_switch(mm); + + /* Now force other processors to do the same. */ + smp_tsb_sync(mm); + + /* Now it is safe to free the old tsb. */ + kmem_cache_free(tsb_caches[old_cache_index], old_tsb); + } +} + +int init_new_context(struct task_struct *tsk, struct mm_struct *mm) +{ +#ifdef CONFIG_HUGETLB_PAGE + unsigned long huge_pte_count; +#endif + unsigned int i; + + spin_lock_init(&mm->context.lock); + + mm->context.sparc64_ctx_val = 0UL; + +#ifdef CONFIG_HUGETLB_PAGE + /* We reset it to zero because the fork() page copying + * will re-increment the counters as the parent PTEs are + * copied into the child address space. + */ + huge_pte_count = mm->context.huge_pte_count; + mm->context.huge_pte_count = 0; +#endif + + /* copy_mm() copies over the parent's mm_struct before calling + * us, so we need to zero out the TSB pointer or else tsb_grow() + * will be confused and think there is an older TSB to free up. + */ + for (i = 0; i < MM_NUM_TSBS; i++) + mm->context.tsb_block[i].tsb = NULL; + + /* If this is fork, inherit the parent's TSB size. We would + * grow it to that size on the first page fault anyways. + */ + tsb_grow(mm, MM_TSB_BASE, get_mm_rss(mm)); + +#ifdef CONFIG_HUGETLB_PAGE + if (unlikely(huge_pte_count)) + tsb_grow(mm, MM_TSB_HUGE, huge_pte_count); +#endif + + if (unlikely(!mm->context.tsb_block[MM_TSB_BASE].tsb)) + return -ENOMEM; + + return 0; +} + +static void tsb_destroy_one(struct tsb_config *tp) +{ + unsigned long cache_index; + + if (!tp->tsb) + return; + cache_index = tp->tsb_reg_val & 0x7UL; + kmem_cache_free(tsb_caches[cache_index], tp->tsb); + tp->tsb = NULL; + tp->tsb_reg_val = 0UL; +} + +void destroy_context(struct mm_struct *mm) +{ + unsigned long flags, i; + + for (i = 0; i < MM_NUM_TSBS; i++) + tsb_destroy_one(&mm->context.tsb_block[i]); + + spin_lock_irqsave(&ctx_alloc_lock, flags); + + if (CTX_VALID(mm->context)) { + unsigned long nr = CTX_NRBITS(mm->context); + mmu_context_bmap[nr>>6] &= ~(1UL << (nr & 63)); + } + + spin_unlock_irqrestore(&ctx_alloc_lock, flags); +} diff --git a/arch/sparc64/mm/ultra.S b/arch/sparc64/mm/ultra.S index e4c9151fa11..f8479fad404 100644 --- a/arch/sparc64/mm/ultra.S +++ b/arch/sparc64/mm/ultra.S @@ -15,6 +15,7 @@ #include <asm/head.h> #include <asm/thread_info.h> #include <asm/cacheflush.h> +#include <asm/hypervisor.h> /* Basically, most of the Spitfire vs. Cheetah madness * has to do with the fact that Cheetah does not support @@ -29,16 +30,18 @@ .text .align 32 .globl __flush_tlb_mm -__flush_tlb_mm: /* %o0=(ctx & TAG_CONTEXT_BITS), %o1=SECONDARY_CONTEXT */ +__flush_tlb_mm: /* 18 insns */ + /* %o0=(ctx & TAG_CONTEXT_BITS), %o1=SECONDARY_CONTEXT */ ldxa [%o1] ASI_DMMU, %g2 cmp %g2, %o0 bne,pn %icc, __spitfire_flush_tlb_mm_slow mov 0x50, %g3 stxa %g0, [%g3] ASI_DMMU_DEMAP stxa %g0, [%g3] ASI_IMMU_DEMAP + sethi %hi(KERNBASE), %g3 + flush %g3 retl - flush %g6 - nop + nop nop nop nop @@ -51,7 +54,7 @@ __flush_tlb_mm: /* %o0=(ctx & TAG_CONTEXT_BITS), %o1=SECONDARY_CONTEXT */ .align 32 .globl __flush_tlb_pending -__flush_tlb_pending: +__flush_tlb_pending: /* 26 insns */ /* %o0 = context, %o1 = nr, %o2 = vaddrs[] */ rdpr %pstate, %g7 sllx %o1, 3, %o1 @@ -72,7 +75,8 @@ __flush_tlb_pending: brnz,pt %o1, 1b nop stxa %g2, [%o4] ASI_DMMU - flush %g6 + sethi %hi(KERNBASE), %o4 + flush %o4 retl wrpr %g7, 0x0, %pstate nop @@ -82,7 +86,8 @@ __flush_tlb_pending: .align 32 .globl __flush_tlb_kernel_range -__flush_tlb_kernel_range: /* %o0=start, %o1=end */ +__flush_tlb_kernel_range: /* 16 insns */ + /* %o0=start, %o1=end */ cmp %o0, %o1 be,pn %xcc, 2f sethi %hi(PAGE_SIZE), %o4 @@ -94,8 +99,11 @@ __flush_tlb_kernel_range: /* %o0=start, %o1=end */ membar #Sync brnz,pt %o3, 1b sub %o3, %o4, %o3 -2: retl - flush %g6 +2: sethi %hi(KERNBASE), %o3 + flush %o3 + retl + nop + nop __spitfire_flush_tlb_mm_slow: rdpr %pstate, %g1 @@ -105,7 +113,8 @@ __spitfire_flush_tlb_mm_slow: stxa %g0, [%g3] ASI_IMMU_DEMAP flush %g6 stxa %g2, [%o1] ASI_DMMU - flush %g6 + sethi %hi(KERNBASE), %o1 + flush %o1 retl wrpr %g1, 0, %pstate @@ -181,7 +190,7 @@ __flush_dcache_page: /* %o0=kaddr, %o1=flush_icache */ .previous /* Cheetah specific versions, patched at boot time. */ -__cheetah_flush_tlb_mm: /* 18 insns */ +__cheetah_flush_tlb_mm: /* 19 insns */ rdpr %pstate, %g7 andn %g7, PSTATE_IE, %g2 wrpr %g2, 0x0, %pstate @@ -196,12 +205,13 @@ __cheetah_flush_tlb_mm: /* 18 insns */ stxa %g0, [%g3] ASI_DMMU_DEMAP stxa %g0, [%g3] ASI_IMMU_DEMAP stxa %g2, [%o2] ASI_DMMU - flush %g6 + sethi %hi(KERNBASE), %o2 + flush %o2 wrpr %g0, 0, %tl retl wrpr %g7, 0x0, %pstate -__cheetah_flush_tlb_pending: /* 26 insns */ +__cheetah_flush_tlb_pending: /* 27 insns */ /* %o0 = context, %o1 = nr, %o2 = vaddrs[] */ rdpr %pstate, %g7 sllx %o1, 3, %o1 @@ -225,7 +235,8 @@ __cheetah_flush_tlb_pending: /* 26 insns */ brnz,pt %o1, 1b nop stxa %g2, [%o4] ASI_DMMU - flush %g6 + sethi %hi(KERNBASE), %o4 + flush %o4 wrpr %g0, 0, %tl retl wrpr %g7, 0x0, %pstate @@ -245,7 +256,76 @@ __cheetah_flush_dcache_page: /* 11 insns */ nop #endif /* DCACHE_ALIASING_POSSIBLE */ -cheetah_patch_one: + /* Hypervisor specific versions, patched at boot time. */ +__hypervisor_tlb_tl0_error: + save %sp, -192, %sp + mov %i0, %o0 + call hypervisor_tlbop_error + mov %i1, %o1 + ret + restore + +__hypervisor_flush_tlb_mm: /* 10 insns */ + mov %o0, %o2 /* ARG2: mmu context */ + mov 0, %o0 /* ARG0: CPU lists unimplemented */ + mov 0, %o1 /* ARG1: CPU lists unimplemented */ + mov HV_MMU_ALL, %o3 /* ARG3: flags */ + mov HV_FAST_MMU_DEMAP_CTX, %o5 + ta HV_FAST_TRAP + brnz,pn %o0, __hypervisor_tlb_tl0_error + mov HV_FAST_MMU_DEMAP_CTX, %o1 + retl + nop + +__hypervisor_flush_tlb_pending: /* 16 insns */ + /* %o0 = context, %o1 = nr, %o2 = vaddrs[] */ + sllx %o1, 3, %g1 + mov %o2, %g2 + mov %o0, %g3 +1: sub %g1, (1 << 3), %g1 + ldx [%g2 + %g1], %o0 /* ARG0: vaddr + IMMU-bit */ + mov %g3, %o1 /* ARG1: mmu context */ + mov HV_MMU_ALL, %o2 /* ARG2: flags */ + srlx %o0, PAGE_SHIFT, %o0 + sllx %o0, PAGE_SHIFT, %o0 + ta HV_MMU_UNMAP_ADDR_TRAP + brnz,pn %o0, __hypervisor_tlb_tl0_error + mov HV_MMU_UNMAP_ADDR_TRAP, %o1 + brnz,pt %g1, 1b + nop + retl + nop + +__hypervisor_flush_tlb_kernel_range: /* 16 insns */ + /* %o0=start, %o1=end */ + cmp %o0, %o1 + be,pn %xcc, 2f + sethi %hi(PAGE_SIZE), %g3 + mov %o0, %g1 + sub %o1, %g1, %g2 + sub %g2, %g3, %g2 +1: add %g1, %g2, %o0 /* ARG0: virtual address */ + mov 0, %o1 /* ARG1: mmu context */ + mov HV_MMU_ALL, %o2 /* ARG2: flags */ + ta HV_MMU_UNMAP_ADDR_TRAP + brnz,pn %o0, __hypervisor_tlb_tl0_error + mov HV_MMU_UNMAP_ADDR_TRAP, %o1 + brnz,pt %g2, 1b + sub %g2, %g3, %g2 +2: retl + nop + +#ifdef DCACHE_ALIASING_POSSIBLE + /* XXX Niagara and friends have an 8K cache, so no aliasing is + * XXX possible, but nothing explicit in the Hypervisor API + * XXX guarantees this. + */ +__hypervisor_flush_dcache_page: /* 2 insns */ + retl + nop +#endif + +tlb_patch_one: 1: lduw [%o1], %g1 stw %g1, [%o0] flush %o0 @@ -264,22 +344,22 @@ cheetah_patch_cachetlbops: or %o0, %lo(__flush_tlb_mm), %o0 sethi %hi(__cheetah_flush_tlb_mm), %o1 or %o1, %lo(__cheetah_flush_tlb_mm), %o1 - call cheetah_patch_one - mov 18, %o2 + call tlb_patch_one + mov 19, %o2 sethi %hi(__flush_tlb_pending), %o0 or %o0, %lo(__flush_tlb_pending), %o0 sethi %hi(__cheetah_flush_tlb_pending), %o1 or %o1, %lo(__cheetah_flush_tlb_pending), %o1 - call cheetah_patch_one - mov 26, %o2 + call tlb_patch_one + mov 27, %o2 #ifdef DCACHE_ALIASING_POSSIBLE sethi %hi(__flush_dcache_page), %o0 or %o0, %lo(__flush_dcache_page), %o0 sethi %hi(__cheetah_flush_dcache_page), %o1 or %o1, %lo(__cheetah_flush_dcache_page), %o1 - call cheetah_patch_one + call tlb_patch_one mov 11, %o2 #endif /* DCACHE_ALIASING_POSSIBLE */ @@ -295,16 +375,14 @@ cheetah_patch_cachetlbops: * %g1 address arg 1 (tlb page and range flushes) * %g7 address arg 2 (tlb range flush only) * - * %g6 ivector table, don't touch - * %g2 scratch 1 - * %g3 scratch 2 - * %g4 scratch 3 - * - * TODO: Make xcall TLB range flushes use the tricks above... -DaveM + * %g6 scratch 1 + * %g2 scratch 2 + * %g3 scratch 3 + * %g4 scratch 4 */ .align 32 .globl xcall_flush_tlb_mm -xcall_flush_tlb_mm: +xcall_flush_tlb_mm: /* 21 insns */ mov PRIMARY_CONTEXT, %g2 ldxa [%g2] ASI_DMMU, %g3 srlx %g3, CTX_PGSZ1_NUC_SHIFT, %g4 @@ -316,9 +394,19 @@ xcall_flush_tlb_mm: stxa %g0, [%g4] ASI_IMMU_DEMAP stxa %g3, [%g2] ASI_DMMU retry + nop + nop + nop + nop + nop + nop + nop + nop + nop + nop .globl xcall_flush_tlb_pending -xcall_flush_tlb_pending: +xcall_flush_tlb_pending: /* 21 insns */ /* %g5=context, %g1=nr, %g7=vaddrs[] */ sllx %g1, 3, %g1 mov PRIMARY_CONTEXT, %g4 @@ -341,9 +429,10 @@ xcall_flush_tlb_pending: nop stxa %g2, [%g4] ASI_DMMU retry + nop .globl xcall_flush_tlb_kernel_range -xcall_flush_tlb_kernel_range: +xcall_flush_tlb_kernel_range: /* 25 insns */ sethi %hi(PAGE_SIZE - 1), %g2 or %g2, %lo(PAGE_SIZE - 1), %g2 andn %g1, %g2, %g1 @@ -360,14 +449,30 @@ xcall_flush_tlb_kernel_range: retry nop nop + nop + nop + nop + nop + nop + nop + nop + nop + nop /* This runs in a very controlled environment, so we do * not need to worry about BH races etc. */ .globl xcall_sync_tick xcall_sync_tick: - rdpr %pstate, %g2 + +661: rdpr %pstate, %g2 wrpr %g2, PSTATE_IG | PSTATE_AG, %pstate + .section .sun4v_2insn_patch, "ax" + .word 661b + nop + nop + .previous + rdpr %pil, %g2 wrpr %g0, 15, %pil sethi %hi(109f), %g7 @@ -390,8 +495,15 @@ xcall_sync_tick: */ .globl xcall_report_regs xcall_report_regs: - rdpr %pstate, %g2 + +661: rdpr %pstate, %g2 wrpr %g2, PSTATE_IG | PSTATE_AG, %pstate + .section .sun4v_2insn_patch, "ax" + .word 661b + nop + nop + .previous + rdpr %pil, %g2 wrpr %g0, 15, %pil sethi %hi(109f), %g7 @@ -453,62 +565,96 @@ xcall_flush_dcache_page_spitfire: /* %g1 == physical page address nop nop - .data - -errata32_hwbug: - .xword 0 - - .text - - /* These two are not performance critical... */ - .globl xcall_flush_tlb_all_spitfire -xcall_flush_tlb_all_spitfire: - /* Spitfire Errata #32 workaround. */ - sethi %hi(errata32_hwbug), %g4 - stx %g0, [%g4 + %lo(errata32_hwbug)] - - clr %g2 - clr %g3 -1: ldxa [%g3] ASI_DTLB_DATA_ACCESS, %g4 - and %g4, _PAGE_L, %g5 - brnz,pn %g5, 2f - mov TLB_TAG_ACCESS, %g7 - - stxa %g0, [%g7] ASI_DMMU - membar #Sync - stxa %g0, [%g3] ASI_DTLB_DATA_ACCESS + /* %g5: error + * %g6: tlb op + */ +__hypervisor_tlb_xcall_error: + mov %g5, %g4 + mov %g6, %g5 + ba,pt %xcc, etrap + rd %pc, %g7 + mov %l4, %o0 + call hypervisor_tlbop_error_xcall + mov %l5, %o1 + ba,a,pt %xcc, rtrap_clr_l6 + + .globl __hypervisor_xcall_flush_tlb_mm +__hypervisor_xcall_flush_tlb_mm: /* 21 insns */ + /* %g5=ctx, g1,g2,g3,g4,g7=scratch, %g6=unusable */ + mov %o0, %g2 + mov %o1, %g3 + mov %o2, %g4 + mov %o3, %g1 + mov %o5, %g7 + clr %o0 /* ARG0: CPU lists unimplemented */ + clr %o1 /* ARG1: CPU lists unimplemented */ + mov %g5, %o2 /* ARG2: mmu context */ + mov HV_MMU_ALL, %o3 /* ARG3: flags */ + mov HV_FAST_MMU_DEMAP_CTX, %o5 + ta HV_FAST_TRAP + mov HV_FAST_MMU_DEMAP_CTX, %g6 + brnz,pn %o0, __hypervisor_tlb_xcall_error + mov %o0, %g5 + mov %g2, %o0 + mov %g3, %o1 + mov %g4, %o2 + mov %g1, %o3 + mov %g7, %o5 membar #Sync + retry - /* Spitfire Errata #32 workaround. */ - sethi %hi(errata32_hwbug), %g4 - stx %g0, [%g4 + %lo(errata32_hwbug)] - -2: ldxa [%g3] ASI_ITLB_DATA_ACCESS, %g4 - and %g4, _PAGE_L, %g5 - brnz,pn %g5, 2f - mov TLB_TAG_ACCESS, %g7 - - stxa %g0, [%g7] ASI_IMMU - membar #Sync - stxa %g0, [%g3] ASI_ITLB_DATA_ACCESS + .globl __hypervisor_xcall_flush_tlb_pending +__hypervisor_xcall_flush_tlb_pending: /* 21 insns */ + /* %g5=ctx, %g1=nr, %g7=vaddrs[], %g2,%g3,%g4,g6=scratch */ + sllx %g1, 3, %g1 + mov %o0, %g2 + mov %o1, %g3 + mov %o2, %g4 +1: sub %g1, (1 << 3), %g1 + ldx [%g7 + %g1], %o0 /* ARG0: virtual address */ + mov %g5, %o1 /* ARG1: mmu context */ + mov HV_MMU_ALL, %o2 /* ARG2: flags */ + srlx %o0, PAGE_SHIFT, %o0 + sllx %o0, PAGE_SHIFT, %o0 + ta HV_MMU_UNMAP_ADDR_TRAP + mov HV_MMU_UNMAP_ADDR_TRAP, %g6 + brnz,a,pn %o0, __hypervisor_tlb_xcall_error + mov %o0, %g5 + brnz,pt %g1, 1b + nop + mov %g2, %o0 + mov %g3, %o1 + mov %g4, %o2 membar #Sync - - /* Spitfire Errata #32 workaround. */ - sethi %hi(errata32_hwbug), %g4 - stx %g0, [%g4 + %lo(errata32_hwbug)] - -2: add %g2, 1, %g2 - cmp %g2, SPITFIRE_HIGHEST_LOCKED_TLBENT - ble,pt %icc, 1b - sll %g2, 3, %g3 - flush %g6 retry - .globl xcall_flush_tlb_all_cheetah -xcall_flush_tlb_all_cheetah: - mov 0x80, %g2 - stxa %g0, [%g2] ASI_DMMU_DEMAP - stxa %g0, [%g2] ASI_IMMU_DEMAP + .globl __hypervisor_xcall_flush_tlb_kernel_range +__hypervisor_xcall_flush_tlb_kernel_range: /* 25 insns */ + /* %g1=start, %g7=end, g2,g3,g4,g5,g6=scratch */ + sethi %hi(PAGE_SIZE - 1), %g2 + or %g2, %lo(PAGE_SIZE - 1), %g2 + andn %g1, %g2, %g1 + andn %g7, %g2, %g7 + sub %g7, %g1, %g3 + add %g2, 1, %g2 + sub %g3, %g2, %g3 + mov %o0, %g2 + mov %o1, %g4 + mov %o2, %g7 +1: add %g1, %g3, %o0 /* ARG0: virtual address */ + mov 0, %o1 /* ARG1: mmu context */ + mov HV_MMU_ALL, %o2 /* ARG2: flags */ + ta HV_MMU_UNMAP_ADDR_TRAP + mov HV_MMU_UNMAP_ADDR_TRAP, %g6 + brnz,pn %o0, __hypervisor_tlb_xcall_error + mov %o0, %g5 + sethi %hi(PAGE_SIZE), %o2 + brnz,pt %g3, 1b + sub %g3, %o2, %g3 + mov %g2, %o0 + mov %g4, %o1 + mov %g7, %o2 + membar #Sync retry /* These just get rescheduled to PIL vectors. */ @@ -527,4 +673,70 @@ xcall_capture: wr %g0, (1 << PIL_SMP_CAPTURE), %set_softint retry + .globl xcall_new_mmu_context_version +xcall_new_mmu_context_version: + wr %g0, (1 << PIL_SMP_CTX_NEW_VERSION), %set_softint + retry + #endif /* CONFIG_SMP */ + + + .globl hypervisor_patch_cachetlbops +hypervisor_patch_cachetlbops: + save %sp, -128, %sp + + sethi %hi(__flush_tlb_mm), %o0 + or %o0, %lo(__flush_tlb_mm), %o0 + sethi %hi(__hypervisor_flush_tlb_mm), %o1 + or %o1, %lo(__hypervisor_flush_tlb_mm), %o1 + call tlb_patch_one + mov 10, %o2 + + sethi %hi(__flush_tlb_pending), %o0 + or %o0, %lo(__flush_tlb_pending), %o0 + sethi %hi(__hypervisor_flush_tlb_pending), %o1 + or %o1, %lo(__hypervisor_flush_tlb_pending), %o1 + call tlb_patch_one + mov 16, %o2 + + sethi %hi(__flush_tlb_kernel_range), %o0 + or %o0, %lo(__flush_tlb_kernel_range), %o0 + sethi %hi(__hypervisor_flush_tlb_kernel_range), %o1 + or %o1, %lo(__hypervisor_flush_tlb_kernel_range), %o1 + call tlb_patch_one + mov 16, %o2 + +#ifdef DCACHE_ALIASING_POSSIBLE + sethi %hi(__flush_dcache_page), %o0 + or %o0, %lo(__flush_dcache_page), %o0 + sethi %hi(__hypervisor_flush_dcache_page), %o1 + or %o1, %lo(__hypervisor_flush_dcache_page), %o1 + call tlb_patch_one + mov 2, %o2 +#endif /* DCACHE_ALIASING_POSSIBLE */ + +#ifdef CONFIG_SMP + sethi %hi(xcall_flush_tlb_mm), %o0 + or %o0, %lo(xcall_flush_tlb_mm), %o0 + sethi %hi(__hypervisor_xcall_flush_tlb_mm), %o1 + or %o1, %lo(__hypervisor_xcall_flush_tlb_mm), %o1 + call tlb_patch_one + mov 21, %o2 + + sethi %hi(xcall_flush_tlb_pending), %o0 + or %o0, %lo(xcall_flush_tlb_pending), %o0 + sethi %hi(__hypervisor_xcall_flush_tlb_pending), %o1 + or %o1, %lo(__hypervisor_xcall_flush_tlb_pending), %o1 + call tlb_patch_one + mov 21, %o2 + + sethi %hi(xcall_flush_tlb_kernel_range), %o0 + or %o0, %lo(xcall_flush_tlb_kernel_range), %o0 + sethi %hi(__hypervisor_xcall_flush_tlb_kernel_range), %o1 + or %o1, %lo(__hypervisor_xcall_flush_tlb_kernel_range), %o1 + call tlb_patch_one + mov 25, %o2 +#endif /* CONFIG_SMP */ + + ret + restore |