diff options
Diffstat (limited to 'arch')
36 files changed, 1713 insertions, 770 deletions
diff --git a/arch/ia64/include/asm/siginfo.h b/arch/ia64/include/asm/siginfo.h index 9294e4b0c8b..118d4297900 100644 --- a/arch/ia64/include/asm/siginfo.h +++ b/arch/ia64/include/asm/siginfo.h @@ -113,11 +113,6 @@ typedef struct siginfo { #undef NSIGSEGV #define NSIGSEGV 3 -/* - * SIGTRAP si_codes - */ -#define TRAP_BRANCH (__SI_FAULT|3) /* process taken branch trap */ -#define TRAP_HWBKPT (__SI_FAULT|4) /* hardware breakpoint or watchpoint */ #undef NSIGTRAP #define NSIGTRAP 4 diff --git a/arch/powerpc/include/asm/siginfo.h b/arch/powerpc/include/asm/siginfo.h index 12f1bce037b..49495b0534e 100644 --- a/arch/powerpc/include/asm/siginfo.h +++ b/arch/powerpc/include/asm/siginfo.h @@ -15,11 +15,6 @@ #include <asm-generic/siginfo.h> -/* - * SIGTRAP si_codes - */ -#define TRAP_BRANCH (__SI_FAULT|3) /* process taken branch trap */ -#define TRAP_HWBKPT (__SI_FAULT|4) /* hardware breakpoint or watchpoint */ #undef NSIGTRAP #define NSIGTRAP 4 diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 8d64c1bc847..4bc02b23674 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -351,31 +351,28 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, savesegment(es, tmp); err |= __put_user(tmp, (unsigned int __user *)&sc->es); - err |= __put_user((u32)regs->di, &sc->di); - err |= __put_user((u32)regs->si, &sc->si); - err |= __put_user((u32)regs->bp, &sc->bp); - err |= __put_user((u32)regs->sp, &sc->sp); - err |= __put_user((u32)regs->bx, &sc->bx); - err |= __put_user((u32)regs->dx, &sc->dx); - err |= __put_user((u32)regs->cx, &sc->cx); - err |= __put_user((u32)regs->ax, &sc->ax); - err |= __put_user((u32)regs->cs, &sc->cs); - err |= __put_user((u32)regs->ss, &sc->ss); + err |= __put_user(regs->di, &sc->di); + err |= __put_user(regs->si, &sc->si); + err |= __put_user(regs->bp, &sc->bp); + err |= __put_user(regs->sp, &sc->sp); + err |= __put_user(regs->bx, &sc->bx); + err |= __put_user(regs->dx, &sc->dx); + err |= __put_user(regs->cx, &sc->cx); + err |= __put_user(regs->ax, &sc->ax); + err |= __put_user(regs->cs, &sc->cs); + err |= __put_user(regs->ss, &sc->ss); err |= __put_user(current->thread.trap_no, &sc->trapno); err |= __put_user(current->thread.error_code, &sc->err); - err |= __put_user((u32)regs->ip, &sc->ip); - err |= __put_user((u32)regs->flags, &sc->flags); - err |= __put_user((u32)regs->sp, &sc->sp_at_signal); + err |= __put_user(regs->ip, &sc->ip); + err |= __put_user(regs->flags, &sc->flags); + err |= __put_user(regs->sp, &sc->sp_at_signal); tmp = save_i387_xstate_ia32(fpstate); if (tmp < 0) err = -EFAULT; - else { - clear_used_math(); - stts(); + else err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL), &sc->fpstate); - } /* non-iBCS2 extensions.. */ err |= __put_user(mask, &sc->oldmask); @@ -444,21 +441,18 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - goto give_sigsegv; + return -EFAULT; - err |= __put_user(sig, &frame->sig); - if (err) - goto give_sigsegv; + if (__put_user(sig, &frame->sig)) + return -EFAULT; - err |= ia32_setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]); - if (err) - goto give_sigsegv; + if (ia32_setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0])) + return -EFAULT; if (_COMPAT_NSIG_WORDS > 1) { - err |= __copy_to_user(frame->extramask, &set->sig[1], - sizeof(frame->extramask)); - if (err) - goto give_sigsegv; + if (__copy_to_user(frame->extramask, &set->sig[1], + sizeof(frame->extramask))) + return -EFAULT; } if (ka->sa.sa_flags & SA_RESTORER) { @@ -479,7 +473,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, */ err |= __copy_to_user(frame->retcode, &code, 8); if (err) - goto give_sigsegv; + return -EFAULT; /* Set up registers for signal handler */ regs->sp = (unsigned long) frame; @@ -502,10 +496,6 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, #endif return 0; - -give_sigsegv: - force_sigsegv(sig, current); - return -EFAULT; } int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, @@ -533,14 +523,14 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - goto give_sigsegv; + return -EFAULT; err |= __put_user(sig, &frame->sig); err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo); err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc); err |= copy_siginfo_to_user32(&frame->info, info); if (err) - goto give_sigsegv; + return -EFAULT; /* Create the ucontext. */ if (cpu_has_xsave) @@ -556,7 +546,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, regs, set->sig[0]); err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); if (err) - goto give_sigsegv; + return -EFAULT; if (ka->sa.sa_flags & SA_RESTORER) restorer = ka->sa.sa_restorer; @@ -571,7 +561,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, */ err |= __copy_to_user(frame->retcode, &code, 8); if (err) - goto give_sigsegv; + return -EFAULT; /* Set up registers for signal handler */ regs->sp = (unsigned long) frame; @@ -599,8 +589,4 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, #endif return 0; - -give_sigsegv: - force_sigsegv(sig, current); - return -EFAULT; } diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index c9be69fedb7..7b655b5bb9a 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -10,7 +10,7 @@ ifdef CONFIG_FTRACE # Do not profile debug and lowlevel utilities CFLAGS_REMOVE_tsc.o = -pg CFLAGS_REMOVE_rtc.o = -pg -CFLAGS_REMOVE_paravirt.o = -pg +CFLAGS_REMOVE_paravirt-spinlocks.o = -pg endif # @@ -90,7 +90,7 @@ obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o obj-$(CONFIG_KVM_GUEST) += kvm.o obj-$(CONFIG_KVM_CLOCK) += kvmclock.o -obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o +obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o paravirt-spinlocks.o obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 7581b62df18..fb789dd9e69 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1121,16 +1121,5 @@ void __cpuinit cpu_init(void) xsave_init(); } -#ifdef CONFIG_HOTPLUG_CPU -void __cpuinit cpu_uninit(void) -{ - int cpu = raw_smp_processor_id(); - cpu_clear(cpu, cpu_initialized); - - /* lazy TLB state */ - per_cpu(cpu_tlbstate, cpu).state = 0; - per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; -} -#endif #endif diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 0ed5f939b90..eee32b43fee 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -52,6 +52,8 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) memset(newldt + oldsize * LDT_ENTRY_SIZE, 0, (mincount - oldsize) * LDT_ENTRY_SIZE); + paravirt_alloc_ldt(newldt, mincount); + #ifdef CONFIG_X86_64 /* CHECKME: Do we really need this ? */ wmb(); @@ -74,6 +76,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) #endif } if (oldsize) { + paravirt_free_ldt(oldldt, oldsize); if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE) vfree(oldldt); else @@ -85,10 +88,13 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) static inline int copy_ldt(mm_context_t *new, mm_context_t *old) { int err = alloc_ldt(new, old->size, 0); + int i; if (err < 0) return err; - memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE); + + for(i = 0; i < old->size; i++) + write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE); return 0; } @@ -125,6 +131,7 @@ void destroy_context(struct mm_struct *mm) if (mm == current->active_mm) clear_LDT(); #endif + paravirt_free_ldt(mm->context.ldt, mm->context.size); if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE) vfree(mm->context.ldt); else diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c new file mode 100644 index 00000000000..0e9f1982b1d --- /dev/null +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -0,0 +1,37 @@ +/* + * Split spinlock implementation out into its own file, so it can be + * compiled in a FTRACE-compatible way. + */ +#include <linux/spinlock.h> +#include <linux/module.h> + +#include <asm/paravirt.h> + +static void default_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags) +{ + __raw_spin_lock(lock); +} + +struct pv_lock_ops pv_lock_ops = { +#ifdef CONFIG_SMP + .spin_is_locked = __ticket_spin_is_locked, + .spin_is_contended = __ticket_spin_is_contended, + + .spin_lock = __ticket_spin_lock, + .spin_lock_flags = default_spin_lock_flags, + .spin_trylock = __ticket_spin_trylock, + .spin_unlock = __ticket_spin_unlock, +#endif +}; +EXPORT_SYMBOL(pv_lock_ops); + +void __init paravirt_use_bytelocks(void) +{ +#ifdef CONFIG_SMP + pv_lock_ops.spin_is_locked = __byte_spin_is_locked; + pv_lock_ops.spin_is_contended = __byte_spin_is_contended; + pv_lock_ops.spin_lock = __byte_spin_lock; + pv_lock_ops.spin_trylock = __byte_spin_trylock; + pv_lock_ops.spin_unlock = __byte_spin_unlock; +#endif +} diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 6b0bb73998d..e4c8fb60887 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -268,17 +268,6 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) return __get_cpu_var(paravirt_lazy_mode); } -void __init paravirt_use_bytelocks(void) -{ -#ifdef CONFIG_SMP - pv_lock_ops.spin_is_locked = __byte_spin_is_locked; - pv_lock_ops.spin_is_contended = __byte_spin_is_contended; - pv_lock_ops.spin_lock = __byte_spin_lock; - pv_lock_ops.spin_trylock = __byte_spin_trylock; - pv_lock_ops.spin_unlock = __byte_spin_unlock; -#endif -} - struct pv_info pv_info = { .name = "bare hardware", .paravirt_enabled = 0, @@ -349,6 +338,10 @@ struct pv_cpu_ops pv_cpu_ops = { .write_ldt_entry = native_write_ldt_entry, .write_gdt_entry = native_write_gdt_entry, .write_idt_entry = native_write_idt_entry, + + .alloc_ldt = paravirt_nop, + .free_ldt = paravirt_nop, + .load_sp0 = native_load_sp0, #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) @@ -460,18 +453,6 @@ struct pv_mmu_ops pv_mmu_ops = { .set_fixmap = native_set_fixmap, }; -struct pv_lock_ops pv_lock_ops = { -#ifdef CONFIG_SMP - .spin_is_locked = __ticket_spin_is_locked, - .spin_is_contended = __ticket_spin_is_contended, - - .spin_lock = __ticket_spin_lock, - .spin_trylock = __ticket_spin_trylock, - .spin_unlock = __ticket_spin_unlock, -#endif -}; -EXPORT_SYMBOL(pv_lock_ops); - EXPORT_SYMBOL_GPL(pv_time_ops); EXPORT_SYMBOL (pv_cpu_ops); EXPORT_SYMBOL (pv_mmu_ops); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 205188db962..922c14058f9 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -76,47 +76,12 @@ unsigned long thread_saved_pc(struct task_struct *tsk) return ((unsigned long *)tsk->thread.sp)[3]; } -#ifdef CONFIG_HOTPLUG_CPU -#include <asm/nmi.h> - -static void cpu_exit_clear(void) -{ - int cpu = raw_smp_processor_id(); - - idle_task_exit(); - - cpu_uninit(); - irq_ctx_exit(cpu); - - cpu_clear(cpu, cpu_callout_map); - cpu_clear(cpu, cpu_callin_map); - - numa_remove_cpu(cpu); - c1e_remove_cpu(cpu); -} - -/* We don't actually take CPU down, just spin without interrupts. */ -static inline void play_dead(void) -{ - /* This must be done before dead CPU ack */ - cpu_exit_clear(); - mb(); - /* Ack it */ - __get_cpu_var(cpu_state) = CPU_DEAD; - - /* - * With physical CPU hotplug, we should halt the cpu - */ - local_irq_disable(); - /* mask all interrupts, flush any and all caches, and halt */ - wbinvd_halt(); -} -#else +#ifndef CONFIG_SMP static inline void play_dead(void) { BUG(); } -#endif /* CONFIG_HOTPLUG_CPU */ +#endif /* * The idle thread. There's no useful work to be diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 2a8ccb9238b..ca80394ef5b 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -86,30 +86,12 @@ void exit_idle(void) __exit_idle(); } -#ifdef CONFIG_HOTPLUG_CPU -DECLARE_PER_CPU(int, cpu_state); - -#include <linux/nmi.h> -/* We halt the CPU with physical CPU hotplug */ -static inline void play_dead(void) -{ - idle_task_exit(); - c1e_remove_cpu(raw_smp_processor_id()); - - mb(); - /* Ack it */ - __get_cpu_var(cpu_state) = CPU_DEAD; - - local_irq_disable(); - /* mask all interrupts, flush any and all caches, and halt */ - wbinvd_halt(); -} -#else +#ifndef CONFIG_SMP static inline void play_dead(void) { BUG(); } -#endif /* CONFIG_HOTPLUG_CPU */ +#endif /* * The idle thread. There's no useful work to be @@ -754,12 +736,12 @@ unsigned long get_wchan(struct task_struct *p) if (!p || p == current || p->state == TASK_RUNNING) return 0; stack = (unsigned long)task_stack_page(p); - if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE) + if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE) return 0; fp = *(u64 *)(p->thread.sp); do { if (fp < (unsigned long)stack || - fp > (unsigned long)stack+THREAD_SIZE) + fp >= (unsigned long)stack+THREAD_SIZE) return 0; ip = *(u64 *)(fp+8); if (!in_sched_functions(ip)) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index e375b658efc..42ec4421e10 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1452,7 +1452,8 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task) #endif } -void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) +void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, + int error_code, int si_code) { struct siginfo info; @@ -1461,7 +1462,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) memset(&info, 0, sizeof(info)); info.si_signo = SIGTRAP; - info.si_code = TRAP_BRKPT; + info.si_code = si_code; /* User-mode ip? */ info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL; @@ -1548,5 +1549,5 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs) */ if (test_thread_flag(TIF_SINGLESTEP) && tracehook_consider_fatal_signal(current, SIGTRAP, SIG_DFL)) - send_sigtrap(current, regs, 0); + send_sigtrap(current, regs, 0, TRAP_BRKPT); } diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index b21070ea33a..d6dd057d0f2 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -27,6 +27,7 @@ #include <asm/uaccess.h> #include <asm/i387.h> #include <asm/vdso.h> +#include <asm/syscall.h> #include <asm/syscalls.h> #include "sigframe.h" @@ -112,6 +113,27 @@ asmlinkage int sys_sigaltstack(unsigned long bx) return do_sigaltstack(uss, uoss, regs->sp); } +#define COPY(x) { \ + err |= __get_user(regs->x, &sc->x); \ +} + +#define COPY_SEG(seg) { \ + unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + regs->seg = tmp; \ +} + +#define COPY_SEG_STRICT(seg) { \ + unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + regs->seg = tmp | 3; \ +} + +#define GET_SEG(seg) { \ + unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + loadsegment(seg, tmp); \ +} /* * Do a signal return; undo the signal stack. @@ -120,28 +142,13 @@ static int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned long *pax) { + void __user *buf; + unsigned int tmpflags; unsigned int err = 0; /* Always make any pending restarted system calls return -EINTR */ current_thread_info()->restart_block.fn = do_no_restart_syscall; -#define COPY(x) err |= __get_user(regs->x, &sc->x) - -#define COPY_SEG(seg) \ - { unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ - regs->seg = tmp; } - -#define COPY_SEG_STRICT(seg) \ - { unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ - regs->seg = tmp|3; } - -#define GET_SEG(seg) \ - { unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ - loadsegment(seg, tmp); } - GET_SEG(gs); COPY_SEG(fs); COPY_SEG(es); @@ -151,21 +158,12 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, COPY_SEG_STRICT(cs); COPY_SEG_STRICT(ss); - { - unsigned int tmpflags; - - err |= __get_user(tmpflags, &sc->flags); - regs->flags = (regs->flags & ~FIX_EFLAGS) | - (tmpflags & FIX_EFLAGS); - regs->orig_ax = -1; /* disable syscall checks */ - } - - { - void __user *buf; + err |= __get_user(tmpflags, &sc->flags); + regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); + regs->orig_ax = -1; /* disable syscall checks */ - err |= __get_user(buf, &sc->fpstate); - err |= restore_i387_xstate(buf); - } + err |= __get_user(buf, &sc->fpstate); + err |= restore_i387_xstate(buf); err |= __get_user(*pax, &sc->ax); return err; @@ -214,9 +212,8 @@ badframe: return 0; } -asmlinkage int sys_rt_sigreturn(unsigned long __unused) +static long do_rt_sigreturn(struct pt_regs *regs) { - struct pt_regs *regs = (struct pt_regs *)&__unused; struct rt_sigframe __user *frame; unsigned long ax; sigset_t set; @@ -242,10 +239,17 @@ asmlinkage int sys_rt_sigreturn(unsigned long __unused) return ax; badframe: - force_sig(SIGSEGV, current); + signal_fault(regs, frame, "rt_sigreturn"); return 0; } +asmlinkage int sys_rt_sigreturn(unsigned long __unused) +{ + struct pt_regs *regs = (struct pt_regs *)&__unused; + + return do_rt_sigreturn(regs); +} + /* * Set up a signal frame. */ @@ -337,39 +341,29 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, } static int -setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, - struct pt_regs *regs) +__setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, + struct pt_regs *regs) { struct sigframe __user *frame; void __user *restorer; int err = 0; - int usig; void __user *fpstate = NULL; frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - goto give_sigsegv; + return -EFAULT; - usig = current_thread_info()->exec_domain - && current_thread_info()->exec_domain->signal_invmap - && sig < 32 - ? current_thread_info()->exec_domain->signal_invmap[sig] - : sig; + if (__put_user(sig, &frame->sig)) + return -EFAULT; - err = __put_user(usig, &frame->sig); - if (err) - goto give_sigsegv; - - err = setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]); - if (err) - goto give_sigsegv; + if (setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0])) + return -EFAULT; if (_NSIG_WORDS > 1) { - err = __copy_to_user(&frame->extramask, &set->sig[1], - sizeof(frame->extramask)); - if (err) - goto give_sigsegv; + if (__copy_to_user(&frame->extramask, &set->sig[1], + sizeof(frame->extramask))) + return -EFAULT; } if (current->mm->context.vdso) @@ -394,7 +388,7 @@ setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, err |= __put_user(0x80cd, (short __user *)(frame->retcode+6)); if (err) - goto give_sigsegv; + return -EFAULT; /* Set up registers for signal handler */ regs->sp = (unsigned long)frame; @@ -409,38 +403,27 @@ setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, regs->cs = __USER_CS; return 0; - -give_sigsegv: - force_sigsegv(sig, current); - return -EFAULT; } -static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs *regs) +static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs *regs) { struct rt_sigframe __user *frame; void __user *restorer; int err = 0; - int usig; void __user *fpstate = NULL; frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - goto give_sigsegv; - - usig = current_thread_info()->exec_domain - && current_thread_info()->exec_domain->signal_invmap - && sig < 32 - ? current_thread_info()->exec_domain->signal_invmap[sig] - : sig; + return -EFAULT; - err |= __put_user(usig, &frame->sig); + err |= __put_user(sig, &frame->sig); err |= __put_user(&frame->info, &frame->pinfo); err |= __put_user(&frame->uc, &frame->puc); err |= copy_siginfo_to_user(&frame->info, info); if (err) - goto give_sigsegv; + return -EFAULT; /* Create the ucontext. */ if (cpu_has_xsave) @@ -456,7 +439,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, regs, set->sig[0]); err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); if (err) - goto give_sigsegv; + return -EFAULT; /* Set up to return from userspace. */ restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); @@ -476,12 +459,12 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, err |= __put_user(0x80cd, (short __user *)(frame->retcode+5)); if (err) - goto give_sigsegv; + return -EFAULT; /* Set up registers for signal handler */ regs->sp = (unsigned long)frame; regs->ip = (unsigned long)ka->sa.sa_handler; - regs->ax = (unsigned long)usig; + regs->ax = (unsigned long)sig; regs->dx = (unsigned long)&frame->info; regs->cx = (unsigned long)&frame->uc; @@ -491,15 +474,48 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, regs->cs = __USER_CS; return 0; - -give_sigsegv: - force_sigsegv(sig, current); - return -EFAULT; } /* * OK, we're invoking a handler: */ +static int signr_convert(int sig) +{ + struct thread_info *info = current_thread_info(); + + if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32) + return info->exec_domain->signal_invmap[sig]; + return sig; +} + +#define is_ia32 1 +#define ia32_setup_frame __setup_frame +#define ia32_setup_rt_frame __setup_rt_frame + +static int +setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs *regs) +{ + int usig = signr_convert(sig); + int ret; + + /* Set up the stack frame */ + if (is_ia32) { + if (ka->sa.sa_flags & SA_SIGINFO) + ret = ia32_setup_rt_frame(usig, ka, info, set, regs); + else + ret = ia32_setup_frame(usig, ka, set, regs); + } else + ret = __setup_rt_frame(sig, ka, info, set, regs); + + if (ret) { + force_sigsegv(sig, current); + return -EFAULT; + } + + return ret; +} + static int handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, sigset_t *oldset, struct pt_regs *regs) @@ -507,9 +523,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, int ret; /* Are we from a system call? */ - if ((long)regs->orig_ax >= 0) { + if (syscall_get_nr(current, regs) >= 0) { /* If so, check system call restarting.. */ - switch (regs->ax) { + switch (syscall_get_error(current, regs)) { case -ERESTART_RESTARTBLOCK: case -ERESTARTNOHAND: regs->ax = -EINTR; @@ -536,15 +552,20 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, likely(test_and_clear_thread_flag(TIF_FORCED_TF))) regs->flags &= ~X86_EFLAGS_TF; - /* Set up the stack frame */ - if (ka->sa.sa_flags & SA_SIGINFO) - ret = setup_rt_frame(sig, ka, info, oldset, regs); - else - ret = setup_frame(sig, ka, oldset, regs); + ret = setup_rt_frame(sig, ka, info, oldset, regs); if (ret) return ret; +#ifdef CONFIG_X86_64 + /* + * This has nothing to do with segment registers, + * despite the name. This magic affects uaccess.h + * macros' behavior. Reset it to the normal setting. + */ + set_fs(USER_DS); +#endif + /* * Clear the direction flag as per the ABI for function entry. */ @@ -571,6 +592,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, return 0; } +#define NR_restart_syscall __NR_restart_syscall /* * Note that 'init' is a special process: it doesn't get signals it doesn't * want to handle. Thus you cannot kill init even with a SIGKILL even by @@ -623,9 +645,9 @@ static void do_signal(struct pt_regs *regs) } /* Did we come from a system call? */ - if ((long)regs->orig_ax >= 0) { + if (syscall_get_nr(current, regs) >= 0) { /* Restart the system call - no handlers present */ - switch (regs->ax) { + switch (syscall_get_error(current, regs)) { case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: @@ -634,7 +656,7 @@ static void do_signal(struct pt_regs *regs) break; case -ERESTART_RESTARTBLOCK: - regs->ax = __NR_restart_syscall; + regs->ax = NR_restart_syscall; regs->ip -= 2; break; } @@ -657,6 +679,12 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { +#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) + /* notify userspace of pending MCEs */ + if (thread_info_flags & _TIF_MCE_NOTIFY) + mce_notify_user(); +#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ + /* deal with pending signal delivery */ if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs); @@ -666,5 +694,23 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) tracehook_notify_resume(regs); } +#ifdef CONFIG_X86_32 clear_thread_flag(TIF_IRET); +#endif /* CONFIG_X86_32 */ +} + +void signal_fault(struct pt_regs *regs, void __user *frame, char *where) +{ + struct task_struct *me = current; + + if (show_unhandled_signals && printk_ratelimit()) { + printk(KERN_INFO + "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", + me->comm, me->pid, where, frame, + regs->ip, regs->sp, regs->orig_ax); + print_vma_addr(" in ", regs->ip); + printk(KERN_CONT "\n"); + } + + force_sig(SIGSEGV, me); } diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 823a55bf8c3..a5c9627f4db 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -52,6 +52,16 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, return do_sigaltstack(uss, uoss, regs->sp); } +#define COPY(x) { \ + err |= __get_user(regs->x, &sc->x); \ +} + +#define COPY_SEG_STRICT(seg) { \ + unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + regs->seg = tmp | 3; \ +} + /* * Do a signal return; undo the signal stack. */ @@ -59,13 +69,13 @@ static int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned long *pax) { + void __user *buf; + unsigned int tmpflags; unsigned int err = 0; /* Always make any pending restarted system calls return -EINTR */ current_thread_info()->restart_block.fn = do_no_restart_syscall; -#define COPY(x) (err |= __get_user(regs->x, &sc->x)) - COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); COPY(dx); COPY(cx); COPY(ip); COPY(r8); @@ -80,34 +90,24 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, /* Kernel saves and restores only the CS segment register on signals, * which is the bare minimum needed to allow mixed 32/64-bit code. * App's signal handler can save/restore other segments if needed. */ - { - unsigned cs; - err |= __get_user(cs, &sc->cs); - regs->cs = cs | 3; /* Force into user mode */ - } + COPY_SEG_STRICT(cs); - { - unsigned int tmpflags; - err |= __get_user(tmpflags, &sc->flags); - regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); - regs->orig_ax = -1; /* disable syscall checks */ - } + err |= __get_user(tmpflags, &sc->flags); + regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); + regs->orig_ax = -1; /* disable syscall checks */ - { - struct _fpstate __user *buf; - err |= __get_user(buf, &sc->fpstate); - err |= restore_i387_xstate(buf); - } + err |= __get_user(buf, &sc->fpstate); + err |= restore_i387_xstate(buf); err |= __get_user(*pax, &sc->ax); return err; } -asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) +static long do_rt_sigreturn(struct pt_regs *regs) { struct rt_sigframe __user *frame; - sigset_t set; unsigned long ax; + sigset_t set; frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) @@ -130,10 +130,15 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) return ax; badframe: - signal_fault(regs, frame, "sigreturn"); + signal_fault(regs, frame, "rt_sigreturn"); return 0; } +asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) +{ + return do_rt_sigreturn(regs); +} + /* * Set up a signal frame. */ @@ -195,8 +200,8 @@ get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size) return (void __user *)round_down(sp - size, 64); } -static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs *regs) +static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs *regs) { struct rt_sigframe __user *frame; void __user *fp = NULL; @@ -209,17 +214,16 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; if (save_i387_xstate(fp) < 0) - err |= -1; + return -EFAULT; } else frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8; if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - goto give_sigsegv; + return -EFAULT; if (ka->sa.sa_flags & SA_SIGINFO) { - err |= copy_siginfo_to_user(&frame->info, info); - if (err) - goto give_sigsegv; + if (copy_siginfo_to_user(&frame->info, info)) + return -EFAULT; } /* Create the ucontext. */ @@ -247,11 +251,11 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); } else { /* could use a vstub here */ - goto give_sigsegv; + return -EFAULT; } if (err) - goto give_sigsegv; + return -EFAULT; /* Set up registers for signal handler */ regs->di = sig; @@ -271,15 +275,45 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, regs->cs = __USER_CS; return 0; - -give_sigsegv: - force_sigsegv(sig, current); - return -EFAULT; } /* * OK, we're invoking a handler */ +static int signr_convert(int sig) +{ + return sig; +} + +#ifdef CONFIG_IA32_EMULATION +#define is_ia32 test_thread_flag(TIF_IA32) +#else +#define is_ia32 0 +#endif + +static int +setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs *regs) +{ + int usig = signr_convert(sig); + int ret; + + /* Set up the stack frame */ + if (is_ia32) { + if (ka->sa.sa_flags & SA_SIGINFO) + ret = ia32_setup_rt_frame(usig, ka, info, set, regs); + else + ret = ia32_setup_frame(usig, ka, set, regs); + } else + ret = __setup_rt_frame(sig, ka, info, set, regs); + + if (ret) { + force_sigsegv(sig, current); + return -EFAULT; + } + + return ret; +} static int handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, @@ -317,51 +351,48 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, likely(test_and_clear_thread_flag(TIF_FORCED_TF))) regs->flags &= ~X86_EFLAGS_TF; -#ifdef CONFIG_IA32_EMULATION - if (test_thread_flag(TIF_IA32)) { - if (ka->sa.sa_flags & SA_SIGINFO) - ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs); - else - ret = ia32_setup_frame(sig, ka, oldset, regs); - } else -#endif ret = setup_rt_frame(sig, ka, info, oldset, regs); - if (ret == 0) { - /* - * This has nothing to do with segment registers, - * despite the name. This magic affects uaccess.h - * macros' behavior. Reset it to the normal setting. - */ - set_fs(USER_DS); + if (ret) + return ret; - /* - * Clear the direction flag as per the ABI for function entry. - */ - regs->flags &= ~X86_EFLAGS_DF; +#ifdef CONFIG_X86_64 + /* + * This has nothing to do with segment registers, + * despite the name. This magic affects uaccess.h + * macros' behavior. Reset it to the normal setting. + */ + set_fs(USER_DS); +#endif - /* - * Clear TF when entering the signal handler, but - * notify any tracer that was single-stepping it. - * The tracer may want to single-step inside the - * handler too. - */ - regs->flags &= ~X86_EFLAGS_TF; + /* + * Clear the direction flag as per the ABI for function entry. + */ + regs->flags &= ~X86_EFLAGS_DF; - spin_lock_irq(¤t->sighand->siglock); - sigorsets(¤t->blocked, ¤t->blocked, &ka->sa.sa_mask); - if (!(ka->sa.sa_flags & SA_NODEFER)) - sigaddset(¤t->blocked, sig); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); + /* + * Clear TF when entering the signal handler, but + * notify any tracer that was single-stepping it. + * The tracer may want to single-step inside the + * handler too. + */ + regs->flags &= ~X86_EFLAGS_TF; - tracehook_signal_handler(sig, info, ka, regs, - test_thread_flag(TIF_SINGLESTEP)); - } + spin_lock_irq(¤t->sighand->siglock); + sigorsets(¤t->blocked, ¤t->blocked, &ka->sa.sa_mask); + if (!(ka->sa.sa_flags & SA_NODEFER)) + sigaddset(¤t->blocked, sig); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); - return ret; + tracehook_signal_handler(sig, info, ka, regs, + test_thread_flag(TIF_SINGLESTEP)); + + return 0; } +#define NR_restart_syscall \ + test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall /* * Note that 'init' is a special process: it doesn't get signals it doesn't * want to handle. Thus you cannot kill init even with a SIGKILL even by @@ -391,7 +422,8 @@ static void do_signal(struct pt_regs *regs) signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { - /* Re-enable any watchpoints before delivering the + /* + * Re-enable any watchpoints before delivering the * signal to user space. The processor register will * have been cleared if the watchpoint triggered * inside the kernel. @@ -399,7 +431,7 @@ static void do_signal(struct pt_regs *regs) if (current->thread.debugreg7) set_debugreg(current->thread.debugreg7, 7); - /* Whee! Actually deliver the signal. */ + /* Whee! Actually deliver the signal. */ if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { /* * A signal was successfully delivered; the saved @@ -422,10 +454,9 @@ static void do_signal(struct pt_regs *regs) regs->ax = regs->orig_ax; regs->ip -= 2; break; + case -ERESTART_RESTARTBLOCK: - regs->ax = test_thread_flag(TIF_IA32) ? - __NR_ia32_restart_syscall : - __NR_restart_syscall; + regs->ax = NR_restart_syscall; regs->ip -= 2; break; } @@ -441,14 +472,18 @@ static void do_signal(struct pt_regs *regs) } } -void do_notify_resume(struct pt_regs *regs, void *unused, - __u32 thread_info_flags) +/* + * notification of userspace execution resumption + * - triggered by the TIF_WORK_MASK flags + */ +void +do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { -#ifdef CONFIG_X86_MCE +#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) /* notify userspace of pending MCEs */ if (thread_info_flags & _TIF_MCE_NOTIFY) mce_notify_user(); -#endif /* CONFIG_X86_MCE */ +#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ /* deal with pending signal delivery */ if (thread_info_flags & _TIF_SIGPENDING) @@ -458,17 +493,23 @@ void do_notify_resume(struct pt_regs *regs, void *unused, clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); } + +#ifdef CONFIG_X86_32 + clear_thread_flag(TIF_IRET); +#endif /* CONFIG_X86_32 */ } void signal_fault(struct pt_regs *regs, void __user *frame, char *where) { struct task_struct *me = current; + if (show_unhandled_signals && printk_ratelimit()) { - printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", - me->comm, me->pid, where, frame, regs->ip, - regs->sp, regs->orig_ax); + printk(KERN_INFO + "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", + me->comm, me->pid, where, frame, + regs->ip, regs->sp, regs->orig_ax); print_vma_addr(" in ", regs->ip); - printk("\n"); + printk(KERN_CONT "\n"); } force_sig(SIGSEGV, me); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 361b7a4c640..18f9b19f5f8 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -214,12 +214,16 @@ void smp_call_function_single_interrupt(struct pt_regs *regs) struct smp_ops smp_ops = { .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, .smp_prepare_cpus = native_smp_prepare_cpus, - .cpu_up = native_cpu_up, .smp_cpus_done = native_smp_cpus_done, .smp_send_stop = native_smp_send_stop, .smp_send_reschedule = native_smp_send_reschedule, + .cpu_up = native_cpu_up, + .cpu_die = native_cpu_die, + .cpu_disable = native_cpu_disable, + .play_dead = native_play_dead, + .send_call_func_ipi = native_send_call_func_ipi, .send_call_func_single_ipi = native_send_call_func_single_ipi, }; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 9056f7e272c..76b6f50978f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -52,6 +52,7 @@ #include <asm/desc.h> #include <asm/nmi.h> #include <asm/irq.h> +#include <asm/idle.h> #include <asm/smp.h> #include <asm/trampoline.h> #include <asm/cpu.h> @@ -1344,25 +1345,9 @@ static void __ref remove_cpu_from_maps(int cpu) numa_remove_cpu(cpu); } -int __cpu_disable(void) +void cpu_disable_common(void) { int cpu = smp_processor_id(); - - /* - * Perhaps use cpufreq to drop frequency, but that could go - * into generic code. - * - * We won't take down the boot processor on i386 due to some - * interrupts only being able to be serviced by the BSP. - * Especially so if we're not using an IOAPIC -zwane - */ - if (cpu == 0) - return -EBUSY; - - if (nmi_watchdog == NMI_LOCAL_APIC) - stop_apic_nmi_watchdog(NULL); - clear_local_APIC(); - /* * HACK: * Allow any queued timer interrupts to get serviced @@ -1380,10 +1365,32 @@ int __cpu_disable(void) remove_cpu_from_maps(cpu); unlock_vector_lock(); fixup_irqs(cpu_online_map); +} + +int native_cpu_disable(void) +{ + int cpu = smp_processor_id(); + + /* + * Perhaps use cpufreq to drop frequency, but that could go + * into generic code. + * + * We won't take down the boot processor on i386 due to some + * interrupts only being able to be serviced by the BSP. + * Especially so if we're not using an IOAPIC -zwane + */ + if (cpu == 0) + return -EBUSY; + + if (nmi_watchdog == NMI_LOCAL_APIC) + stop_apic_nmi_watchdog(NULL); + clear_local_APIC(); + + cpu_disable_common(); return 0; } -void __cpu_die(unsigned int cpu) +void native_cpu_die(unsigned int cpu) { /* We don't do anything here: idle task is faking death itself. */ unsigned int i; @@ -1400,15 +1407,45 @@ void __cpu_die(unsigned int cpu) } printk(KERN_ERR "CPU %u didn't die...\n", cpu); } + +void play_dead_common(void) +{ + idle_task_exit(); + reset_lazy_tlbstate(); + irq_ctx_exit(raw_smp_processor_id()); + c1e_remove_cpu(raw_smp_processor_id()); + + mb(); + /* Ack it */ + __get_cpu_var(cpu_state) = CPU_DEAD; + + /* + * With physical CPU hotplug, we should halt the cpu + */ + local_irq_disable(); +} + +void native_play_dead(void) +{ + play_dead_common(); + wbinvd_halt(); +} + #else /* ... !CONFIG_HOTPLUG_CPU */ -int __cpu_disable(void) +int native_cpu_disable(void) { return -ENOSYS; } -void __cpu_die(unsigned int cpu) +void native_cpu_die(unsigned int cpu) { /* We said "no" in __cpu_disable */ BUG(); } + +void native_play_dead(void) +{ + BUG(); +} + #endif diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c index fec1ecedc9b..e00534b3353 100644 --- a/arch/x86/kernel/tlb_32.c +++ b/arch/x86/kernel/tlb_32.c @@ -241,3 +241,11 @@ void flush_tlb_all(void) on_each_cpu(do_flush_tlb_all, NULL, 1); } +void reset_lazy_tlbstate(void) +{ + int cpu = raw_smp_processor_id(); + + per_cpu(cpu_tlbstate, cpu).state = 0; + per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; +} + diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index da5a5964fcc..0429c5de5ea 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -891,6 +891,7 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code) { struct task_struct *tsk = current; unsigned int condition; + int si_code; trace_hardirqs_fixup(); @@ -935,8 +936,9 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code) goto clear_TF_reenable; } + si_code = get_si_code((unsigned long)condition); /* Ok, finally something we can handle */ - send_sigtrap(tsk, regs, error_code); + send_sigtrap(tsk, regs, error_code, si_code); /* * Disable additional traps. They'll be re-enabled when diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 2887a789e38..9c0ac0cab01 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -940,7 +940,7 @@ asmlinkage void __kprobes do_debug(struct pt_regs *regs, tsk->thread.error_code = error_code; info.si_signo = SIGTRAP; info.si_errno = 0; - info.si_code = TRAP_BRKPT; + info.si_code = get_si_code(condition); info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL; force_sig_info(SIGTRAP, &info, tsk); diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 201e81a91a9..46e05447405 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -172,8 +172,8 @@ SECTIONS .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { *(.x86_cpu_dev.init) } - SECURITY_INIT __x86_cpu_dev_end = .; + SECURITY_INIT . = ALIGN(8); .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 8f92cac4e6d..a742d753d5b 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -914,15 +914,15 @@ LIST_HEAD(pgd_list); void vmalloc_sync_all(void) { -#ifdef CONFIG_X86_32 - unsigned long start = VMALLOC_START & PGDIR_MASK; unsigned long address; +#ifdef CONFIG_X86_32 if (SHARED_KERNEL_PMD) return; - BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); - for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { + for (address = VMALLOC_START & PMD_MASK; + address >= TASK_SIZE && address < FIXADDR_TOP; + address += PMD_SIZE) { unsigned long flags; struct page *page; @@ -935,10 +935,8 @@ void vmalloc_sync_all(void) spin_unlock_irqrestore(&pgd_lock, flags); } #else /* CONFIG_X86_64 */ - unsigned long start = VMALLOC_START & PGDIR_MASK; - unsigned long address; - - for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { + for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; + address += PGDIR_SIZE) { const pgd_t *pgd_ref = pgd_offset_k(address); unsigned long flags; struct page *page; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 6ab3196d12b..10b52309aef 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -614,7 +614,7 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size) */ offset = phys_addr & ~PAGE_MASK; phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr) - phys_addr; + size = PAGE_ALIGN(last_addr + 1) - phys_addr; /* * Mappings have to fit in the FIX_BTMAP area. diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 3815e425f47..87b9ab16642 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -26,5 +26,13 @@ config XEN_MAX_DOMAIN_MEMORY config XEN_SAVE_RESTORE bool - depends on PM - default y
\ No newline at end of file + depends on XEN && PM + default y + +config XEN_DEBUG_FS + bool "Enable Xen debug and tuning parameters in debugfs" + depends on XEN && DEBUG_FS + default n + help + Enable statistics output and various tuning options in debugfs. + Enabling this option may incur a significant performance overhead. diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 59c1e539aed..313947940a1 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -1,4 +1,12 @@ -obj-y := enlighten.o setup.o multicalls.o mmu.o \ +ifdef CONFIG_FTRACE +# Do not profile debug and lowlevel utilities +CFLAGS_REMOVE_spinlock.o = -pg +CFLAGS_REMOVE_time.o = -pg +CFLAGS_REMOVE_irq.o = -pg +endif + +obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ time.o xen-asm_$(BITS).o grant-table.o suspend.o -obj-$(CONFIG_SMP) += smp.o +obj-$(CONFIG_SMP) += smp.o spinlock.o +obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
\ No newline at end of file diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c new file mode 100644 index 00000000000..b53225d2cac --- /dev/null +++ b/arch/x86/xen/debugfs.c @@ -0,0 +1,123 @@ +#include <linux/init.h> +#include <linux/debugfs.h> +#include <linux/module.h> + +#include "debugfs.h" + +static struct dentry *d_xen_debug; + +struct dentry * __init xen_init_debugfs(void) +{ + if (!d_xen_debug) { + d_xen_debug = debugfs_create_dir("xen", NULL); + + if (!d_xen_debug) + pr_warning("Could not create 'xen' debugfs directory\n"); + } + + return d_xen_debug; +} + +struct array_data +{ + void *array; + unsigned elements; +}; + +static int u32_array_open(struct inode *inode, struct file *file) +{ + file->private_data = NULL; + return nonseekable_open(inode, file); +} + +static size_t format_array(char *buf, size_t bufsize, const char *fmt, + u32 *array, unsigned array_size) +{ + size_t ret = 0; + unsigned i; + + for(i = 0; i < array_size; i++) { + size_t len; + + len = snprintf(buf, bufsize, fmt, array[i]); + len++; /* ' ' or '\n' */ + ret += len; + + if (buf) { + buf += len; + bufsize -= len; + buf[-1] = (i == array_size-1) ? '\n' : ' '; + } + } + + ret++; /* \0 */ + if (buf) + *buf = '\0'; + + return ret; +} + +static char *format_array_alloc(const char *fmt, u32 *array, unsigned array_size) +{ + size_t len = format_array(NULL, 0, fmt, array, array_size); + char *ret; + + ret = kmalloc(len, GFP_KERNEL); + if (ret == NULL) + return NULL; + + format_array(ret, len, fmt, array, array_size); + return ret; +} + +static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len, + loff_t *ppos) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct array_data *data = inode->i_private; + size_t size; + + if (*ppos == 0) { + if (file->private_data) { + kfree(file->private_data); + file->private_data = NULL; + } + + file->private_data = format_array_alloc("%u", data->array, data->elements); + } + + size = 0; + if (file->private_data) + size = strlen(file->private_data); + + return simple_read_from_buffer(buf, len, ppos, file->private_data, size); +} + +static int xen_array_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + + return 0; +} + +static struct file_operations u32_array_fops = { + .owner = THIS_MODULE, + .open = u32_array_open, + .release= xen_array_release, + .read = u32_array_read, +}; + +struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode, + struct dentry *parent, + u32 *array, unsigned elements) +{ + struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL); + + if (data == NULL) + return NULL; + + data->array = array; + data->elements = elements; + + return debugfs_create_file(name, mode, parent, data, &u32_array_fops); +} diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h new file mode 100644 index 00000000000..e2813208483 --- /dev/null +++ b/arch/x86/xen/debugfs.h @@ -0,0 +1,10 @@ +#ifndef _XEN_DEBUGFS_H +#define _XEN_DEBUGFS_H + +struct dentry * __init xen_init_debugfs(void); + +struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode, + struct dentry *parent, + u32 *array, unsigned elements); + +#endif /* _XEN_DEBUGFS_H */ diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index a27d562a974..0013a729b41 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -30,7 +30,6 @@ #include <xen/interface/xen.h> #include <xen/interface/physdev.h> #include <xen/interface/vcpu.h> -#include <xen/interface/sched.h> #include <xen/features.h> #include <xen/page.h> #include <xen/hvc-console.h> @@ -58,6 +57,9 @@ EXPORT_SYMBOL_GPL(hypercall_page); DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); +enum xen_domain_type xen_domain_type = XEN_NATIVE; +EXPORT_SYMBOL_GPL(xen_domain_type); + /* * Identity map, in addition to plain kernel map. This needs to be * large enough to allocate page table pages to allocate the rest. @@ -111,7 +113,14 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; * * 0: not available, 1: available */ -static int have_vcpu_info_placement = 1; +static int have_vcpu_info_placement = +#ifdef CONFIG_X86_32 + 1 +#else + 0 +#endif + ; + static void xen_vcpu_setup(int cpu) { @@ -227,103 +236,68 @@ static unsigned long xen_get_debugreg(int reg) return HYPERVISOR_get_debugreg(reg); } -static unsigned long xen_save_fl(void) +static void xen_leave_lazy(void) { - struct vcpu_info *vcpu; - unsigned long flags; - - vcpu = x86_read_percpu(xen_vcpu); - - /* flag has opposite sense of mask */ - flags = !vcpu->evtchn_upcall_mask; - - /* convert to IF type flag - -0 -> 0x00000000 - -1 -> 0xffffffff - */ - return (-flags) & X86_EFLAGS_IF; + paravirt_leave_lazy(paravirt_get_lazy_mode()); + xen_mc_flush(); } -static void xen_restore_fl(unsigned long flags) +static unsigned long xen_store_tr(void) { - struct vcpu_info *vcpu; - - /* convert from IF type flag */ - flags = !(flags & X86_EFLAGS_IF); - - /* There's a one instruction preempt window here. We need to - make sure we're don't switch CPUs between getting the vcpu - pointer and updating the mask. */ - preempt_disable(); - vcpu = x86_read_percpu(xen_vcpu); - vcpu->evtchn_upcall_mask = flags; - preempt_enable_no_resched(); - - /* Doesn't matter if we get preempted here, because any - pending event will get dealt with anyway. */ - - if (flags == 0) { - preempt_check_resched(); - barrier(); /* unmask then check (avoid races) */ - if (unlikely(vcpu->evtchn_upcall_pending)) - force_evtchn_callback(); - } + return 0; } -static void xen_irq_disable(void) +/* + * Set the page permissions for a particular virtual address. If the + * address is a vmalloc mapping (or other non-linear mapping), then + * find the linear mapping of the page and also set its protections to + * match. + */ +static void set_aliased_prot(void *v, pgprot_t prot) { - /* There's a one instruction preempt window here. We need to - make sure we're don't switch CPUs between getting the vcpu - pointer and updating the mask. */ - preempt_disable(); - x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1; - preempt_enable_no_resched(); -} + int level; + pte_t *ptep; + pte_t pte; + unsigned long pfn; + struct page *page; -static void xen_irq_enable(void) -{ - struct vcpu_info *vcpu; + ptep = lookup_address((unsigned long)v, &level); + BUG_ON(ptep == NULL); - /* We don't need to worry about being preempted here, since - either a) interrupts are disabled, so no preemption, or b) - the caller is confused and is trying to re-enable interrupts - on an indeterminate processor. */ + pfn = pte_pfn(*ptep); + page = pfn_to_page(pfn); - vcpu = x86_read_percpu(xen_vcpu); - vcpu->evtchn_upcall_mask = 0; + pte = pfn_pte(pfn, prot); - /* Doesn't matter if we get preempted here, because any - pending event will get dealt with anyway. */ + if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0)) + BUG(); - barrier(); /* unmask then check (avoid races) */ - if (unlikely(vcpu->evtchn_upcall_pending)) - force_evtchn_callback(); -} + if (!PageHighMem(page)) { + void *av = __va(PFN_PHYS(pfn)); -static void xen_safe_halt(void) -{ - /* Blocking includes an implicit local_irq_enable(). */ - if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0) - BUG(); + if (av != v) + if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0)) + BUG(); + } else + kmap_flush_unused(); } -static void xen_halt(void) +static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries) { - if (irqs_disabled()) - HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); - else - xen_safe_halt(); -} + const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE; + int i; -static void xen_leave_lazy(void) -{ - paravirt_leave_lazy(paravirt_get_lazy_mode()); - xen_mc_flush(); + for(i = 0; i < entries; i += entries_per_page) + set_aliased_prot(ldt + i, PAGE_KERNEL_RO); } -static unsigned long xen_store_tr(void) +static void xen_free_ldt(struct desc_struct *ldt, unsigned entries) { - return 0; + const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE; + int i; + + for(i = 0; i < entries; i += entries_per_page) + set_aliased_prot(ldt + i, PAGE_KERNEL); } static void xen_set_ldt(const void *addr, unsigned entries) @@ -426,8 +400,7 @@ static void xen_load_gs_index(unsigned int idx) static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, const void *ptr) { - unsigned long lp = (unsigned long)&dt[entrynum]; - xmaddr_t mach_lp = virt_to_machine(lp); + xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]); u64 entry = *(u64 *)ptr; preempt_disable(); @@ -560,7 +533,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry, } static void xen_load_sp0(struct tss_struct *tss, - struct thread_struct *thread) + struct thread_struct *thread) { struct multicall_space mcs = xen_mc_entry(0); MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); @@ -835,6 +808,19 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) ret = -EFAULT; break; #endif + + case MSR_STAR: + case MSR_CSTAR: + case MSR_LSTAR: + case MSR_SYSCALL_MASK: + case MSR_IA32_SYSENTER_CS: + case MSR_IA32_SYSENTER_ESP: + case MSR_IA32_SYSENTER_EIP: + /* Fast syscall setup is all done in hypercalls, so + these are all ignored. Stub them out here to stop + Xen console noise. */ + break; + default: ret = native_write_msr_safe(msr, low, high); } @@ -878,8 +864,8 @@ static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned l SetPagePinned(page); if (!PageHighMem(page)) { - make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); - if (level == PT_PTE) + make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn))); + if (level == PT_PTE && USE_SPLIT_PTLOCKS) pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); } else /* make sure there are no stray mappings of @@ -947,7 +933,7 @@ static void xen_release_ptpage(unsigned long pfn, unsigned level) if (PagePinned(page)) { if (!PageHighMem(page)) { - if (level == PT_PTE) + if (level == PT_PTE && USE_SPLIT_PTLOCKS) pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); } @@ -994,6 +980,7 @@ static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) } #endif +#ifdef CONFIG_X86_32 static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) { /* If there's an existing pte, then don't allow _PAGE_RW to be set */ @@ -1012,6 +999,7 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) xen_set_pte(ptep, pte); } +#endif static __init void xen_pagetable_setup_start(pgd_t *base) { @@ -1078,7 +1066,6 @@ void xen_setup_vcpu_info_placement(void) /* xen_vcpu_setup managed to place the vcpu_info within the percpu area for all cpus, so make use of it */ -#ifdef CONFIG_X86_32 if (have_vcpu_info_placement) { printk(KERN_INFO "Xen: using vcpu_info placement\n"); @@ -1088,7 +1075,6 @@ void xen_setup_vcpu_info_placement(void) pv_irq_ops.irq_enable = xen_irq_enable_direct; pv_mmu_ops.read_cr2 = xen_read_cr2_direct; } -#endif } static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, @@ -1109,12 +1095,10 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, goto patch_site switch (type) { -#ifdef CONFIG_X86_32 SITE(pv_irq_ops, irq_enable); SITE(pv_irq_ops, irq_disable); SITE(pv_irq_ops, save_fl); SITE(pv_irq_ops, restore_fl); -#endif /* CONFIG_X86_32 */ #undef SITE patch_site: @@ -1252,6 +1236,9 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .load_gs_index = xen_load_gs_index, #endif + .alloc_ldt = xen_alloc_ldt, + .free_ldt = xen_free_ldt, + .store_gdt = native_store_gdt, .store_idt = native_store_idt, .store_tr = xen_store_tr, @@ -1273,36 +1260,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { }, }; -static void __init __xen_init_IRQ(void) -{ -#ifdef CONFIG_X86_64 - int i; - - /* Create identity vector->irq map */ - for(i = 0; i < NR_VECTORS; i++) { - int cpu; - - for_each_possible_cpu(cpu) - per_cpu(vector_irq, cpu)[i] = i; - } -#endif /* CONFIG_X86_64 */ - - xen_init_IRQ(); -} - -static const struct pv_irq_ops xen_irq_ops __initdata = { - .init_IRQ = __xen_init_IRQ, - .save_fl = xen_save_fl, - .restore_fl = xen_restore_fl, - .irq_disable = xen_irq_disable, - .irq_enable = xen_irq_enable, - .safe_halt = xen_safe_halt, - .halt = xen_halt, -#ifdef CONFIG_X86_64 - .adjust_exception_frame = xen_adjust_exception_frame, -#endif -}; - static const struct pv_apic_ops xen_apic_ops __initdata = { #ifdef CONFIG_X86_LOCAL_APIC .setup_boot_clock = paravirt_nop, @@ -1443,7 +1400,7 @@ static void __init xen_reserve_top(void) if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) top = pp.virt_start; - reserve_top_address(-top + 2 * PAGE_SIZE); + reserve_top_address(-top); #endif /* CONFIG_X86_32 */ } @@ -1477,48 +1434,11 @@ static void *m2v(phys_addr_t maddr) return __ka(m2p(maddr)); } -#ifdef CONFIG_X86_64 -static void walk(pgd_t *pgd, unsigned long addr) -{ - unsigned l4idx = pgd_index(addr); - unsigned l3idx = pud_index(addr); - unsigned l2idx = pmd_index(addr); - unsigned l1idx = pte_index(addr); - pgd_t l4; - pud_t l3; - pmd_t l2; - pte_t l1; - - xen_raw_printk("walk %p, %lx -> %d %d %d %d\n", - pgd, addr, l4idx, l3idx, l2idx, l1idx); - - l4 = pgd[l4idx]; - xen_raw_printk(" l4: %016lx\n", l4.pgd); - xen_raw_printk(" %016lx\n", pgd_val(l4)); - - l3 = ((pud_t *)(m2v(l4.pgd)))[l3idx]; - xen_raw_printk(" l3: %016lx\n", l3.pud); - xen_raw_printk(" %016lx\n", pud_val(l3)); - - l2 = ((pmd_t *)(m2v(l3.pud)))[l2idx]; - xen_raw_printk(" l2: %016lx\n", l2.pmd); - xen_raw_printk(" %016lx\n", pmd_val(l2)); - - l1 = ((pte_t *)(m2v(l2.pmd)))[l1idx]; - xen_raw_printk(" l1: %016lx\n", l1.pte); - xen_raw_printk(" %016lx\n", pte_val(l1)); -} -#endif - static void set_page_prot(void *addr, pgprot_t prot) { unsigned long pfn = __pa(addr) >> PAGE_SHIFT; pte_t pte = pfn_pte(pfn, prot); - xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016llx pte=%016llx\n", - addr, pfn, get_phys_to_machine(pfn), - pgprot_val(prot), pte.pte); - if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) BUG(); } @@ -1694,6 +1614,8 @@ asmlinkage void __init xen_start_kernel(void) if (!xen_start_info) return; + xen_domain_type = XEN_PV_DOMAIN; + BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0); xen_setup_features(); @@ -1703,10 +1625,11 @@ asmlinkage void __init xen_start_kernel(void) pv_init_ops = xen_init_ops; pv_time_ops = xen_time_ops; pv_cpu_ops = xen_cpu_ops; - pv_irq_ops = xen_irq_ops; pv_apic_ops = xen_apic_ops; pv_mmu_ops = xen_mmu_ops; + xen_init_irq_ops(); + #ifdef CONFIG_X86_LOCAL_APIC /* * set up the basic apic ops. @@ -1737,7 +1660,7 @@ asmlinkage void __init xen_start_kernel(void) /* Prevent unwanted bits from being set in PTEs. */ __supported_pte_mask &= ~_PAGE_GLOBAL; - if (!is_initial_xendomain()) + if (!xen_initial_domain()) __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); /* Don't do the full vcpu_info placement stuff until we have a @@ -1772,7 +1695,7 @@ asmlinkage void __init xen_start_kernel(void) boot_params.hdr.ramdisk_size = xen_start_info->mod_len; boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line); - if (!is_initial_xendomain()) { + if (!xen_initial_domain()) { add_preferred_console("xenboot", 0, NULL); add_preferred_console("tty", 0, NULL); add_preferred_console("hvc", 0, NULL); @@ -1780,15 +1703,6 @@ asmlinkage void __init xen_start_kernel(void) xen_raw_console_write("about to get started...\n"); -#if 0 - xen_raw_printk("&boot_params=%p __pa(&boot_params)=%lx __va(__pa(&boot_params))=%lx\n", - &boot_params, __pa_symbol(&boot_params), - __va(__pa_symbol(&boot_params))); - - walk(pgd, &boot_params); - walk(pgd, __va(__pa(&boot_params))); -#endif - /* Start the world */ #ifdef CONFIG_X86_32 i386_start_kernel(); diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c new file mode 100644 index 00000000000..28b85ab8422 --- /dev/null +++ b/arch/x86/xen/irq.c @@ -0,0 +1,143 @@ +#include <linux/hardirq.h> + +#include <xen/interface/xen.h> +#include <xen/interface/sched.h> +#include <xen/interface/vcpu.h> + +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> + +#include "xen-ops.h" + +/* + * Force a proper event-channel callback from Xen after clearing the + * callback mask. We do this in a very simple manner, by making a call + * down into Xen. The pending flag will be checked by Xen on return. + */ +void xen_force_evtchn_callback(void) +{ + (void)HYPERVISOR_xen_version(0, NULL); +} + +static void __init __xen_init_IRQ(void) +{ +#ifdef CONFIG_X86_64 + int i; + + /* Create identity vector->irq map */ + for(i = 0; i < NR_VECTORS; i++) { + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(vector_irq, cpu)[i] = i; + } +#endif /* CONFIG_X86_64 */ + + xen_init_IRQ(); +} + +static unsigned long xen_save_fl(void) +{ + struct vcpu_info *vcpu; + unsigned long flags; + + vcpu = x86_read_percpu(xen_vcpu); + + /* flag has opposite sense of mask */ + flags = !vcpu->evtchn_upcall_mask; + + /* convert to IF type flag + -0 -> 0x00000000 + -1 -> 0xffffffff + */ + return (-flags) & X86_EFLAGS_IF; +} + +static void xen_restore_fl(unsigned long flags) +{ + struct vcpu_info *vcpu; + + /* convert from IF type flag */ + flags = !(flags & X86_EFLAGS_IF); + + /* There's a one instruction preempt window here. We need to + make sure we're don't switch CPUs between getting the vcpu + pointer and updating the mask. */ + preempt_disable(); + vcpu = x86_read_percpu(xen_vcpu); + vcpu->evtchn_upcall_mask = flags; + preempt_enable_no_resched(); + + /* Doesn't matter if we get preempted here, because any + pending event will get dealt with anyway. */ + + if (flags == 0) { + preempt_check_resched(); + barrier(); /* unmask then check (avoid races) */ + if (unlikely(vcpu->evtchn_upcall_pending)) + xen_force_evtchn_callback(); + } +} + +static void xen_irq_disable(void) +{ + /* There's a one instruction preempt window here. We need to + make sure we're don't switch CPUs between getting the vcpu + pointer and updating the mask. */ + preempt_disable(); + x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1; + preempt_enable_no_resched(); +} + +static void xen_irq_enable(void) +{ + struct vcpu_info *vcpu; + + /* We don't need to worry about being preempted here, since + either a) interrupts are disabled, so no preemption, or b) + the caller is confused and is trying to re-enable interrupts + on an indeterminate processor. */ + + vcpu = x86_read_percpu(xen_vcpu); + vcpu->evtchn_upcall_mask = 0; + + /* Doesn't matter if we get preempted here, because any + pending event will get dealt with anyway. */ + + barrier(); /* unmask then check (avoid races) */ + if (unlikely(vcpu->evtchn_upcall_pending)) + xen_force_evtchn_callback(); +} + +static void xen_safe_halt(void) +{ + /* Blocking includes an implicit local_irq_enable(). */ + if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0) + BUG(); +} + +static void xen_halt(void) +{ + if (irqs_disabled()) + HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); + else + xen_safe_halt(); +} + +static const struct pv_irq_ops xen_irq_ops __initdata = { + .init_IRQ = __xen_init_IRQ, + .save_fl = xen_save_fl, + .restore_fl = xen_restore_fl, + .irq_disable = xen_irq_disable, + .irq_enable = xen_irq_enable, + .safe_halt = xen_safe_halt, + .halt = xen_halt, +#ifdef CONFIG_X86_64 + .adjust_exception_frame = xen_adjust_exception_frame, +#endif +}; + +void __init xen_init_irq_ops() +{ + pv_irq_ops = xen_irq_ops; +} diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index aa37469da69..ae173f6edd8 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -40,6 +40,7 @@ */ #include <linux/sched.h> #include <linux/highmem.h> +#include <linux/debugfs.h> #include <linux/bug.h> #include <asm/pgtable.h> @@ -57,6 +58,61 @@ #include "multicalls.h" #include "mmu.h" +#include "debugfs.h" + +#define MMU_UPDATE_HISTO 30 + +#ifdef CONFIG_XEN_DEBUG_FS + +static struct { + u32 pgd_update; + u32 pgd_update_pinned; + u32 pgd_update_batched; + + u32 pud_update; + u32 pud_update_pinned; + u32 pud_update_batched; + + u32 pmd_update; + u32 pmd_update_pinned; + u32 pmd_update_batched; + + u32 pte_update; + u32 pte_update_pinned; + u32 pte_update_batched; + + u32 mmu_update; + u32 mmu_update_extended; + u32 mmu_update_histo[MMU_UPDATE_HISTO]; + + u32 prot_commit; + u32 prot_commit_batched; + + u32 set_pte_at; + u32 set_pte_at_batched; + u32 set_pte_at_pinned; + u32 set_pte_at_current; + u32 set_pte_at_kernel; +} mmu_stats; + +static u8 zero_stats; + +static inline void check_zero(void) +{ + if (unlikely(zero_stats)) { + memset(&mmu_stats, 0, sizeof(mmu_stats)); + zero_stats = 0; + } +} + +#define ADD_STATS(elem, val) \ + do { check_zero(); mmu_stats.elem += (val); } while(0) + +#else /* !CONFIG_XEN_DEBUG_FS */ + +#define ADD_STATS(elem, val) do { (void)(val); } while(0) + +#endif /* CONFIG_XEN_DEBUG_FS */ /* * Just beyond the highest usermode address. STACK_TOP_MAX has a @@ -229,25 +285,35 @@ void make_lowmem_page_readwrite(void *vaddr) } -static bool page_pinned(void *ptr) +static bool xen_page_pinned(void *ptr) { struct page *page = virt_to_page(ptr); return PagePinned(page); } -static void extend_mmu_update(const struct mmu_update *update) +static void xen_extend_mmu_update(const struct mmu_update *update) { struct multicall_space mcs; struct mmu_update *u; mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); - if (mcs.mc != NULL) + if (mcs.mc != NULL) { + ADD_STATS(mmu_update_extended, 1); + ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1); + mcs.mc->args[1]++; - else { + + if (mcs.mc->args[1] < MMU_UPDATE_HISTO) + ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1); + else + ADD_STATS(mmu_update_histo[0], 1); + } else { + ADD_STATS(mmu_update, 1); mcs = __xen_mc_entry(sizeof(*u)); MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); + ADD_STATS(mmu_update_histo[1], 1); } u = mcs.args; @@ -265,7 +331,9 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) /* ptr may be ioremapped for 64-bit pagetable setup */ u.ptr = arbitrary_virt_to_machine(ptr).maddr; u.val = pmd_val_ma(val); - extend_mmu_update(&u); + xen_extend_mmu_update(&u); + + ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); xen_mc_issue(PARAVIRT_LAZY_MMU); @@ -274,13 +342,17 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) void xen_set_pmd(pmd_t *ptr, pmd_t val) { + ADD_STATS(pmd_update, 1); + /* If page is not pinned, we can just update the entry directly */ - if (!page_pinned(ptr)) { + if (!xen_page_pinned(ptr)) { *ptr = val; return; } + ADD_STATS(pmd_update_pinned, 1); + xen_set_pmd_hyper(ptr, val); } @@ -300,12 +372,18 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, if (mm == &init_mm) preempt_disable(); + ADD_STATS(set_pte_at, 1); +// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep)); + ADD_STATS(set_pte_at_current, mm == current->mm); + ADD_STATS(set_pte_at_kernel, mm == &init_mm); + if (mm == current->mm || mm == &init_mm) { if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { struct multicall_space mcs; mcs = xen_mc_entry(0); MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); + ADD_STATS(set_pte_at_batched, 1); xen_mc_issue(PARAVIRT_LAZY_MMU); goto out; } else @@ -334,7 +412,10 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; u.val = pte_val_ma(pte); - extend_mmu_update(&u); + xen_extend_mmu_update(&u); + + ADD_STATS(prot_commit, 1); + ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); xen_mc_issue(PARAVIRT_LAZY_MMU); } @@ -400,7 +481,9 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val) /* ptr may be ioremapped for 64-bit pagetable setup */ u.ptr = arbitrary_virt_to_machine(ptr).maddr; u.val = pud_val_ma(val); - extend_mmu_update(&u); + xen_extend_mmu_update(&u); + + ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); xen_mc_issue(PARAVIRT_LAZY_MMU); @@ -409,18 +492,26 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val) void xen_set_pud(pud_t *ptr, pud_t val) { + ADD_STATS(pud_update, 1); + /* If page is not pinned, we can just update the entry directly */ - if (!page_pinned(ptr)) { + if (!xen_page_pinned(ptr)) { *ptr = val; return; } + ADD_STATS(pud_update_pinned, 1); + xen_set_pud_hyper(ptr, val); } void xen_set_pte(pte_t *ptep, pte_t pte) { + ADD_STATS(pte_update, 1); +// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep)); + ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); + #ifdef CONFIG_X86_PAE ptep->pte_high = pte.pte_high; smp_wmb(); @@ -490,7 +581,7 @@ static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) u.ptr = virt_to_machine(ptr).maddr; u.val = pgd_val_ma(val); - extend_mmu_update(&u); + xen_extend_mmu_update(&u); } /* @@ -517,17 +608,22 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val) { pgd_t *user_ptr = xen_get_user_pgd(ptr); + ADD_STATS(pgd_update, 1); + /* If page is not pinned, we can just update the entry directly */ - if (!page_pinned(ptr)) { + if (!xen_page_pinned(ptr)) { *ptr = val; if (user_ptr) { - WARN_ON(page_pinned(user_ptr)); + WARN_ON(xen_page_pinned(user_ptr)); *user_ptr = val; } return; } + ADD_STATS(pgd_update_pinned, 1); + ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); + /* If it's pinned, then we can at least batch the kernel and user updates together. */ xen_mc_batch(); @@ -555,9 +651,12 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val) * For 64-bit, we must skip the Xen hole in the middle of the address * space, just after the big x86-64 virtual hole. */ -static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), - unsigned long limit) +static int xen_pgd_walk(struct mm_struct *mm, + int (*func)(struct mm_struct *mm, struct page *, + enum pt_level), + unsigned long limit) { + pgd_t *pgd = mm->pgd; int flush = 0; unsigned hole_low, hole_high; unsigned pgdidx_limit, pudidx_limit, pmdidx_limit; @@ -590,8 +689,6 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), pmdidx_limit = 0; #endif - flush |= (*func)(virt_to_page(pgd), PT_PGD); - for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) { pud_t *pud; @@ -604,7 +701,7 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), pud = pud_offset(&pgd[pgdidx], 0); if (PTRS_PER_PUD > 1) /* not folded */ - flush |= (*func)(virt_to_page(pud), PT_PUD); + flush |= (*func)(mm, virt_to_page(pud), PT_PUD); for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) { pmd_t *pmd; @@ -619,7 +716,7 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), pmd = pmd_offset(&pud[pudidx], 0); if (PTRS_PER_PMD > 1) /* not folded */ - flush |= (*func)(virt_to_page(pmd), PT_PMD); + flush |= (*func)(mm, virt_to_page(pmd), PT_PMD); for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) { struct page *pte; @@ -633,28 +730,34 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), continue; pte = pmd_page(pmd[pmdidx]); - flush |= (*func)(pte, PT_PTE); + flush |= (*func)(mm, pte, PT_PTE); } } } + out: + /* Do the top level last, so that the callbacks can use it as + a cue to do final things like tlb flushes. */ + flush |= (*func)(mm, virt_to_page(pgd), PT_PGD); return flush; } -static spinlock_t *lock_pte(struct page *page) +/* If we're using split pte locks, then take the page's lock and + return a pointer to it. Otherwise return NULL. */ +static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm) { spinlock_t *ptl = NULL; -#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS +#if USE_SPLIT_PTLOCKS ptl = __pte_lockptr(page); - spin_lock(ptl); + spin_lock_nest_lock(ptl, &mm->page_table_lock); #endif return ptl; } -static void do_unlock(void *v) +static void xen_pte_unlock(void *v) { spinlock_t *ptl = v; spin_unlock(ptl); @@ -672,7 +775,8 @@ static void xen_do_pin(unsigned level, unsigned long pfn) MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); } -static int pin_page(struct page *page, enum pt_level level) +static int xen_pin_page(struct mm_struct *mm, struct page *page, + enum pt_level level) { unsigned pgfl = TestSetPagePinned(page); int flush; @@ -691,21 +795,40 @@ static int pin_page(struct page *page, enum pt_level level) flush = 0; + /* + * We need to hold the pagetable lock between the time + * we make the pagetable RO and when we actually pin + * it. If we don't, then other users may come in and + * attempt to update the pagetable by writing it, + * which will fail because the memory is RO but not + * pinned, so Xen won't do the trap'n'emulate. + * + * If we're using split pte locks, we can't hold the + * entire pagetable's worth of locks during the + * traverse, because we may wrap the preempt count (8 + * bits). The solution is to mark RO and pin each PTE + * page while holding the lock. This means the number + * of locks we end up holding is never more than a + * batch size (~32 entries, at present). + * + * If we're not using split pte locks, we needn't pin + * the PTE pages independently, because we're + * protected by the overall pagetable lock. + */ ptl = NULL; if (level == PT_PTE) - ptl = lock_pte(page); + ptl = xen_pte_lock(page, mm); MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, pfn_pte(pfn, PAGE_KERNEL_RO), level == PT_PGD ? UVMF_TLB_FLUSH : 0); - if (level == PT_PTE) + if (ptl) { xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); - if (ptl) { /* Queue a deferred unlock for when this batch is completed. */ - xen_mc_callback(do_unlock, ptl); + xen_mc_callback(xen_pte_unlock, ptl); } } @@ -715,11 +838,11 @@ static int pin_page(struct page *page, enum pt_level level) /* This is called just after a mm has been created, but it has not been used yet. We need to make sure that its pagetable is all read-only, and can be pinned. */ -void xen_pgd_pin(pgd_t *pgd) +static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) { xen_mc_batch(); - if (pgd_walk(pgd, pin_page, USER_LIMIT)) { + if (xen_pgd_walk(mm, xen_pin_page, USER_LIMIT)) { /* re-enable interrupts for kmap_flush_unused */ xen_mc_issue(0); kmap_flush_unused(); @@ -733,25 +856,35 @@ void xen_pgd_pin(pgd_t *pgd) xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); if (user_pgd) { - pin_page(virt_to_page(user_pgd), PT_PGD); + xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD); xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd))); } } #else /* CONFIG_X86_32 */ #ifdef CONFIG_X86_PAE /* Need to make sure unshared kernel PMD is pinnable */ - pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); + xen_pin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), + PT_PMD); #endif xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); #endif /* CONFIG_X86_64 */ xen_mc_issue(0); } +static void xen_pgd_pin(struct mm_struct *mm) +{ + __xen_pgd_pin(mm, mm->pgd); +} + /* * On save, we need to pin all pagetables to make sure they get their * mfns turned into pfns. Search the list for any unpinned pgds and pin * them (unpinned pgds are not currently in use, probably because the * process is under construction or destruction). + * + * Expected to be called in stop_machine() ("equivalent to taking + * every spinlock in the system"), so the locking doesn't really + * matter all that much. */ void xen_mm_pin_all(void) { @@ -762,7 +895,7 @@ void xen_mm_pin_all(void) list_for_each_entry(page, &pgd_list, lru) { if (!PagePinned(page)) { - xen_pgd_pin((pgd_t *)page_address(page)); + __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page)); SetPageSavePinned(page); } } @@ -775,7 +908,8 @@ void xen_mm_pin_all(void) * that's before we have page structures to store the bits. So do all * the book-keeping now. */ -static __init int mark_pinned(struct page *page, enum pt_level level) +static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page, + enum pt_level level) { SetPagePinned(page); return 0; @@ -783,10 +917,11 @@ static __init int mark_pinned(struct page *page, enum pt_level level) void __init xen_mark_init_mm_pinned(void) { - pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); + xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP); } -static int unpin_page(struct page *page, enum pt_level level) +static int xen_unpin_page(struct mm_struct *mm, struct page *page, + enum pt_level level) { unsigned pgfl = TestClearPagePinned(page); @@ -796,10 +931,18 @@ static int unpin_page(struct page *page, enum pt_level level) spinlock_t *ptl = NULL; struct multicall_space mcs; + /* + * Do the converse to pin_page. If we're using split + * pte locks, we must be holding the lock for while + * the pte page is unpinned but still RO to prevent + * concurrent updates from seeing it in this + * partially-pinned state. + */ if (level == PT_PTE) { - ptl = lock_pte(page); + ptl = xen_pte_lock(page, mm); - xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); + if (ptl) + xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); } mcs = __xen_mc_entry(0); @@ -810,7 +953,7 @@ static int unpin_page(struct page *page, enum pt_level level) if (ptl) { /* unlock when batch completed */ - xen_mc_callback(do_unlock, ptl); + xen_mc_callback(xen_pte_unlock, ptl); } } @@ -818,7 +961,7 @@ static int unpin_page(struct page *page, enum pt_level level) } /* Release a pagetables pages back as normal RW */ -static void xen_pgd_unpin(pgd_t *pgd) +static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd) { xen_mc_batch(); @@ -830,21 +973,27 @@ static void xen_pgd_unpin(pgd_t *pgd) if (user_pgd) { xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd))); - unpin_page(virt_to_page(user_pgd), PT_PGD); + xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD); } } #endif #ifdef CONFIG_X86_PAE /* Need to make sure unshared kernel PMD is unpinned */ - pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); + xen_unpin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), + PT_PMD); #endif - pgd_walk(pgd, unpin_page, USER_LIMIT); + xen_pgd_walk(mm, xen_unpin_page, USER_LIMIT); xen_mc_issue(0); } +static void xen_pgd_unpin(struct mm_struct *mm) +{ + __xen_pgd_unpin(mm, mm->pgd); +} + /* * On resume, undo any pinning done at save, so that the rest of the * kernel doesn't see any unexpected pinned pagetables. @@ -859,7 +1008,7 @@ void xen_mm_unpin_all(void) list_for_each_entry(page, &pgd_list, lru) { if (PageSavePinned(page)) { BUG_ON(!PagePinned(page)); - xen_pgd_unpin((pgd_t *)page_address(page)); + __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page)); ClearPageSavePinned(page); } } @@ -870,14 +1019,14 @@ void xen_mm_unpin_all(void) void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) { spin_lock(&next->page_table_lock); - xen_pgd_pin(next->pgd); + xen_pgd_pin(next); spin_unlock(&next->page_table_lock); } void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) { spin_lock(&mm->page_table_lock); - xen_pgd_pin(mm->pgd); + xen_pgd_pin(mm); spin_unlock(&mm->page_table_lock); } @@ -907,7 +1056,7 @@ static void drop_other_mm_ref(void *info) } } -static void drop_mm_ref(struct mm_struct *mm) +static void xen_drop_mm_ref(struct mm_struct *mm) { cpumask_t mask; unsigned cpu; @@ -937,7 +1086,7 @@ static void drop_mm_ref(struct mm_struct *mm) smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); } #else -static void drop_mm_ref(struct mm_struct *mm) +static void xen_drop_mm_ref(struct mm_struct *mm) { if (current->active_mm == mm) load_cr3(swapper_pg_dir); @@ -961,14 +1110,77 @@ static void drop_mm_ref(struct mm_struct *mm) void xen_exit_mmap(struct mm_struct *mm) { get_cpu(); /* make sure we don't move around */ - drop_mm_ref(mm); + xen_drop_mm_ref(mm); put_cpu(); spin_lock(&mm->page_table_lock); /* pgd may not be pinned in the error exit path of execve */ - if (page_pinned(mm->pgd)) - xen_pgd_unpin(mm->pgd); + if (xen_page_pinned(mm->pgd)) + xen_pgd_unpin(mm); spin_unlock(&mm->page_table_lock); } + +#ifdef CONFIG_XEN_DEBUG_FS + +static struct dentry *d_mmu_debug; + +static int __init xen_mmu_debugfs(void) +{ + struct dentry *d_xen = xen_init_debugfs(); + + if (d_xen == NULL) + return -ENOMEM; + + d_mmu_debug = debugfs_create_dir("mmu", d_xen); + + debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats); + + debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update); + debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug, + &mmu_stats.pgd_update_pinned); + debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug, + &mmu_stats.pgd_update_pinned); + + debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update); + debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug, + &mmu_stats.pud_update_pinned); + debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug, + &mmu_stats.pud_update_pinned); + + debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update); + debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug, + &mmu_stats.pmd_update_pinned); + debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug, + &mmu_stats.pmd_update_pinned); + + debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update); +// debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug, +// &mmu_stats.pte_update_pinned); + debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug, + &mmu_stats.pte_update_pinned); + + debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update); + debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug, + &mmu_stats.mmu_update_extended); + xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug, + mmu_stats.mmu_update_histo, 20); + + debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at); + debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug, + &mmu_stats.set_pte_at_batched); + debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug, + &mmu_stats.set_pte_at_current); + debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug, + &mmu_stats.set_pte_at_kernel); + + debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit); + debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug, + &mmu_stats.prot_commit_batched); + + return 0; +} +fs_initcall(xen_mmu_debugfs); + +#endif /* CONFIG_XEN_DEBUG_FS */ diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 0f59bd03f9e..98d71659da5 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -18,9 +18,6 @@ void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next); void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); void xen_exit_mmap(struct mm_struct *mm); -void xen_pgd_pin(pgd_t *pgd); -//void xen_pgd_unpin(pgd_t *pgd); - pteval_t xen_pte_val(pte_t); pmdval_t xen_pmd_val(pmd_t); pgdval_t xen_pgd_val(pgd_t); diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index 9efd1c6c977..8ea8a0d0b0d 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -21,16 +21,20 @@ */ #include <linux/percpu.h> #include <linux/hardirq.h> +#include <linux/debugfs.h> #include <asm/xen/hypercall.h> #include "multicalls.h" +#include "debugfs.h" + +#define MC_BATCH 32 #define MC_DEBUG 1 -#define MC_BATCH 32 #define MC_ARGS (MC_BATCH * 16) + struct mc_buffer { struct multicall_entry entries[MC_BATCH]; #if MC_DEBUG @@ -47,6 +51,76 @@ struct mc_buffer { static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags); +/* flush reasons 0- slots, 1- args, 2- callbacks */ +enum flush_reasons +{ + FL_SLOTS, + FL_ARGS, + FL_CALLBACKS, + + FL_N_REASONS +}; + +#ifdef CONFIG_XEN_DEBUG_FS +#define NHYPERCALLS 40 /* not really */ + +static struct { + unsigned histo[MC_BATCH+1]; + + unsigned issued; + unsigned arg_total; + unsigned hypercalls; + unsigned histo_hypercalls[NHYPERCALLS]; + + unsigned flush[FL_N_REASONS]; +} mc_stats; + +static u8 zero_stats; + +static inline void check_zero(void) +{ + if (unlikely(zero_stats)) { + memset(&mc_stats, 0, sizeof(mc_stats)); + zero_stats = 0; + } +} + +static void mc_add_stats(const struct mc_buffer *mc) +{ + int i; + + check_zero(); + + mc_stats.issued++; + mc_stats.hypercalls += mc->mcidx; + mc_stats.arg_total += mc->argidx; + + mc_stats.histo[mc->mcidx]++; + for(i = 0; i < mc->mcidx; i++) { + unsigned op = mc->entries[i].op; + if (op < NHYPERCALLS) + mc_stats.histo_hypercalls[op]++; + } +} + +static void mc_stats_flush(enum flush_reasons idx) +{ + check_zero(); + + mc_stats.flush[idx]++; +} + +#else /* !CONFIG_XEN_DEBUG_FS */ + +static inline void mc_add_stats(const struct mc_buffer *mc) +{ +} + +static inline void mc_stats_flush(enum flush_reasons idx) +{ +} +#endif /* CONFIG_XEN_DEBUG_FS */ + void xen_mc_flush(void) { struct mc_buffer *b = &__get_cpu_var(mc_buffer); @@ -60,6 +134,8 @@ void xen_mc_flush(void) something in the middle */ local_irq_save(flags); + mc_add_stats(b); + if (b->mcidx) { #if MC_DEBUG memcpy(b->debug, b->entries, @@ -115,6 +191,7 @@ struct multicall_space __xen_mc_entry(size_t args) if (b->mcidx == MC_BATCH || (argidx + args) > MC_ARGS) { + mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS); xen_mc_flush(); argidx = roundup(b->argidx, sizeof(u64)); } @@ -158,10 +235,44 @@ void xen_mc_callback(void (*fn)(void *), void *data) struct mc_buffer *b = &__get_cpu_var(mc_buffer); struct callback *cb; - if (b->cbidx == MC_BATCH) + if (b->cbidx == MC_BATCH) { + mc_stats_flush(FL_CALLBACKS); xen_mc_flush(); + } cb = &b->callbacks[b->cbidx++]; cb->fn = fn; cb->data = data; } + +#ifdef CONFIG_XEN_DEBUG_FS + +static struct dentry *d_mc_debug; + +static int __init xen_mc_debugfs(void) +{ + struct dentry *d_xen = xen_init_debugfs(); + + if (d_xen == NULL) + return -ENOMEM; + + d_mc_debug = debugfs_create_dir("multicalls", d_xen); + + debugfs_create_u8("zero_stats", 0644, d_mc_debug, &zero_stats); + + debugfs_create_u32("batches", 0444, d_mc_debug, &mc_stats.issued); + debugfs_create_u32("hypercalls", 0444, d_mc_debug, &mc_stats.hypercalls); + debugfs_create_u32("arg_total", 0444, d_mc_debug, &mc_stats.arg_total); + + xen_debugfs_create_u32_array("batch_histo", 0444, d_mc_debug, + mc_stats.histo, MC_BATCH); + xen_debugfs_create_u32_array("hypercall_histo", 0444, d_mc_debug, + mc_stats.histo_hypercalls, NHYPERCALLS); + xen_debugfs_create_u32_array("flush_reasons", 0444, d_mc_debug, + mc_stats.flush, FL_N_REASONS); + + return 0; +} +fs_initcall(xen_mc_debugfs); + +#endif /* CONFIG_XEN_DEBUG_FS */ diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index d8faf79a0a1..d77da613b1d 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -11,11 +11,8 @@ * useful topology information for the kernel to make use of. As a * result, all CPUs are treated as if they're single-core and * single-threaded. - * - * This does not handle HOTPLUG_CPU yet. */ #include <linux/sched.h> -#include <linux/kernel_stat.h> #include <linux/err.h> #include <linux/smp.h> @@ -36,8 +33,6 @@ #include "xen-ops.h" #include "mmu.h" -static void __cpuinit xen_init_lock_cpu(int cpu); - cpumask_t xen_cpu_initialized_map; static DEFINE_PER_CPU(int, resched_irq); @@ -64,11 +59,12 @@ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static __cpuinit void cpu_bringup_and_idle(void) +static __cpuinit void cpu_bringup(void) { int cpu = smp_processor_id(); cpu_init(); + touch_softlockup_watchdog(); preempt_disable(); xen_enable_sysenter(); @@ -89,6 +85,11 @@ static __cpuinit void cpu_bringup_and_idle(void) local_irq_enable(); wmb(); /* make sure everything is out */ +} + +static __cpuinit void cpu_bringup_and_idle(void) +{ + cpu_bringup(); cpu_idle(); } @@ -212,8 +213,6 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) cpu_set(cpu, cpu_present_map); } - - //init_xenbus_allowed_cpumask(); } static __cpuinit int @@ -281,12 +280,6 @@ static int __cpuinit xen_cpu_up(unsigned int cpu) struct task_struct *idle = idle_task(cpu); int rc; -#if 0 - rc = cpu_up_check(cpu); - if (rc) - return rc; -#endif - #ifdef CONFIG_X86_64 /* Allocate node local memory for AP pdas */ WARN_ON(cpu == 0); @@ -339,6 +332,60 @@ static void xen_smp_cpus_done(unsigned int max_cpus) { } +#ifdef CONFIG_HOTPLUG_CPU +static int xen_cpu_disable(void) +{ + unsigned int cpu = smp_processor_id(); + if (cpu == 0) + return -EBUSY; + + cpu_disable_common(); + + load_cr3(swapper_pg_dir); + return 0; +} + +static void xen_cpu_die(unsigned int cpu) +{ + while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) { + current->state = TASK_UNINTERRUPTIBLE; + schedule_timeout(HZ/10); + } + unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL); + xen_uninit_lock_cpu(cpu); + xen_teardown_timer(cpu); + + if (num_online_cpus() == 1) + alternatives_smp_switch(0); +} + +static void xen_play_dead(void) +{ + play_dead_common(); + HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); + cpu_bringup(); +} + +#else /* !CONFIG_HOTPLUG_CPU */ +static int xen_cpu_disable(void) +{ + return -ENOSYS; +} + +static void xen_cpu_die(unsigned int cpu) +{ + BUG(); +} + +static void xen_play_dead(void) +{ + BUG(); +} + +#endif static void stop_self(void *v) { int cpu = smp_processor_id(); @@ -419,176 +466,16 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -struct xen_spinlock { - unsigned char lock; /* 0 -> free; 1 -> locked */ - unsigned short spinners; /* count of waiting cpus */ -}; - -static int xen_spin_is_locked(struct raw_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - - return xl->lock != 0; -} - -static int xen_spin_is_contended(struct raw_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - - /* Not strictly true; this is only the count of contended - lock-takers entering the slow path. */ - return xl->spinners != 0; -} - -static int xen_spin_trylock(struct raw_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - u8 old = 1; - - asm("xchgb %b0,%1" - : "+q" (old), "+m" (xl->lock) : : "memory"); - - return old == 0; -} - -static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; -static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners); - -static inline void spinning_lock(struct xen_spinlock *xl) -{ - __get_cpu_var(lock_spinners) = xl; - wmb(); /* set lock of interest before count */ - asm(LOCK_PREFIX " incw %0" - : "+m" (xl->spinners) : : "memory"); -} - -static inline void unspinning_lock(struct xen_spinlock *xl) -{ - asm(LOCK_PREFIX " decw %0" - : "+m" (xl->spinners) : : "memory"); - wmb(); /* decrement count before clearing lock */ - __get_cpu_var(lock_spinners) = NULL; -} - -static noinline int xen_spin_lock_slow(struct raw_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - int irq = __get_cpu_var(lock_kicker_irq); - int ret; - - /* If kicker interrupts not initialized yet, just spin */ - if (irq == -1) - return 0; - - /* announce we're spinning */ - spinning_lock(xl); - - /* clear pending */ - xen_clear_irq_pending(irq); - - /* check again make sure it didn't become free while - we weren't looking */ - ret = xen_spin_trylock(lock); - if (ret) - goto out; - - /* block until irq becomes pending */ - xen_poll_irq(irq); - kstat_this_cpu.irqs[irq]++; - -out: - unspinning_lock(xl); - return ret; -} - -static void xen_spin_lock(struct raw_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - int timeout; - u8 oldval; - - do { - timeout = 1 << 10; - - asm("1: xchgb %1,%0\n" - " testb %1,%1\n" - " jz 3f\n" - "2: rep;nop\n" - " cmpb $0,%0\n" - " je 1b\n" - " dec %2\n" - " jnz 2b\n" - "3:\n" - : "+m" (xl->lock), "=q" (oldval), "+r" (timeout) - : "1" (1) - : "memory"); - - } while (unlikely(oldval != 0 && !xen_spin_lock_slow(lock))); -} - -static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl) -{ - int cpu; - - for_each_online_cpu(cpu) { - /* XXX should mix up next cpu selection */ - if (per_cpu(lock_spinners, cpu) == xl) { - xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); - break; - } - } -} - -static void xen_spin_unlock(struct raw_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - - smp_wmb(); /* make sure no writes get moved after unlock */ - xl->lock = 0; /* release lock */ - - /* make sure unlock happens before kick */ - barrier(); - - if (unlikely(xl->spinners)) - xen_spin_unlock_slow(xl); -} - -static __cpuinit void xen_init_lock_cpu(int cpu) -{ - int irq; - const char *name; - - name = kasprintf(GFP_KERNEL, "spinlock%d", cpu); - irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR, - cpu, - xen_reschedule_interrupt, - IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, - name, - NULL); - - if (irq >= 0) { - disable_irq(irq); /* make sure it's never delivered */ - per_cpu(lock_kicker_irq, cpu) = irq; - } - - printk("cpu %d spinlock event irq %d\n", cpu, irq); -} - -static void __init xen_init_spinlocks(void) -{ - pv_lock_ops.spin_is_locked = xen_spin_is_locked; - pv_lock_ops.spin_is_contended = xen_spin_is_contended; - pv_lock_ops.spin_lock = xen_spin_lock; - pv_lock_ops.spin_trylock = xen_spin_trylock; - pv_lock_ops.spin_unlock = xen_spin_unlock; -} - static const struct smp_ops xen_smp_ops __initdata = { .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, .smp_prepare_cpus = xen_smp_prepare_cpus, - .cpu_up = xen_cpu_up, .smp_cpus_done = xen_smp_cpus_done, + .cpu_up = xen_cpu_up, + .cpu_die = xen_cpu_die, + .cpu_disable = xen_cpu_disable, + .play_dead = xen_play_dead, + .smp_send_stop = xen_smp_send_stop, .smp_send_reschedule = xen_smp_send_reschedule, diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c new file mode 100644 index 00000000000..dd71e3a021c --- /dev/null +++ b/arch/x86/xen/spinlock.c @@ -0,0 +1,428 @@ +/* + * Split spinlock implementation out into its own file, so it can be + * compiled in a FTRACE-compatible way. + */ +#include <linux/kernel_stat.h> +#include <linux/spinlock.h> +#include <linux/debugfs.h> +#include <linux/log2.h> + +#include <asm/paravirt.h> + +#include <xen/interface/xen.h> +#include <xen/events.h> + +#include "xen-ops.h" +#include "debugfs.h" + +#ifdef CONFIG_XEN_DEBUG_FS +static struct xen_spinlock_stats +{ + u64 taken; + u32 taken_slow; + u32 taken_slow_nested; + u32 taken_slow_pickup; + u32 taken_slow_spurious; + u32 taken_slow_irqenable; + + u64 released; + u32 released_slow; + u32 released_slow_kicked; + +#define HISTO_BUCKETS 30 + u32 histo_spin_total[HISTO_BUCKETS+1]; + u32 histo_spin_spinning[HISTO_BUCKETS+1]; + u32 histo_spin_blocked[HISTO_BUCKETS+1]; + + u64 time_total; + u64 time_spinning; + u64 time_blocked; +} spinlock_stats; + +static u8 zero_stats; + +static unsigned lock_timeout = 1 << 10; +#define TIMEOUT lock_timeout + +static inline void check_zero(void) +{ + if (unlikely(zero_stats)) { + memset(&spinlock_stats, 0, sizeof(spinlock_stats)); + zero_stats = 0; + } +} + +#define ADD_STATS(elem, val) \ + do { check_zero(); spinlock_stats.elem += (val); } while(0) + +static inline u64 spin_time_start(void) +{ + return xen_clocksource_read(); +} + +static void __spin_time_accum(u64 delta, u32 *array) +{ + unsigned index = ilog2(delta); + + check_zero(); + + if (index < HISTO_BUCKETS) + array[index]++; + else + array[HISTO_BUCKETS]++; +} + +static inline void spin_time_accum_spinning(u64 start) +{ + u32 delta = xen_clocksource_read() - start; + + __spin_time_accum(delta, spinlock_stats.histo_spin_spinning); + spinlock_stats.time_spinning += delta; +} + +static inline void spin_time_accum_total(u64 start) +{ + u32 delta = xen_clocksource_read() - start; + + __spin_time_accum(delta, spinlock_stats.histo_spin_total); + spinlock_stats.time_total += delta; +} + +static inline void spin_time_accum_blocked(u64 start) +{ + u32 delta = xen_clocksource_read() - start; + + __spin_time_accum(delta, spinlock_stats.histo_spin_blocked); + spinlock_stats.time_blocked += delta; +} +#else /* !CONFIG_XEN_DEBUG_FS */ +#define TIMEOUT (1 << 10) +#define ADD_STATS(elem, val) do { (void)(val); } while(0) + +static inline u64 spin_time_start(void) +{ + return 0; +} + +static inline void spin_time_accum_total(u64 start) +{ +} +static inline void spin_time_accum_spinning(u64 start) +{ +} +static inline void spin_time_accum_blocked(u64 start) +{ +} +#endif /* CONFIG_XEN_DEBUG_FS */ + +struct xen_spinlock { + unsigned char lock; /* 0 -> free; 1 -> locked */ + unsigned short spinners; /* count of waiting cpus */ +}; + +static int xen_spin_is_locked(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + + return xl->lock != 0; +} + +static int xen_spin_is_contended(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + + /* Not strictly true; this is only the count of contended + lock-takers entering the slow path. */ + return xl->spinners != 0; +} + +static int xen_spin_trylock(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + u8 old = 1; + + asm("xchgb %b0,%1" + : "+q" (old), "+m" (xl->lock) : : "memory"); + + return old == 0; +} + +static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; +static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners); + +/* + * Mark a cpu as interested in a lock. Returns the CPU's previous + * lock of interest, in case we got preempted by an interrupt. + */ +static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl) +{ + struct xen_spinlock *prev; + + prev = __get_cpu_var(lock_spinners); + __get_cpu_var(lock_spinners) = xl; + + wmb(); /* set lock of interest before count */ + + asm(LOCK_PREFIX " incw %0" + : "+m" (xl->spinners) : : "memory"); + + return prev; +} + +/* + * Mark a cpu as no longer interested in a lock. Restores previous + * lock of interest (NULL for none). + */ +static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev) +{ + asm(LOCK_PREFIX " decw %0" + : "+m" (xl->spinners) : : "memory"); + wmb(); /* decrement count before restoring lock */ + __get_cpu_var(lock_spinners) = prev; +} + +static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enable) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + struct xen_spinlock *prev; + int irq = __get_cpu_var(lock_kicker_irq); + int ret; + unsigned long flags; + u64 start; + + /* If kicker interrupts not initialized yet, just spin */ + if (irq == -1) + return 0; + + start = spin_time_start(); + + /* announce we're spinning */ + prev = spinning_lock(xl); + + flags = __raw_local_save_flags(); + if (irq_enable) { + ADD_STATS(taken_slow_irqenable, 1); + raw_local_irq_enable(); + } + + ADD_STATS(taken_slow, 1); + ADD_STATS(taken_slow_nested, prev != NULL); + + do { + /* clear pending */ + xen_clear_irq_pending(irq); + + /* check again make sure it didn't become free while + we weren't looking */ + ret = xen_spin_trylock(lock); + if (ret) { + ADD_STATS(taken_slow_pickup, 1); + + /* + * If we interrupted another spinlock while it + * was blocking, make sure it doesn't block + * without rechecking the lock. + */ + if (prev != NULL) + xen_set_irq_pending(irq); + goto out; + } + + /* + * Block until irq becomes pending. If we're + * interrupted at this point (after the trylock but + * before entering the block), then the nested lock + * handler guarantees that the irq will be left + * pending if there's any chance the lock became free; + * xen_poll_irq() returns immediately if the irq is + * pending. + */ + xen_poll_irq(irq); + ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq)); + } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */ + + kstat_this_cpu.irqs[irq]++; + +out: + raw_local_irq_restore(flags); + unspinning_lock(xl, prev); + spin_time_accum_blocked(start); + + return ret; +} + +static inline void __xen_spin_lock(struct raw_spinlock *lock, bool irq_enable) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + unsigned timeout; + u8 oldval; + u64 start_spin; + + ADD_STATS(taken, 1); + + start_spin = spin_time_start(); + + do { + u64 start_spin_fast = spin_time_start(); + + timeout = TIMEOUT; + + asm("1: xchgb %1,%0\n" + " testb %1,%1\n" + " jz 3f\n" + "2: rep;nop\n" + " cmpb $0,%0\n" + " je 1b\n" + " dec %2\n" + " jnz 2b\n" + "3:\n" + : "+m" (xl->lock), "=q" (oldval), "+r" (timeout) + : "1" (1) + : "memory"); + + spin_time_accum_spinning(start_spin_fast); + + } while (unlikely(oldval != 0 && + (TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable)))); + + spin_time_accum_total(start_spin); +} + +static void xen_spin_lock(struct raw_spinlock *lock) +{ + __xen_spin_lock(lock, false); +} + +static void xen_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags) +{ + __xen_spin_lock(lock, !raw_irqs_disabled_flags(flags)); +} + +static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl) +{ + int cpu; + + ADD_STATS(released_slow, 1); + + for_each_online_cpu(cpu) { + /* XXX should mix up next cpu selection */ + if (per_cpu(lock_spinners, cpu) == xl) { + ADD_STATS(released_slow_kicked, 1); + xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); + break; + } + } +} + +static void xen_spin_unlock(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + + ADD_STATS(released, 1); + + smp_wmb(); /* make sure no writes get moved after unlock */ + xl->lock = 0; /* release lock */ + + /* make sure unlock happens before kick */ + barrier(); + + if (unlikely(xl->spinners)) + xen_spin_unlock_slow(xl); +} + +static irqreturn_t dummy_handler(int irq, void *dev_id) +{ + BUG(); + return IRQ_HANDLED; +} + +void __cpuinit xen_init_lock_cpu(int cpu) +{ + int irq; + const char *name; + + name = kasprintf(GFP_KERNEL, "spinlock%d", cpu); + irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR, + cpu, + dummy_handler, + IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, + name, + NULL); + + if (irq >= 0) { + disable_irq(irq); /* make sure it's never delivered */ + per_cpu(lock_kicker_irq, cpu) = irq; + } + + printk("cpu %d spinlock event irq %d\n", cpu, irq); +} + +void xen_uninit_lock_cpu(int cpu) +{ + unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL); +} + +void __init xen_init_spinlocks(void) +{ + pv_lock_ops.spin_is_locked = xen_spin_is_locked; + pv_lock_ops.spin_is_contended = xen_spin_is_contended; + pv_lock_ops.spin_lock = xen_spin_lock; + pv_lock_ops.spin_lock_flags = xen_spin_lock_flags; + pv_lock_ops.spin_trylock = xen_spin_trylock; + pv_lock_ops.spin_unlock = xen_spin_unlock; +} + +#ifdef CONFIG_XEN_DEBUG_FS + +static struct dentry *d_spin_debug; + +static int __init xen_spinlock_debugfs(void) +{ + struct dentry *d_xen = xen_init_debugfs(); + + if (d_xen == NULL) + return -ENOMEM; + + d_spin_debug = debugfs_create_dir("spinlocks", d_xen); + + debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats); + + debugfs_create_u32("timeout", 0644, d_spin_debug, &lock_timeout); + + debugfs_create_u64("taken", 0444, d_spin_debug, &spinlock_stats.taken); + debugfs_create_u32("taken_slow", 0444, d_spin_debug, + &spinlock_stats.taken_slow); + debugfs_create_u32("taken_slow_nested", 0444, d_spin_debug, + &spinlock_stats.taken_slow_nested); + debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug, + &spinlock_stats.taken_slow_pickup); + debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug, + &spinlock_stats.taken_slow_spurious); + debugfs_create_u32("taken_slow_irqenable", 0444, d_spin_debug, + &spinlock_stats.taken_slow_irqenable); + + debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released); + debugfs_create_u32("released_slow", 0444, d_spin_debug, + &spinlock_stats.released_slow); + debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug, + &spinlock_stats.released_slow_kicked); + + debugfs_create_u64("time_spinning", 0444, d_spin_debug, + &spinlock_stats.time_spinning); + debugfs_create_u64("time_blocked", 0444, d_spin_debug, + &spinlock_stats.time_blocked); + debugfs_create_u64("time_total", 0444, d_spin_debug, + &spinlock_stats.time_total); + + xen_debugfs_create_u32_array("histo_total", 0444, d_spin_debug, + spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1); + xen_debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug, + spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1); + xen_debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, + spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); + + return 0; +} +fs_initcall(xen_spinlock_debugfs); + +#endif /* CONFIG_XEN_DEBUG_FS */ diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 685b77470fc..004ba86326a 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -30,8 +30,6 @@ #define TIMER_SLOP 100000 #define NS_PER_TICK (1000000000LL / HZ) -static cycle_t xen_clocksource_read(void); - /* runstate info updated by Xen */ static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); @@ -213,7 +211,7 @@ unsigned long xen_tsc_khz(void) return xen_khz; } -static cycle_t xen_clocksource_read(void) +cycle_t xen_clocksource_read(void) { struct pvclock_vcpu_time_info *src; cycle_t ret; @@ -452,6 +450,14 @@ void xen_setup_timer(int cpu) setup_runstate_info(cpu); } +void xen_teardown_timer(int cpu) +{ + struct clock_event_device *evt; + BUG_ON(cpu == 0); + evt = &per_cpu(xen_clock_events, cpu); + unbind_from_irqhandler(evt->irq, NULL); +} + void xen_setup_cpu_clockevents(void) { BUG_ON(preemptible()); diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index 2497a30f41d..42786f59d9c 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S @@ -298,7 +298,7 @@ check_events: push %eax push %ecx push %edx - call force_evtchn_callback + call xen_force_evtchn_callback pop %edx pop %ecx pop %eax diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 7f58304fafb..05794c566e8 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -26,8 +26,15 @@ /* Pseudo-flag used for virtual NMI, which we don't implement yet */ #define XEN_EFLAGS_NMI 0x80000000 -#if 0 -#include <asm/percpu.h> +#if 1 +/* + x86-64 does not yet support direct access to percpu variables + via a segment override, so we just need to make sure this code + never gets used + */ +#define BUG ud2a +#define PER_CPU_VAR(var, off) 0xdeadbeef +#endif /* Enable events. This clears the event mask and tests the pending @@ -35,6 +42,8 @@ events, then enter the hypervisor to get them handled. */ ENTRY(xen_irq_enable_direct) + BUG + /* Unmask events */ movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) @@ -58,6 +67,8 @@ ENDPATCH(xen_irq_enable_direct) non-zero. */ ENTRY(xen_irq_disable_direct) + BUG + movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) ENDPATCH(xen_irq_disable_direct) ret @@ -74,6 +85,8 @@ ENDPATCH(xen_irq_disable_direct) Xen and x86 use opposite senses (mask vs enable). */ ENTRY(xen_save_fl_direct) + BUG + testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) setz %ah addb %ah,%ah @@ -91,6 +104,8 @@ ENDPATCH(xen_save_fl_direct) if so. */ ENTRY(xen_restore_fl_direct) + BUG + testb $X86_EFLAGS_IF>>8, %ah setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) /* Preempt here doesn't matter because that will deal with @@ -122,7 +137,7 @@ check_events: push %r9 push %r10 push %r11 - call force_evtchn_callback + call xen_force_evtchn_callback pop %r11 pop %r10 pop %r9 @@ -133,7 +148,6 @@ check_events: pop %rcx pop %rax ret -#endif ENTRY(xen_adjust_exception_frame) mov 8+0(%rsp),%rcx diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index dd3c23152a2..d7422dc2a55 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -2,6 +2,7 @@ #define XEN_OPS_H #include <linux/init.h> +#include <linux/clocksource.h> #include <linux/irqreturn.h> #include <xen/xen-ops.h> @@ -31,7 +32,10 @@ void xen_vcpu_restore(void); void __init xen_build_dynamic_phys_to_machine(void); +void xen_init_irq_ops(void); void xen_setup_timer(int cpu); +void xen_teardown_timer(int cpu); +cycle_t xen_clocksource_read(void); void xen_setup_cpu_clockevents(void); unsigned long xen_tsc_khz(void); void __init xen_time_init(void); @@ -50,6 +54,10 @@ void __init xen_setup_vcpu_info_placement(void); #ifdef CONFIG_SMP void xen_smp_init(void); +void __init xen_init_spinlocks(void); +__cpuinit void xen_init_lock_cpu(int cpu); +void xen_uninit_lock_cpu(int cpu); + extern cpumask_t xen_cpu_initialized_map; #else static inline void xen_smp_init(void) {} |