aboutsummaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/ia32/ia32entry.S3
-rw-r--r--arch/x86/include/asm/atomic_32.h236
-rw-r--r--arch/x86/include/asm/cpufeature.h2
-rw-r--r--arch/x86/include/asm/entry_arch.h2
-rw-r--r--arch/x86/include/asm/hardirq.h2
-rw-r--r--arch/x86/include/asm/hw_irq.h2
-rw-r--r--arch/x86/include/asm/intel_arch_perfmon.h31
-rw-r--r--arch/x86/include/asm/irq_vectors.h8
-rw-r--r--arch/x86/include/asm/kvm.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h45
-rw-r--r--arch/x86/include/asm/kvm_x86_emulate.h6
-rw-r--r--arch/x86/include/asm/msr.h23
-rw-r--r--arch/x86/include/asm/perf_counter.h100
-rw-r--r--arch/x86/include/asm/svm.h1
-rw-r--r--arch/x86/include/asm/termios.h1
-rw-r--r--arch/x86/include/asm/unistd_32.h1
-rw-r--r--arch/x86/include/asm/unistd_64.h3
-rw-r--r--arch/x86/include/asm/vmx.h1
-rw-r--r--arch/x86/kernel/apic/apic.c3
-rw-r--r--arch/x86/kernel/apic/io_apic.c6
-rw-r--r--arch/x86/kernel/cpu/Makefile12
-rw-r--r--arch/x86/kernel/cpu/common.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c1
-rw-r--r--arch/x86/kernel/cpu/perf_counter.c1704
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c4
-rw-r--r--arch/x86/kernel/entry_64.S5
-rw-r--r--arch/x86/kernel/irq.c10
-rw-r--r--arch/x86/kernel/irqinit.c15
-rw-r--r--arch/x86/kernel/kvm.c4
-rw-r--r--arch/x86/kernel/signal.c1
-rw-r--r--arch/x86/kernel/smp.c3
-rw-r--r--arch/x86/kernel/syscall_table_32.S1
-rw-r--r--arch/x86/kernel/traps.c12
-rw-r--r--arch/x86/kvm/Kconfig6
-rw-r--r--arch/x86/kvm/Makefile2
-rw-r--r--arch/x86/kvm/i8254.c109
-rw-r--r--arch/x86/kvm/i8254.h12
-rw-r--r--arch/x86/kvm/irq.c7
-rw-r--r--arch/x86/kvm/kvm_timer.h18
-rw-r--r--arch/x86/kvm/lapic.c251
-rw-r--r--arch/x86/kvm/lapic.h12
-rw-r--r--arch/x86/kvm/mmu.c194
-rw-r--r--arch/x86/kvm/mmu.h5
-rw-r--r--arch/x86/kvm/paging_tmpl.h16
-rw-r--r--arch/x86/kvm/svm.c415
-rw-r--r--arch/x86/kvm/timer.c46
-rw-r--r--arch/x86/kvm/vmx.c721
-rw-r--r--arch/x86/kvm/x86.c409
-rw-r--r--arch/x86/kvm/x86.h14
-rw-r--r--arch/x86/kvm/x86_emulate.c141
-rw-r--r--arch/x86/lib/Makefile2
-rw-r--r--arch/x86/lib/msr-on-cpu.c97
-rw-r--r--arch/x86/lib/msr.c183
-rw-r--r--arch/x86/mm/fault.c12
-rw-r--r--arch/x86/mm/memtest.c9
-rw-r--r--arch/x86/oprofile/nmi_int.c7
-rw-r--r--arch/x86/oprofile/op_model_ppro.c10
-rw-r--r--arch/x86/vdso/vdso32-setup.c6
-rw-r--r--arch/x86/vdso/vma.c7
60 files changed, 3713 insertions, 1250 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index aafae3b140d..68f5578fe38 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -739,6 +739,7 @@ config X86_UP_IOAPIC
config X86_LOCAL_APIC
def_bool y
depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
+ select HAVE_PERF_COUNTERS if (!M386 && !M486)
config X86_IO_APIC
def_bool y
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index dcef387ddc3..e590261ba05 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -825,10 +825,11 @@ ia32_sys_call_table:
.quad compat_sys_signalfd4
.quad sys_eventfd2
.quad sys_epoll_create1
- .quad sys_dup3 /* 330 */
+ .quad sys_dup3 /* 330 */
.quad sys_pipe2
.quad sys_inotify_init1
.quad compat_sys_preadv
.quad compat_sys_pwritev
.quad compat_sys_rt_tgsigqueueinfo /* 335 */
+ .quad sys_perf_counter_open
ia32_syscall_end:
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
index 85b46fba422..aff9f1fcdcd 100644
--- a/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@ -247,5 +247,241 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
#define smp_mb__before_atomic_inc() barrier()
#define smp_mb__after_atomic_inc() barrier()
+/* An 64bit atomic type */
+
+typedef struct {
+ unsigned long long counter;
+} atomic64_t;
+
+#define ATOMIC64_INIT(val) { (val) }
+
+/**
+ * atomic64_read - read atomic64 variable
+ * @v: pointer of type atomic64_t
+ *
+ * Atomically reads the value of @v.
+ * Doesn't imply a read memory barrier.
+ */
+#define __atomic64_read(ptr) ((ptr)->counter)
+
+static inline unsigned long long
+cmpxchg8b(unsigned long long *ptr, unsigned long long old, unsigned long long new)
+{
+ asm volatile(
+
+ LOCK_PREFIX "cmpxchg8b (%[ptr])\n"
+
+ : "=A" (old)
+
+ : [ptr] "D" (ptr),
+ "A" (old),
+ "b" (ll_low(new)),
+ "c" (ll_high(new))
+
+ : "memory");
+
+ return old;
+}
+
+static inline unsigned long long
+atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val,
+ unsigned long long new_val)
+{
+ return cmpxchg8b(&ptr->counter, old_val, new_val);
+}
+
+/**
+ * atomic64_xchg - xchg atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ * @new_val: value to assign
+ * @old_val: old value that was there
+ *
+ * Atomically xchgs the value of @ptr to @new_val and returns
+ * the old value.
+ */
+
+static inline unsigned long long
+atomic64_xchg(atomic64_t *ptr, unsigned long long new_val)
+{
+ unsigned long long old_val;
+
+ do {
+ old_val = atomic_read(ptr);
+ } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
+
+ return old_val;
+}
+
+/**
+ * atomic64_set - set atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ * @new_val: value to assign
+ *
+ * Atomically sets the value of @ptr to @new_val.
+ */
+static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val)
+{
+ atomic64_xchg(ptr, new_val);
+}
+
+/**
+ * atomic64_read - read atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically reads the value of @ptr and returns it.
+ */
+static inline unsigned long long atomic64_read(atomic64_t *ptr)
+{
+ unsigned long long curr_val;
+
+ do {
+ curr_val = __atomic64_read(ptr);
+ } while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val);
+
+ return curr_val;
+}
+
+/**
+ * atomic64_add_return - add and return
+ * @delta: integer value to add
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically adds @delta to @ptr and returns @delta + *@ptr
+ */
+static inline unsigned long long
+atomic64_add_return(unsigned long long delta, atomic64_t *ptr)
+{
+ unsigned long long old_val, new_val;
+
+ do {
+ old_val = atomic_read(ptr);
+ new_val = old_val + delta;
+
+ } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
+
+ return new_val;
+}
+
+static inline long atomic64_sub_return(unsigned long long delta, atomic64_t *ptr)
+{
+ return atomic64_add_return(-delta, ptr);
+}
+
+static inline long atomic64_inc_return(atomic64_t *ptr)
+{
+ return atomic64_add_return(1, ptr);
+}
+
+static inline long atomic64_dec_return(atomic64_t *ptr)
+{
+ return atomic64_sub_return(1, ptr);
+}
+
+/**
+ * atomic64_add - add integer to atomic64 variable
+ * @delta: integer value to add
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically adds @delta to @ptr.
+ */
+static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr)
+{
+ atomic64_add_return(delta, ptr);
+}
+
+/**
+ * atomic64_sub - subtract the atomic64 variable
+ * @delta: integer value to subtract
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically subtracts @delta from @ptr.
+ */
+static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr)
+{
+ atomic64_add(-delta, ptr);
+}
+
+/**
+ * atomic64_sub_and_test - subtract value from variable and test result
+ * @delta: integer value to subtract
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically subtracts @delta from @ptr and returns
+ * true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int
+atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr)
+{
+ unsigned long long old_val = atomic64_sub_return(delta, ptr);
+
+ return old_val == 0;
+}
+
+/**
+ * atomic64_inc - increment atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically increments @ptr by 1.
+ */
+static inline void atomic64_inc(atomic64_t *ptr)
+{
+ atomic64_add(1, ptr);
+}
+
+/**
+ * atomic64_dec - decrement atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically decrements @ptr by 1.
+ */
+static inline void atomic64_dec(atomic64_t *ptr)
+{
+ atomic64_sub(1, ptr);
+}
+
+/**
+ * atomic64_dec_and_test - decrement and test
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically decrements @ptr by 1 and
+ * returns true if the result is 0, or false for all other
+ * cases.
+ */
+static inline int atomic64_dec_and_test(atomic64_t *ptr)
+{
+ return atomic64_sub_and_test(1, ptr);
+}
+
+/**
+ * atomic64_inc_and_test - increment and test
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically increments @ptr by 1
+ * and returns true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int atomic64_inc_and_test(atomic64_t *ptr)
+{
+ return atomic64_sub_and_test(-1, ptr);
+}
+
+/**
+ * atomic64_add_negative - add and test if negative
+ * @delta: integer value to add
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically adds @delta to @ptr and returns true
+ * if the result is negative, or false when
+ * result is greater than or equal to zero.
+ */
+static inline int
+atomic64_add_negative(unsigned long long delta, atomic64_t *ptr)
+{
+ long long old_val = atomic64_add_return(delta, ptr);
+
+ return old_val < 0;
+}
+
#include <asm-generic/atomic.h>
#endif /* _ASM_X86_ATOMIC_32_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 19af42138f7..4a28d22d479 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -116,6 +116,8 @@
#define X86_FEATURE_XMM4_1 (4*32+19) /* "sse4_1" SSE-4.1 */
#define X86_FEATURE_XMM4_2 (4*32+20) /* "sse4_2" SSE-4.2 */
#define X86_FEATURE_X2APIC (4*32+21) /* x2APIC */
+#define X86_FEATURE_MOVBE (4*32+22) /* MOVBE instruction */
+#define X86_FEATURE_POPCNT (4*32+23) /* POPCNT instruction */
#define X86_FEATURE_AES (4*32+25) /* AES instructions */
#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index c2e6bedaf25..d750a10ccad 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -49,7 +49,7 @@ BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
#ifdef CONFIG_PERF_COUNTERS
-BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR)
+BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
#endif
#ifdef CONFIG_X86_MCE_P4THERMAL
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 37555e52f98..9ebc5c25503 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -13,6 +13,8 @@ typedef struct {
unsigned int irq_spurious_count;
#endif
unsigned int generic_irqs; /* arch dependent */
+ unsigned int apic_perf_irqs;
+ unsigned int apic_pending_irqs;
#ifdef CONFIG_SMP
unsigned int irq_resched_count;
unsigned int irq_call_count;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 3bd1777a4c8..6df45f63966 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -29,6 +29,8 @@
extern void apic_timer_interrupt(void);
extern void generic_interrupt(void);
extern void error_interrupt(void);
+extern void perf_pending_interrupt(void);
+
extern void spurious_interrupt(void);
extern void thermal_interrupt(void);
extern void reschedule_interrupt(void);
diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/intel_arch_perfmon.h
deleted file mode 100644
index fa0fd068bc2..00000000000
--- a/arch/x86/include/asm/intel_arch_perfmon.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef _ASM_X86_INTEL_ARCH_PERFMON_H
-#define _ASM_X86_INTEL_ARCH_PERFMON_H
-
-#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
-#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
-
-#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
-#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
-
-#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
-#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
-#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
-#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
-
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL (0x3c)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
- (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
-
-union cpuid10_eax {
- struct {
- unsigned int version_id:8;
- unsigned int num_counters:8;
- unsigned int bit_width:8;
- unsigned int mask_length:8;
- } split;
- unsigned int full;
-};
-
-#endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 910b5a3d675..e997be98c9b 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -108,14 +108,14 @@
#define LOCAL_TIMER_VECTOR 0xef
/*
- * Performance monitoring interrupt vector:
+ * Generic system vector for platform specific use
*/
-#define LOCAL_PERF_VECTOR 0xee
+#define GENERIC_INTERRUPT_VECTOR 0xed
/*
- * Generic system vector for platform specific use
+ * Performance monitoring pending work vector:
*/
-#define GENERIC_INTERRUPT_VECTOR 0xed
+#define LOCAL_PENDING_VECTOR 0xec
/*
* First APIC vector available to drivers: (vectors 0x30-0xee) we
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index dc3f6cf1170..125be8b1956 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -16,6 +16,7 @@
#define __KVM_HAVE_MSI
#define __KVM_HAVE_USER_NMI
#define __KVM_HAVE_GUEST_DEBUG
+#define __KVM_HAVE_MSIX
/* Architectural interrupt line count. */
#define KVM_NR_INTERRUPTS 256
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f0faf58044f..eabdc1cfab5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -185,6 +185,7 @@ union kvm_mmu_page_role {
unsigned access:3;
unsigned invalid:1;
unsigned cr4_pge:1;
+ unsigned nxe:1;
};
};
@@ -212,7 +213,6 @@ struct kvm_mmu_page {
int multimapped; /* More than one parent_pte? */
int root_count; /* Currently serving as active root */
bool unsync;
- bool global;
unsigned int unsync_children;
union {
u64 *parent_pte; /* !multimapped */
@@ -261,13 +261,11 @@ struct kvm_mmu {
union kvm_mmu_page_role base_role;
u64 *pae_root;
+ u64 rsvd_bits_mask[2][4];
};
struct kvm_vcpu_arch {
u64 host_tsc;
- int interrupt_window_open;
- unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
- DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
/*
* rip and regs accesses must go through
* kvm_{register,rip}_{read,write} functions.
@@ -286,6 +284,7 @@ struct kvm_vcpu_arch {
u64 shadow_efer;
u64 apic_base;
struct kvm_lapic *apic; /* kernel irqchip context */
+ int32_t apic_arb_prio;
int mp_state;
int sipi_vector;
u64 ia32_misc_enable_msr;
@@ -320,6 +319,8 @@ struct kvm_vcpu_arch {
struct kvm_pio_request pio;
void *pio_data;
+ u8 event_exit_inst_len;
+
struct kvm_queued_exception {
bool pending;
bool has_error_code;
@@ -329,11 +330,12 @@ struct kvm_vcpu_arch {
struct kvm_queued_interrupt {
bool pending;
+ bool soft;
u8 nr;
} interrupt;
struct {
- int active;
+ int vm86_active;
u8 save_iopl;
struct kvm_save_segment {
u16 selector;
@@ -356,9 +358,9 @@ struct kvm_vcpu_arch {
unsigned int time_offset;
struct page *time_page;
+ bool singlestep; /* guest is single stepped by KVM */
bool nmi_pending;
bool nmi_injected;
- bool nmi_window_open;
struct mtrr_state_type mtrr_state;
u32 pat;
@@ -392,15 +394,14 @@ struct kvm_arch{
*/
struct list_head active_mmu_pages;
struct list_head assigned_dev_head;
- struct list_head oos_global_pages;
struct iommu_domain *iommu_domain;
+ int iommu_flags;
struct kvm_pic *vpic;
struct kvm_ioapic *vioapic;
struct kvm_pit *vpit;
struct hlist_head irq_ack_notifier_list;
int vapics_in_nmi_mode;
- int round_robin_prev_vcpu;
unsigned int tss_addr;
struct page *apic_access_page;
@@ -423,7 +424,6 @@ struct kvm_vm_stat {
u32 mmu_recycled;
u32 mmu_cache_miss;
u32 mmu_unsync;
- u32 mmu_unsync_global;
u32 remote_tlb_flush;
u32 lpages;
};
@@ -443,7 +443,6 @@ struct kvm_vcpu_stat {
u32 halt_exits;
u32 halt_wakeup;
u32 request_irq_exits;
- u32 request_nmi_exits;
u32 irq_exits;
u32 host_state_reload;
u32 efer_reload;
@@ -511,20 +510,22 @@ struct kvm_x86_ops {
void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
+ void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
+ u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
void (*patch_hypercall)(struct kvm_vcpu *vcpu,
unsigned char *hypercall_addr);
- int (*get_irq)(struct kvm_vcpu *vcpu);
- void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
+ void (*set_irq)(struct kvm_vcpu *vcpu);
+ void (*set_nmi)(struct kvm_vcpu *vcpu);
void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
bool has_error_code, u32 error_code);
- bool (*exception_injected)(struct kvm_vcpu *vcpu);
- void (*inject_pending_irq)(struct kvm_vcpu *vcpu);
- void (*inject_pending_vectors)(struct kvm_vcpu *vcpu,
- struct kvm_run *run);
-
+ int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
+ int (*nmi_allowed)(struct kvm_vcpu *vcpu);
+ void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
+ void (*enable_irq_window)(struct kvm_vcpu *vcpu);
+ void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
int (*get_tdp_level)(void);
- int (*get_mt_mask_shift)(void);
+ u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
};
extern struct kvm_x86_ops *kvm_x86_ops;
@@ -538,7 +539,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu);
void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
void kvm_mmu_set_base_ptes(u64 base_pte);
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
- u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask);
+ u64 dirty_mask, u64 nx_mask, u64 x_mask);
int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
@@ -552,6 +553,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
const void *val, int bytes);
int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
gpa_t addr, unsigned long *ret);
+u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
extern bool tdp_enabled;
@@ -563,6 +565,7 @@ enum emulation_result {
#define EMULTYPE_NO_DECODE (1 << 0)
#define EMULTYPE_TRAP_UD (1 << 1)
+#define EMULTYPE_SKIP (1 << 2)
int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run,
unsigned long cr2, u16 error_code, int emulation_type);
void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
@@ -638,7 +641,6 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
int kvm_mmu_load(struct kvm_vcpu *vcpu);
void kvm_mmu_unload(struct kvm_vcpu *vcpu);
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
-void kvm_mmu_sync_global(struct kvm_vcpu *vcpu);
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
@@ -769,6 +771,8 @@ enum {
#define HF_GIF_MASK (1 << 0)
#define HF_HIF_MASK (1 << 1)
#define HF_VINTR_MASK (1 << 2)
+#define HF_NMI_MASK (1 << 3)
+#define HF_IRET_MASK (1 << 4)
/*
* Hardware virtualization extension instructions may fault if a
@@ -791,5 +795,6 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
#define KVM_ARCH_WANT_MMU_NOTIFIER
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
int kvm_age_hva(struct kvm *kvm, unsigned long hva);
+int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_x86_emulate.h
index 6a159732881..b7ed2c42311 100644
--- a/arch/x86/include/asm/kvm_x86_emulate.h
+++ b/arch/x86/include/asm/kvm_x86_emulate.h
@@ -143,6 +143,9 @@ struct decode_cache {
struct fetch_cache fetch;
};
+#define X86_SHADOW_INT_MOV_SS 1
+#define X86_SHADOW_INT_STI 2
+
struct x86_emulate_ctxt {
/* Register state before/after emulation. */
struct kvm_vcpu *vcpu;
@@ -152,6 +155,9 @@ struct x86_emulate_ctxt {
int mode;
u32 cs_base;
+ /* interruptibility state, as a result of execution of STI or MOV SS */
+ int interruptibility;
+
/* decode cache */
struct decode_cache decode;
};
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 638bf624180..22603764e7d 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -12,6 +12,17 @@
#include <asm/asm.h>
#include <asm/errno.h>
+#include <asm/cpumask.h>
+
+struct msr {
+ union {
+ struct {
+ u32 l;
+ u32 h;
+ };
+ u64 q;
+ };
+};
static inline unsigned long long native_read_tscp(unsigned int *aux)
{
@@ -216,6 +227,8 @@ do { \
#ifdef CONFIG_SMP
int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
+void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs);
+void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs);
int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
#else /* CONFIG_SMP */
@@ -229,6 +242,16 @@ static inline int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
wrmsr(msr_no, l, h);
return 0;
}
+static inline void rdmsr_on_cpus(const cpumask_t *m, u32 msr_no,
+ struct msr *msrs)
+{
+ rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h));
+}
+static inline void wrmsr_on_cpus(const cpumask_t *m, u32 msr_no,
+ struct msr *msrs)
+{
+ wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h);
+}
static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no,
u32 *l, u32 *h)
{
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
new file mode 100644
index 00000000000..876ed97147b
--- /dev/null
+++ b/arch/x86/include/asm/perf_counter.h
@@ -0,0 +1,100 @@
+#ifndef _ASM_X86_PERF_COUNTER_H
+#define _ASM_X86_PERF_COUNTER_H
+
+/*
+ * Performance counter hw details:
+ */
+
+#define X86_PMC_MAX_GENERIC 8
+#define X86_PMC_MAX_FIXED 3
+
+#define X86_PMC_IDX_GENERIC 0
+#define X86_PMC_IDX_FIXED 32
+#define X86_PMC_IDX_MAX 64
+
+#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
+#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
+
+#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
+#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
+
+#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
+#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
+#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
+#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
+
+/*
+ * Includes eventsel and unit mask as well:
+ */
+#define ARCH_PERFMON_EVENT_MASK 0xffff
+
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
+ (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
+
+#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6
+
+/*
+ * Intel "Architectural Performance Monitoring" CPUID
+ * detection/enumeration details:
+ */
+union cpuid10_eax {
+ struct {
+ unsigned int version_id:8;
+ unsigned int num_counters:8;
+ unsigned int bit_width:8;
+ unsigned int mask_length:8;
+ } split;
+ unsigned int full;
+};
+
+union cpuid10_edx {
+ struct {
+ unsigned int num_counters_fixed:4;
+ unsigned int reserved:28;
+ } split;
+ unsigned int full;
+};
+
+
+/*
+ * Fixed-purpose performance counters:
+ */
+
+/*
+ * All 3 fixed-mode PMCs are configured via this single MSR:
+ */
+#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d
+
+/*
+ * The counts are available in three separate MSRs:
+ */
+
+/* Instr_Retired.Any: */
+#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309
+#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0)
+
+/* CPU_CLK_Unhalted.Core: */
+#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a
+#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1)
+
+/* CPU_CLK_Unhalted.Ref: */
+#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b
+#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2)
+
+extern void set_perf_counter_pending(void);
+
+#define clear_perf_counter_pending() do { } while (0)
+#define test_perf_counter_pending() (0)
+
+#ifdef CONFIG_PERF_COUNTERS
+extern void init_hw_perf_counters(void);
+extern void perf_counters_lapic_init(void);
+#else
+static inline void init_hw_perf_counters(void) { }
+static inline void perf_counters_lapic_init(void) { }
+#endif
+
+#endif /* _ASM_X86_PERF_COUNTER_H */
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 82ada75f3eb..85574b7c1bc 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -225,6 +225,7 @@ struct __attribute__ ((__packed__)) vmcb {
#define SVM_EVTINJ_VALID_ERR (1 << 11)
#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK
+#define SVM_EXITINTINFO_TYPE_MASK SVM_EVTINJ_TYPE_MASK
#define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR
#define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI
diff --git a/arch/x86/include/asm/termios.h b/arch/x86/include/asm/termios.h
index f72956331c4..c4ee8056bac 100644
--- a/arch/x86/include/asm/termios.h
+++ b/arch/x86/include/asm/termios.h
@@ -67,6 +67,7 @@ static inline int user_termio_to_kernel_termios(struct ktermios *termios,
SET_LOW_TERMIOS_BITS(termios, termio, c_oflag);
SET_LOW_TERMIOS_BITS(termios, termio, c_cflag);
SET_LOW_TERMIOS_BITS(termios, termio, c_lflag);
+ get_user(termios->c_line, &termio->c_line);
return copy_from_user(termios->c_cc, termio->c_cc, NCC);
}
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 708dae61262..732a3070615 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -341,6 +341,7 @@
#define __NR_preadv 333
#define __NR_pwritev 334
#define __NR_rt_tgsigqueueinfo 335
+#define __NR_perf_counter_open 336
#ifdef __KERNEL__
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 4e2b0540440..900e1617e67 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -659,7 +659,8 @@ __SYSCALL(__NR_preadv, sys_preadv)
__SYSCALL(__NR_pwritev, sys_pwritev)
#define __NR_rt_tgsigqueueinfo 297
__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
-
+#define __NR_perf_counter_open 298
+__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
#ifndef __NO_STUBS
#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 498f944010b..11be5ad2e0e 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -247,6 +247,7 @@ enum vmcs_field {
#define EXIT_REASON_MSR_READ 31
#define EXIT_REASON_MSR_WRITE 32
#define EXIT_REASON_MWAIT_INSTRUCTION 36
+#define EXIT_REASON_MCE_DURING_VMENTRY 41
#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
#define EXIT_REASON_APIC_ACCESS 44
#define EXIT_REASON_EPT_VIOLATION 48
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index a4c9cf0bf70..076d3881f3d 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -14,6 +14,7 @@
* Mikael Pettersson : PM converted to driver model.
*/
+#include <linux/perf_counter.h>
#include <linux/kernel_stat.h>
#include <linux/mc146818rtc.h>
#include <linux/acpi_pmtmr.h>
@@ -34,6 +35,7 @@
#include <linux/smp.h>
#include <linux/mm.h>
+#include <asm/perf_counter.h>
#include <asm/pgalloc.h>
#include <asm/atomic.h>
#include <asm/mpspec.h>
@@ -1187,6 +1189,7 @@ void __cpuinit setup_local_APIC(void)
apic_write(APIC_ESR, 0);
}
#endif
+ perf_counters_lapic_init();
preempt_disable();
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 1946fac42ab..94605e7f6a5 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -177,16 +177,18 @@ int __init arch_early_irq_init(void)
struct irq_cfg *cfg;
struct irq_desc *desc;
int count;
+ int node;
int i;
cfg = irq_cfgx;
count = ARRAY_SIZE(irq_cfgx);
+ node= cpu_to_node(boot_cpu_id);
for (i = 0; i < count; i++) {
desc = irq_to_desc(i);
desc->chip_data = &cfg[i];
- alloc_bootmem_cpumask_var(&cfg[i].domain);
- alloc_bootmem_cpumask_var(&cfg[i].old_domain);
+ alloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
+ alloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
if (i < NR_IRQS_LEGACY)
cpumask_setall(cfg[i].domain);
}
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 4e242f9a06e..3efcb2b96a1 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -1,5 +1,5 @@
#
-# Makefile for x86-compatible CPU details and quirks
+# Makefile for x86-compatible CPU details, features and quirks
#
# Don't trace early stages of a secondary CPU boot
@@ -23,11 +23,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
-obj-$(CONFIG_X86_MCE) += mcheck/
-obj-$(CONFIG_MTRR) += mtrr/
-obj-$(CONFIG_CPU_FREQ) += cpufreq/
+obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
-obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
+obj-$(CONFIG_X86_MCE) += mcheck/
+obj-$(CONFIG_MTRR) += mtrr/
+obj-$(CONFIG_CPU_FREQ) += cpufreq/
+
+obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
quiet_cmd_mkcapflags = MKCAP $@
cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b0517aa2bd3..3ffdcfa9abd 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -13,6 +13,7 @@
#include <linux/io.h>
#include <asm/stackprotector.h>
+#include <asm/perf_counter.h>
#include <asm/mmu_context.h>
#include <asm/hypervisor.h>
#include <asm/processor.h>
@@ -874,6 +875,7 @@ void __init identify_boot_cpu(void)
#else
vgetcpu_set_mode();
#endif
+ init_hw_perf_counters();
}
void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 09dd1d414fc..289cc481502 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -420,6 +420,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
out2:
atomic_dec(&mce_entry);
}
+EXPORT_SYMBOL_GPL(do_machine_check);
#ifdef CONFIG_X86_MCE_INTEL
/***
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
new file mode 100644
index 00000000000..895c82e7845
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -0,0 +1,1704 @@
+/*
+ * Performance counter x86 architecture code
+ *
+ * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ * Copyright (C) 2009 Jaswinder Singh Rajput
+ * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_counter.h>
+#include <linux/capability.h>
+#include <linux/notifier.h>
+#include <linux/hardirq.h>
+#include <linux/kprobes.h>
+#include <linux/module.h>
+#include <linux/kdebug.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+
+#include <asm/apic.h>
+#include <asm/stacktrace.h>
+#include <asm/nmi.h>
+
+static u64 perf_counter_mask __read_mostly;
+
+struct cpu_hw_counters {
+ struct perf_counter *counters[X86_PMC_IDX_MAX];
+ unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ unsigned long interrupts;
+ int enabled;
+};
+
+/*
+ * struct x86_pmu - generic x86 pmu
+ */
+struct x86_pmu {
+ const char *name;
+ int version;
+ int (*handle_irq)(struct pt_regs *);
+ void (*disable_all)(void);
+ void (*enable_all)(void);
+ void (*enable)(struct hw_perf_counter *, int);
+ void (*disable)(struct hw_perf_counter *, int);
+ unsigned eventsel;
+ unsigned perfctr;
+ u64 (*event_map)(int);
+ u64 (*raw_event)(u64);
+ int max_events;
+ int num_counters;
+ int num_counters_fixed;
+ int counter_bits;
+ u64 counter_mask;
+ u64 max_period;
+ u64 intel_ctrl;
+};
+
+static struct x86_pmu x86_pmu __read_mostly;
+
+static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
+ .enabled = 1,
+};
+
+/*
+ * Intel PerfMon v3. Used on Core2 and later.
+ */
+static const u64 intel_perfmon_event_map[] =
+{
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
+ [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
+};
+
+static u64 intel_pmu_event_map(int event)
+{
+ return intel_perfmon_event_map[event];
+}
+
+/*
+ * Generalized hw caching related event table, filled
+ * in on a per model basis. A value of 0 means
+ * 'not supported', -1 means 'event makes no sense on
+ * this CPU', any other value means the raw event
+ * ID.
+ */
+
+#define C(x) PERF_COUNT_HW_CACHE_##x
+
+static u64 __read_mostly hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX];
+
+static const u64 nehalem_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
+ [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
+ [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
+ [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
+ [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
+ [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
+ [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
+ [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
+ [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+ [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+static const u64 core2_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
+ [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
+ [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */
+ [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
+ [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
+ [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
+ [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
+ [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+static const u64 atom_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
+ [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
+ [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
+ [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
+ [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
+ [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+static u64 intel_pmu_raw_event(u64 event)
+{
+#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
+#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
+#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
+#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
+#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL
+
+#define CORE_EVNTSEL_MASK \
+ (CORE_EVNTSEL_EVENT_MASK | \
+ CORE_EVNTSEL_UNIT_MASK | \
+ CORE_EVNTSEL_EDGE_MASK | \
+ CORE_EVNTSEL_INV_MASK | \
+ CORE_EVNTSEL_COUNTER_MASK)
+
+ return event & CORE_EVNTSEL_MASK;
+}
+
+static const u64 amd_0f_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */
+ [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
+ [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */
+ [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+/*
+ * AMD Performance Monitor K7 and later.
+ */
+static const u64 amd_perfmon_event_map[] =
+{
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
+};
+
+static u64 amd_pmu_event_map(int event)
+{
+ return amd_perfmon_event_map[event];
+}
+
+static u64 amd_pmu_raw_event(u64 event)
+{
+#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL
+#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
+#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL
+#define K7_EVNTSEL_INV_MASK 0x000800000ULL
+#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL
+
+#define K7_EVNTSEL_MASK \
+ (K7_EVNTSEL_EVENT_MASK | \
+ K7_EVNTSEL_UNIT_MASK | \
+ K7_EVNTSEL_EDGE_MASK | \
+ K7_EVNTSEL_INV_MASK | \
+ K7_EVNTSEL_COUNTER_MASK)
+
+ return event & K7_EVNTSEL_MASK;
+}
+
+/*
+ * Propagate counter elapsed time into the generic counter.
+ * Can only be executed on the CPU where the counter is active.
+ * Returns the delta events processed.
+ */
+static u64
+x86_perf_counter_update(struct perf_counter *counter,
+ struct hw_perf_counter *hwc, int idx)
+{
+ int shift = 64 - x86_pmu.counter_bits;
+ u64 prev_raw_count, new_raw_count;
+ s64 delta;
+
+ /*
+ * Careful: an NMI might modify the previous counter value.
+ *
+ * Our tactic to handle this is to first atomically read and
+ * exchange a new raw count - then add that new-prev delta
+ * count to the generic counter atomically:
+ */
+again:
+ prev_raw_count = atomic64_read(&hwc->prev_count);
+ rdmsrl(hwc->counter_base + idx, new_raw_count);
+
+ if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+ new_raw_count) != prev_raw_count)
+ goto again;
+
+ /*
+ * Now we have the new raw value and have updated the prev
+ * timestamp already. We can now calculate the elapsed delta
+ * (counter-)time and add that to the generic counter.
+ *
+ * Careful, not all hw sign-extends above the physical width
+ * of the count.
+ */
+ delta = (new_raw_count << shift) - (prev_raw_count << shift);
+ delta >>= shift;
+
+ atomic64_add(delta, &counter->count);
+ atomic64_sub(delta, &hwc->period_left);
+
+ return new_raw_count;
+}
+
+static atomic_t active_counters;
+static DEFINE_MUTEX(pmc_reserve_mutex);
+
+static bool reserve_pmc_hardware(void)
+{
+ int i;
+
+ if (nmi_watchdog == NMI_LOCAL_APIC)
+ disable_lapic_nmi_watchdog();
+
+ for (i = 0; i < x86_pmu.num_counters; i++) {
+ if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
+ goto perfctr_fail;
+ }
+
+ for (i = 0; i < x86_pmu.num_counters; i++) {
+ if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
+ goto eventsel_fail;
+ }
+
+ return true;
+
+eventsel_fail:
+ for (i--; i >= 0; i--)
+ release_evntsel_nmi(x86_pmu.eventsel + i);
+
+ i = x86_pmu.num_counters;
+
+perfctr_fail:
+ for (i--; i >= 0; i--)
+ release_perfctr_nmi(x86_pmu.perfctr + i);
+
+ if (nmi_watchdog == NMI_LOCAL_APIC)
+ enable_lapic_nmi_watchdog();
+
+ return false;
+}
+
+static void release_pmc_hardware(void)
+{
+ int i;
+
+ for (i = 0; i < x86_pmu.num_counters; i++) {
+ release_perfctr_nmi(x86_pmu.perfctr + i);
+ release_evntsel_nmi(x86_pmu.eventsel + i);
+ }
+
+ if (nmi_watchdog == NMI_LOCAL_APIC)
+ enable_lapic_nmi_watchdog();
+}
+
+static void hw_perf_counter_destroy(struct perf_counter *counter)
+{
+ if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) {
+ release_pmc_hardware();
+ mutex_unlock(&pmc_reserve_mutex);
+ }
+}
+
+static inline int x86_pmu_initialized(void)
+{
+ return x86_pmu.handle_irq != NULL;
+}
+
+static inline int
+set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
+{
+ unsigned int cache_type, cache_op, cache_result;
+ u64 config, val;
+
+ config = attr->config;
+
+ cache_type = (config >> 0) & 0xff;
+ if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
+ return -EINVAL;
+
+ cache_op = (config >> 8) & 0xff;
+ if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
+ return -EINVAL;
+
+ cache_result = (config >> 16) & 0xff;
+ if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
+ return -EINVAL;
+
+ val = hw_cache_event_ids[cache_type][cache_op][cache_result];
+
+ if (val == 0)
+ return -ENOENT;
+
+ if (val == -1)
+ return -EINVAL;
+
+ hwc->config |= val;
+
+ return 0;
+}
+
+/*
+ * Setup the hardware configuration for a given attr_type
+ */
+static int __hw_perf_counter_init(struct perf_counter *counter)
+{
+ struct perf_counter_attr *attr = &counter->attr;
+ struct hw_perf_counter *hwc = &counter->hw;
+ int err;
+
+ if (!x86_pmu_initialized())
+ return -ENODEV;
+
+ err = 0;
+ if (!atomic_inc_not_zero(&active_counters)) {
+ mutex_lock(&pmc_reserve_mutex);
+ if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware())
+ err = -EBUSY;
+ else
+ atomic_inc(&active_counters);
+ mutex_unlock(&pmc_reserve_mutex);
+ }
+ if (err)
+ return err;
+
+ /*
+ * Generate PMC IRQs:
+ * (keep 'enabled' bit clear for now)
+ */
+ hwc->config = ARCH_PERFMON_EVENTSEL_INT;
+
+ /*
+ * Count user and OS events unless requested not to.
+ */
+ if (!attr->exclude_user)
+ hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
+ if (!attr->exclude_kernel)
+ hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
+
+ if (!hwc->sample_period) {
+ hwc->sample_period = x86_pmu.max_period;
+ hwc->last_period = hwc->sample_period;
+ atomic64_set(&hwc->period_left, hwc->sample_period);
+ }
+
+ counter->destroy = hw_perf_counter_destroy;
+
+ /*
+ * Raw event type provide the config in the event structure
+ */
+ if (attr->type == PERF_TYPE_RAW) {
+ hwc->config |= x86_pmu.raw_event(attr->config);
+ return 0;
+ }
+
+ if (attr->type == PERF_TYPE_HW_CACHE)
+ return set_ext_hw_attr(hwc, attr);
+
+ if (attr->config >= x86_pmu.max_events)
+ return -EINVAL;
+ /*
+ * The generic map:
+ */
+ hwc->config |= x86_pmu.event_map(attr->config);
+
+ return 0;
+}
+
+static void intel_pmu_disable_all(void)
+{
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+}
+
+static void amd_pmu_disable_all(void)
+{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ int idx;
+
+ if (!cpuc->enabled)
+ return;
+
+ cpuc->enabled = 0;
+ /*
+ * ensure we write the disable before we start disabling the
+ * counters proper, so that amd_pmu_enable_counter() does the
+ * right thing.
+ */
+ barrier();
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ u64 val;
+
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+ rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
+ if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
+ continue;
+ val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+ wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
+ }
+}
+
+void hw_perf_disable(void)
+{
+ if (!x86_pmu_initialized())
+ return;
+ return x86_pmu.disable_all();
+}
+
+static void intel_pmu_enable_all(void)
+{
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
+}
+
+static void amd_pmu_enable_all(void)
+{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ int idx;
+
+ if (cpuc->enabled)
+ return;
+
+ cpuc->enabled = 1;
+ barrier();
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ u64 val;
+
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+ rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
+ if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
+ continue;
+ val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+ wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
+ }
+}
+
+void hw_perf_enable(void)
+{
+ if (!x86_pmu_initialized())
+ return;
+ x86_pmu.enable_all();
+}
+
+static inline u64 intel_pmu_get_status(void)
+{
+ u64 status;
+
+ rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+
+ return status;
+}
+
+static inline void intel_pmu_ack_status(u64 ack)
+{
+ wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
+}
+
+static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
+{
+ int err;
+ err = checking_wrmsrl(hwc->config_base + idx,
+ hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
+}
+
+static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
+{
+ int err;
+ err = checking_wrmsrl(hwc->config_base + idx,
+ hwc->config);
+}
+
+static inline void
+intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
+{
+ int idx = __idx - X86_PMC_IDX_FIXED;
+ u64 ctrl_val, mask;
+ int err;
+
+ mask = 0xfULL << (idx * 4);
+
+ rdmsrl(hwc->config_base, ctrl_val);
+ ctrl_val &= ~mask;
+ err = checking_wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static inline void
+intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
+{
+ if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+ intel_pmu_disable_fixed(hwc, idx);
+ return;
+ }
+
+ x86_pmu_disable_counter(hwc, idx);
+}
+
+static inline void
+amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
+{
+ x86_pmu_disable_counter(hwc, idx);
+}
+
+static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
+
+/*
+ * Set the next IRQ period, based on the hwc->period_left value.
+ * To be called with the counter disabled in hw:
+ */
+static int
+x86_perf_counter_set_period(struct perf_counter *counter,
+ struct hw_perf_counter *hwc, int idx)
+{
+ s64 left = atomic64_read(&hwc->period_left);
+ s64 period = hwc->sample_period;
+ int err, ret = 0;
+
+ /*
+ * If we are way outside a reasoable range then just skip forward:
+ */
+ if (unlikely(left <= -period)) {
+ left = period;
+ atomic64_set(&hwc->period_left, left);
+ hwc->last_period = period;
+ ret = 1;
+ }
+
+ if (unlikely(left <= 0)) {
+ left += period;
+ atomic64_set(&hwc->period_left, left);
+ hwc->last_period = period;
+ ret = 1;
+ }
+ /*
+ * Quirk: certain CPUs dont like it if just 1 event is left:
+ */
+ if (unlikely(left < 2))
+ left = 2;
+
+ if (left > x86_pmu.max_period)
+ left = x86_pmu.max_period;
+
+ per_cpu(prev_left[idx], smp_processor_id()) = left;
+
+ /*
+ * The hw counter starts counting from this counter offset,
+ * mark it to be able to extra future deltas:
+ */
+ atomic64_set(&hwc->prev_count, (u64)-left);
+
+ err = checking_wrmsrl(hwc->counter_base + idx,
+ (u64)(-left) & x86_pmu.counter_mask);
+
+ return ret;
+}
+
+static inline void
+intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
+{
+ int idx = __idx - X86_PMC_IDX_FIXED;
+ u64 ctrl_val, bits, mask;
+ int err;
+
+ /*
+ * Enable IRQ generation (0x8),
+ * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
+ * if requested:
+ */
+ bits = 0x8ULL;
+ if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
+ bits |= 0x2;
+ if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+ bits |= 0x1;
+ bits <<= (idx * 4);
+ mask = 0xfULL << (idx * 4);
+
+ rdmsrl(hwc->config_base, ctrl_val);
+ ctrl_val &= ~mask;
+ ctrl_val |= bits;
+ err = checking_wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
+{
+ if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+ intel_pmu_enable_fixed(hwc, idx);
+ return;
+ }
+
+ x86_pmu_enable_counter(hwc, idx);
+}
+
+static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
+{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+ if (cpuc->enabled)
+ x86_pmu_enable_counter(hwc, idx);
+ else
+ x86_pmu_disable_counter(hwc, idx);
+}
+
+static int
+fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
+{
+ unsigned int event;
+
+ if (!x86_pmu.num_counters_fixed)
+ return -1;
+
+ event = hwc->config & ARCH_PERFMON_EVENT_MASK;
+
+ if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
+ return X86_PMC_IDX_FIXED_INSTRUCTIONS;
+ if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
+ return X86_PMC_IDX_FIXED_CPU_CYCLES;
+ if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
+ return X86_PMC_IDX_FIXED_BUS_CYCLES;
+
+ return -1;
+}
+
+/*
+ * Find a PMC slot for the freshly enabled / scheduled in counter:
+ */
+static int x86_pmu_enable(struct perf_counter *counter)
+{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ struct hw_perf_counter *hwc = &counter->hw;
+ int idx;
+
+ idx = fixed_mode_idx(counter, hwc);
+ if (idx >= 0) {
+ /*
+ * Try to get the fixed counter, if that is already taken
+ * then try to get a generic counter:
+ */
+ if (test_and_set_bit(idx, cpuc->used_mask))
+ goto try_generic;
+
+ hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
+ /*
+ * We set it so that counter_base + idx in wrmsr/rdmsr maps to
+ * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
+ */
+ hwc->counter_base =
+ MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
+ hwc->idx = idx;
+ } else {
+ idx = hwc->idx;
+ /* Try to get the previous generic counter again */
+ if (test_and_set_bit(idx, cpuc->used_mask)) {
+try_generic:
+ idx = find_first_zero_bit(cpuc->used_mask,
+ x86_pmu.num_counters);
+ if (idx == x86_pmu.num_counters)
+ return -EAGAIN;
+
+ set_bit(idx, cpuc->used_mask);
+ hwc->idx = idx;
+ }
+ hwc->config_base = x86_pmu.eventsel;
+ hwc->counter_base = x86_pmu.perfctr;
+ }
+
+ perf_counters_lapic_init();
+
+ x86_pmu.disable(hwc, idx);
+
+ cpuc->counters[idx] = counter;
+ set_bit(idx, cpuc->active_mask);
+
+ x86_perf_counter_set_period(counter, hwc, idx);
+ x86_pmu.enable(hwc, idx);
+
+ return 0;
+}
+
+static void x86_pmu_unthrottle(struct perf_counter *counter)
+{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ struct hw_perf_counter *hwc = &counter->hw;
+
+ if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
+ cpuc->counters[hwc->idx] != counter))
+ return;
+
+ x86_pmu.enable(hwc, hwc->idx);
+}
+
+void perf_counter_print_debug(void)
+{
+ u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
+ struct cpu_hw_counters *cpuc;
+ unsigned long flags;
+ int cpu, idx;
+
+ if (!x86_pmu.num_counters)
+ return;
+
+ local_irq_save(flags);
+
+ cpu = smp_processor_id();
+ cpuc = &per_cpu(cpu_hw_counters, cpu);
+
+ if (x86_pmu.version >= 2) {
+ rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+ rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+ rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
+ rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
+
+ pr_info("\n");
+ pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
+ pr_info("CPU#%d: status: %016llx\n", cpu, status);
+ pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
+ pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
+ }
+ pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask);
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
+ rdmsrl(x86_pmu.perfctr + idx, pmc_count);
+
+ prev_left = per_cpu(prev_left[idx], cpu);
+
+ pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
+ cpu, idx, pmc_ctrl);
+ pr_info("CPU#%d: gen-PMC%d count: %016llx\n",
+ cpu, idx, pmc_count);
+ pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
+ cpu, idx, prev_left);
+ }
+ for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+ rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
+
+ pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
+ cpu, idx, pmc_count);
+ }
+ local_irq_restore(flags);
+}
+
+static void x86_pmu_disable(struct perf_counter *counter)
+{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ struct hw_perf_counter *hwc = &counter->hw;
+ int idx = hwc->idx;
+
+ /*
+ * Must be done before we disable, otherwise the nmi handler
+ * could reenable again:
+ */
+ clear_bit(idx, cpuc->active_mask);
+ x86_pmu.disable(hwc, idx);
+
+ /*
+ * Make sure the cleared pointer becomes visible before we
+ * (potentially) free the counter:
+ */
+ barrier();
+
+ /*
+ * Drain the remaining delta count out of a counter
+ * that we are disabling:
+ */
+ x86_perf_counter_update(counter, hwc, idx);
+ cpuc->counters[idx] = NULL;
+ clear_bit(idx, cpuc->used_mask);
+}
+
+/*
+ * Save and restart an expired counter. Called by NMI contexts,
+ * so it has to be careful about preempting normal counter ops:
+ */
+static int intel_pmu_save_and_restart(struct perf_counter *counter)
+{
+ struct hw_perf_counter *hwc = &counter->hw;
+ int idx = hwc->idx;
+ int ret;
+
+ x86_perf_counter_update(counter, hwc, idx);
+ ret = x86_perf_counter_set_period(counter, hwc, idx);
+
+ if (counter->state == PERF_COUNTER_STATE_ACTIVE)
+ intel_pmu_enable_counter(hwc, idx);
+
+ return ret;
+}
+
+static void intel_pmu_reset(void)
+{
+ unsigned long flags;
+ int idx;
+
+ if (!x86_pmu.num_counters)
+ return;
+
+ local_irq_save(flags);
+
+ printk("clearing PMU state on CPU#%d\n", smp_processor_id());
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
+ checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
+ }
+ for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+ checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
+ }
+
+ local_irq_restore(flags);
+}
+
+
+/*
+ * This handler is triggered by the local APIC, so the APIC IRQ handling
+ * rules apply:
+ */
+static int intel_pmu_handle_irq(struct pt_regs *regs)
+{
+ struct perf_sample_data data;
+ struct cpu_hw_counters *cpuc;
+ int bit, cpu, loops;
+ u64 ack, status;
+
+ data.regs = regs;
+ data.addr = 0;
+
+ cpu = smp_processor_id();
+ cpuc = &per_cpu(cpu_hw_counters, cpu);
+
+ perf_disable();
+ status = intel_pmu_get_status();
+ if (!status) {
+ perf_enable();
+ return 0;
+ }
+
+ loops = 0;
+again:
+ if (++loops > 100) {
+ WARN_ONCE(1, "perfcounters: irq loop stuck!\n");
+ perf_counter_print_debug();
+ intel_pmu_reset();
+ perf_enable();
+ return 1;
+ }
+
+ inc_irq_stat(apic_perf_irqs);
+ ack = status;
+ for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
+ struct perf_counter *counter = cpuc->counters[bit];
+
+ clear_bit(bit, (unsigned long *) &status);
+ if (!test_bit(bit, cpuc->active_mask))
+ continue;
+
+ if (!intel_pmu_save_and_restart(counter))
+ continue;
+
+ if (perf_counter_overflow(counter, 1, &data))
+ intel_pmu_disable_counter(&counter->hw, bit);
+ }
+
+ intel_pmu_ack_status(ack);
+
+ /*
+ * Repeat if there is more work to be done:
+ */
+ status = intel_pmu_get_status();
+ if (status)
+ goto again;
+
+ perf_enable();
+
+ return 1;
+}
+
+static int amd_pmu_handle_irq(struct pt_regs *regs)
+{
+ struct perf_sample_data data;
+ struct cpu_hw_counters *cpuc;
+ struct perf_counter *counter;
+ struct hw_perf_counter *hwc;
+ int cpu, idx, handled = 0;
+ u64 val;
+
+ data.regs = regs;
+ data.addr = 0;
+
+ cpu = smp_processor_id();
+ cpuc = &per_cpu(cpu_hw_counters, cpu);
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+
+ counter = cpuc->counters[idx];
+ hwc = &counter->hw;
+
+ val = x86_perf_counter_update(counter, hwc, idx);
+ if (val & (1ULL << (x86_pmu.counter_bits - 1)))
+ continue;
+
+ /*
+ * counter overflow
+ */
+ handled = 1;
+ data.period = counter->hw.last_period;
+
+ if (!x86_perf_counter_set_period(counter, hwc, idx))
+ continue;
+
+ if (perf_counter_overflow(counter, 1, &data))
+ amd_pmu_disable_counter(hwc, idx);
+ }
+
+ if (handled)
+ inc_irq_stat(apic_perf_irqs);
+
+ return handled;
+}
+
+void smp_perf_pending_interrupt(struct pt_regs *regs)
+{
+ irq_enter();
+ ack_APIC_irq();
+ inc_irq_stat(apic_pending_irqs);
+ perf_counter_do_pending();
+ irq_exit();
+}
+
+void set_perf_counter_pending(void)
+{
+ apic->send_IPI_self(LOCAL_PENDING_VECTOR);
+}
+
+void perf_counters_lapic_init(void)
+{
+ if (!x86_pmu_initialized())
+ return;
+
+ /*
+ * Always use NMI for PMU
+ */
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+}
+
+static int __kprobes
+perf_counter_nmi_handler(struct notifier_block *self,
+ unsigned long cmd, void *__args)
+{
+ struct die_args *args = __args;
+ struct pt_regs *regs;
+
+ if (!atomic_read(&active_counters))
+ return NOTIFY_DONE;
+
+ switch (cmd) {
+ case DIE_NMI:
+ case DIE_NMI_IPI:
+ break;
+
+ default:
+ return NOTIFY_DONE;
+ }
+
+ regs = args->regs;
+
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ /*
+ * Can't rely on the handled return value to say it was our NMI, two
+ * counters could trigger 'simultaneously' raising two back-to-back NMIs.
+ *
+ * If the first NMI handles both, the latter will be empty and daze
+ * the CPU.
+ */
+ x86_pmu.handle_irq(regs);
+
+ return NOTIFY_STOP;
+}
+
+static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
+ .notifier_call = perf_counter_nmi_handler,
+ .next = NULL,
+ .priority = 1
+};
+
+static struct x86_pmu intel_pmu = {
+ .name = "Intel",
+ .handle_irq = intel_pmu_handle_irq,
+ .disable_all = intel_pmu_disable_all,
+ .enable_all = intel_pmu_enable_all,
+ .enable = intel_pmu_enable_counter,
+ .disable = intel_pmu_disable_counter,
+ .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
+ .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
+ .event_map = intel_pmu_event_map,
+ .raw_event = intel_pmu_raw_event,
+ .max_events = ARRAY_SIZE(intel_perfmon_event_map),
+ /*
+ * Intel PMCs cannot be accessed sanely above 32 bit width,
+ * so we install an artificial 1<<31 period regardless of
+ * the generic counter period:
+ */
+ .max_period = (1ULL << 31) - 1,
+};
+
+static struct x86_pmu amd_pmu = {
+ .name = "AMD",
+ .handle_irq = amd_pmu_handle_irq,
+ .disable_all = amd_pmu_disable_all,
+ .enable_all = amd_pmu_enable_all,
+ .enable = amd_pmu_enable_counter,
+ .disable = amd_pmu_disable_counter,
+ .eventsel = MSR_K7_EVNTSEL0,
+ .perfctr = MSR_K7_PERFCTR0,
+ .event_map = amd_pmu_event_map,
+ .raw_event = amd_pmu_raw_event,
+ .max_events = ARRAY_SIZE(amd_perfmon_event_map),
+ .num_counters = 4,
+ .counter_bits = 48,
+ .counter_mask = (1ULL << 48) - 1,
+ /* use highest bit to detect overflow */
+ .max_period = (1ULL << 47) - 1,
+};
+
+static int intel_pmu_init(void)
+{
+ union cpuid10_edx edx;
+ union cpuid10_eax eax;
+ unsigned int unused;
+ unsigned int ebx;
+ int version;
+
+ if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+ return -ENODEV;
+
+ /*
+ * Check whether the Architectural PerfMon supports
+ * Branch Misses Retired Event or not.
+ */
+ cpuid(10, &eax.full, &ebx, &unused, &edx.full);
+ if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
+ return -ENODEV;
+
+ version = eax.split.version_id;
+ if (version < 2)
+ return -ENODEV;
+
+ x86_pmu = intel_pmu;
+ x86_pmu.version = version;
+ x86_pmu.num_counters = eax.split.num_counters;
+ x86_pmu.counter_bits = eax.split.bit_width;
+ x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
+
+ /*
+ * Quirk: v2 perfmon does not report fixed-purpose counters, so
+ * assume at least 3 counters:
+ */
+ x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
+
+ rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
+
+ /*
+ * Install the hw-cache-events table:
+ */
+ switch (boot_cpu_data.x86_model) {
+ case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
+ case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
+ case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
+ case 29: /* six-core 45 nm xeon "Dunnington" */
+ memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ pr_cont("Core2 events, ");
+ break;
+ default:
+ case 26:
+ memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ pr_cont("Nehalem/Corei7 events, ");
+ break;
+ case 28:
+ memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ pr_cont("Atom events, ");
+ break;
+ }
+ return 0;
+}
+
+static int amd_pmu_init(void)
+{
+ x86_pmu = amd_pmu;
+
+ switch (boot_cpu_data.x86) {
+ case 0x0f:
+ case 0x10:
+ case 0x11:
+ memcpy(hw_cache_event_ids, amd_0f_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ pr_cont("AMD Family 0f/10/11 events, ");
+ break;
+ }
+ return 0;
+}
+
+void __init init_hw_perf_counters(void)
+{
+ int err;
+
+ pr_info("Performance Counters: ");
+
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_INTEL:
+ err = intel_pmu_init();
+ break;
+ case X86_VENDOR_AMD:
+ err = amd_pmu_init();
+ break;
+ default:
+ return;
+ }
+ if (err != 0) {
+ pr_cont("no PMU driver, software counters only.\n");
+ return;
+ }
+
+ pr_cont("%s PMU driver.\n", x86_pmu.name);
+
+ if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
+ x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
+ WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
+ x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
+ }
+ perf_counter_mask = (1 << x86_pmu.num_counters) - 1;
+ perf_max_counters = x86_pmu.num_counters;
+
+ if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
+ x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
+ WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
+ x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
+ }
+
+ perf_counter_mask |=
+ ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
+
+ perf_counters_lapic_init();
+ register_die_notifier(&perf_counter_nmi_notifier);
+
+ pr_info("... version: %d\n", x86_pmu.version);
+ pr_info("... bit width: %d\n", x86_pmu.counter_bits);
+ pr_info("... generic counters: %d\n", x86_pmu.num_counters);
+ pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask);
+ pr_info("... max period: %016Lx\n", x86_pmu.max_period);
+ pr_info("... fixed-purpose counters: %d\n", x86_pmu.num_counters_fixed);
+ pr_info("... counter mask: %016Lx\n", perf_counter_mask);
+}
+
+static inline void x86_pmu_read(struct perf_counter *counter)
+{
+ x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
+}
+
+static const struct pmu pmu = {
+ .enable = x86_pmu_enable,
+ .disable = x86_pmu_disable,
+ .read = x86_pmu_read,
+ .unthrottle = x86_pmu_unthrottle,
+};
+
+const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
+{
+ int err;
+
+ err = __hw_perf_counter_init(counter);
+ if (err)
+ return ERR_PTR(err);
+
+ return &pmu;
+}
+
+/*
+ * callchain support
+ */
+
+static inline
+void callchain_store(struct perf_callchain_entry *entry, unsigned long ip)
+{
+ if (entry->nr < MAX_STACK_DEPTH)
+ entry->ip[entry->nr++] = ip;
+}
+
+static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
+static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
+
+
+static void
+backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+ /* Ignore warnings */
+}
+
+static void backtrace_warning(void *data, char *msg)
+{
+ /* Ignore warnings */
+}
+
+static int backtrace_stack(void *data, char *name)
+{
+ /* Don't bother with IRQ stacks for now */
+ return -1;
+}
+
+static void backtrace_address(void *data, unsigned long addr, int reliable)
+{
+ struct perf_callchain_entry *entry = data;
+
+ if (reliable)
+ callchain_store(entry, addr);
+}
+
+static const struct stacktrace_ops backtrace_ops = {
+ .warning = backtrace_warning,
+ .warning_symbol = backtrace_warning_symbol,
+ .stack = backtrace_stack,
+ .address = backtrace_address,
+};
+
+static void
+perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+ unsigned long bp;
+ char *stack;
+ int nr = entry->nr;
+
+ callchain_store(entry, instruction_pointer(regs));
+
+ stack = ((char *)regs + sizeof(struct pt_regs));
+#ifdef CONFIG_FRAME_POINTER
+ bp = frame_pointer(regs);
+#else
+ bp = 0;
+#endif
+
+ dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry);
+
+ entry->kernel = entry->nr - nr;
+}
+
+
+struct stack_frame {
+ const void __user *next_fp;
+ unsigned long return_address;
+};
+
+static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+{
+ int ret;
+
+ if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
+ return 0;
+
+ ret = 1;
+ pagefault_disable();
+ if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
+ ret = 0;
+ pagefault_enable();
+
+ return ret;
+}
+
+static void
+perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+ struct stack_frame frame;
+ const void __user *fp;
+ int nr = entry->nr;
+
+ regs = (struct pt_regs *)current->thread.sp0 - 1;
+ fp = (void __user *)regs->bp;
+
+ callchain_store(entry, regs->ip);
+
+ while (entry->nr < MAX_STACK_DEPTH) {
+ frame.next_fp = NULL;
+ frame.return_address = 0;
+
+ if (!copy_stack_frame(fp, &frame))
+ break;
+
+ if ((unsigned long)fp < user_stack_pointer(regs))
+ break;
+
+ callchain_store(entry, frame.return_address);
+ fp = frame.next_fp;
+ }
+
+ entry->user = entry->nr - nr;
+}
+
+static void
+perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+ int is_user;
+
+ if (!regs)
+ return;
+
+ is_user = user_mode(regs);
+
+ if (!current || current->pid == 0)
+ return;
+
+ if (is_user && current->state != TASK_RUNNING)
+ return;
+
+ if (!is_user)
+ perf_callchain_kernel(regs, entry);
+
+ if (current->mm)
+ perf_callchain_user(regs, entry);
+}
+
+struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+ struct perf_callchain_entry *entry;
+
+ if (in_nmi())
+ entry = &__get_cpu_var(nmi_entry);
+ else
+ entry = &__get_cpu_var(irq_entry);
+
+ entry->nr = 0;
+ entry->hv = 0;
+ entry->kernel = 0;
+ entry->user = 0;
+
+ perf_do_callchain(regs, entry);
+
+ return entry;
+}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index f6c70a164e3..d6f5b9fbde3 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -19,8 +19,8 @@
#include <linux/nmi.h>
#include <linux/kprobes.h>
-#include <asm/genapic.h>
-#include <asm/intel_arch_perfmon.h>
+#include <asm/apic.h>
+#include <asm/perf_counter.h>
struct nmi_watchdog_ctlblk {
unsigned int cccr_msr;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 1c17d7c751a..a4742a340d8 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1012,6 +1012,11 @@ apicinterrupt ERROR_APIC_VECTOR \
apicinterrupt SPURIOUS_APIC_VECTOR \
spurious_interrupt smp_spurious_interrupt
+#ifdef CONFIG_PERF_COUNTERS
+apicinterrupt LOCAL_PENDING_VECTOR \
+ perf_pending_interrupt smp_perf_pending_interrupt
+#endif
+
/*
* Exception entry points.
*/
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 9a391bbb8ba..38287b5f116 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -62,6 +62,14 @@ static int show_other_interrupts(struct seq_file *p, int prec)
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
seq_printf(p, " Spurious interrupts\n");
+ seq_printf(p, "%*s: ", prec, "CNT");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
+ seq_printf(p, " Performance counter interrupts\n");
+ seq_printf(p, "%*s: ", prec, "PND");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
+ seq_printf(p, " Performance pending work\n");
#endif
if (generic_interrupt_extension) {
seq_printf(p, "%*s: ", prec, "PLT");
@@ -165,6 +173,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
#ifdef CONFIG_X86_LOCAL_APIC
sum += irq_stats(cpu)->apic_timer_irqs;
sum += irq_stats(cpu)->irq_spurious_count;
+ sum += irq_stats(cpu)->apic_perf_irqs;
+ sum += irq_stats(cpu)->apic_pending_irqs;
#endif
if (generic_interrupt_extension)
sum += irq_stats(cpu)->generic_irqs;
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 2e08b10ad51..267c6624c77 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -181,10 +181,15 @@ static void __init apic_intr_init(void)
{
smp_intr_init();
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_X86_THERMAL_VECTOR
alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+#endif
+#ifdef CONFIG_X86_THRESHOLD
alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
#endif
+#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC)
+ alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);
+#endif
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
/* self generated IPI for local APIC timer */
@@ -199,18 +204,10 @@ static void __init apic_intr_init(void)
/* Performance monitoring interrupts: */
# ifdef CONFIG_PERF_COUNTERS
- alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
# endif
#endif
-
-#ifdef CONFIG_X86_32
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
- /* thermal monitor LVT interrupt */
- alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
-#endif
-#endif
}
/**
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 6551dedee20..a78ecad0c90 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -27,6 +27,7 @@
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/hardirq.h>
+#include <asm/timer.h>
#define MMU_QUEUE_SIZE 1024
@@ -230,6 +231,9 @@ static void paravirt_ops_setup(void)
pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;
pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;
}
+#ifdef CONFIG_X86_IO_APIC
+ no_timer_check = 1;
+#endif
}
void __init kvm_guest_init(void)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 14425166b8e..0a813b17b17 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,7 +6,6 @@
* 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
* 2000-2002 x86-64 support by Andi Kleen
*/
-
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/smp.h>
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index f6db48c405b..28f5fb495a6 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -172,6 +172,9 @@ void smp_reschedule_interrupt(struct pt_regs *regs)
{
ack_APIC_irq();
inc_irq_stat(irq_resched_count);
+ /*
+ * KVM uses this interrupt to force a cpu out of guest mode
+ */
}
void smp_call_function_interrupt(struct pt_regs *regs)
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 734f92c02dd..d51321ddafd 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -335,3 +335,4 @@ ENTRY(sys_call_table)
.long sys_preadv
.long sys_pwritev
.long sys_rt_tgsigqueueinfo /* 335 */
+ .long sys_perf_counter_open
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ede024531f8..07d60c870ce 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -942,8 +942,13 @@ void __init trap_init(void)
#endif
set_intr_gate(19, &simd_coprocessor_error);
+ /* Reserve all the builtin and the syscall vector: */
+ for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
+ set_bit(i, used_vectors);
+
#ifdef CONFIG_IA32_EMULATION
set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
+ set_bit(IA32_SYSCALL_VECTOR, used_vectors);
#endif
#ifdef CONFIG_X86_32
@@ -960,14 +965,9 @@ void __init trap_init(void)
}
set_system_trap_gate(SYSCALL_VECTOR, &system_call);
+ set_bit(SYSCALL_VECTOR, used_vectors);
#endif
- /* Reserve all the builtin and the syscall vector: */
- for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
- set_bit(i, used_vectors);
-
- set_bit(IA32_SYSCALL_VECTOR, used_vectors);
-
/*
* Should be a barrier for any external CPU state:
*/
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index a58504ea78c..8600a09e0c6 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -50,6 +50,9 @@ config KVM_INTEL
Provides support for KVM on Intel processors equipped with the VT
extensions.
+ To compile this as a module, choose M here: the module
+ will be called kvm-intel.
+
config KVM_AMD
tristate "KVM for AMD processors support"
depends on KVM
@@ -57,6 +60,9 @@ config KVM_AMD
Provides support for KVM on AMD processors equipped with the AMD-V
(SVM) extensions.
+ To compile this as a module, choose M here: the module
+ will be called kvm-amd.
+
config KVM_TRACE
bool "KVM trace support"
depends on KVM && SYSFS
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index d3ec292f00f..b43c4efafe8 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -14,7 +14,7 @@ endif
EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \
- i8254.o
+ i8254.o timer.o
obj-$(CONFIG_KVM) += kvm.o
kvm-intel-objs = vmx.o
obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index c13bb92d315..4d6f0d293ee 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -98,6 +98,37 @@ static int pit_get_gate(struct kvm *kvm, int channel)
return kvm->arch.vpit->pit_state.channels[channel].gate;
}
+static s64 __kpit_elapsed(struct kvm *kvm)
+{
+ s64 elapsed;
+ ktime_t remaining;
+ struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
+
+ /*
+ * The Counter does not stop when it reaches zero. In
+ * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to
+ * the highest count, either FFFF hex for binary counting
+ * or 9999 for BCD counting, and continues counting.
+ * Modes 2 and 3 are periodic; the Counter reloads
+ * itself with the initial count and continues counting
+ * from there.
+ */
+ remaining = hrtimer_expires_remaining(&ps->pit_timer.timer);
+ elapsed = ps->pit_timer.period - ktime_to_ns(remaining);
+ elapsed = mod_64(elapsed, ps->pit_timer.period);
+
+ return elapsed;
+}
+
+static s64 kpit_elapsed(struct kvm *kvm, struct kvm_kpit_channel_state *c,
+ int channel)
+{
+ if (channel == 0)
+ return __kpit_elapsed(kvm);
+
+ return ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
+}
+
static int pit_get_count(struct kvm *kvm, int channel)
{
struct kvm_kpit_channel_state *c =
@@ -107,7 +138,7 @@ static int pit_get_count(struct kvm *kvm, int channel)
WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
- t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
+ t = kpit_elapsed(kvm, c, channel);
d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
switch (c->mode) {
@@ -137,7 +168,7 @@ static int pit_get_out(struct kvm *kvm, int channel)
WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
- t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
+ t = kpit_elapsed(kvm, c, channel);
d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
switch (c->mode) {
@@ -193,28 +224,6 @@ static void pit_latch_status(struct kvm *kvm, int channel)
}
}
-static int __pit_timer_fn(struct kvm_kpit_state *ps)
-{
- struct kvm_vcpu *vcpu0 = ps->pit->kvm->vcpus[0];
- struct kvm_kpit_timer *pt = &ps->pit_timer;
-
- if (!atomic_inc_and_test(&pt->pending))
- set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
-
- if (!pt->reinject)
- atomic_set(&pt->pending, 1);
-
- if (vcpu0 && waitqueue_active(&vcpu0->wq))
- wake_up_interruptible(&vcpu0->wq);
-
- hrtimer_add_expires_ns(&pt->timer, pt->period);
- pt->scheduled = hrtimer_get_expires_ns(&pt->timer);
- if (pt->period)
- ps->channels[0].count_load_time = ktime_get();
-
- return (pt->period == 0 ? 0 : 1);
-}
-
int pit_has_pending_timer(struct kvm_vcpu *vcpu)
{
struct kvm_pit *pit = vcpu->kvm->arch.vpit;
@@ -235,21 +244,6 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
spin_unlock(&ps->inject_lock);
}
-static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
-{
- struct kvm_kpit_state *ps;
- int restart_timer = 0;
-
- ps = container_of(data, struct kvm_kpit_state, pit_timer.timer);
-
- restart_timer = __pit_timer_fn(ps);
-
- if (restart_timer)
- return HRTIMER_RESTART;
- else
- return HRTIMER_NORESTART;
-}
-
void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
{
struct kvm_pit *pit = vcpu->kvm->arch.vpit;
@@ -263,15 +257,26 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
}
-static void destroy_pit_timer(struct kvm_kpit_timer *pt)
+static void destroy_pit_timer(struct kvm_timer *pt)
{
pr_debug("pit: execute del timer!\n");
hrtimer_cancel(&pt->timer);
}
+static bool kpit_is_periodic(struct kvm_timer *ktimer)
+{
+ struct kvm_kpit_state *ps = container_of(ktimer, struct kvm_kpit_state,
+ pit_timer);
+ return ps->is_periodic;
+}
+
+static struct kvm_timer_ops kpit_ops = {
+ .is_periodic = kpit_is_periodic,
+};
+
static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
{
- struct kvm_kpit_timer *pt = &ps->pit_timer;
+ struct kvm_timer *pt = &ps->pit_timer;
s64 interval;
interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
@@ -280,8 +285,14 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
/* TODO The new value only affected after the retriggered */
hrtimer_cancel(&pt->timer);
- pt->period = (is_period == 0) ? 0 : interval;
- pt->timer.function = pit_timer_fn;
+ pt->period = interval;
+ ps->is_periodic = is_period;
+
+ pt->timer.function = kvm_timer_fn;
+ pt->t_ops = &kpit_ops;
+ pt->kvm = ps->pit->kvm;
+ pt->vcpu_id = 0;
+
atomic_set(&pt->pending, 0);
ps->irq_ack = 1;
@@ -298,23 +309,23 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
pr_debug("pit: load_count val is %d, channel is %d\n", val, channel);
/*
- * Though spec said the state of 8254 is undefined after power-up,
- * seems some tricky OS like Windows XP depends on IRQ0 interrupt
- * when booting up.
- * So here setting initialize rate for it, and not a specific number
+ * The largest possible initial count is 0; this is equivalent
+ * to 216 for binary counting and 104 for BCD counting.
*/
if (val == 0)
val = 0x10000;
- ps->channels[channel].count_load_time = ktime_get();
ps->channels[channel].count = val;
- if (channel != 0)
+ if (channel != 0) {
+ ps->channels[channel].count_load_time = ktime_get();
return;
+ }
/* Two types of timer
* mode 1 is one shot, mode 2 is period, otherwise del timer */
switch (ps->channels[0].mode) {
+ case 0:
case 1:
/* FIXME: enhance mode 4 precision */
case 4:
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 6acbe4b505d..bbd863ff60b 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -3,15 +3,6 @@
#include "iodev.h"
-struct kvm_kpit_timer {
- struct hrtimer timer;
- int irq;
- s64 period; /* unit: ns */
- s64 scheduled;
- atomic_t pending;
- bool reinject;
-};
-
struct kvm_kpit_channel_state {
u32 count; /* can be 65536 */
u16 latched_count;
@@ -30,7 +21,8 @@ struct kvm_kpit_channel_state {
struct kvm_kpit_state {
struct kvm_kpit_channel_state channels[3];
- struct kvm_kpit_timer pit_timer;
+ struct kvm_timer pit_timer;
+ bool is_periodic;
u32 speaker_data_on;
struct mutex lock;
struct kvm_pit *pit;
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index cf17ed52f6f..96dfbb6ad2a 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -24,6 +24,7 @@
#include "irq.h"
#include "i8254.h"
+#include "x86.h"
/*
* check if there are pending timer events
@@ -48,6 +49,9 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
{
struct kvm_pic *s;
+ if (!irqchip_in_kernel(v->kvm))
+ return v->arch.interrupt.pending;
+
if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */
if (kvm_apic_accept_pic_intr(v)) {
s = pic_irqchip(v->kvm); /* PIC */
@@ -67,6 +71,9 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
struct kvm_pic *s;
int vector;
+ if (!irqchip_in_kernel(v->kvm))
+ return v->arch.interrupt.nr;
+
vector = kvm_get_apic_interrupt(v); /* APIC */
if (vector == -1) {
if (kvm_apic_accept_pic_intr(v)) {
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
new file mode 100644
index 00000000000..26bd6ba74e1
--- /dev/null
+++ b/arch/x86/kvm/kvm_timer.h
@@ -0,0 +1,18 @@
+
+struct kvm_timer {
+ struct hrtimer timer;
+ s64 period; /* unit: ns */
+ atomic_t pending; /* accumulated triggered timers */
+ bool reinject;
+ struct kvm_timer_ops *t_ops;
+ struct kvm *kvm;
+ int vcpu_id;
+};
+
+struct kvm_timer_ops {
+ bool (*is_periodic)(struct kvm_timer *);
+};
+
+
+enum hrtimer_restart kvm_timer_fn(struct hrtimer *data);
+
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index f0b67f2cdd6..ae99d83f81a 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -196,20 +196,15 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig)
+static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
+ int vector, int level, int trig_mode);
+
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!apic_test_and_set_irr(vec, apic)) {
- /* a new pending irq is set in IRR */
- if (trig)
- apic_set_vector(vec, apic->regs + APIC_TMR);
- else
- apic_clear_vector(vec, apic->regs + APIC_TMR);
- kvm_vcpu_kick(apic->vcpu);
- return 1;
- }
- return 0;
+ return __apic_accept_irq(apic, irq->delivery_mode, irq->vector,
+ irq->level, irq->trig_mode);
}
static inline int apic_find_highest_isr(struct kvm_lapic *apic)
@@ -250,7 +245,7 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
{
- return kvm_apic_id(apic) == dest;
+ return dest == 0xff || kvm_apic_id(apic) == dest;
}
int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
@@ -279,37 +274,34 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
return result;
}
-static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
int short_hand, int dest, int dest_mode)
{
int result = 0;
struct kvm_lapic *target = vcpu->arch.apic;
apic_debug("target %p, source %p, dest 0x%x, "
- "dest_mode 0x%x, short_hand 0x%x",
+ "dest_mode 0x%x, short_hand 0x%x\n",
target, source, dest, dest_mode, short_hand);
ASSERT(!target);
switch (short_hand) {
case APIC_DEST_NOSHORT:
- if (dest_mode == 0) {
+ if (dest_mode == 0)
/* Physical mode. */
- if ((dest == 0xFF) || (dest == kvm_apic_id(target)))
- result = 1;
- } else
+ result = kvm_apic_match_physical_addr(target, dest);
+ else
/* Logical mode. */
result = kvm_apic_match_logical_addr(target, dest);
break;
case APIC_DEST_SELF:
- if (target == source)
- result = 1;
+ result = (target == source);
break;
case APIC_DEST_ALLINC:
result = 1;
break;
case APIC_DEST_ALLBUT:
- if (target != source)
- result = 1;
+ result = (target != source);
break;
default:
printk(KERN_WARNING "Bad dest shorthand value %x\n",
@@ -327,20 +319,22 @@ static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
int vector, int level, int trig_mode)
{
- int orig_irr, result = 0;
+ int result = 0;
struct kvm_vcpu *vcpu = apic->vcpu;
switch (delivery_mode) {
- case APIC_DM_FIXED:
case APIC_DM_LOWEST:
+ vcpu->arch.apic_arb_prio++;
+ case APIC_DM_FIXED:
/* FIXME add logic for vcpu on reset */
if (unlikely(!apic_enabled(apic)))
break;
- orig_irr = apic_test_and_set_irr(vector, apic);
- if (orig_irr && trig_mode) {
- apic_debug("level trig mode repeatedly for vector %d",
- vector);
+ result = !apic_test_and_set_irr(vector, apic);
+ if (!result) {
+ if (trig_mode)
+ apic_debug("level trig mode repeatedly for "
+ "vector %d", vector);
break;
}
@@ -349,10 +343,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
apic_set_vector(vector, apic->regs + APIC_TMR);
} else
apic_clear_vector(vector, apic->regs + APIC_TMR);
-
kvm_vcpu_kick(vcpu);
-
- result = (orig_irr == 0);
break;
case APIC_DM_REMRD:
@@ -364,12 +355,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
break;
case APIC_DM_NMI:
+ result = 1;
kvm_inject_nmi(vcpu);
kvm_vcpu_kick(vcpu);
break;
case APIC_DM_INIT:
if (level) {
+ result = 1;
if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
printk(KERN_DEBUG
"INIT on a runnable vcpu %d\n",
@@ -386,6 +379,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
apic_debug("SIPI to vcpu %d vector 0x%02x\n",
vcpu->vcpu_id, vector);
if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
+ result = 1;
vcpu->arch.sipi_vector = vector;
vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
kvm_vcpu_kick(vcpu);
@@ -408,43 +402,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
return result;
}
-static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
- unsigned long bitmap)
-{
- int last;
- int next;
- struct kvm_lapic *apic = NULL;
-
- last = kvm->arch.round_robin_prev_vcpu;
- next = last;
-
- do {
- if (++next == KVM_MAX_VCPUS)
- next = 0;
- if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap))
- continue;
- apic = kvm->vcpus[next]->arch.apic;
- if (apic && apic_enabled(apic))
- break;
- apic = NULL;
- } while (next != last);
- kvm->arch.round_robin_prev_vcpu = next;
-
- if (!apic)
- printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
-
- return apic;
-}
-
-struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
- unsigned long bitmap)
+int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
{
- struct kvm_lapic *apic;
-
- apic = kvm_apic_round_robin(kvm, vector, bitmap);
- if (apic)
- return apic->vcpu;
- return NULL;
+ return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
}
static void apic_set_eoi(struct kvm_lapic *apic)
@@ -472,47 +432,24 @@ static void apic_send_ipi(struct kvm_lapic *apic)
{
u32 icr_low = apic_get_reg(apic, APIC_ICR);
u32 icr_high = apic_get_reg(apic, APIC_ICR2);
+ struct kvm_lapic_irq irq;
- unsigned int dest = GET_APIC_DEST_FIELD(icr_high);
- unsigned int short_hand = icr_low & APIC_SHORT_MASK;
- unsigned int trig_mode = icr_low & APIC_INT_LEVELTRIG;
- unsigned int level = icr_low & APIC_INT_ASSERT;
- unsigned int dest_mode = icr_low & APIC_DEST_MASK;
- unsigned int delivery_mode = icr_low & APIC_MODE_MASK;
- unsigned int vector = icr_low & APIC_VECTOR_MASK;
-
- struct kvm_vcpu *target;
- struct kvm_vcpu *vcpu;
- unsigned long lpr_map = 0;
- int i;
+ irq.vector = icr_low & APIC_VECTOR_MASK;
+ irq.delivery_mode = icr_low & APIC_MODE_MASK;
+ irq.dest_mode = icr_low & APIC_DEST_MASK;
+ irq.level = icr_low & APIC_INT_ASSERT;
+ irq.trig_mode = icr_low & APIC_INT_LEVELTRIG;
+ irq.shorthand = icr_low & APIC_SHORT_MASK;
+ irq.dest_id = GET_APIC_DEST_FIELD(icr_high);
apic_debug("icr_high 0x%x, icr_low 0x%x, "
"short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
"dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
- icr_high, icr_low, short_hand, dest,
- trig_mode, level, dest_mode, delivery_mode, vector);
-
- for (i = 0; i < KVM_MAX_VCPUS; i++) {
- vcpu = apic->vcpu->kvm->vcpus[i];
- if (!vcpu)
- continue;
-
- if (vcpu->arch.apic &&
- apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) {
- if (delivery_mode == APIC_DM_LOWEST)
- set_bit(vcpu->vcpu_id, &lpr_map);
- else
- __apic_accept_irq(vcpu->arch.apic, delivery_mode,
- vector, level, trig_mode);
- }
- }
+ icr_high, icr_low, irq.shorthand, irq.dest_id,
+ irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
+ irq.vector);
- if (delivery_mode == APIC_DM_LOWEST) {
- target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map);
- if (target != NULL)
- __apic_accept_irq(target->arch.apic, delivery_mode,
- vector, level, trig_mode);
- }
+ kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq);
}
static u32 apic_get_tmcct(struct kvm_lapic *apic)
@@ -527,12 +464,13 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
if (apic_get_reg(apic, APIC_TMICT) == 0)
return 0;
- remaining = hrtimer_expires_remaining(&apic->timer.dev);
+ remaining = hrtimer_expires_remaining(&apic->lapic_timer.timer);
if (ktime_to_ns(remaining) < 0)
remaining = ktime_set(0, 0);
- ns = mod_64(ktime_to_ns(remaining), apic->timer.period);
- tmcct = div64_u64(ns, (APIC_BUS_CYCLE_NS * apic->timer.divide_count));
+ ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period);
+ tmcct = div64_u64(ns,
+ (APIC_BUS_CYCLE_NS * apic->divide_count));
return tmcct;
}
@@ -619,25 +557,25 @@ static void update_divide_count(struct kvm_lapic *apic)
tdcr = apic_get_reg(apic, APIC_TDCR);
tmp1 = tdcr & 0xf;
tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
- apic->timer.divide_count = 0x1 << (tmp2 & 0x7);
+ apic->divide_count = 0x1 << (tmp2 & 0x7);
apic_debug("timer divide count is 0x%x\n",
- apic->timer.divide_count);
+ apic->divide_count);
}
static void start_apic_timer(struct kvm_lapic *apic)
{
- ktime_t now = apic->timer.dev.base->get_time();
+ ktime_t now = apic->lapic_timer.timer.base->get_time();
- apic->timer.period = apic_get_reg(apic, APIC_TMICT) *
- APIC_BUS_CYCLE_NS * apic->timer.divide_count;
- atomic_set(&apic->timer.pending, 0);
+ apic->lapic_timer.period = apic_get_reg(apic, APIC_TMICT) *
+ APIC_BUS_CYCLE_NS * apic->divide_count;
+ atomic_set(&apic->lapic_timer.pending, 0);
- if (!apic->timer.period)
+ if (!apic->lapic_timer.period)
return;
- hrtimer_start(&apic->timer.dev,
- ktime_add_ns(now, apic->timer.period),
+ hrtimer_start(&apic->lapic_timer.timer,
+ ktime_add_ns(now, apic->lapic_timer.period),
HRTIMER_MODE_ABS);
apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
@@ -646,9 +584,9 @@ static void start_apic_timer(struct kvm_lapic *apic)
"expire @ 0x%016" PRIx64 ".\n", __func__,
APIC_BUS_CYCLE_NS, ktime_to_ns(now),
apic_get_reg(apic, APIC_TMICT),
- apic->timer.period,
+ apic->lapic_timer.period,
ktime_to_ns(ktime_add_ns(now,
- apic->timer.period)));
+ apic->lapic_timer.period)));
}
static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
@@ -730,7 +668,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
apic_set_reg(apic, APIC_LVTT + 0x10 * i,
lvt_val | APIC_LVT_MASKED);
}
- atomic_set(&apic->timer.pending, 0);
+ atomic_set(&apic->lapic_timer.pending, 0);
}
break;
@@ -762,7 +700,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
break;
case APIC_TMICT:
- hrtimer_cancel(&apic->timer.dev);
+ hrtimer_cancel(&apic->lapic_timer.timer);
apic_set_reg(apic, APIC_TMICT, val);
start_apic_timer(apic);
return;
@@ -802,7 +740,7 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
if (!vcpu->arch.apic)
return;
- hrtimer_cancel(&vcpu->arch.apic->timer.dev);
+ hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer);
if (vcpu->arch.apic->regs_page)
__free_page(vcpu->arch.apic->regs_page);
@@ -880,7 +818,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
ASSERT(apic != NULL);
/* Stop the timer in case it's a reset to an active apic */
- hrtimer_cancel(&apic->timer.dev);
+ hrtimer_cancel(&apic->lapic_timer.timer);
apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
apic_set_reg(apic, APIC_LVR, APIC_VERSION);
@@ -905,11 +843,13 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
}
update_divide_count(apic);
- atomic_set(&apic->timer.pending, 0);
+ atomic_set(&apic->lapic_timer.pending, 0);
if (vcpu->vcpu_id == 0)
vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
apic_update_ppr(apic);
+ vcpu->arch.apic_arb_prio = 0;
+
apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
"0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__,
vcpu, kvm_apic_id(apic),
@@ -917,16 +857,14 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_lapic_reset);
-int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
+bool kvm_apic_present(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
- int ret = 0;
-
- if (!apic)
- return 0;
- ret = apic_enabled(apic);
+ return vcpu->arch.apic && apic_hw_enabled(vcpu->arch.apic);
+}
- return ret;
+int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
+{
+ return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic);
}
EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
@@ -936,22 +874,11 @@ EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
*----------------------------------------------------------------------
*/
-/* TODO: make sure __apic_timer_fn runs in current pCPU */
-static int __apic_timer_fn(struct kvm_lapic *apic)
+static bool lapic_is_periodic(struct kvm_timer *ktimer)
{
- int result = 0;
- wait_queue_head_t *q = &apic->vcpu->wq;
-
- if(!atomic_inc_and_test(&apic->timer.pending))
- set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
- if (waitqueue_active(q))
- wake_up_interruptible(q);
-
- if (apic_lvtt_period(apic)) {
- result = 1;
- hrtimer_add_expires_ns(&apic->timer.dev, apic->timer.period);
- }
- return result;
+ struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic,
+ lapic_timer);
+ return apic_lvtt_period(apic);
}
int apic_has_pending_timer(struct kvm_vcpu *vcpu)
@@ -959,7 +886,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
struct kvm_lapic *lapic = vcpu->arch.apic;
if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT))
- return atomic_read(&lapic->timer.pending);
+ return atomic_read(&lapic->lapic_timer.pending);
return 0;
}
@@ -986,20 +913,9 @@ void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
kvm_apic_local_deliver(apic, APIC_LVT0);
}
-static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
-{
- struct kvm_lapic *apic;
- int restart_timer = 0;
-
- apic = container_of(data, struct kvm_lapic, timer.dev);
-
- restart_timer = __apic_timer_fn(apic);
-
- if (restart_timer)
- return HRTIMER_RESTART;
- else
- return HRTIMER_NORESTART;
-}
+static struct kvm_timer_ops lapic_timer_ops = {
+ .is_periodic = lapic_is_periodic,
+};
int kvm_create_lapic(struct kvm_vcpu *vcpu)
{
@@ -1024,8 +940,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
memset(apic->regs, 0, PAGE_SIZE);
apic->vcpu = vcpu;
- hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
- apic->timer.dev.function = apic_timer_fn;
+ hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS);
+ apic->lapic_timer.timer.function = kvm_timer_fn;
+ apic->lapic_timer.t_ops = &lapic_timer_ops;
+ apic->lapic_timer.kvm = vcpu->kvm;
+ apic->lapic_timer.vcpu_id = vcpu->vcpu_id;
+
apic->base_address = APIC_DEFAULT_PHYS_BASE;
vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
@@ -1078,9 +999,9 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (apic && atomic_read(&apic->timer.pending) > 0) {
+ if (apic && atomic_read(&apic->lapic_timer.pending) > 0) {
if (kvm_apic_local_deliver(apic, APIC_LVTT))
- atomic_dec(&apic->timer.pending);
+ atomic_dec(&apic->lapic_timer.pending);
}
}
@@ -1106,7 +1027,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
MSR_IA32_APICBASE_BASE;
apic_set_reg(apic, APIC_LVR, APIC_VERSION);
apic_update_ppr(apic);
- hrtimer_cancel(&apic->timer.dev);
+ hrtimer_cancel(&apic->lapic_timer.timer);
update_divide_count(apic);
start_apic_timer(apic);
}
@@ -1119,7 +1040,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
if (!apic)
return;
- timer = &apic->timer.dev;
+ timer = &apic->lapic_timer.timer;
if (hrtimer_cancel(timer))
hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 45ab6ee7120..a587f8349c4 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -2,18 +2,15 @@
#define __KVM_X86_LAPIC_H
#include "iodev.h"
+#include "kvm_timer.h"
#include <linux/kvm_host.h>
struct kvm_lapic {
unsigned long base_address;
struct kvm_io_device dev;
- struct {
- atomic_t pending;
- s64 period; /* unit: ns */
- u32 divide_count;
- struct hrtimer dev;
- } timer;
+ struct kvm_timer lapic_timer;
+ u32 divide_count;
struct kvm_vcpu *vcpu;
struct page *regs_page;
void *regs;
@@ -34,12 +31,13 @@ u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig);
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
+bool kvm_apic_present(struct kvm_vcpu *vcpu);
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 32cf11e5728..5c3d6e81a7d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -126,6 +126,7 @@ module_param(oos_shadow, bool, 0644);
#define PFERR_PRESENT_MASK (1U << 0)
#define PFERR_WRITE_MASK (1U << 1)
#define PFERR_USER_MASK (1U << 2)
+#define PFERR_RSVD_MASK (1U << 3)
#define PFERR_FETCH_MASK (1U << 4)
#define PT_DIRECTORY_LEVEL 2
@@ -177,7 +178,11 @@ static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
static u64 __read_mostly shadow_user_mask;
static u64 __read_mostly shadow_accessed_mask;
static u64 __read_mostly shadow_dirty_mask;
-static u64 __read_mostly shadow_mt_mask;
+
+static inline u64 rsvd_bits(int s, int e)
+{
+ return ((1ULL << (e - s + 1)) - 1) << s;
+}
void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
{
@@ -193,14 +198,13 @@ void kvm_mmu_set_base_ptes(u64 base_pte)
EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
- u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask)
+ u64 dirty_mask, u64 nx_mask, u64 x_mask)
{
shadow_user_mask = user_mask;
shadow_accessed_mask = accessed_mask;
shadow_dirty_mask = dirty_mask;
shadow_nx_mask = nx_mask;
shadow_x_mask = x_mask;
- shadow_mt_mask = mt_mask;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
@@ -219,11 +223,6 @@ static int is_nx(struct kvm_vcpu *vcpu)
return vcpu->arch.shadow_efer & EFER_NX;
}
-static int is_present_pte(unsigned long pte)
-{
- return pte & PT_PRESENT_MASK;
-}
-
static int is_shadow_present_pte(u64 pte)
{
return pte != shadow_trap_nonpresent_pte
@@ -1074,18 +1073,10 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
return NULL;
}
-static void kvm_unlink_unsync_global(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- list_del(&sp->oos_link);
- --kvm->stat.mmu_unsync_global;
-}
-
static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
WARN_ON(!sp->unsync);
sp->unsync = 0;
- if (sp->global)
- kvm_unlink_unsync_global(kvm, sp);
--kvm->stat.mmu_unsync;
}
@@ -1248,7 +1239,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word);
sp->gfn = gfn;
sp->role = role;
- sp->global = 0;
hlist_add_head(&sp->hash_link, bucket);
if (!direct) {
if (rmap_write_protect(vcpu->kvm, gfn))
@@ -1616,7 +1606,7 @@ static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
return mtrr_state->def_type;
}
-static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
+u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
{
u8 mtrr;
@@ -1626,6 +1616,7 @@ static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
mtrr = MTRR_TYPE_WRBACK;
return mtrr;
}
+EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
@@ -1646,11 +1637,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
++vcpu->kvm->stat.mmu_unsync;
sp->unsync = 1;
- if (sp->global) {
- list_add(&sp->oos_link, &vcpu->kvm->arch.oos_global_pages);
- ++vcpu->kvm->stat.mmu_unsync_global;
- } else
- kvm_mmu_mark_parents_unsync(vcpu, sp);
+ kvm_mmu_mark_parents_unsync(vcpu, sp);
mmu_convert_notrap(sp);
return 0;
@@ -1677,21 +1664,11 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
unsigned pte_access, int user_fault,
int write_fault, int dirty, int largepage,
- int global, gfn_t gfn, pfn_t pfn, bool speculative,
+ gfn_t gfn, pfn_t pfn, bool speculative,
bool can_unsync)
{
u64 spte;
int ret = 0;
- u64 mt_mask = shadow_mt_mask;
- struct kvm_mmu_page *sp = page_header(__pa(shadow_pte));
-
- if (!global && sp->global) {
- sp->global = 0;
- if (sp->unsync) {
- kvm_unlink_unsync_global(vcpu->kvm, sp);
- kvm_mmu_mark_parents_unsync(vcpu, sp);
- }
- }
/*
* We don't set the accessed bit, since we sometimes want to see
@@ -1711,16 +1688,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
spte |= shadow_user_mask;
if (largepage)
spte |= PT_PAGE_SIZE_MASK;
- if (mt_mask) {
- if (!kvm_is_mmio_pfn(pfn)) {
- mt_mask = get_memory_type(vcpu, gfn) <<
- kvm_x86_ops->get_mt_mask_shift();
- mt_mask |= VMX_EPT_IGMT_BIT;
- } else
- mt_mask = MTRR_TYPE_UNCACHABLE <<
- kvm_x86_ops->get_mt_mask_shift();
- spte |= mt_mask;
- }
+ if (tdp_enabled)
+ spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
+ kvm_is_mmio_pfn(pfn));
spte |= (u64)pfn << PAGE_SHIFT;
@@ -1765,8 +1735,8 @@ set_pte:
static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
unsigned pt_access, unsigned pte_access,
int user_fault, int write_fault, int dirty,
- int *ptwrite, int largepage, int global,
- gfn_t gfn, pfn_t pfn, bool speculative)
+ int *ptwrite, int largepage, gfn_t gfn,
+ pfn_t pfn, bool speculative)
{
int was_rmapped = 0;
int was_writeble = is_writeble_pte(*shadow_pte);
@@ -1795,7 +1765,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
was_rmapped = 1;
}
if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
- dirty, largepage, global, gfn, pfn, speculative, true)) {
+ dirty, largepage, gfn, pfn, speculative, true)) {
if (write_fault)
*ptwrite = 1;
kvm_x86_ops->tlb_flush(vcpu);
@@ -1843,7 +1813,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
|| (largepage && iterator.level == PT_DIRECTORY_LEVEL)) {
mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
0, write, 1, &pt_write,
- largepage, 0, gfn, pfn, false);
+ largepage, gfn, pfn, false);
++vcpu->stat.pf_fixed;
break;
}
@@ -1942,7 +1912,19 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
}
-static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
+static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
+{
+ int ret = 0;
+
+ if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
+ set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+ ret = 1;
+ }
+
+ return ret;
+}
+
+static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
{
int i;
gfn_t root_gfn;
@@ -1957,13 +1939,15 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
ASSERT(!VALID_PAGE(root));
if (tdp_enabled)
direct = 1;
+ if (mmu_check_root(vcpu, root_gfn))
+ return 1;
sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
PT64_ROOT_LEVEL, direct,
ACC_ALL, NULL);
root = __pa(sp->spt);
++sp->root_count;
vcpu->arch.mmu.root_hpa = root;
- return;
+ return 0;
}
direct = !is_paging(vcpu);
if (tdp_enabled)
@@ -1980,6 +1964,8 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT;
} else if (vcpu->arch.mmu.root_level == 0)
root_gfn = 0;
+ if (mmu_check_root(vcpu, root_gfn))
+ return 1;
sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
PT32_ROOT_LEVEL, direct,
ACC_ALL, NULL);
@@ -1988,6 +1974,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
}
vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+ return 0;
}
static void mmu_sync_roots(struct kvm_vcpu *vcpu)
@@ -2006,7 +1993,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->arch.mmu.pae_root[i];
- if (root) {
+ if (root && VALID_PAGE(root)) {
root &= PT64_BASE_ADDR_MASK;
sp = page_header(root);
mmu_sync_children(vcpu, sp);
@@ -2014,15 +2001,6 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
}
}
-static void mmu_sync_global(struct kvm_vcpu *vcpu)
-{
- struct kvm *kvm = vcpu->kvm;
- struct kvm_mmu_page *sp, *n;
-
- list_for_each_entry_safe(sp, n, &kvm->arch.oos_global_pages, oos_link)
- kvm_sync_page(vcpu, sp);
-}
-
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
{
spin_lock(&vcpu->kvm->mmu_lock);
@@ -2030,13 +2008,6 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
spin_unlock(&vcpu->kvm->mmu_lock);
}
-void kvm_mmu_sync_global(struct kvm_vcpu *vcpu)
-{
- spin_lock(&vcpu->kvm->mmu_lock);
- mmu_sync_global(vcpu);
- spin_unlock(&vcpu->kvm->mmu_lock);
-}
-
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
{
return vaddr;
@@ -2151,6 +2122,14 @@ static void paging_free(struct kvm_vcpu *vcpu)
nonpaging_free(vcpu);
}
+static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level)
+{
+ int bit7;
+
+ bit7 = (gpte >> 7) & 1;
+ return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0;
+}
+
#define PTTYPE 64
#include "paging_tmpl.h"
#undef PTTYPE
@@ -2159,6 +2138,59 @@ static void paging_free(struct kvm_vcpu *vcpu)
#include "paging_tmpl.h"
#undef PTTYPE
+static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
+{
+ struct kvm_mmu *context = &vcpu->arch.mmu;
+ int maxphyaddr = cpuid_maxphyaddr(vcpu);
+ u64 exb_bit_rsvd = 0;
+
+ if (!is_nx(vcpu))
+ exb_bit_rsvd = rsvd_bits(63, 63);
+ switch (level) {
+ case PT32_ROOT_LEVEL:
+ /* no rsvd bits for 2 level 4K page table entries */
+ context->rsvd_bits_mask[0][1] = 0;
+ context->rsvd_bits_mask[0][0] = 0;
+ if (is_cpuid_PSE36())
+ /* 36bits PSE 4MB page */
+ context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
+ else
+ /* 32 bits PSE 4MB page */
+ context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
+ context->rsvd_bits_mask[1][0] = ~0ull;
+ break;
+ case PT32E_ROOT_LEVEL:
+ context->rsvd_bits_mask[0][2] =
+ rsvd_bits(maxphyaddr, 63) |
+ rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */
+ context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 62); /* PDE */
+ context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 62); /* PTE */
+ context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 62) |
+ rsvd_bits(13, 20); /* large page */
+ context->rsvd_bits_mask[1][0] = ~0ull;
+ break;
+ case PT64_ROOT_LEVEL:
+ context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
+ context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
+ context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 51);
+ context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 51);
+ context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
+ context->rsvd_bits_mask[1][2] = context->rsvd_bits_mask[0][2];
+ context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 51) |
+ rsvd_bits(13, 20); /* large page */
+ context->rsvd_bits_mask[1][0] = ~0ull;
+ break;
+ }
+}
+
static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
{
struct kvm_mmu *context = &vcpu->arch.mmu;
@@ -2179,6 +2211,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
static int paging64_init_context(struct kvm_vcpu *vcpu)
{
+ reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
}
@@ -2186,6 +2219,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *context = &vcpu->arch.mmu;
+ reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
context->new_cr3 = paging_new_cr3;
context->page_fault = paging32_page_fault;
context->gva_to_gpa = paging32_gva_to_gpa;
@@ -2201,6 +2235,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
static int paging32E_init_context(struct kvm_vcpu *vcpu)
{
+ reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
}
@@ -2221,12 +2256,15 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->gva_to_gpa = nonpaging_gva_to_gpa;
context->root_level = 0;
} else if (is_long_mode(vcpu)) {
+ reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
context->gva_to_gpa = paging64_gva_to_gpa;
context->root_level = PT64_ROOT_LEVEL;
} else if (is_pae(vcpu)) {
+ reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
context->gva_to_gpa = paging64_gva_to_gpa;
context->root_level = PT32E_ROOT_LEVEL;
} else {
+ reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
context->gva_to_gpa = paging32_gva_to_gpa;
context->root_level = PT32_ROOT_LEVEL;
}
@@ -2290,9 +2328,11 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
goto out;
spin_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_free_some_pages(vcpu);
- mmu_alloc_roots(vcpu);
+ r = mmu_alloc_roots(vcpu);
mmu_sync_roots(vcpu);
spin_unlock(&vcpu->kvm->mmu_lock);
+ if (r)
+ goto out;
kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
kvm_mmu_flush_tlb(vcpu);
out:
@@ -2638,14 +2678,6 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp);
static void free_mmu_pages(struct kvm_vcpu *vcpu)
{
- struct kvm_mmu_page *sp;
-
- while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
- sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
- struct kvm_mmu_page, link);
- kvm_mmu_zap_page(vcpu->kvm, sp);
- cond_resched();
- }
free_page((unsigned long)vcpu->arch.mmu.pae_root);
}
@@ -2710,7 +2742,6 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
{
struct kvm_mmu_page *sp;
- spin_lock(&kvm->mmu_lock);
list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
int i;
u64 *pt;
@@ -2725,7 +2756,6 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
pt[i] &= ~PT_WRITABLE_MASK;
}
kvm_flush_remote_tlbs(kvm);
- spin_unlock(&kvm->mmu_lock);
}
void kvm_mmu_zap_all(struct kvm *kvm)
@@ -3007,11 +3037,13 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
" in nonleaf level: levels %d gva %lx"
" level %d pte %llx\n", audit_msg,
vcpu->arch.mmu.root_level, va, level, ent);
-
- audit_mappings_page(vcpu, ent, va, level - 1);
+ else
+ audit_mappings_page(vcpu, ent, va, level - 1);
} else {
gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
- hpa_t hpa = (hpa_t)gpa_to_pfn(vcpu, gpa) << PAGE_SHIFT;
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
+ hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
if (is_shadow_present_pte(ent)
&& (ent & PT64_BASE_ADDR_MASK) != hpa)
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index eaab2145f62..3494a2fb136 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -75,4 +75,9 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
return vcpu->arch.cr0 & X86_CR0_PG;
}
+static inline int is_present_pte(unsigned long pte)
+{
+ return pte & PT_PRESENT_MASK;
+}
+
#endif
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6bd70206c56..258e4591e1c 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -123,6 +123,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
gfn_t table_gfn;
unsigned index, pt_access, pte_access;
gpa_t pte_gpa;
+ int rsvd_fault = 0;
pgprintk("%s: addr %lx\n", __func__, addr);
walk:
@@ -157,6 +158,10 @@ walk:
if (!is_present_pte(pte))
goto not_present;
+ rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level);
+ if (rsvd_fault)
+ goto access_error;
+
if (write_fault && !is_writeble_pte(pte))
if (user_fault || is_write_protection(vcpu))
goto access_error;
@@ -209,7 +214,6 @@ walk:
if (ret)
goto walk;
pte |= PT_DIRTY_MASK;
- kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte), 0);
walker->ptes[walker->level - 1] = pte;
}
@@ -233,6 +237,8 @@ err:
walker->error_code |= PFERR_USER_MASK;
if (fetch_fault)
walker->error_code |= PFERR_FETCH_MASK;
+ if (rsvd_fault)
+ walker->error_code |= PFERR_RSVD_MASK;
return 0;
}
@@ -262,8 +268,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
kvm_get_pfn(pfn);
mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
gpte & PT_DIRTY_MASK, NULL, largepage,
- gpte & PT_GLOBAL_MASK, gpte_to_gfn(gpte),
- pfn, true);
+ gpte_to_gfn(gpte), pfn, true);
}
/*
@@ -297,7 +302,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
user_fault, write_fault,
gw->ptes[gw->level-1] & PT_DIRTY_MASK,
ptwrite, largepage,
- gw->ptes[gw->level-1] & PT_GLOBAL_MASK,
gw->gfn, pfn, false);
break;
}
@@ -380,7 +384,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
return r;
/*
- * Look up the shadow pte for the faulting address.
+ * Look up the guest pte for the faulting address.
*/
r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
fetch_fault);
@@ -586,7 +590,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
nr_present++;
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
- is_dirty_pte(gpte), 0, gpte & PT_GLOBAL_MASK, gfn,
+ is_dirty_pte(gpte), 0, gfn,
spte_to_pfn(sp->spt[i]), true, false);
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1f8510c51d6..71510e07e69 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -19,6 +19,7 @@
#include "irq.h"
#include "mmu.h"
#include "kvm_cache_regs.h"
+#include "x86.h"
#include <linux/module.h>
#include <linux/kernel.h>
@@ -69,7 +70,6 @@ module_param(npt, int, S_IRUGO);
static int nested = 0;
module_param(nested, int, S_IRUGO);
-static void kvm_reput_irq(struct vcpu_svm *svm);
static void svm_flush_tlb(struct kvm_vcpu *vcpu);
static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override);
@@ -132,24 +132,6 @@ static inline u32 svm_has(u32 feat)
return svm_features & feat;
}
-static inline u8 pop_irq(struct kvm_vcpu *vcpu)
-{
- int word_index = __ffs(vcpu->arch.irq_summary);
- int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
- int irq = word_index * BITS_PER_LONG + bit_index;
-
- clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
- if (!vcpu->arch.irq_pending[word_index])
- clear_bit(word_index, &vcpu->arch.irq_summary);
- return irq;
-}
-
-static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
-{
- set_bit(irq, vcpu->arch.irq_pending);
- set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
-}
-
static inline void clgi(void)
{
asm volatile (__ex(SVM_CLGI));
@@ -214,17 +196,31 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
svm->vmcb->control.event_inj_err = error_code;
}
-static bool svm_exception_injected(struct kvm_vcpu *vcpu)
+static int is_external_interrupt(u32 info)
+{
+ info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
+ return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
+}
+
+static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ u32 ret = 0;
- return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID);
+ if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
+ ret |= X86_SHADOW_INT_STI | X86_SHADOW_INT_MOV_SS;
+ return ret & mask;
}
-static int is_external_interrupt(u32 info)
+static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
{
- info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
- return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (mask == 0)
+ svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
+ else
+ svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
+
}
static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
@@ -232,7 +228,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
if (!svm->next_rip) {
- printk(KERN_DEBUG "%s: NOP\n", __func__);
+ if (emulate_instruction(vcpu, vcpu->run, 0, 0, EMULTYPE_SKIP) !=
+ EMULATE_DONE)
+ printk(KERN_DEBUG "%s: NOP\n", __func__);
return;
}
if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
@@ -240,9 +238,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
__func__, kvm_rip_read(vcpu), svm->next_rip);
kvm_rip_write(vcpu, svm->next_rip);
- svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
-
- vcpu->arch.interrupt_window_open = (svm->vcpu.arch.hflags & HF_GIF_MASK);
+ svm_set_interrupt_shadow(vcpu, 0);
}
static int has_svm(void)
@@ -830,6 +826,15 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
if (!var->unusable)
var->type |= 0x1;
break;
+ case VCPU_SREG_SS:
+ /* On AMD CPUs sometimes the DB bit in the segment
+ * descriptor is left as 1, although the whole segment has
+ * been made unusable. Clear it here to pass an Intel VMX
+ * entry check when cross vendor migrating.
+ */
+ if (var->unusable)
+ var->db = 0;
+ break;
}
}
@@ -960,15 +965,16 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
}
-static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
+static void update_db_intercept(struct kvm_vcpu *vcpu)
{
- int old_debug = vcpu->guest_debug;
struct vcpu_svm *svm = to_svm(vcpu);
- vcpu->guest_debug = dbg->control;
-
svm->vmcb->control.intercept_exceptions &=
~((1 << DB_VECTOR) | (1 << BP_VECTOR));
+
+ if (vcpu->arch.singlestep)
+ svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR);
+
if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
if (vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
@@ -979,6 +985,16 @@ static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1 << BP_VECTOR;
} else
vcpu->guest_debug = 0;
+}
+
+static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
+{
+ int old_debug = vcpu->guest_debug;
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ vcpu->guest_debug = dbg->control;
+
+ update_db_intercept(vcpu);
if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
@@ -993,16 +1009,6 @@ static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
return 0;
}
-static int svm_get_irq(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- u32 exit_int_info = svm->vmcb->control.exit_int_info;
-
- if (is_external_interrupt(exit_int_info))
- return exit_int_info & SVM_EVTINJ_VEC_MASK;
- return -1;
-}
-
static void load_host_msrs(struct kvm_vcpu *vcpu)
{
#ifdef CONFIG_X86_64
@@ -1107,17 +1113,8 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
- u32 exit_int_info = svm->vmcb->control.exit_int_info;
- struct kvm *kvm = svm->vcpu.kvm;
u64 fault_address;
u32 error_code;
- bool event_injection = false;
-
- if (!irqchip_in_kernel(kvm) &&
- is_external_interrupt(exit_int_info)) {
- event_injection = true;
- push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
- }
fault_address = svm->vmcb->control.exit_info_2;
error_code = svm->vmcb->control.exit_info_1;
@@ -1137,23 +1134,40 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
*/
if (npt_enabled)
svm_flush_tlb(&svm->vcpu);
-
- if (!npt_enabled && event_injection)
- kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
+ else {
+ if (kvm_event_needs_reinjection(&svm->vcpu))
+ kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
+ }
return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
}
static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
if (!(svm->vcpu.guest_debug &
- (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
+ !svm->vcpu.arch.singlestep) {
kvm_queue_exception(&svm->vcpu, DB_VECTOR);
return 1;
}
- kvm_run->exit_reason = KVM_EXIT_DEBUG;
- kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
- kvm_run->debug.arch.exception = DB_VECTOR;
- return 0;
+
+ if (svm->vcpu.arch.singlestep) {
+ svm->vcpu.arch.singlestep = false;
+ if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
+ svm->vmcb->save.rflags &=
+ ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
+ update_db_intercept(&svm->vcpu);
+ }
+
+ if (svm->vcpu.guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)){
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ kvm_run->debug.arch.pc =
+ svm->vmcb->save.cs.base + svm->vmcb->save.rip;
+ kvm_run->debug.arch.exception = DB_VECTOR;
+ return 0;
+ }
+
+ return 1;
}
static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
@@ -1842,17 +1856,51 @@ static int task_switch_interception(struct vcpu_svm *svm,
struct kvm_run *kvm_run)
{
u16 tss_selector;
+ int reason;
+ int int_type = svm->vmcb->control.exit_int_info &
+ SVM_EXITINTINFO_TYPE_MASK;
+ int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
+ uint32_t type =
+ svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
+ uint32_t idt_v =
+ svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
tss_selector = (u16)svm->vmcb->control.exit_info_1;
+
if (svm->vmcb->control.exit_info_2 &
(1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
- return kvm_task_switch(&svm->vcpu, tss_selector,
- TASK_SWITCH_IRET);
- if (svm->vmcb->control.exit_info_2 &
- (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
- return kvm_task_switch(&svm->vcpu, tss_selector,
- TASK_SWITCH_JMP);
- return kvm_task_switch(&svm->vcpu, tss_selector, TASK_SWITCH_CALL);
+ reason = TASK_SWITCH_IRET;
+ else if (svm->vmcb->control.exit_info_2 &
+ (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
+ reason = TASK_SWITCH_JMP;
+ else if (idt_v)
+ reason = TASK_SWITCH_GATE;
+ else
+ reason = TASK_SWITCH_CALL;
+
+ if (reason == TASK_SWITCH_GATE) {
+ switch (type) {
+ case SVM_EXITINTINFO_TYPE_NMI:
+ svm->vcpu.arch.nmi_injected = false;
+ break;
+ case SVM_EXITINTINFO_TYPE_EXEPT:
+ kvm_clear_exception_queue(&svm->vcpu);
+ break;
+ case SVM_EXITINTINFO_TYPE_INTR:
+ kvm_clear_interrupt_queue(&svm->vcpu);
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (reason != TASK_SWITCH_GATE ||
+ int_type == SVM_EXITINTINFO_TYPE_SOFT ||
+ (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
+ (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
+ skip_emulated_instruction(&svm->vcpu);
+
+ return kvm_task_switch(&svm->vcpu, tss_selector, reason);
}
static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
@@ -1862,6 +1910,14 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
return 1;
}
+static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+ ++svm->vcpu.stat.nmi_window_exits;
+ svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
+ svm->vcpu.arch.hflags |= HF_IRET_MASK;
+ return 1;
+}
+
static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE)
@@ -1879,8 +1935,14 @@ static int emulate_on_interception(struct vcpu_svm *svm,
static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
+ u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
+ /* instruction emulation calls kvm_set_cr8() */
emulate_instruction(&svm->vcpu, NULL, 0, 0, 0);
- if (irqchip_in_kernel(svm->vcpu.kvm))
+ if (irqchip_in_kernel(svm->vcpu.kvm)) {
+ svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
+ return 1;
+ }
+ if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
return 1;
kvm_run->exit_reason = KVM_EXIT_SET_TPR;
return 0;
@@ -2090,8 +2152,9 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
* If the user space waits to inject interrupts, exit as soon as
* possible
*/
- if (kvm_run->request_interrupt_window &&
- !svm->vcpu.arch.irq_summary) {
+ if (!irqchip_in_kernel(svm->vcpu.kvm) &&
+ kvm_run->request_interrupt_window &&
+ !kvm_cpu_has_interrupt(&svm->vcpu)) {
++svm->vcpu.stat.irq_window_exits;
kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
return 0;
@@ -2134,6 +2197,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
[SVM_EXIT_VINTR] = interrupt_window_interception,
/* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */
[SVM_EXIT_CPUID] = cpuid_interception,
+ [SVM_EXIT_IRET] = iret_interception,
[SVM_EXIT_INVD] = emulate_on_interception,
[SVM_EXIT_HLT] = halt_interception,
[SVM_EXIT_INVLPG] = invlpg_interception,
@@ -2194,7 +2258,6 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
}
}
- kvm_reput_irq(svm);
if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -2205,7 +2268,7 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
- exit_code != SVM_EXIT_NPF)
+ exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH)
printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
"exit_code 0x%x\n",
__func__, svm->vmcb->control.exit_int_info,
@@ -2242,6 +2305,15 @@ static void pre_svm_run(struct vcpu_svm *svm)
new_asid(svm, svm_data);
}
+static void svm_inject_nmi(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
+ vcpu->arch.hflags |= HF_NMI_MASK;
+ svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET);
+ ++vcpu->stat.nmi_injections;
+}
static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
{
@@ -2257,134 +2329,71 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
}
-static void svm_set_irq(struct kvm_vcpu *vcpu, int irq)
+static void svm_queue_irq(struct kvm_vcpu *vcpu, unsigned nr)
{
struct vcpu_svm *svm = to_svm(vcpu);
- nested_svm_intr(svm);
-
- svm_inject_irq(svm, irq);
+ svm->vmcb->control.event_inj = nr |
+ SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
}
-static void update_cr8_intercept(struct kvm_vcpu *vcpu)
+static void svm_set_irq(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb *vmcb = svm->vmcb;
- int max_irr, tpr;
- if (!irqchip_in_kernel(vcpu->kvm) || vcpu->arch.apic->vapic_addr)
- return;
+ nested_svm_intr(svm);
- vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
+ svm_queue_irq(vcpu, vcpu->arch.interrupt.nr);
+}
- max_irr = kvm_lapic_find_highest_irr(vcpu);
- if (max_irr == -1)
- return;
+static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
- tpr = kvm_lapic_get_cr8(vcpu) << 4;
+ if (irr == -1)
+ return;
- if (tpr >= (max_irr & 0xf0))
- vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK;
+ if (tpr >= irr)
+ svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK;
}
-static void svm_intr_assist(struct kvm_vcpu *vcpu)
+static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb *vmcb = svm->vmcb;
- int intr_vector = -1;
-
- if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) &&
- ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) {
- intr_vector = vmcb->control.exit_int_info &
- SVM_EVTINJ_VEC_MASK;
- vmcb->control.exit_int_info = 0;
- svm_inject_irq(svm, intr_vector);
- goto out;
- }
-
- if (vmcb->control.int_ctl & V_IRQ_MASK)
- goto out;
-
- if (!kvm_cpu_has_interrupt(vcpu))
- goto out;
-
- if (nested_svm_intr(svm))
- goto out;
-
- if (!(svm->vcpu.arch.hflags & HF_GIF_MASK))
- goto out;
-
- if (!(vmcb->save.rflags & X86_EFLAGS_IF) ||
- (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
- (vmcb->control.event_inj & SVM_EVTINJ_VALID)) {
- /* unable to deliver irq, set pending irq */
- svm_set_vintr(svm);
- svm_inject_irq(svm, 0x0);
- goto out;
- }
- /* Okay, we can deliver the interrupt: grab it and update PIC state. */
- intr_vector = kvm_cpu_get_interrupt(vcpu);
- svm_inject_irq(svm, intr_vector);
-out:
- update_cr8_intercept(vcpu);
+ return !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
+ !(svm->vcpu.arch.hflags & HF_NMI_MASK);
}
-static void kvm_reput_irq(struct vcpu_svm *svm)
+static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
{
- struct vmcb_control_area *control = &svm->vmcb->control;
-
- if ((control->int_ctl & V_IRQ_MASK)
- && !irqchip_in_kernel(svm->vcpu.kvm)) {
- control->int_ctl &= ~V_IRQ_MASK;
- push_irq(&svm->vcpu, control->int_vector);
- }
-
- svm->vcpu.arch.interrupt_window_open =
- !(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
- (svm->vcpu.arch.hflags & HF_GIF_MASK);
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb;
+ return (vmcb->save.rflags & X86_EFLAGS_IF) &&
+ !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
+ (svm->vcpu.arch.hflags & HF_GIF_MASK);
}
-static void svm_do_inject_vector(struct vcpu_svm *svm)
+static void enable_irq_window(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu *vcpu = &svm->vcpu;
- int word_index = __ffs(vcpu->arch.irq_summary);
- int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
- int irq = word_index * BITS_PER_LONG + bit_index;
-
- clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
- if (!vcpu->arch.irq_pending[word_index])
- clear_bit(word_index, &vcpu->arch.irq_summary);
- svm_inject_irq(svm, irq);
+ svm_set_vintr(to_svm(vcpu));
+ svm_inject_irq(to_svm(vcpu), 0x0);
}
-static void do_interrupt_requests(struct kvm_vcpu *vcpu,
- struct kvm_run *kvm_run)
+static void enable_nmi_window(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb_control_area *control = &svm->vmcb->control;
-
- if (nested_svm_intr(svm))
- return;
- svm->vcpu.arch.interrupt_window_open =
- (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
- (svm->vmcb->save.rflags & X86_EFLAGS_IF) &&
- (svm->vcpu.arch.hflags & HF_GIF_MASK));
+ if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
+ == HF_NMI_MASK)
+ return; /* IRET will cause a vm exit */
- if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary)
- /*
- * If interrupts enabled, and not blocked by sti or mov ss. Good.
- */
- svm_do_inject_vector(svm);
-
- /*
- * Interrupts blocked. Wait for unblock.
- */
- if (!svm->vcpu.arch.interrupt_window_open &&
- (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window))
- svm_set_vintr(svm);
- else
- svm_clear_vintr(svm);
+ /* Something prevents NMI from been injected. Single step over
+ possible problem (IRET or exception injection or interrupt
+ shadow) */
+ vcpu->arch.singlestep = true;
+ svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
+ update_db_intercept(vcpu);
}
static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -2407,7 +2416,7 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) {
int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
- kvm_lapic_set_tpr(vcpu, cr8);
+ kvm_set_cr8(vcpu, cr8);
}
}
@@ -2416,14 +2425,54 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
u64 cr8;
- if (!irqchip_in_kernel(vcpu->kvm))
- return;
-
cr8 = kvm_get_cr8(vcpu);
svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
}
+static void svm_complete_interrupts(struct vcpu_svm *svm)
+{
+ u8 vector;
+ int type;
+ u32 exitintinfo = svm->vmcb->control.exit_int_info;
+
+ if (svm->vcpu.arch.hflags & HF_IRET_MASK)
+ svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
+
+ svm->vcpu.arch.nmi_injected = false;
+ kvm_clear_exception_queue(&svm->vcpu);
+ kvm_clear_interrupt_queue(&svm->vcpu);
+
+ if (!(exitintinfo & SVM_EXITINTINFO_VALID))
+ return;
+
+ vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
+ type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
+
+ switch (type) {
+ case SVM_EXITINTINFO_TYPE_NMI:
+ svm->vcpu.arch.nmi_injected = true;
+ break;
+ case SVM_EXITINTINFO_TYPE_EXEPT:
+ /* In case of software exception do not reinject an exception
+ vector, but re-execute and instruction instead */
+ if (kvm_exception_is_soft(vector))
+ break;
+ if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
+ u32 err = svm->vmcb->control.exit_int_info_err;
+ kvm_queue_exception_e(&svm->vcpu, vector, err);
+
+ } else
+ kvm_queue_exception(&svm->vcpu, vector);
+ break;
+ case SVM_EXITINTINFO_TYPE_INTR:
+ kvm_queue_interrupt(&svm->vcpu, vector, false);
+ break;
+ default:
+ break;
+ }
+}
+
#ifdef CONFIG_X86_64
#define R "r"
#else
@@ -2552,6 +2601,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
sync_cr8_to_lapic(vcpu);
svm->next_rip = 0;
+
+ svm_complete_interrupts(svm);
}
#undef R
@@ -2617,7 +2668,7 @@ static int get_npt_level(void)
#endif
}
-static int svm_get_mt_mask_shift(void)
+static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
return 0;
}
@@ -2667,17 +2718,21 @@ static struct kvm_x86_ops svm_x86_ops = {
.run = svm_vcpu_run,
.handle_exit = handle_exit,
.skip_emulated_instruction = skip_emulated_instruction,
+ .set_interrupt_shadow = svm_set_interrupt_shadow,
+ .get_interrupt_shadow = svm_get_interrupt_shadow,
.patch_hypercall = svm_patch_hypercall,
- .get_irq = svm_get_irq,
.set_irq = svm_set_irq,
+ .set_nmi = svm_inject_nmi,
.queue_exception = svm_queue_exception,
- .exception_injected = svm_exception_injected,
- .inject_pending_irq = svm_intr_assist,
- .inject_pending_vectors = do_interrupt_requests,
+ .interrupt_allowed = svm_interrupt_allowed,
+ .nmi_allowed = svm_nmi_allowed,
+ .enable_nmi_window = enable_nmi_window,
+ .enable_irq_window = enable_irq_window,
+ .update_cr8_intercept = update_cr8_intercept,
.set_tss_addr = svm_set_tss_addr,
.get_tdp_level = get_npt_level,
- .get_mt_mask_shift = svm_get_mt_mask_shift,
+ .get_mt_mask = svm_get_mt_mask,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
new file mode 100644
index 00000000000..86dbac072d0
--- /dev/null
+++ b/arch/x86/kvm/timer.c
@@ -0,0 +1,46 @@
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/hrtimer.h>
+#include <asm/atomic.h>
+#include "kvm_timer.h"
+
+static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
+{
+ int restart_timer = 0;
+ wait_queue_head_t *q = &vcpu->wq;
+
+ /* FIXME: this code should not know anything about vcpus */
+ if (!atomic_inc_and_test(&ktimer->pending))
+ set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
+
+ if (!ktimer->reinject)
+ atomic_set(&ktimer->pending, 1);
+
+ if (waitqueue_active(q))
+ wake_up_interruptible(q);
+
+ if (ktimer->t_ops->is_periodic(ktimer)) {
+ hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
+ restart_timer = 1;
+ }
+
+ return restart_timer;
+}
+
+enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
+{
+ int restart_timer;
+ struct kvm_vcpu *vcpu;
+ struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
+
+ vcpu = ktimer->kvm->vcpus[ktimer->vcpu_id];
+ if (!vcpu)
+ return HRTIMER_NORESTART;
+
+ restart_timer = __kvm_timer_fn(vcpu, ktimer);
+ if (restart_timer)
+ return HRTIMER_RESTART;
+ else
+ return HRTIMER_NORESTART;
+}
+
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bb481330716..32d6ae8fb60 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -32,26 +32,27 @@
#include <asm/desc.h>
#include <asm/vmx.h>
#include <asm/virtext.h>
+#include <asm/mce.h>
#define __ex(x) __kvm_handle_fault_on_reboot(x)
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
-static int bypass_guest_pf = 1;
-module_param(bypass_guest_pf, bool, 0);
+static int __read_mostly bypass_guest_pf = 1;
+module_param(bypass_guest_pf, bool, S_IRUGO);
-static int enable_vpid = 1;
-module_param(enable_vpid, bool, 0);
+static int __read_mostly enable_vpid = 1;
+module_param_named(vpid, enable_vpid, bool, 0444);
-static int flexpriority_enabled = 1;
-module_param(flexpriority_enabled, bool, 0);
+static int __read_mostly flexpriority_enabled = 1;
+module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
-static int enable_ept = 1;
-module_param(enable_ept, bool, 0);
+static int __read_mostly enable_ept = 1;
+module_param_named(ept, enable_ept, bool, S_IRUGO);
-static int emulate_invalid_guest_state = 0;
-module_param(emulate_invalid_guest_state, bool, 0);
+static int __read_mostly emulate_invalid_guest_state = 0;
+module_param(emulate_invalid_guest_state, bool, S_IRUGO);
struct vmcs {
u32 revision_id;
@@ -97,6 +98,7 @@ struct vcpu_vmx {
int soft_vnmi_blocked;
ktime_t entry_time;
s64 vnmi_blocked_time;
+ u32 exit_reason;
};
static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -111,9 +113,10 @@ static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
-static struct page *vmx_io_bitmap_a;
-static struct page *vmx_io_bitmap_b;
-static struct page *vmx_msr_bitmap;
+static unsigned long *vmx_io_bitmap_a;
+static unsigned long *vmx_io_bitmap_b;
+static unsigned long *vmx_msr_bitmap_legacy;
+static unsigned long *vmx_msr_bitmap_longmode;
static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
static DEFINE_SPINLOCK(vmx_vpid_lock);
@@ -213,70 +216,78 @@ static inline int is_external_interrupt(u32 intr_info)
== (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
}
+static inline int is_machine_check(u32 intr_info)
+{
+ return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
+ INTR_INFO_VALID_MASK)) ==
+ (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
+}
+
static inline int cpu_has_vmx_msr_bitmap(void)
{
- return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS);
+ return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
}
static inline int cpu_has_vmx_tpr_shadow(void)
{
- return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW);
+ return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
}
static inline int vm_need_tpr_shadow(struct kvm *kvm)
{
- return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)));
+ return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
}
static inline int cpu_has_secondary_exec_ctrls(void)
{
- return (vmcs_config.cpu_based_exec_ctrl &
- CPU_BASED_ACTIVATE_SECONDARY_CONTROLS);
+ return vmcs_config.cpu_based_exec_ctrl &
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
}
static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
{
- return flexpriority_enabled
- && (vmcs_config.cpu_based_2nd_exec_ctrl &
- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+}
+
+static inline bool cpu_has_vmx_flexpriority(void)
+{
+ return cpu_has_vmx_tpr_shadow() &&
+ cpu_has_vmx_virtualize_apic_accesses();
}
static inline int cpu_has_vmx_invept_individual_addr(void)
{
- return (!!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT));
+ return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
}
static inline int cpu_has_vmx_invept_context(void)
{
- return (!!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT));
+ return !!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT);
}
static inline int cpu_has_vmx_invept_global(void)
{
- return (!!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT));
+ return !!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT);
}
static inline int cpu_has_vmx_ept(void)
{
- return (vmcs_config.cpu_based_2nd_exec_ctrl &
- SECONDARY_EXEC_ENABLE_EPT);
-}
-
-static inline int vm_need_ept(void)
-{
- return (cpu_has_vmx_ept() && enable_ept);
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_EPT;
}
static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
{
- return ((cpu_has_vmx_virtualize_apic_accesses()) &&
- (irqchip_in_kernel(kvm)));
+ return flexpriority_enabled &&
+ (cpu_has_vmx_virtualize_apic_accesses()) &&
+ (irqchip_in_kernel(kvm));
}
static inline int cpu_has_vmx_vpid(void)
{
- return (vmcs_config.cpu_based_2nd_exec_ctrl &
- SECONDARY_EXEC_ENABLE_VPID);
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_VPID;
}
static inline int cpu_has_virtual_nmis(void)
@@ -284,6 +295,11 @@ static inline int cpu_has_virtual_nmis(void)
return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
}
+static inline bool report_flexpriority(void)
+{
+ return flexpriority_enabled;
+}
+
static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
{
int i;
@@ -381,7 +397,7 @@ static inline void ept_sync_global(void)
static inline void ept_sync_context(u64 eptp)
{
- if (vm_need_ept()) {
+ if (enable_ept) {
if (cpu_has_vmx_invept_context())
__invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
else
@@ -391,7 +407,7 @@ static inline void ept_sync_context(u64 eptp)
static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
{
- if (vm_need_ept()) {
+ if (enable_ept) {
if (cpu_has_vmx_invept_individual_addr())
__invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
eptp, gpa);
@@ -478,7 +494,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
{
u32 eb;
- eb = (1u << PF_VECTOR) | (1u << UD_VECTOR);
+ eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR);
if (!vcpu->fpu_active)
eb |= 1u << NM_VECTOR;
if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
@@ -488,9 +504,9 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
eb |= 1u << BP_VECTOR;
}
- if (vcpu->arch.rmode.active)
+ if (vcpu->arch.rmode.vm86_active)
eb = ~0;
- if (vm_need_ept())
+ if (enable_ept)
eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
vmcs_write32(EXCEPTION_BITMAP, eb);
}
@@ -724,29 +740,50 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
- if (vcpu->arch.rmode.active)
+ if (vcpu->arch.rmode.vm86_active)
rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
vmcs_writel(GUEST_RFLAGS, rflags);
}
+static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+ u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+ int ret = 0;
+
+ if (interruptibility & GUEST_INTR_STATE_STI)
+ ret |= X86_SHADOW_INT_STI;
+ if (interruptibility & GUEST_INTR_STATE_MOV_SS)
+ ret |= X86_SHADOW_INT_MOV_SS;
+
+ return ret & mask;
+}
+
+static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+ u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+ u32 interruptibility = interruptibility_old;
+
+ interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
+
+ if (mask & X86_SHADOW_INT_MOV_SS)
+ interruptibility |= GUEST_INTR_STATE_MOV_SS;
+ if (mask & X86_SHADOW_INT_STI)
+ interruptibility |= GUEST_INTR_STATE_STI;
+
+ if ((interruptibility != interruptibility_old))
+ vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
+}
+
static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
unsigned long rip;
- u32 interruptibility;
rip = kvm_rip_read(vcpu);
rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
kvm_rip_write(vcpu, rip);
- /*
- * We emulated an instruction, so temporary interrupt blocking
- * should be removed, if set.
- */
- interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
- if (interruptibility & 3)
- vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
- interruptibility & ~3);
- vcpu->arch.interrupt_window_open = 1;
+ /* skipping an emulated instruction also counts */
+ vmx_set_interrupt_shadow(vcpu, 0);
}
static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
@@ -760,7 +797,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
intr_info |= INTR_INFO_DELIVER_CODE_MASK;
}
- if (vcpu->arch.rmode.active) {
+ if (vcpu->arch.rmode.vm86_active) {
vmx->rmode.irq.pending = true;
vmx->rmode.irq.vector = nr;
vmx->rmode.irq.rip = kvm_rip_read(vcpu);
@@ -773,8 +810,9 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
return;
}
- if (nr == BP_VECTOR || nr == OF_VECTOR) {
- vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
+ if (kvm_exception_is_soft(nr)) {
+ vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+ vmx->vcpu.arch.event_exit_inst_len);
intr_info |= INTR_TYPE_SOFT_EXCEPTION;
} else
intr_info |= INTR_TYPE_HARD_EXCEPTION;
@@ -782,11 +820,6 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
}
-static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
-{
- return false;
-}
-
/*
* Swap MSR entry in host/guest MSR entry array.
*/
@@ -812,6 +845,7 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
static void setup_msrs(struct vcpu_vmx *vmx)
{
int save_nmsrs;
+ unsigned long *msr_bitmap;
vmx_load_host_state(vmx);
save_nmsrs = 0;
@@ -847,6 +881,15 @@ static void setup_msrs(struct vcpu_vmx *vmx)
__find_msr_index(vmx, MSR_KERNEL_GS_BASE);
#endif
vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
+
+ if (cpu_has_vmx_msr_bitmap()) {
+ if (is_long_mode(&vmx->vcpu))
+ msr_bitmap = vmx_msr_bitmap_longmode;
+ else
+ msr_bitmap = vmx_msr_bitmap_legacy;
+
+ vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
+ }
}
/*
@@ -1034,13 +1077,6 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
return 0;
}
-static int vmx_get_irq(struct kvm_vcpu *vcpu)
-{
- if (!vcpu->arch.interrupt.pending)
- return -1;
- return vcpu->arch.interrupt.nr;
-}
-
static __init int cpu_has_kvm_support(void)
{
return cpu_has_vmx();
@@ -1294,6 +1330,18 @@ static __init int hardware_setup(void)
if (boot_cpu_has(X86_FEATURE_NX))
kvm_enable_efer_bits(EFER_NX);
+ if (!cpu_has_vmx_vpid())
+ enable_vpid = 0;
+
+ if (!cpu_has_vmx_ept())
+ enable_ept = 0;
+
+ if (!cpu_has_vmx_flexpriority())
+ flexpriority_enabled = 0;
+
+ if (!cpu_has_vmx_tpr_shadow())
+ kvm_x86_ops->update_cr8_intercept = NULL;
+
return alloc_kvm_area();
}
@@ -1324,7 +1372,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
vmx->emulation_required = 1;
- vcpu->arch.rmode.active = 0;
+ vcpu->arch.rmode.vm86_active = 0;
vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit);
@@ -1386,7 +1434,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
vmx->emulation_required = 1;
- vcpu->arch.rmode.active = 1;
+ vcpu->arch.rmode.vm86_active = 1;
vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
@@ -1485,7 +1533,7 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
{
vpid_sync_vcpu_all(to_vmx(vcpu));
- if (vm_need_ept())
+ if (enable_ept)
ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
}
@@ -1555,10 +1603,10 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
vmx_fpu_deactivate(vcpu);
- if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE))
+ if (vcpu->arch.rmode.vm86_active && (cr0 & X86_CR0_PE))
enter_pmode(vcpu);
- if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE))
+ if (!vcpu->arch.rmode.vm86_active && !(cr0 & X86_CR0_PE))
enter_rmode(vcpu);
#ifdef CONFIG_X86_64
@@ -1570,7 +1618,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
}
#endif
- if (vm_need_ept())
+ if (enable_ept)
ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
vmcs_writel(CR0_READ_SHADOW, cr0);
@@ -1599,7 +1647,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
u64 eptp;
guest_cr3 = cr3;
- if (vm_need_ept()) {
+ if (enable_ept) {
eptp = construct_eptp(cr3);
vmcs_write64(EPT_POINTER, eptp);
ept_sync_context(eptp);
@@ -1616,11 +1664,11 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
- unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.active ?
+ unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.vm86_active ?
KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
vcpu->arch.cr4 = cr4;
- if (vm_need_ept())
+ if (enable_ept)
ept_update_paging_mode_cr4(&hw_cr4, vcpu);
vmcs_writel(CR4_READ_SHADOW, cr4);
@@ -1699,7 +1747,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
u32 ar;
- if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) {
+ if (vcpu->arch.rmode.vm86_active && seg == VCPU_SREG_TR) {
vcpu->arch.rmode.tr.selector = var->selector;
vcpu->arch.rmode.tr.base = var->base;
vcpu->arch.rmode.tr.limit = var->limit;
@@ -1709,7 +1757,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
vmcs_writel(sf->base, var->base);
vmcs_write32(sf->limit, var->limit);
vmcs_write16(sf->selector, var->selector);
- if (vcpu->arch.rmode.active && var->s) {
+ if (vcpu->arch.rmode.vm86_active && var->s) {
/*
* Hack real-mode segments into vm86 compatibility.
*/
@@ -1982,7 +2030,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
pfn_t identity_map_pfn;
u32 tmp;
- if (!vm_need_ept())
+ if (!enable_ept)
return 1;
if (unlikely(!kvm->arch.ept_identity_pagetable)) {
printk(KERN_ERR "EPT: identity-mapping pagetable "
@@ -2071,7 +2119,7 @@ static void allocate_vpid(struct vcpu_vmx *vmx)
int vpid;
vmx->vpid = 0;
- if (!enable_vpid || !cpu_has_vmx_vpid())
+ if (!enable_vpid)
return;
spin_lock(&vmx_vpid_lock);
vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
@@ -2082,9 +2130,9 @@ static void allocate_vpid(struct vcpu_vmx *vmx)
spin_unlock(&vmx_vpid_lock);
}
-static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr)
+static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
{
- void *va;
+ int f = sizeof(unsigned long);
if (!cpu_has_vmx_msr_bitmap())
return;
@@ -2094,16 +2142,21 @@ static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr)
* have the write-low and read-high bitmap offsets the wrong way round.
* We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
*/
- va = kmap(msr_bitmap);
if (msr <= 0x1fff) {
- __clear_bit(msr, va + 0x000); /* read-low */
- __clear_bit(msr, va + 0x800); /* write-low */
+ __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
+ __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
msr &= 0x1fff;
- __clear_bit(msr, va + 0x400); /* read-high */
- __clear_bit(msr, va + 0xc00); /* write-high */
+ __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
+ __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
}
- kunmap(msr_bitmap);
+}
+
+static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
+{
+ if (!longmode_only)
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
}
/*
@@ -2121,11 +2174,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
u32 exec_control;
/* I/O */
- vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a));
- vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b));
+ vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
+ vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
if (cpu_has_vmx_msr_bitmap())
- vmcs_write64(MSR_BITMAP, page_to_phys(vmx_msr_bitmap));
+ vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
@@ -2141,7 +2194,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
CPU_BASED_CR8_LOAD_EXITING;
#endif
}
- if (!vm_need_ept())
+ if (!enable_ept)
exec_control |= CPU_BASED_CR3_STORE_EXITING |
CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_INVLPG_EXITING;
@@ -2154,7 +2207,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
if (vmx->vpid == 0)
exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
- if (!vm_need_ept())
+ if (!enable_ept)
exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
}
@@ -2273,7 +2326,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
goto out;
}
- vmx->vcpu.arch.rmode.active = 0;
+ vmx->vcpu.arch.rmode.vm86_active = 0;
vmx->soft_vnmi_blocked = 0;
@@ -2402,14 +2455,16 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
}
-static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
+static void vmx_inject_irq(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ uint32_t intr;
+ int irq = vcpu->arch.interrupt.nr;
KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler);
++vcpu->stat.irq_injections;
- if (vcpu->arch.rmode.active) {
+ if (vcpu->arch.rmode.vm86_active) {
vmx->rmode.irq.pending = true;
vmx->rmode.irq.vector = irq;
vmx->rmode.irq.rip = kvm_rip_read(vcpu);
@@ -2419,8 +2474,14 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
return;
}
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
- irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
+ intr = irq | INTR_INFO_VALID_MASK;
+ if (vcpu->arch.interrupt.soft) {
+ intr |= INTR_TYPE_SOFT_INTR;
+ vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+ vmx->vcpu.arch.event_exit_inst_len);
+ } else
+ intr |= INTR_TYPE_EXT_INTR;
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
}
static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -2441,7 +2502,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
}
++vcpu->stat.nmi_injections;
- if (vcpu->arch.rmode.active) {
+ if (vcpu->arch.rmode.vm86_active) {
vmx->rmode.irq.pending = true;
vmx->rmode.irq.vector = NMI_VECTOR;
vmx->rmode.irq.rip = kvm_rip_read(vcpu);
@@ -2456,76 +2517,21 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
}
-static void vmx_update_window_states(struct kvm_vcpu *vcpu)
+static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
{
- u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
-
- vcpu->arch.nmi_window_open =
- !(guest_intr & (GUEST_INTR_STATE_STI |
- GUEST_INTR_STATE_MOV_SS |
- GUEST_INTR_STATE_NMI));
if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
- vcpu->arch.nmi_window_open = 0;
-
- vcpu->arch.interrupt_window_open =
- ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
- !(guest_intr & (GUEST_INTR_STATE_STI |
- GUEST_INTR_STATE_MOV_SS)));
-}
-
-static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
-{
- int word_index = __ffs(vcpu->arch.irq_summary);
- int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
- int irq = word_index * BITS_PER_LONG + bit_index;
+ return 0;
- clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
- if (!vcpu->arch.irq_pending[word_index])
- clear_bit(word_index, &vcpu->arch.irq_summary);
- kvm_queue_interrupt(vcpu, irq);
+ return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS |
+ GUEST_INTR_STATE_NMI));
}
-static void do_interrupt_requests(struct kvm_vcpu *vcpu,
- struct kvm_run *kvm_run)
+static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
{
- vmx_update_window_states(vcpu);
-
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
- vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
- GUEST_INTR_STATE_STI |
- GUEST_INTR_STATE_MOV_SS);
-
- if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
- if (vcpu->arch.interrupt.pending) {
- enable_nmi_window(vcpu);
- } else if (vcpu->arch.nmi_window_open) {
- vcpu->arch.nmi_pending = false;
- vcpu->arch.nmi_injected = true;
- } else {
- enable_nmi_window(vcpu);
- return;
- }
- }
- if (vcpu->arch.nmi_injected) {
- vmx_inject_nmi(vcpu);
- if (vcpu->arch.nmi_pending)
- enable_nmi_window(vcpu);
- else if (vcpu->arch.irq_summary
- || kvm_run->request_interrupt_window)
- enable_irq_window(vcpu);
- return;
- }
-
- if (vcpu->arch.interrupt_window_open) {
- if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
- kvm_do_inject_irq(vcpu);
-
- if (vcpu->arch.interrupt.pending)
- vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
- }
- if (!vcpu->arch.interrupt_window_open &&
- (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
- enable_irq_window(vcpu);
+ return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+ !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
}
static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -2585,6 +2591,31 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
return 0;
}
+/*
+ * Trigger machine check on the host. We assume all the MSRs are already set up
+ * by the CPU and that we still run on the same CPU as the MCE occurred on.
+ * We pass a fake environment to the machine check handler because we want
+ * the guest to be always treated like user space, no matter what context
+ * it used internally.
+ */
+static void kvm_machine_check(void)
+{
+#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
+ struct pt_regs regs = {
+ .cs = 3, /* Fake ring 3 no matter what the guest ran on */
+ .flags = X86_EFLAGS_IF,
+ };
+
+ do_machine_check(&regs, 0);
+#endif
+}
+
+static int handle_machine_check(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ /* already handled by vcpu_run */
+ return 1;
+}
+
static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2596,17 +2627,14 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
vect_info = vmx->idt_vectoring_info;
intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+ if (is_machine_check(intr_info))
+ return handle_machine_check(vcpu, kvm_run);
+
if ((vect_info & VECTORING_INFO_VALID_MASK) &&
!is_page_fault(intr_info))
printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
"intr info 0x%x\n", __func__, vect_info, intr_info);
- if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
- int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
- set_bit(irq, vcpu->arch.irq_pending);
- set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
- }
-
if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
return 1; /* already handled by vmx_vcpu_run() */
@@ -2628,17 +2656,17 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
if (is_page_fault(intr_info)) {
/* EPT won't cause page fault directly */
- if (vm_need_ept())
+ if (enable_ept)
BUG();
cr2 = vmcs_readl(EXIT_QUALIFICATION);
KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
(u32)((u64)cr2 >> 32), handler);
- if (vcpu->arch.interrupt.pending || vcpu->arch.exception.pending)
+ if (kvm_event_needs_reinjection(vcpu))
kvm_mmu_unprotect_page_virt(vcpu, cr2);
return kvm_mmu_page_fault(vcpu, cr2, error_code);
}
- if (vcpu->arch.rmode.active &&
+ if (vcpu->arch.rmode.vm86_active &&
handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
error_code)) {
if (vcpu->arch.halt_request) {
@@ -2753,13 +2781,18 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg));
skip_emulated_instruction(vcpu);
return 1;
- case 8:
- kvm_set_cr8(vcpu, kvm_register_read(vcpu, reg));
- skip_emulated_instruction(vcpu);
- if (irqchip_in_kernel(vcpu->kvm))
- return 1;
- kvm_run->exit_reason = KVM_EXIT_SET_TPR;
- return 0;
+ case 8: {
+ u8 cr8_prev = kvm_get_cr8(vcpu);
+ u8 cr8 = kvm_register_read(vcpu, reg);
+ kvm_set_cr8(vcpu, cr8);
+ skip_emulated_instruction(vcpu);
+ if (irqchip_in_kernel(vcpu->kvm))
+ return 1;
+ if (cr8_prev <= cr8)
+ return 1;
+ kvm_run->exit_reason = KVM_EXIT_SET_TPR;
+ return 0;
+ }
};
break;
case 2: /* clts */
@@ -2957,8 +2990,9 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
* If the user space waits to inject interrupts, exit as soon as
* possible
*/
- if (kvm_run->request_interrupt_window &&
- !vcpu->arch.irq_summary) {
+ if (!irqchip_in_kernel(vcpu->kvm) &&
+ kvm_run->request_interrupt_window &&
+ !kvm_cpu_has_interrupt(vcpu)) {
kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
return 0;
}
@@ -2980,7 +3014,7 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
- u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
+ unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
kvm_mmu_invlpg(vcpu, exit_qualification);
skip_emulated_instruction(vcpu);
@@ -2996,11 +3030,11 @@ static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
- u64 exit_qualification;
+ unsigned long exit_qualification;
enum emulation_result er;
unsigned long offset;
- exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
offset = exit_qualification & 0xffful;
er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
@@ -3019,22 +3053,41 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long exit_qualification;
u16 tss_selector;
- int reason;
+ int reason, type, idt_v;
+
+ idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
+ type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
reason = (u32)exit_qualification >> 30;
- if (reason == TASK_SWITCH_GATE && vmx->vcpu.arch.nmi_injected &&
- (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
- (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK)
- == INTR_TYPE_NMI_INTR) {
- vcpu->arch.nmi_injected = false;
- if (cpu_has_virtual_nmis())
- vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
- GUEST_INTR_STATE_NMI);
+ if (reason == TASK_SWITCH_GATE && idt_v) {
+ switch (type) {
+ case INTR_TYPE_NMI_INTR:
+ vcpu->arch.nmi_injected = false;
+ if (cpu_has_virtual_nmis())
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ break;
+ case INTR_TYPE_EXT_INTR:
+ case INTR_TYPE_SOFT_INTR:
+ kvm_clear_interrupt_queue(vcpu);
+ break;
+ case INTR_TYPE_HARD_EXCEPTION:
+ case INTR_TYPE_SOFT_EXCEPTION:
+ kvm_clear_exception_queue(vcpu);
+ break;
+ default:
+ break;
+ }
}
tss_selector = exit_qualification;
+ if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
+ type != INTR_TYPE_EXT_INTR &&
+ type != INTR_TYPE_NMI_INTR))
+ skip_emulated_instruction(vcpu);
+
if (!kvm_task_switch(vcpu, tss_selector, reason))
return 0;
@@ -3051,11 +3104,11 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
- u64 exit_qualification;
+ unsigned long exit_qualification;
gpa_t gpa;
int gla_validity;
- exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
if (exit_qualification & (1 << 6)) {
printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
@@ -3067,7 +3120,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
(long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
- (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS));
+ vmcs_readl(GUEST_LINEAR_ADDRESS));
printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
(long unsigned int)exit_qualification);
kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
@@ -3150,6 +3203,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
[EXIT_REASON_WBINVD] = handle_wbinvd,
[EXIT_REASON_TASK_SWITCH] = handle_task_switch,
[EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
+ [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
};
static const int kvm_vmx_max_exit_handlers =
@@ -3159,10 +3213,10 @@ static const int kvm_vmx_max_exit_handlers =
* The guest has exited. See if we can fix it or if we need userspace
* assistance.
*/
-static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
{
- u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 exit_reason = vmx->exit_reason;
u32 vectoring_info = vmx->idt_vectoring_info;
KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu),
@@ -3178,7 +3232,7 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
/* Access CR3 don't cause VMExit in paging mode, so we need
* to sync with guest real CR3. */
- if (vm_need_ept() && is_paging(vcpu)) {
+ if (enable_ept && is_paging(vcpu)) {
vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
ept_load_pdptrs(vcpu);
}
@@ -3199,9 +3253,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
__func__, vectoring_info, exit_reason);
if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
- if (vcpu->arch.interrupt_window_open) {
+ if (vmx_interrupt_allowed(vcpu)) {
vmx->soft_vnmi_blocked = 0;
- vcpu->arch.nmi_window_open = 1;
} else if (vmx->vnmi_blocked_time > 1000000000LL &&
vcpu->arch.nmi_pending) {
/*
@@ -3214,7 +3267,6 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
"state on VCPU %d after 1 s timeout\n",
__func__, vcpu->vcpu_id);
vmx->soft_vnmi_blocked = 0;
- vmx->vcpu.arch.nmi_window_open = 1;
}
}
@@ -3228,122 +3280,107 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
return 0;
}
-static void update_tpr_threshold(struct kvm_vcpu *vcpu)
+static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
- int max_irr, tpr;
-
- if (!vm_need_tpr_shadow(vcpu->kvm))
- return;
-
- if (!kvm_lapic_enabled(vcpu) ||
- ((max_irr = kvm_lapic_find_highest_irr(vcpu)) == -1)) {
+ if (irr == -1 || tpr < irr) {
vmcs_write32(TPR_THRESHOLD, 0);
return;
}
- tpr = (kvm_lapic_get_cr8(vcpu) & 0x0f) << 4;
- vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4);
+ vmcs_write32(TPR_THRESHOLD, irr);
}
static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
{
u32 exit_intr_info;
- u32 idt_vectoring_info;
+ u32 idt_vectoring_info = vmx->idt_vectoring_info;
bool unblock_nmi;
u8 vector;
int type;
bool idtv_info_valid;
- u32 error;
exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+ vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
+
+ /* Handle machine checks before interrupts are enabled */
+ if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
+ || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
+ && is_machine_check(exit_intr_info)))
+ kvm_machine_check();
+
+ /* We need to handle NMIs before interrupts are enabled */
+ if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
+ (exit_intr_info & INTR_INFO_VALID_MASK)) {
+ KVMTRACE_0D(NMI, &vmx->vcpu, handler);
+ asm("int $2");
+ }
+
+ idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
+
if (cpu_has_virtual_nmis()) {
unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
/*
- * SDM 3: 25.7.1.2
+ * SDM 3: 27.7.1.2 (September 2008)
* Re-set bit "block by NMI" before VM entry if vmexit caused by
* a guest IRET fault.
+ * SDM 3: 23.2.2 (September 2008)
+ * Bit 12 is undefined in any of the following cases:
+ * If the VM exit sets the valid bit in the IDT-vectoring
+ * information field.
+ * If the VM exit is due to a double fault.
*/
- if (unblock_nmi && vector != DF_VECTOR)
+ if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
+ vector != DF_VECTOR && !idtv_info_valid)
vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
GUEST_INTR_STATE_NMI);
} else if (unlikely(vmx->soft_vnmi_blocked))
vmx->vnmi_blocked_time +=
ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
- idt_vectoring_info = vmx->idt_vectoring_info;
- idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
+ vmx->vcpu.arch.nmi_injected = false;
+ kvm_clear_exception_queue(&vmx->vcpu);
+ kvm_clear_interrupt_queue(&vmx->vcpu);
+
+ if (!idtv_info_valid)
+ return;
+
vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
- if (vmx->vcpu.arch.nmi_injected) {
+
+ switch (type) {
+ case INTR_TYPE_NMI_INTR:
+ vmx->vcpu.arch.nmi_injected = true;
/*
- * SDM 3: 25.7.1.2
- * Clear bit "block by NMI" before VM entry if a NMI delivery
- * faulted.
+ * SDM 3: 27.7.1.2 (September 2008)
+ * Clear bit "block by NMI" before VM entry if a NMI
+ * delivery faulted.
*/
- if (idtv_info_valid && type == INTR_TYPE_NMI_INTR)
- vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
- GUEST_INTR_STATE_NMI);
- else
- vmx->vcpu.arch.nmi_injected = false;
- }
- kvm_clear_exception_queue(&vmx->vcpu);
- if (idtv_info_valid && (type == INTR_TYPE_HARD_EXCEPTION ||
- type == INTR_TYPE_SOFT_EXCEPTION)) {
+ vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ break;
+ case INTR_TYPE_SOFT_EXCEPTION:
+ vmx->vcpu.arch.event_exit_inst_len =
+ vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ /* fall through */
+ case INTR_TYPE_HARD_EXCEPTION:
if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
- error = vmcs_read32(IDT_VECTORING_ERROR_CODE);
- kvm_queue_exception_e(&vmx->vcpu, vector, error);
+ u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE);
+ kvm_queue_exception_e(&vmx->vcpu, vector, err);
} else
kvm_queue_exception(&vmx->vcpu, vector);
- vmx->idt_vectoring_info = 0;
- }
- kvm_clear_interrupt_queue(&vmx->vcpu);
- if (idtv_info_valid && type == INTR_TYPE_EXT_INTR) {
- kvm_queue_interrupt(&vmx->vcpu, vector);
- vmx->idt_vectoring_info = 0;
- }
-}
-
-static void vmx_intr_assist(struct kvm_vcpu *vcpu)
-{
- update_tpr_threshold(vcpu);
-
- vmx_update_window_states(vcpu);
-
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
- vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
- GUEST_INTR_STATE_STI |
- GUEST_INTR_STATE_MOV_SS);
-
- if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
- if (vcpu->arch.interrupt.pending) {
- enable_nmi_window(vcpu);
- } else if (vcpu->arch.nmi_window_open) {
- vcpu->arch.nmi_pending = false;
- vcpu->arch.nmi_injected = true;
- } else {
- enable_nmi_window(vcpu);
- return;
- }
- }
- if (vcpu->arch.nmi_injected) {
- vmx_inject_nmi(vcpu);
- if (vcpu->arch.nmi_pending)
- enable_nmi_window(vcpu);
- else if (kvm_cpu_has_interrupt(vcpu))
- enable_irq_window(vcpu);
- return;
- }
- if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) {
- if (vcpu->arch.interrupt_window_open)
- kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
- else
- enable_irq_window(vcpu);
- }
- if (vcpu->arch.interrupt.pending) {
- vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
- if (kvm_cpu_has_interrupt(vcpu))
- enable_irq_window(vcpu);
+ break;
+ case INTR_TYPE_SOFT_INTR:
+ vmx->vcpu.arch.event_exit_inst_len =
+ vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ /* fall through */
+ case INTR_TYPE_EXT_INTR:
+ kvm_queue_interrupt(&vmx->vcpu, vector,
+ type == INTR_TYPE_SOFT_INTR);
+ break;
+ default:
+ break;
}
}
@@ -3381,7 +3418,6 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 intr_info;
/* Record the guest's net vcpu time for enforced NMI injections. */
if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
@@ -3505,20 +3541,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if (vmx->rmode.irq.pending)
fixup_rmode_irq(vmx);
- vmx_update_window_states(vcpu);
-
asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
vmx->launched = 1;
- intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-
- /* We need to handle NMIs before interrupts are enabled */
- if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
- (intr_info & INTR_INFO_VALID_MASK)) {
- KVMTRACE_0D(NMI, vcpu, handler);
- asm("int $2");
- }
-
vmx_complete_interrupts(vmx);
}
@@ -3593,7 +3618,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
if (alloc_apic_access_page(kvm) != 0)
goto free_vmcs;
- if (vm_need_ept())
+ if (enable_ept)
if (alloc_identity_pagetable(kvm) != 0)
goto free_vmcs;
@@ -3631,9 +3656,32 @@ static int get_ept_level(void)
return VMX_EPT_DEFAULT_GAW + 1;
}
-static int vmx_get_mt_mask_shift(void)
+static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
- return VMX_EPT_MT_EPTE_SHIFT;
+ u64 ret;
+
+ /* For VT-d and EPT combination
+ * 1. MMIO: always map as UC
+ * 2. EPT with VT-d:
+ * a. VT-d without snooping control feature: can't guarantee the
+ * result, try to trust guest.
+ * b. VT-d with snooping control feature: snooping control feature of
+ * VT-d engine can guarantee the cache correctness. Just set it
+ * to WB to keep consistent with host. So the same as item 3.
+ * 3. EPT without VT-d: always map as WB and set IGMT=1 to keep
+ * consistent with host MTRR
+ */
+ if (is_mmio)
+ ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
+ else if (vcpu->kvm->arch.iommu_domain &&
+ !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY))
+ ret = kvm_get_guest_memory_type(vcpu, gfn) <<
+ VMX_EPT_MT_EPTE_SHIFT;
+ else
+ ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT)
+ | VMX_EPT_IGMT_BIT;
+
+ return ret;
}
static struct kvm_x86_ops vmx_x86_ops = {
@@ -3644,7 +3692,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.check_processor_compatibility = vmx_check_processor_compat,
.hardware_enable = hardware_enable,
.hardware_disable = hardware_disable,
- .cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses,
+ .cpu_has_accelerated_tpr = report_flexpriority,
.vcpu_create = vmx_create_vcpu,
.vcpu_free = vmx_free_vcpu,
@@ -3678,78 +3726,82 @@ static struct kvm_x86_ops vmx_x86_ops = {
.tlb_flush = vmx_flush_tlb,
.run = vmx_vcpu_run,
- .handle_exit = kvm_handle_exit,
+ .handle_exit = vmx_handle_exit,
.skip_emulated_instruction = skip_emulated_instruction,
+ .set_interrupt_shadow = vmx_set_interrupt_shadow,
+ .get_interrupt_shadow = vmx_get_interrupt_shadow,
.patch_hypercall = vmx_patch_hypercall,
- .get_irq = vmx_get_irq,
.set_irq = vmx_inject_irq,
+ .set_nmi = vmx_inject_nmi,
.queue_exception = vmx_queue_exception,
- .exception_injected = vmx_exception_injected,
- .inject_pending_irq = vmx_intr_assist,
- .inject_pending_vectors = do_interrupt_requests,
+ .interrupt_allowed = vmx_interrupt_allowed,
+ .nmi_allowed = vmx_nmi_allowed,
+ .enable_nmi_window = enable_nmi_window,
+ .enable_irq_window = enable_irq_window,
+ .update_cr8_intercept = update_cr8_intercept,
.set_tss_addr = vmx_set_tss_addr,
.get_tdp_level = get_ept_level,
- .get_mt_mask_shift = vmx_get_mt_mask_shift,
+ .get_mt_mask = vmx_get_mt_mask,
};
static int __init vmx_init(void)
{
- void *va;
int r;
- vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+ vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_io_bitmap_a)
return -ENOMEM;
- vmx_io_bitmap_b = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+ vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_io_bitmap_b) {
r = -ENOMEM;
goto out;
}
- vmx_msr_bitmap = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
- if (!vmx_msr_bitmap) {
+ vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_msr_bitmap_legacy) {
r = -ENOMEM;
goto out1;
}
+ vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_msr_bitmap_longmode) {
+ r = -ENOMEM;
+ goto out2;
+ }
+
/*
* Allow direct access to the PC debug port (it is often used for I/O
* delays, but the vmexits simply slow things down).
*/
- va = kmap(vmx_io_bitmap_a);
- memset(va, 0xff, PAGE_SIZE);
- clear_bit(0x80, va);
- kunmap(vmx_io_bitmap_a);
+ memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE);
+ clear_bit(0x80, vmx_io_bitmap_a);
- va = kmap(vmx_io_bitmap_b);
- memset(va, 0xff, PAGE_SIZE);
- kunmap(vmx_io_bitmap_b);
+ memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
- va = kmap(vmx_msr_bitmap);
- memset(va, 0xff, PAGE_SIZE);
- kunmap(vmx_msr_bitmap);
+ memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
+ memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
if (r)
- goto out2;
+ goto out3;
- vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_FS_BASE);
- vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_GS_BASE);
- vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_CS);
- vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP);
- vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP);
+ vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
+ vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
+ vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
+ vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
+ vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
+ vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
- if (vm_need_ept()) {
+ if (enable_ept) {
bypass_guest_pf = 0;
kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
VMX_EPT_WRITABLE_MASK);
kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
- VMX_EPT_EXECUTABLE_MASK,
- VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
+ VMX_EPT_EXECUTABLE_MASK);
kvm_enable_tdp();
} else
kvm_disable_tdp();
@@ -3761,20 +3813,23 @@ static int __init vmx_init(void)
return 0;
+out3:
+ free_page((unsigned long)vmx_msr_bitmap_longmode);
out2:
- __free_page(vmx_msr_bitmap);
+ free_page((unsigned long)vmx_msr_bitmap_legacy);
out1:
- __free_page(vmx_io_bitmap_b);
+ free_page((unsigned long)vmx_io_bitmap_b);
out:
- __free_page(vmx_io_bitmap_a);
+ free_page((unsigned long)vmx_io_bitmap_a);
return r;
}
static void __exit vmx_exit(void)
{
- __free_page(vmx_msr_bitmap);
- __free_page(vmx_io_bitmap_b);
- __free_page(vmx_io_bitmap_a);
+ free_page((unsigned long)vmx_msr_bitmap_legacy);
+ free_page((unsigned long)vmx_msr_bitmap_longmode);
+ free_page((unsigned long)vmx_io_bitmap_b);
+ free_page((unsigned long)vmx_io_bitmap_a);
kvm_exit();
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3944e917e79..249540f9851 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -91,7 +91,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
{ "hypercalls", VCPU_STAT(hypercalls) },
{ "request_irq", VCPU_STAT(request_irq_exits) },
- { "request_nmi", VCPU_STAT(request_nmi_exits) },
{ "irq_exits", VCPU_STAT(irq_exits) },
{ "host_state_reload", VCPU_STAT(host_state_reload) },
{ "efer_reload", VCPU_STAT(efer_reload) },
@@ -108,7 +107,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "mmu_recycled", VM_STAT(mmu_recycled) },
{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
{ "mmu_unsync", VM_STAT(mmu_unsync) },
- { "mmu_unsync_global", VM_STAT(mmu_unsync_global) },
{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
{ "largepages", VM_STAT(lpages) },
{ NULL }
@@ -234,7 +232,8 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
goto out;
}
for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
- if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
+ if (is_present_pte(pdpte[i]) &&
+ (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
ret = 0;
goto out;
}
@@ -321,7 +320,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
kvm_x86_ops->set_cr0(vcpu, cr0);
vcpu->arch.cr0 = cr0;
- kvm_mmu_sync_global(vcpu);
kvm_mmu_reset_context(vcpu);
return;
}
@@ -370,7 +368,6 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
kvm_x86_ops->set_cr4(vcpu, cr4);
vcpu->arch.cr4 = cr4;
vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
- kvm_mmu_sync_global(vcpu);
kvm_mmu_reset_context(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_set_cr4);
@@ -523,6 +520,9 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
efer |= vcpu->arch.shadow_efer & EFER_LMA;
vcpu->arch.shadow_efer = efer;
+
+ vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
+ kvm_mmu_reset_context(vcpu);
}
void kvm_enable_efer_bits(u64 mask)
@@ -630,14 +630,17 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
unsigned long flags;
struct kvm_vcpu_arch *vcpu = &v->arch;
void *shared_kaddr;
+ unsigned long this_tsc_khz;
if ((!vcpu->time_page))
return;
- if (unlikely(vcpu->hv_clock_tsc_khz != __get_cpu_var(cpu_tsc_khz))) {
- kvm_set_time_scale(__get_cpu_var(cpu_tsc_khz), &vcpu->hv_clock);
- vcpu->hv_clock_tsc_khz = __get_cpu_var(cpu_tsc_khz);
+ this_tsc_khz = get_cpu_var(cpu_tsc_khz);
+ if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
+ kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
+ vcpu->hv_clock_tsc_khz = this_tsc_khz;
}
+ put_cpu_var(cpu_tsc_khz);
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
@@ -893,6 +896,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_IA32_LASTINTFROMIP:
case MSR_IA32_LASTINTTOIP:
case MSR_VM_HSAVE_PA:
+ case MSR_P6_EVNTSEL0:
+ case MSR_P6_EVNTSEL1:
data = 0;
break;
case MSR_MTRRcap:
@@ -1024,6 +1029,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_SYNC_MMU:
case KVM_CAP_REINJECT_CONTROL:
case KVM_CAP_IRQ_INJECT_STATUS:
+ case KVM_CAP_ASSIGN_DEV_IRQ:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -1241,41 +1247,53 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->flags = 0;
}
+#define F(x) bit(X86_FEATURE_##x)
+
static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
u32 index, int *nent, int maxnent)
{
- const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) |
- bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
- bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
- bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
- bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
- bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
- bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
- bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
- bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
- bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
- const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
- bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
- bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
- bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
- bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
- bit(X86_FEATURE_PGE) |
- bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
- bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
- bit(X86_FEATURE_SYSCALL) |
- (is_efer_nx() ? bit(X86_FEATURE_NX) : 0) |
+ unsigned f_nx = is_efer_nx() ? F(NX) : 0;
#ifdef CONFIG_X86_64
- bit(X86_FEATURE_LM) |
+ unsigned f_lm = F(LM);
+#else
+ unsigned f_lm = 0;
#endif
- bit(X86_FEATURE_FXSR_OPT) |
- bit(X86_FEATURE_MMXEXT) |
- bit(X86_FEATURE_3DNOWEXT) |
- bit(X86_FEATURE_3DNOW);
- const u32 kvm_supported_word3_x86_features =
- bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
+
+ /* cpuid 1.edx */
+ const u32 kvm_supported_word0_x86_features =
+ F(FPU) | F(VME) | F(DE) | F(PSE) |
+ F(TSC) | F(MSR) | F(PAE) | F(MCE) |
+ F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
+ F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
+ F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
+ 0 /* Reserved, DS, ACPI */ | F(MMX) |
+ F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
+ 0 /* HTT, TM, Reserved, PBE */;
+ /* cpuid 0x80000001.edx */
+ const u32 kvm_supported_word1_x86_features =
+ F(FPU) | F(VME) | F(DE) | F(PSE) |
+ F(TSC) | F(MSR) | F(PAE) | F(MCE) |
+ F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
+ F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
+ F(PAT) | F(PSE36) | 0 /* Reserved */ |
+ f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
+ F(FXSR) | F(FXSR_OPT) | 0 /* GBPAGES */ | 0 /* RDTSCP */ |
+ 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
+ /* cpuid 1.ecx */
+ const u32 kvm_supported_word4_x86_features =
+ F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ |
+ 0 /* DS-CPL, VMX, SMX, EST */ |
+ 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
+ 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
+ 0 /* Reserved, DCA */ | F(XMM4_1) |
+ F(XMM4_2) | 0 /* x2APIC */ | F(MOVBE) | F(POPCNT) |
+ 0 /* Reserved, XSAVE, OSXSAVE */;
+ /* cpuid 0x80000001.ecx */
const u32 kvm_supported_word6_x86_features =
- bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY) |
- bit(X86_FEATURE_SVM);
+ F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
+ F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
+ F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) |
+ 0 /* SKINIT */ | 0 /* WDT */;
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
@@ -1288,7 +1306,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
break;
case 1:
entry->edx &= kvm_supported_word0_x86_features;
- entry->ecx &= kvm_supported_word3_x86_features;
+ entry->ecx &= kvm_supported_word4_x86_features;
break;
/* function 2 entries are STATEFUL. That is, repeated cpuid commands
* may return different values. This forces us to get_cpu() before
@@ -1350,6 +1368,8 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
put_cpu();
}
+#undef F
+
static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_entry2 __user *entries)
{
@@ -1421,8 +1441,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
return -ENXIO;
vcpu_load(vcpu);
- set_bit(irq->irq, vcpu->arch.irq_pending);
- set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
+ kvm_queue_interrupt(vcpu, irq->irq, false);
vcpu_put(vcpu);
@@ -1584,8 +1603,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = -EINVAL;
}
out:
- if (lapic)
- kfree(lapic);
+ kfree(lapic);
return r;
}
@@ -1606,10 +1624,12 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
return -EINVAL;
down_write(&kvm->slots_lock);
+ spin_lock(&kvm->mmu_lock);
kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
+ spin_unlock(&kvm->mmu_lock);
up_write(&kvm->slots_lock);
return 0;
}
@@ -1785,7 +1805,9 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
/* If nothing is dirty, don't bother messing with page tables. */
if (is_dirty) {
+ spin_lock(&kvm->mmu_lock);
kvm_mmu_slot_remove_write_access(kvm, log->slot);
+ spin_unlock(&kvm->mmu_lock);
kvm_flush_remote_tlbs(kvm);
memslot = &kvm->memslots[log->slot];
n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
@@ -2360,7 +2382,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
u16 error_code,
int emulation_type)
{
- int r;
+ int r, shadow_mask;
struct decode_cache *c;
kvm_clear_exception_queue(vcpu);
@@ -2408,7 +2430,16 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
}
}
+ if (emulation_type & EMULTYPE_SKIP) {
+ kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip);
+ return EMULATE_DONE;
+ }
+
r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
+ shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
+
+ if (r == 0)
+ kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
if (vcpu->arch.pio.string)
return EMULATE_DO_MMIO;
@@ -2761,7 +2792,7 @@ int kvm_arch_init(void *opaque)
kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
- PT_DIRTY_MASK, PT64_NX_MASK, 0, 0);
+ PT_DIRTY_MASK, PT64_NX_MASK, 0);
for_each_possible_cpu(cpu)
per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
@@ -3012,6 +3043,16 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
return best;
}
+int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
+ if (best)
+ return best->eax & 0xff;
+ return 36;
+}
+
void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
{
u32 function, index;
@@ -3048,10 +3089,9 @@ EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
struct kvm_run *kvm_run)
{
- return (!vcpu->arch.irq_summary &&
+ return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
kvm_run->request_interrupt_window &&
- vcpu->arch.interrupt_window_open &&
- (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
+ kvm_arch_interrupt_allowed(vcpu));
}
static void post_kvm_run_save(struct kvm_vcpu *vcpu,
@@ -3064,8 +3104,9 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu,
kvm_run->ready_for_interrupt_injection = 1;
else
kvm_run->ready_for_interrupt_injection =
- (vcpu->arch.interrupt_window_open &&
- vcpu->arch.irq_summary == 0);
+ kvm_arch_interrupt_allowed(vcpu) &&
+ !kvm_cpu_has_interrupt(vcpu) &&
+ !kvm_event_needs_reinjection(vcpu);
}
static void vapic_enter(struct kvm_vcpu *vcpu)
@@ -3094,9 +3135,63 @@ static void vapic_exit(struct kvm_vcpu *vcpu)
up_read(&vcpu->kvm->slots_lock);
}
+static void update_cr8_intercept(struct kvm_vcpu *vcpu)
+{
+ int max_irr, tpr;
+
+ if (!kvm_x86_ops->update_cr8_intercept)
+ return;
+
+ if (!vcpu->arch.apic->vapic_addr)
+ max_irr = kvm_lapic_find_highest_irr(vcpu);
+ else
+ max_irr = -1;
+
+ if (max_irr != -1)
+ max_irr >>= 4;
+
+ tpr = kvm_lapic_get_cr8(vcpu);
+
+ kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
+}
+
+static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+ kvm_x86_ops->set_interrupt_shadow(vcpu, 0);
+
+ /* try to reinject previous events if any */
+ if (vcpu->arch.nmi_injected) {
+ kvm_x86_ops->set_nmi(vcpu);
+ return;
+ }
+
+ if (vcpu->arch.interrupt.pending) {
+ kvm_x86_ops->set_irq(vcpu);
+ return;
+ }
+
+ /* try to inject new event if pending */
+ if (vcpu->arch.nmi_pending) {
+ if (kvm_x86_ops->nmi_allowed(vcpu)) {
+ vcpu->arch.nmi_pending = false;
+ vcpu->arch.nmi_injected = true;
+ kvm_x86_ops->set_nmi(vcpu);
+ }
+ } else if (kvm_cpu_has_interrupt(vcpu)) {
+ if (kvm_x86_ops->interrupt_allowed(vcpu)) {
+ kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
+ false);
+ kvm_x86_ops->set_irq(vcpu);
+ }
+ }
+}
+
static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
int r;
+ bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
+ kvm_run->request_interrupt_window;
if (vcpu->requests)
if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
@@ -3128,9 +3223,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
}
}
- clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
- kvm_inject_pending_timer_irqs(vcpu);
-
preempt_disable();
kvm_x86_ops->prepare_guest_switch(vcpu);
@@ -3138,6 +3230,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
local_irq_disable();
+ clear_bit(KVM_REQ_KICK, &vcpu->requests);
+ smp_mb__after_clear_bit();
+
if (vcpu->requests || need_resched() || signal_pending(current)) {
local_irq_enable();
preempt_enable();
@@ -3145,21 +3240,21 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
goto out;
}
- vcpu->guest_mode = 1;
- /*
- * Make sure that guest_mode assignment won't happen after
- * testing the pending IRQ vector bitmap.
- */
- smp_wmb();
-
if (vcpu->arch.exception.pending)
__queue_exception(vcpu);
- else if (irqchip_in_kernel(vcpu->kvm))
- kvm_x86_ops->inject_pending_irq(vcpu);
else
- kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
+ inject_pending_irq(vcpu, kvm_run);
- kvm_lapic_sync_to_vapic(vcpu);
+ /* enable NMI/IRQ window open exits if needed */
+ if (vcpu->arch.nmi_pending)
+ kvm_x86_ops->enable_nmi_window(vcpu);
+ else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+ kvm_x86_ops->enable_irq_window(vcpu);
+
+ if (kvm_lapic_enabled(vcpu)) {
+ update_cr8_intercept(vcpu);
+ kvm_lapic_sync_to_vapic(vcpu);
+ }
up_read(&vcpu->kvm->slots_lock);
@@ -3193,7 +3288,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
set_debugreg(vcpu->arch.host_dr6, 6);
set_debugreg(vcpu->arch.host_dr7, 7);
- vcpu->guest_mode = 0;
+ set_bit(KVM_REQ_KICK, &vcpu->requests);
local_irq_enable();
++vcpu->stat.exits;
@@ -3220,8 +3315,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
profile_hit(KVM_PROFILING, (void *)rip);
}
- if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
- vcpu->arch.exception.pending = false;
kvm_lapic_sync_from_vapic(vcpu);
@@ -3230,6 +3323,7 @@ out:
return r;
}
+
static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
int r;
@@ -3256,29 +3350,42 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
kvm_vcpu_block(vcpu);
down_read(&vcpu->kvm->slots_lock);
if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
- if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
+ {
+ switch(vcpu->arch.mp_state) {
+ case KVM_MP_STATE_HALTED:
vcpu->arch.mp_state =
- KVM_MP_STATE_RUNNABLE;
- if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
- r = -EINTR;
+ KVM_MP_STATE_RUNNABLE;
+ case KVM_MP_STATE_RUNNABLE:
+ break;
+ case KVM_MP_STATE_SIPI_RECEIVED:
+ default:
+ r = -EINTR;
+ break;
+ }
+ }
}
- if (r > 0) {
- if (dm_request_for_irq_injection(vcpu, kvm_run)) {
- r = -EINTR;
- kvm_run->exit_reason = KVM_EXIT_INTR;
- ++vcpu->stat.request_irq_exits;
- }
- if (signal_pending(current)) {
- r = -EINTR;
- kvm_run->exit_reason = KVM_EXIT_INTR;
- ++vcpu->stat.signal_exits;
- }
- if (need_resched()) {
- up_read(&vcpu->kvm->slots_lock);
- kvm_resched(vcpu);
- down_read(&vcpu->kvm->slots_lock);
- }
+ if (r <= 0)
+ break;
+
+ clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
+ if (kvm_cpu_has_pending_timer(vcpu))
+ kvm_inject_pending_timer_irqs(vcpu);
+
+ if (dm_request_for_irq_injection(vcpu, kvm_run)) {
+ r = -EINTR;
+ kvm_run->exit_reason = KVM_EXIT_INTR;
+ ++vcpu->stat.request_irq_exits;
+ }
+ if (signal_pending(current)) {
+ r = -EINTR;
+ kvm_run->exit_reason = KVM_EXIT_INTR;
+ ++vcpu->stat.signal_exits;
+ }
+ if (need_resched()) {
+ up_read(&vcpu->kvm->slots_lock);
+ kvm_resched(vcpu);
+ down_read(&vcpu->kvm->slots_lock);
}
}
@@ -3442,7 +3549,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
struct kvm_sregs *sregs)
{
struct descriptor_table dt;
- int pending_vec;
vcpu_load(vcpu);
@@ -3472,16 +3578,11 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
sregs->efer = vcpu->arch.shadow_efer;
sregs->apic_base = kvm_get_apic_base(vcpu);
- if (irqchip_in_kernel(vcpu->kvm)) {
- memset(sregs->interrupt_bitmap, 0,
- sizeof sregs->interrupt_bitmap);
- pending_vec = kvm_x86_ops->get_irq(vcpu);
- if (pending_vec >= 0)
- set_bit(pending_vec,
- (unsigned long *)sregs->interrupt_bitmap);
- } else
- memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
- sizeof sregs->interrupt_bitmap);
+ memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
+
+ if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
+ set_bit(vcpu->arch.interrupt.nr,
+ (unsigned long *)sregs->interrupt_bitmap);
vcpu_put(vcpu);
@@ -3688,7 +3789,6 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
- tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
}
static int load_state_from_tss32(struct kvm_vcpu *vcpu,
@@ -3785,8 +3885,8 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu,
}
static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
- u32 old_tss_base,
- struct desc_struct *nseg_desc)
+ u16 old_tss_sel, u32 old_tss_base,
+ struct desc_struct *nseg_desc)
{
struct tss_segment_16 tss_segment_16;
int ret = 0;
@@ -3805,6 +3905,16 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
&tss_segment_16, sizeof tss_segment_16))
goto out;
+ if (old_tss_sel != 0xffff) {
+ tss_segment_16.prev_task_link = old_tss_sel;
+
+ if (kvm_write_guest(vcpu->kvm,
+ get_tss_base_addr(vcpu, nseg_desc),
+ &tss_segment_16.prev_task_link,
+ sizeof tss_segment_16.prev_task_link))
+ goto out;
+ }
+
if (load_state_from_tss16(vcpu, &tss_segment_16))
goto out;
@@ -3814,7 +3924,7 @@ out:
}
static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
- u32 old_tss_base,
+ u16 old_tss_sel, u32 old_tss_base,
struct desc_struct *nseg_desc)
{
struct tss_segment_32 tss_segment_32;
@@ -3834,6 +3944,16 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
&tss_segment_32, sizeof tss_segment_32))
goto out;
+ if (old_tss_sel != 0xffff) {
+ tss_segment_32.prev_task_link = old_tss_sel;
+
+ if (kvm_write_guest(vcpu->kvm,
+ get_tss_base_addr(vcpu, nseg_desc),
+ &tss_segment_32.prev_task_link,
+ sizeof tss_segment_32.prev_task_link))
+ goto out;
+ }
+
if (load_state_from_tss32(vcpu, &tss_segment_32))
goto out;
@@ -3887,14 +4007,22 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
}
- kvm_x86_ops->skip_emulated_instruction(vcpu);
+ /* set back link to prev task only if NT bit is set in eflags
+ note that old_tss_sel is not used afetr this point */
+ if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
+ old_tss_sel = 0xffff;
+
+ /* set back link to prev task only if NT bit is set in eflags
+ note that old_tss_sel is not used afetr this point */
+ if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
+ old_tss_sel = 0xffff;
if (nseg_desc.type & 8)
- ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base,
- &nseg_desc);
+ ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
+ old_tss_base, &nseg_desc);
else
- ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_base,
- &nseg_desc);
+ ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
+ old_tss_base, &nseg_desc);
if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
u32 eflags = kvm_x86_ops->get_rflags(vcpu);
@@ -3920,7 +4048,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
struct kvm_sregs *sregs)
{
int mmu_reset_needed = 0;
- int i, pending_vec, max_bits;
+ int pending_vec, max_bits;
struct descriptor_table dt;
vcpu_load(vcpu);
@@ -3934,7 +4062,13 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
vcpu->arch.cr2 = sregs->cr2;
mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
- vcpu->arch.cr3 = sregs->cr3;
+
+ down_read(&vcpu->kvm->slots_lock);
+ if (gfn_to_memslot(vcpu->kvm, sregs->cr3 >> PAGE_SHIFT))
+ vcpu->arch.cr3 = sregs->cr3;
+ else
+ set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+ up_read(&vcpu->kvm->slots_lock);
kvm_set_cr8(vcpu, sregs->cr8);
@@ -3956,25 +4090,14 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
if (mmu_reset_needed)
kvm_mmu_reset_context(vcpu);
- if (!irqchip_in_kernel(vcpu->kvm)) {
- memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap,
- sizeof vcpu->arch.irq_pending);
- vcpu->arch.irq_summary = 0;
- for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i)
- if (vcpu->arch.irq_pending[i])
- __set_bit(i, &vcpu->arch.irq_summary);
- } else {
- max_bits = (sizeof sregs->interrupt_bitmap) << 3;
- pending_vec = find_first_bit(
- (const unsigned long *)sregs->interrupt_bitmap,
- max_bits);
- /* Only pending external irq is handled here */
- if (pending_vec < max_bits) {
- kvm_x86_ops->set_irq(vcpu, pending_vec);
- pr_debug("Set back pending irq %d\n",
- pending_vec);
- }
- kvm_pic_clear_isr_ack(vcpu->kvm);
+ max_bits = (sizeof sregs->interrupt_bitmap) << 3;
+ pending_vec = find_first_bit(
+ (const unsigned long *)sregs->interrupt_bitmap, max_bits);
+ if (pending_vec < max_bits) {
+ kvm_queue_interrupt(vcpu, pending_vec, false);
+ pr_debug("Set back pending irq %d\n", pending_vec);
+ if (irqchip_in_kernel(vcpu->kvm))
+ kvm_pic_clear_isr_ack(vcpu->kvm);
}
kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
@@ -4308,7 +4431,6 @@ struct kvm *kvm_arch_create_vm(void)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
- INIT_LIST_HEAD(&kvm->arch.oos_global_pages);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
@@ -4411,12 +4533,14 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
}
}
+ spin_lock(&kvm->mmu_lock);
if (!kvm->arch.n_requested_mmu_pages) {
unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
}
kvm_mmu_slot_remove_write_access(kvm, mem->slot);
+ spin_unlock(&kvm->mmu_lock);
kvm_flush_remote_tlbs(kvm);
return 0;
@@ -4425,6 +4549,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
void kvm_arch_flush_shadow(struct kvm *kvm)
{
kvm_mmu_zap_all(kvm);
+ kvm_reload_remote_mmus(kvm);
}
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
@@ -4434,28 +4559,24 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
|| vcpu->arch.nmi_pending;
}
-static void vcpu_kick_intr(void *info)
-{
-#ifdef DEBUG
- struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
- printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
-#endif
-}
-
void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
{
- int ipi_pcpu = vcpu->cpu;
- int cpu = get_cpu();
+ int me;
+ int cpu = vcpu->cpu;
if (waitqueue_active(&vcpu->wq)) {
wake_up_interruptible(&vcpu->wq);
++vcpu->stat.halt_wakeup;
}
- /*
- * We may be called synchronously with irqs disabled in guest mode,
- * So need not to call smp_call_function_single() in that case.
- */
- if (vcpu->guest_mode && vcpu->cpu != cpu)
- smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0);
+
+ me = get_cpu();
+ if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
+ if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
+ smp_send_reschedule(cpu);
put_cpu();
}
+
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+ return kvm_x86_ops->interrupt_allowed(vcpu);
+}
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 6a4be78a738..4c8e10af78e 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -8,9 +8,11 @@ static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
vcpu->arch.exception.pending = false;
}
-static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector)
+static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector,
+ bool soft)
{
vcpu->arch.interrupt.pending = true;
+ vcpu->arch.interrupt.soft = soft;
vcpu->arch.interrupt.nr = vector;
}
@@ -19,4 +21,14 @@ static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
vcpu->arch.interrupt.pending = false;
}
+static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.exception.pending || vcpu->arch.interrupt.pending ||
+ vcpu->arch.nmi_injected;
+}
+
+static inline bool kvm_exception_is_soft(unsigned int nr)
+{
+ return (nr == BP_VECTOR) || (nr == OF_VECTOR);
+}
#endif
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index ca91749d208..c1b6c232e02 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -59,13 +59,14 @@
#define SrcImm (5<<4) /* Immediate operand. */
#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */
#define SrcOne (7<<4) /* Implied '1' */
-#define SrcMask (7<<4)
+#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */
+#define SrcMask (0xf<<4)
/* Generic ModRM decode. */
-#define ModRM (1<<7)
+#define ModRM (1<<8)
/* Destination is only written; never read. */
-#define Mov (1<<8)
-#define BitOp (1<<9)
-#define MemAbs (1<<10) /* Memory operand is absolute displacement */
+#define Mov (1<<9)
+#define BitOp (1<<10)
+#define MemAbs (1<<11) /* Memory operand is absolute displacement */
#define String (1<<12) /* String instruction (rep capable) */
#define Stack (1<<13) /* Stack instruction (push/pop) */
#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
@@ -76,6 +77,7 @@
#define Src2CL (1<<29)
#define Src2ImmByte (2<<29)
#define Src2One (3<<29)
+#define Src2Imm16 (4<<29)
#define Src2Mask (7<<29)
enum {
@@ -135,11 +137,11 @@ static u32 opcode_table[256] = {
SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */
SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */
/* 0x70 - 0x77 */
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+ SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
+ SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
/* 0x78 - 0x7F */
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+ SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
+ SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
/* 0x80 - 0x87 */
Group | Group1_80, Group | Group1_81,
Group | Group1_82, Group | Group1_83,
@@ -153,7 +155,8 @@ static u32 opcode_table[256] = {
/* 0x90 - 0x97 */
DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
/* 0x98 - 0x9F */
- 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
+ 0, 0, SrcImm | Src2Imm16, 0,
+ ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
/* 0xA0 - 0xA7 */
ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
@@ -178,7 +181,8 @@ static u32 opcode_table[256] = {
0, ImplicitOps | Stack, 0, 0,
ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
/* 0xC8 - 0xCF */
- 0, 0, 0, ImplicitOps | Stack, 0, 0, 0, 0,
+ 0, 0, 0, ImplicitOps | Stack,
+ ImplicitOps, SrcImmByte, ImplicitOps, ImplicitOps,
/* 0xD0 - 0xD7 */
ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
@@ -187,11 +191,11 @@ static u32 opcode_table[256] = {
0, 0, 0, 0, 0, 0, 0, 0,
/* 0xE0 - 0xE7 */
0, 0, 0, 0,
- SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
- SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
+ ByteOp | SrcImmUByte, SrcImmUByte,
+ ByteOp | SrcImmUByte, SrcImmUByte,
/* 0xE8 - 0xEF */
- ImplicitOps | Stack, SrcImm | ImplicitOps,
- ImplicitOps, SrcImmByte | ImplicitOps,
+ SrcImm | Stack, SrcImm | ImplicitOps,
+ SrcImm | Src2Imm16, SrcImmByte | ImplicitOps,
SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
/* 0xF0 - 0xF7 */
@@ -230,10 +234,8 @@ static u32 twobyte_table[256] = {
/* 0x70 - 0x7F */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 0x80 - 0x8F */
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+ SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
+ SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
/* 0x90 - 0x9F */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 0xA0 - 0xA7 */
@@ -1044,10 +1046,14 @@ done_prefixes:
}
break;
case SrcImmByte:
+ case SrcImmUByte:
c->src.type = OP_IMM;
c->src.ptr = (unsigned long *)c->eip;
c->src.bytes = 1;
- c->src.val = insn_fetch(s8, 1, c->eip);
+ if ((c->d & SrcMask) == SrcImmByte)
+ c->src.val = insn_fetch(s8, 1, c->eip);
+ else
+ c->src.val = insn_fetch(u8, 1, c->eip);
break;
case SrcOne:
c->src.bytes = 1;
@@ -1072,6 +1078,12 @@ done_prefixes:
c->src2.bytes = 1;
c->src2.val = insn_fetch(u8, 1, c->eip);
break;
+ case Src2Imm16:
+ c->src2.type = OP_IMM;
+ c->src2.ptr = (unsigned long *)c->eip;
+ c->src2.bytes = 2;
+ c->src2.val = insn_fetch(u16, 2, c->eip);
+ break;
case Src2One:
c->src2.bytes = 1;
c->src2.val = 1;
@@ -1349,6 +1361,20 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
return 0;
}
+void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
+{
+ u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask);
+ /*
+ * an sti; sti; sequence only disable interrupts for the first
+ * instruction. So, if the last instruction, be it emulated or
+ * not, left the system with the INT_STI flag enabled, it
+ * means that the last instruction is an sti. We should not
+ * leave the flag on in this case. The same goes for mov ss
+ */
+ if (!(int_shadow & mask))
+ ctxt->interruptibility = mask;
+}
+
int
x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
{
@@ -1360,6 +1386,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
int io_dir_in;
int rc = 0;
+ ctxt->interruptibility = 0;
+
/* Shadow copy of register state. Committed on successful emulation.
* NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
* modify them.
@@ -1531,13 +1559,10 @@ special_insn:
return -1;
}
return 0;
- case 0x70 ... 0x7f: /* jcc (short) */ {
- int rel = insn_fetch(s8, 1, c->eip);
-
+ case 0x70 ... 0x7f: /* jcc (short) */
if (test_cc(c->b, ctxt->eflags))
- jmp_rel(c, rel);
+ jmp_rel(c, c->src.val);
break;
- }
case 0x80 ... 0x83: /* Grp1 */
switch (c->modrm_reg) {
case 0:
@@ -1609,6 +1634,9 @@ special_insn:
int err;
sel = c->src.val;
+ if (c->modrm_reg == VCPU_SREG_SS)
+ toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS);
+
if (c->modrm_reg <= 5) {
type_bits = (c->modrm_reg == 1) ? 9 : 1;
err = kvm_load_segment_descriptor(ctxt->vcpu, sel,
@@ -1769,59 +1797,32 @@ special_insn:
break;
case 0xe4: /* inb */
case 0xe5: /* in */
- port = insn_fetch(u8, 1, c->eip);
+ port = c->src.val;
io_dir_in = 1;
goto do_io;
case 0xe6: /* outb */
case 0xe7: /* out */
- port = insn_fetch(u8, 1, c->eip);
+ port = c->src.val;
io_dir_in = 0;
goto do_io;
case 0xe8: /* call (near) */ {
- long int rel;
- switch (c->op_bytes) {
- case 2:
- rel = insn_fetch(s16, 2, c->eip);
- break;
- case 4:
- rel = insn_fetch(s32, 4, c->eip);
- break;
- default:
- DPRINTF("Call: Invalid op_bytes\n");
- goto cannot_emulate;
- }
+ long int rel = c->src.val;
c->src.val = (unsigned long) c->eip;
jmp_rel(c, rel);
- c->op_bytes = c->ad_bytes;
emulate_push(ctxt);
break;
}
case 0xe9: /* jmp rel */
goto jmp;
- case 0xea: /* jmp far */ {
- uint32_t eip;
- uint16_t sel;
-
- switch (c->op_bytes) {
- case 2:
- eip = insn_fetch(u16, 2, c->eip);
- break;
- case 4:
- eip = insn_fetch(u32, 4, c->eip);
- break;
- default:
- DPRINTF("jmp far: Invalid op_bytes\n");
- goto cannot_emulate;
- }
- sel = insn_fetch(u16, 2, c->eip);
- if (kvm_load_segment_descriptor(ctxt->vcpu, sel, 9, VCPU_SREG_CS) < 0) {
+ case 0xea: /* jmp far */
+ if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 9,
+ VCPU_SREG_CS) < 0) {
DPRINTF("jmp far: Failed to load CS descriptor\n");
goto cannot_emulate;
}
- c->eip = eip;
+ c->eip = c->src.val;
break;
- }
case 0xeb:
jmp: /* jmp rel short */
jmp_rel(c, c->src.val);
@@ -1865,6 +1866,7 @@ special_insn:
c->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xfb: /* sti */
+ toggle_interruptibility(ctxt, X86_SHADOW_INT_STI);
ctxt->eflags |= X86_EFLAGS_IF;
c->dst.type = OP_NONE; /* Disable writeback. */
break;
@@ -2039,28 +2041,11 @@ twobyte_insn:
if (!test_cc(c->b, ctxt->eflags))
c->dst.type = OP_NONE; /* no writeback */
break;
- case 0x80 ... 0x8f: /* jnz rel, etc*/ {
- long int rel;
-
- switch (c->op_bytes) {
- case 2:
- rel = insn_fetch(s16, 2, c->eip);
- break;
- case 4:
- rel = insn_fetch(s32, 4, c->eip);
- break;
- case 8:
- rel = insn_fetch(s64, 8, c->eip);
- break;
- default:
- DPRINTF("jnz: Invalid op_bytes\n");
- goto cannot_emulate;
- }
+ case 0x80 ... 0x8f: /* jnz rel, etc*/
if (test_cc(c->b, ctxt->eflags))
- jmp_rel(c, rel);
+ jmp_rel(c, c->src.val);
c->dst.type = OP_NONE;
break;
- }
case 0xa3:
bt: /* bt */
c->dst.type = OP_NONE;
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 55e11aa6d66..f9d35632666 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -2,7 +2,7 @@
# Makefile for x86 specific library files.
#
-obj-$(CONFIG_SMP) := msr-on-cpu.o
+obj-$(CONFIG_SMP) := msr.o
lib-y := delay.o
lib-y += thunk_$(BITS).o
diff --git a/arch/x86/lib/msr-on-cpu.c b/arch/x86/lib/msr-on-cpu.c
deleted file mode 100644
index 321cf720dbb..00000000000
--- a/arch/x86/lib/msr-on-cpu.c
+++ /dev/null
@@ -1,97 +0,0 @@
-#include <linux/module.h>
-#include <linux/preempt.h>
-#include <linux/smp.h>
-#include <asm/msr.h>
-
-struct msr_info {
- u32 msr_no;
- u32 l, h;
- int err;
-};
-
-static void __rdmsr_on_cpu(void *info)
-{
- struct msr_info *rv = info;
-
- rdmsr(rv->msr_no, rv->l, rv->h);
-}
-
-static void __wrmsr_on_cpu(void *info)
-{
- struct msr_info *rv = info;
-
- wrmsr(rv->msr_no, rv->l, rv->h);
-}
-
-int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
-{
- int err;
- struct msr_info rv;
-
- rv.msr_no = msr_no;
- err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
- *l = rv.l;
- *h = rv.h;
-
- return err;
-}
-
-int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
-{
- int err;
- struct msr_info rv;
-
- rv.msr_no = msr_no;
- rv.l = l;
- rv.h = h;
- err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
-
- return err;
-}
-
-/* These "safe" variants are slower and should be used when the target MSR
- may not actually exist. */
-static void __rdmsr_safe_on_cpu(void *info)
-{
- struct msr_info *rv = info;
-
- rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h);
-}
-
-static void __wrmsr_safe_on_cpu(void *info)
-{
- struct msr_info *rv = info;
-
- rv->err = wrmsr_safe(rv->msr_no, rv->l, rv->h);
-}
-
-int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
-{
- int err;
- struct msr_info rv;
-
- rv.msr_no = msr_no;
- err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
- *l = rv.l;
- *h = rv.h;
-
- return err ? err : rv.err;
-}
-
-int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
-{
- int err;
- struct msr_info rv;
-
- rv.msr_no = msr_no;
- rv.l = l;
- rv.h = h;
- err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
-
- return err ? err : rv.err;
-}
-
-EXPORT_SYMBOL(rdmsr_on_cpu);
-EXPORT_SYMBOL(wrmsr_on_cpu);
-EXPORT_SYMBOL(rdmsr_safe_on_cpu);
-EXPORT_SYMBOL(wrmsr_safe_on_cpu);
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
new file mode 100644
index 00000000000..1440b9c0547
--- /dev/null
+++ b/arch/x86/lib/msr.c
@@ -0,0 +1,183 @@
+#include <linux/module.h>
+#include <linux/preempt.h>
+#include <linux/smp.h>
+#include <asm/msr.h>
+
+struct msr_info {
+ u32 msr_no;
+ struct msr reg;
+ struct msr *msrs;
+ int off;
+ int err;
+};
+
+static void __rdmsr_on_cpu(void *info)
+{
+ struct msr_info *rv = info;
+ struct msr *reg;
+ int this_cpu = raw_smp_processor_id();
+
+ if (rv->msrs)
+ reg = &rv->msrs[this_cpu - rv->off];
+ else
+ reg = &rv->reg;
+
+ rdmsr(rv->msr_no, reg->l, reg->h);
+}
+
+static void __wrmsr_on_cpu(void *info)
+{
+ struct msr_info *rv = info;
+ struct msr *reg;
+ int this_cpu = raw_smp_processor_id();
+
+ if (rv->msrs)
+ reg = &rv->msrs[this_cpu - rv->off];
+ else
+ reg = &rv->reg;
+
+ wrmsr(rv->msr_no, reg->l, reg->h);
+}
+
+int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
+{
+ int err;
+ struct msr_info rv;
+
+ memset(&rv, 0, sizeof(rv));
+
+ rv.msr_no = msr_no;
+ err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
+ *l = rv.reg.l;
+ *h = rv.reg.h;
+
+ return err;
+}
+EXPORT_SYMBOL(rdmsr_on_cpu);
+
+int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
+{
+ int err;
+ struct msr_info rv;
+
+ memset(&rv, 0, sizeof(rv));
+
+ rv.msr_no = msr_no;
+ rv.reg.l = l;
+ rv.reg.h = h;
+ err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
+
+ return err;
+}
+EXPORT_SYMBOL(wrmsr_on_cpu);
+
+/* rdmsr on a bunch of CPUs
+ *
+ * @mask: which CPUs
+ * @msr_no: which MSR
+ * @msrs: array of MSR values
+ *
+ */
+void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs)
+{
+ struct msr_info rv;
+ int this_cpu;
+
+ memset(&rv, 0, sizeof(rv));
+
+ rv.off = cpumask_first(mask);
+ rv.msrs = msrs;
+ rv.msr_no = msr_no;
+
+ preempt_disable();
+ /*
+ * FIXME: handle the CPU we're executing on separately for now until
+ * smp_call_function_many has been fixed to not skip it.
+ */
+ this_cpu = raw_smp_processor_id();
+ smp_call_function_single(this_cpu, __rdmsr_on_cpu, &rv, 1);
+
+ smp_call_function_many(mask, __rdmsr_on_cpu, &rv, 1);
+ preempt_enable();
+}
+EXPORT_SYMBOL(rdmsr_on_cpus);
+
+/*
+ * wrmsr on a bunch of CPUs
+ *
+ * @mask: which CPUs
+ * @msr_no: which MSR
+ * @msrs: array of MSR values
+ *
+ */
+void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs)
+{
+ struct msr_info rv;
+ int this_cpu;
+
+ memset(&rv, 0, sizeof(rv));
+
+ rv.off = cpumask_first(mask);
+ rv.msrs = msrs;
+ rv.msr_no = msr_no;
+
+ preempt_disable();
+ /*
+ * FIXME: handle the CPU we're executing on separately for now until
+ * smp_call_function_many has been fixed to not skip it.
+ */
+ this_cpu = raw_smp_processor_id();
+ smp_call_function_single(this_cpu, __wrmsr_on_cpu, &rv, 1);
+
+ smp_call_function_many(mask, __wrmsr_on_cpu, &rv, 1);
+ preempt_enable();
+}
+EXPORT_SYMBOL(wrmsr_on_cpus);
+
+/* These "safe" variants are slower and should be used when the target MSR
+ may not actually exist. */
+static void __rdmsr_safe_on_cpu(void *info)
+{
+ struct msr_info *rv = info;
+
+ rv->err = rdmsr_safe(rv->msr_no, &rv->reg.l, &rv->reg.h);
+}
+
+static void __wrmsr_safe_on_cpu(void *info)
+{
+ struct msr_info *rv = info;
+
+ rv->err = wrmsr_safe(rv->msr_no, rv->reg.l, rv->reg.h);
+}
+
+int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
+{
+ int err;
+ struct msr_info rv;
+
+ memset(&rv, 0, sizeof(rv));
+
+ rv.msr_no = msr_no;
+ err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
+ *l = rv.reg.l;
+ *h = rv.reg.h;
+
+ return err ? err : rv.err;
+}
+EXPORT_SYMBOL(rdmsr_safe_on_cpu);
+
+int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
+{
+ int err;
+ struct msr_info rv;
+
+ memset(&rv, 0, sizeof(rv));
+
+ rv.msr_no = msr_no;
+ rv.reg.l = l;
+ rv.reg.h = h;
+ err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
+
+ return err ? err : rv.err;
+}
+EXPORT_SYMBOL(wrmsr_safe_on_cpu);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 5ec7ae36661..c6acc632637 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -10,6 +10,7 @@
#include <linux/bootmem.h> /* max_low_pfn */
#include <linux/kprobes.h> /* __kprobes, ... */
#include <linux/mmiotrace.h> /* kmmio_handler, ... */
+#include <linux/perf_counter.h> /* perf_swcounter_event */
#include <asm/traps.h> /* dotraplinkage, ... */
#include <asm/pgalloc.h> /* pgd_*(), ... */
@@ -1013,6 +1014,8 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
if (unlikely(error_code & PF_RSVD))
pgtable_bad(regs, error_code, address);
+ perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+
/*
* If we're in an interrupt, have no user context or are running
* in an atomic region then we must not take the fault:
@@ -1106,10 +1109,15 @@ good_area:
return;
}
- if (fault & VM_FAULT_MAJOR)
+ if (fault & VM_FAULT_MAJOR) {
tsk->maj_flt++;
- else
+ perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+ regs, address);
+ } else {
tsk->min_flt++;
+ perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+ regs, address);
+ }
check_v8086_mode(regs, address, tsk);
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index c0bedcd10f9..18d244f7020 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -40,21 +40,20 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
static void __init memtest(u64 pattern, u64 start_phys, u64 size)
{
- u64 *p;
- void *start, *end;
+ u64 *p, *start, *end;
u64 start_bad, last_bad;
u64 start_phys_aligned;
- size_t incr;
+ const size_t incr = sizeof(pattern);
- incr = sizeof(pattern);
start_phys_aligned = ALIGN(start_phys, incr);
start = __va(start_phys_aligned);
- end = start + size - (start_phys_aligned - start_phys);
+ end = start + (size - (start_phys_aligned - start_phys)) / incr;
start_bad = 0;
last_bad = 0;
for (p = start; p < end; p++)
*p = pattern;
+
for (p = start; p < end; p++, start_phys_aligned += incr) {
if (*p == pattern)
continue;
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 3b285e656e2..b07dd8d0b32 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -40,8 +40,9 @@ static int profile_exceptions_notify(struct notifier_block *self,
switch (val) {
case DIE_NMI:
- if (model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu)))
- ret = NOTIFY_STOP;
+ case DIE_NMI_IPI:
+ model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu));
+ ret = NOTIFY_STOP;
break;
default:
break;
@@ -134,7 +135,7 @@ static void nmi_cpu_setup(void *dummy)
static struct notifier_block profile_exceptions_nb = {
.notifier_call = profile_exceptions_notify,
.next = NULL,
- .priority = 0
+ .priority = 2
};
static int nmi_setup(void)
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 10131fbdaad..4da7230b3d1 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -18,7 +18,7 @@
#include <asm/msr.h>
#include <asm/apic.h>
#include <asm/nmi.h>
-#include <asm/intel_arch_perfmon.h>
+#include <asm/perf_counter.h>
#include "op_x86_model.h"
#include "op_counter.h"
@@ -136,6 +136,13 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
u64 val;
int i;
+ /*
+ * This can happen if perf counters are in use when
+ * we steal the die notifier NMI.
+ */
+ if (unlikely(!reset_value))
+ goto out;
+
for (i = 0 ; i < num_counters; ++i) {
if (!reset_value[i])
continue;
@@ -146,6 +153,7 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
}
}
+out:
/* Only P6 based Pentium M need to re-unmask the apic vector but it
* doesn't hurt other P6 variant */
apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 1241f118ab5..58bc00f68b1 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -338,6 +338,8 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
}
}
+ current->mm->context.vdso = (void *)addr;
+
if (compat_uses_vma || !compat) {
/*
* MAYWRITE to allow gdb to COW and set breakpoints
@@ -358,11 +360,13 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
goto up_fail;
}
- current->mm->context.vdso = (void *)addr;
current_thread_info()->sysenter_return =
VDSO32_SYMBOL(addr, SYSENTER_RETURN);
up_fail:
+ if (ret)
+ current->mm->context.vdso = NULL;
+
up_write(&mm->mmap_sem);
return ret;
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index cac083386e0..21e1aeb9f3e 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -116,15 +116,18 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
goto up_fail;
}
+ current->mm->context.vdso = (void *)addr;
+
ret = install_special_mapping(mm, addr, vdso_size,
VM_READ|VM_EXEC|
VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
VM_ALWAYSDUMP,
vdso_pages);
- if (ret)
+ if (ret) {
+ current->mm->context.vdso = NULL;
goto up_fail;
+ }
- current->mm->context.vdso = (void *)addr;
up_fail:
up_write(&mm->mmap_sem);
return ret;