aboutsummaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig13
-rw-r--r--arch/x86/include/asm/agp.h4
-rw-r--r--arch/x86/include/asm/apicdef.h2
-rw-r--r--arch/x86/include/asm/bootparam.h3
-rw-r--r--arch/x86/include/asm/cacheflush.h54
-rw-r--r--arch/x86/include/asm/device.h3
-rw-r--r--arch/x86/include/asm/elf.h2
-rw-r--r--arch/x86/include/asm/fixmap.h3
-rw-r--r--arch/x86/include/asm/iomap.h9
-rw-r--r--arch/x86/include/asm/kvm.h10
-rw-r--r--arch/x86/include/asm/kvm_emulate.h (renamed from arch/x86/include/asm/kvm_x86_emulate.h)0
-rw-r--r--arch/x86/include/asm/kvm_host.h60
-rw-r--r--arch/x86/include/asm/kvm_para.h2
-rw-r--r--arch/x86/include/asm/msr-index.h1
-rw-r--r--arch/x86/include/asm/mtrr.h6
-rw-r--r--arch/x86/include/asm/nops.h2
-rw-r--r--arch/x86/include/asm/pat.h5
-rw-r--r--arch/x86/include/asm/pci.h1
-rw-r--r--arch/x86/include/asm/percpu.h9
-rw-r--r--arch/x86/include/asm/processor.h2
-rw-r--r--arch/x86/include/asm/vmx.h8
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/cpu/amd.c10
-rw-r--r--arch/x86/kernel/cpu/cpu_debug.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c20
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c46
-rw-r--r--arch/x86/kernel/cpu/perf_counter.c14
-rw-r--r--arch/x86/kernel/entry_64.S6
-rw-r--r--arch/x86/kernel/kvm.c7
-rw-r--r--arch/x86/kernel/kvmclock.c4
-rw-r--r--arch/x86/kernel/pci-dma.c4
-rw-r--r--arch/x86/kernel/quirks.c2
-rw-r--r--arch/x86/kernel/reboot.c7
-rw-r--r--arch/x86/kernel/setup.c23
-rw-r--r--arch/x86/kernel/setup_percpu.c364
-rw-r--r--arch/x86/kernel/smpboot.c16
-rw-r--r--arch/x86/kernel/tboot.c447
-rw-r--r--arch/x86/kernel/vmlinux.lds.S11
-rw-r--r--arch/x86/kvm/Kconfig21
-rw-r--r--arch/x86/kvm/Makefile35
-rw-r--r--arch/x86/kvm/emulate.c (renamed from arch/x86/kvm/x86_emulate.c)265
-rw-r--r--arch/x86/kvm/i8254.c160
-rw-r--r--arch/x86/kvm/i8254.h5
-rw-r--r--arch/x86/kvm/i8259.c116
-rw-r--r--arch/x86/kvm/irq.h1
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h9
-rw-r--r--arch/x86/kvm/kvm_svm.h51
-rw-r--r--arch/x86/kvm/kvm_timer.h2
-rw-r--r--arch/x86/kvm/lapic.c334
-rw-r--r--arch/x86/kvm/lapic.h4
-rw-r--r--arch/x86/kvm/mmu.c587
-rw-r--r--arch/x86/kvm/mmu.h4
-rw-r--r--arch/x86/kvm/mmutrace.h220
-rw-r--r--arch/x86/kvm/paging_tmpl.h141
-rw-r--r--arch/x86/kvm/svm.c889
-rw-r--r--arch/x86/kvm/timer.c16
-rw-r--r--arch/x86/kvm/trace.h355
-rw-r--r--arch/x86/kvm/vmx.c497
-rw-r--r--arch/x86/kvm/x86.c815
-rw-r--r--arch/x86/kvm/x86.h4
-rw-r--r--arch/x86/mm/Makefile6
-rw-r--r--arch/x86/mm/highmem_32.c1
-rw-r--r--arch/x86/mm/iomap_32.c27
-rw-r--r--arch/x86/mm/ioremap.c90
-rw-r--r--arch/x86/mm/mmap.c17
-rw-r--r--arch/x86/mm/pageattr.c29
-rw-r--r--arch/x86/mm/pat.c353
-rw-r--r--arch/x86/mm/physaddr.c70
-rw-r--r--arch/x86/mm/physaddr.h10
-rw-r--r--arch/x86/pci/amd_bus.c64
-rw-r--r--arch/x86/pci/common.c69
-rw-r--r--arch/x86/power/cpu.c2
-rw-r--r--arch/x86/xen/Makefile2
-rw-r--r--arch/x86/xen/enlighten.c131
-rw-r--r--arch/x86/xen/smp.c1
-rw-r--r--arch/x86/xen/spinlock.c28
77 files changed, 4582 insertions, 2036 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fc20fdc0f7f..e98e81a0497 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -150,7 +150,10 @@ config ARCH_HAS_CACHE_LINE_SIZE
config HAVE_SETUP_PER_CPU_AREA
def_bool y
-config HAVE_DYNAMIC_PER_CPU_AREA
+config NEED_PER_CPU_EMBED_FIRST_CHUNK
+ def_bool y
+
+config NEED_PER_CPU_PAGE_FIRST_CHUNK
def_bool y
config HAVE_CPUMASK_OF_CPU_MAP
@@ -179,6 +182,10 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING
config ARCH_SUPPORTS_DEBUG_PAGEALLOC
def_bool y
+config HAVE_INTEL_TXT
+ def_bool y
+ depends on EXPERIMENTAL && DMAR && ACPI
+
# Use the generic interrupt handling code in kernel/irq/:
config GENERIC_HARDIRQS
bool
@@ -1413,6 +1420,10 @@ config X86_PAT
If unsure, say Y.
+config ARCH_USES_PG_UNCACHED
+ def_bool y
+ depends on X86_PAT
+
config EFI
bool "EFI runtime service support"
depends on ACPI
diff --git a/arch/x86/include/asm/agp.h b/arch/x86/include/asm/agp.h
index 9825cd64c9b..eec2a70d437 100644
--- a/arch/x86/include/asm/agp.h
+++ b/arch/x86/include/asm/agp.h
@@ -22,10 +22,6 @@
*/
#define flush_agp_cache() wbinvd()
-/* Convert a physical address to an address suitable for the GART. */
-#define phys_to_gart(x) (x)
-#define gart_to_phys(x) (x)
-
/* GATT allocation. Returns/accepts GATT kernel virtual address. */
#define alloc_gatt_pages(order) \
((char *)__get_free_pages(GFP_KERNEL, (order)))
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 7386bfa4f4b..3b62da926de 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -15,6 +15,7 @@
#define APIC_LVR 0x30
#define APIC_LVR_MASK 0xFF00FF
+#define APIC_LVR_DIRECTED_EOI (1 << 24)
#define GET_APIC_VERSION(x) ((x) & 0xFFu)
#define GET_APIC_MAXLVT(x) (((x) >> 16) & 0xFFu)
#ifdef CONFIG_X86_32
@@ -41,6 +42,7 @@
#define APIC_DFR_CLUSTER 0x0FFFFFFFul
#define APIC_DFR_FLAT 0xFFFFFFFFul
#define APIC_SPIV 0xF0
+#define APIC_SPIV_DIRECTED_EOI (1 << 12)
#define APIC_SPIV_FOCUS_DISABLED (1 << 9)
#define APIC_SPIV_APIC_ENABLED (1 << 8)
#define APIC_ISR 0x100
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index 1724e8de317..6ca20218dd7 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -85,7 +85,8 @@ struct efi_info {
struct boot_params {
struct screen_info screen_info; /* 0x000 */
struct apm_bios_info apm_bios_info; /* 0x040 */
- __u8 _pad2[12]; /* 0x054 */
+ __u8 _pad2[4]; /* 0x054 */
+ __u64 tboot_addr; /* 0x058 */
struct ist_info ist_info; /* 0x060 */
__u8 _pad3[16]; /* 0x070 */
__u8 hd0_info[16]; /* obsolete! */ /* 0x080 */
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index e55dfc1ad45..b54f6afe7ec 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -43,8 +43,58 @@ static inline void copy_from_user_page(struct vm_area_struct *vma,
memcpy(dst, src, len);
}
-#define PG_non_WB PG_arch_1
-PAGEFLAG(NonWB, non_WB)
+#define PG_WC PG_arch_1
+PAGEFLAG(WC, WC)
+
+#ifdef CONFIG_X86_PAT
+/*
+ * X86 PAT uses page flags WC and Uncached together to keep track of
+ * memory type of pages that have backing page struct. X86 PAT supports 3
+ * different memory types, _PAGE_CACHE_WB, _PAGE_CACHE_WC and
+ * _PAGE_CACHE_UC_MINUS and fourth state where page's memory type has not
+ * been changed from its default (value of -1 used to denote this).
+ * Note we do not support _PAGE_CACHE_UC here.
+ *
+ * Caller must hold memtype_lock for atomicity.
+ */
+static inline unsigned long get_page_memtype(struct page *pg)
+{
+ if (!PageUncached(pg) && !PageWC(pg))
+ return -1;
+ else if (!PageUncached(pg) && PageWC(pg))
+ return _PAGE_CACHE_WC;
+ else if (PageUncached(pg) && !PageWC(pg))
+ return _PAGE_CACHE_UC_MINUS;
+ else
+ return _PAGE_CACHE_WB;
+}
+
+static inline void set_page_memtype(struct page *pg, unsigned long memtype)
+{
+ switch (memtype) {
+ case _PAGE_CACHE_WC:
+ ClearPageUncached(pg);
+ SetPageWC(pg);
+ break;
+ case _PAGE_CACHE_UC_MINUS:
+ SetPageUncached(pg);
+ ClearPageWC(pg);
+ break;
+ case _PAGE_CACHE_WB:
+ SetPageUncached(pg);
+ SetPageWC(pg);
+ break;
+ default:
+ case -1:
+ ClearPageUncached(pg);
+ ClearPageWC(pg);
+ break;
+ }
+}
+#else
+static inline unsigned long get_page_memtype(struct page *pg) { return -1; }
+static inline void set_page_memtype(struct page *pg, unsigned long memtype) { }
+#endif
/*
* The set_memory_* API can be used to change various attributes of a virtual
diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h
index 4994a20acbc..cee34e9ca45 100644
--- a/arch/x86/include/asm/device.h
+++ b/arch/x86/include/asm/device.h
@@ -13,4 +13,7 @@ struct dma_map_ops *dma_ops;
#endif
};
+struct pdev_archdata {
+};
+
#endif /* _ASM_X86_DEVICE_H */
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 83c1bc8d2e8..456a304b817 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -299,6 +299,8 @@ do { \
#ifdef CONFIG_X86_32
+#define STACK_RND_MASK (0x7ff)
+
#define VDSO_HIGH_BASE (__fix_to_virt(FIX_VDSO))
#define ARCH_DLINFO ARCH_DLINFO_IA32(vdso_enabled)
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 7b2d71df39a..14f9890eb49 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -132,6 +132,9 @@ enum fixed_addresses {
#ifdef CONFIG_X86_32
FIX_WP_TEST,
#endif
+#ifdef CONFIG_INTEL_TXT
+ FIX_TBOOT_BASE,
+#endif
__end_of_fixed_addresses
};
diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h
index 0e9fe1d9d97..f35eb45d657 100644
--- a/arch/x86/include/asm/iomap.h
+++ b/arch/x86/include/asm/iomap.h
@@ -26,13 +26,16 @@
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
-int
-is_io_mapping_possible(resource_size_t base, unsigned long size);
-
void *
iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
void
iounmap_atomic(void *kvaddr, enum km_type type);
+int
+iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot);
+
+void
+iomap_free(resource_size_t base, unsigned long size);
+
#endif /* _ASM_X86_IOMAP_H */
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 125be8b1956..4a5fe914dc5 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -17,6 +17,8 @@
#define __KVM_HAVE_USER_NMI
#define __KVM_HAVE_GUEST_DEBUG
#define __KVM_HAVE_MSIX
+#define __KVM_HAVE_MCE
+#define __KVM_HAVE_PIT_STATE2
/* Architectural interrupt line count. */
#define KVM_NR_INTERRUPTS 256
@@ -236,6 +238,14 @@ struct kvm_pit_state {
struct kvm_pit_channel_state channels[3];
};
+#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001
+
+struct kvm_pit_state2 {
+ struct kvm_pit_channel_state channels[3];
+ __u32 flags;
+ __u32 reserved[9];
+};
+
struct kvm_reinject_control {
__u8 pit_reinject;
__u8 reserved[31];
diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index b7ed2c42311..b7ed2c42311 100644
--- a/arch/x86/include/asm/kvm_x86_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index eabdc1cfab5..3be000435fa 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -14,6 +14,7 @@
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/mmu_notifier.h>
+#include <linux/tracepoint.h>
#include <linux/kvm.h>
#include <linux/kvm_para.h>
@@ -37,12 +38,14 @@
#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \
0xFFFFFF0000000000ULL)
-#define KVM_GUEST_CR0_MASK \
- (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE \
- | X86_CR0_NW | X86_CR0_CD)
+#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
+ (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
+#define KVM_GUEST_CR0_MASK \
+ (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
+#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \
+ (X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP)
#define KVM_VM_CR0_ALWAYS_ON \
- (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE | X86_CR0_TS \
- | X86_CR0_MP)
+ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
#define KVM_GUEST_CR4_MASK \
(X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE)
#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
@@ -51,12 +54,12 @@
#define INVALID_PAGE (~(hpa_t)0)
#define UNMAPPED_GVA (~(gpa_t)0)
-/* shadow tables are PAE even on non-PAE hosts */
-#define KVM_HPAGE_SHIFT 21
-#define KVM_HPAGE_SIZE (1UL << KVM_HPAGE_SHIFT)
-#define KVM_HPAGE_MASK (~(KVM_HPAGE_SIZE - 1))
-
-#define KVM_PAGES_PER_HPAGE (KVM_HPAGE_SIZE / PAGE_SIZE)
+/* KVM Hugepage definitions for x86 */
+#define KVM_NR_PAGE_SIZES 3
+#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + (((x) - 1) * 9))
+#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x))
+#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1))
+#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
#define DE_VECTOR 0
#define DB_VECTOR 1
@@ -120,6 +123,10 @@ enum kvm_reg {
NR_VCPU_REGS
};
+enum kvm_reg_ex {
+ VCPU_EXREG_PDPTR = NR_VCPU_REGS,
+};
+
enum {
VCPU_SREG_ES,
VCPU_SREG_CS,
@@ -131,7 +138,7 @@ enum {
VCPU_SREG_LDTR,
};
-#include <asm/kvm_x86_emulate.h>
+#include <asm/kvm_emulate.h>
#define KVM_NR_MEM_OBJS 40
@@ -308,7 +315,6 @@ struct kvm_vcpu_arch {
struct {
gfn_t gfn; /* presumed gfn during guest pte update */
pfn_t pfn; /* pfn corresponding to that gfn */
- int largepage;
unsigned long mmu_seq;
} update_pte;
@@ -334,16 +340,6 @@ struct kvm_vcpu_arch {
u8 nr;
} interrupt;
- struct {
- int vm86_active;
- u8 save_iopl;
- struct kvm_save_segment {
- u16 selector;
- unsigned long base;
- u32 limit;
- u32 ar;
- } tr, es, ds, fs, gs;
- } rmode;
int halt_request; /* real mode on Intel only */
int cpuid_nent;
@@ -366,13 +362,15 @@ struct kvm_vcpu_arch {
u32 pat;
int switch_db_regs;
- unsigned long host_db[KVM_NR_DB_REGS];
- unsigned long host_dr6;
- unsigned long host_dr7;
unsigned long db[KVM_NR_DB_REGS];
unsigned long dr6;
unsigned long dr7;
unsigned long eff_db[KVM_NR_DB_REGS];
+
+ u64 mcg_cap;
+ u64 mcg_status;
+ u64 mcg_ctl;
+ u64 *mce_banks;
};
struct kvm_mem_alias {
@@ -409,6 +407,7 @@ struct kvm_arch{
struct page *ept_identity_pagetable;
bool ept_identity_pagetable_done;
+ gpa_t ept_identity_map_addr;
unsigned long irq_sources_bitmap;
unsigned long irq_states[KVM_IOAPIC_NUM_PINS];
@@ -526,6 +525,9 @@ struct kvm_x86_ops {
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
int (*get_tdp_level)(void);
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
+ bool (*gb_page_enable)(void);
+
+ const struct trace_print_flags *exit_reasons_str;
};
extern struct kvm_x86_ops *kvm_x86_ops;
@@ -618,6 +620,7 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
u32 error_code);
+bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
int kvm_pic_set_irq(void *opaque, int irq, int level);
@@ -752,8 +755,6 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
}
-#define MSR_IA32_TIME_STAMP_COUNTER 0x010
-
#define TSS_IOPB_BASE_OFFSET 0x66
#define TSS_BASE_SIZE 0x68
#define TSS_IOPB_SIZE (65536 / 8)
@@ -796,5 +797,8 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
int kvm_age_hva(struct kvm *kvm, unsigned long hva);
int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
+int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
+int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index b8a3305ae09..c584076a47f 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -1,6 +1,8 @@
#ifndef _ASM_X86_KVM_PARA_H
#define _ASM_X86_KVM_PARA_H
+#include <linux/types.h>
+
/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It
* should be used to determine that a VM is running under KVM.
*/
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 6be7fc254b5..bd5549034a9 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -374,6 +374,7 @@
/* AMD-V MSRs */
#define MSR_VM_CR 0xc0010114
+#define MSR_VM_IGNNE 0xc0010115
#define MSR_VM_HSAVE_PA 0xc0010117
#endif /* _ASM_X86_MSR_INDEX_H */
diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
index a51ada8467d..4365ffdb461 100644
--- a/arch/x86/include/asm/mtrr.h
+++ b/arch/x86/include/asm/mtrr.h
@@ -121,6 +121,9 @@ extern int mtrr_del_page(int reg, unsigned long base, unsigned long size);
extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi);
extern void mtrr_ap_init(void);
extern void mtrr_bp_init(void);
+extern void set_mtrr_aps_delayed_init(void);
+extern void mtrr_aps_init(void);
+extern void mtrr_bp_restore(void);
extern int mtrr_trim_uncached_memory(unsigned long end_pfn);
extern int amd_special_default_mtrr(void);
# else
@@ -161,6 +164,9 @@ static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
#define mtrr_ap_init() do {} while (0)
#define mtrr_bp_init() do {} while (0)
+#define set_mtrr_aps_delayed_init() do {} while (0)
+#define mtrr_aps_init() do {} while (0)
+#define mtrr_bp_restore() do {} while (0)
# endif
#ifdef CONFIG_COMPAT
diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h
index ad2668ee1aa..6d8723a766c 100644
--- a/arch/x86/include/asm/nops.h
+++ b/arch/x86/include/asm/nops.h
@@ -65,6 +65,8 @@
6: osp nopl 0x00(%eax,%eax,1)
7: nopl 0x00000000(%eax)
8: nopl 0x00000000(%eax,%eax,1)
+ Note: All the above are assumed to be a single instruction.
+ There is kernel code that depends on this.
*/
#define P6_NOP1 GENERIC_NOP1
#define P6_NOP2 ".byte 0x66,0x90\n"
diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h
index 7af14e512f9..e2c1668dde7 100644
--- a/arch/x86/include/asm/pat.h
+++ b/arch/x86/include/asm/pat.h
@@ -19,4 +19,9 @@ extern int free_memtype(u64 start, u64 end);
extern int kernel_map_sync_memtype(u64 base, unsigned long size,
unsigned long flag);
+int io_reserve_memtype(resource_size_t start, resource_size_t end,
+ unsigned long *type);
+
+void io_free_memtype(resource_size_t start, resource_size_t end);
+
#endif /* _ASM_X86_PAT_H */
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 1ff685ca221..f76a162c082 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -48,7 +48,6 @@ extern unsigned int pcibios_assign_all_busses(void);
#else
#define pcibios_assign_all_busses() 0
#endif
-#define pcibios_scan_all_fns(a, b) 0
extern unsigned long pci_mem_start;
#define PCIBIOS_MIN_IO 0x1000
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 04eacefcfd2..b65a36defeb 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -168,15 +168,6 @@ do { \
/* We can use this directly for local CPU (faster). */
DECLARE_PER_CPU(unsigned long, this_cpu_off);
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-void *pcpu_lpage_remapped(void *kaddr);
-#else
-static inline void *pcpu_lpage_remapped(void *kaddr)
-{
- return NULL;
-}
-#endif
-
#endif /* !__ASSEMBLY__ */
#ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 4ae2ccfed63..c3429e8b242 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -1021,6 +1021,8 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
extern int get_tsc_mode(unsigned long adr);
extern int set_tsc_mode(unsigned int val);
+extern int amd_get_nb_id(int cpu);
+
struct aperfmperf {
u64 aperf, mperf;
};
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 11be5ad2e0e..272514c2d45 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -55,6 +55,7 @@
#define SECONDARY_EXEC_ENABLE_EPT 0x00000002
#define SECONDARY_EXEC_ENABLE_VPID 0x00000020
#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
+#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080
#define PIN_BASED_EXT_INTR_MASK 0x00000001
@@ -351,9 +352,16 @@ enum vmcs_field {
#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0
#define VMX_EPT_EXTENT_CONTEXT 1
#define VMX_EPT_EXTENT_GLOBAL 2
+
+#define VMX_EPT_EXECUTE_ONLY_BIT (1ull)
+#define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6)
+#define VMX_EPTP_UC_BIT (1ull << 8)
+#define VMX_EPTP_WB_BIT (1ull << 14)
+#define VMX_EPT_2MB_PAGE_BIT (1ull << 16)
#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24)
#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
+
#define VMX_EPT_DEFAULT_GAW 3
#define VMX_EPT_MAX_GAW 0x4
#define VMX_EPT_MT_EPTE_SHIFT 3
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 430d5b24af7..832cb838cb4 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -52,6 +52,7 @@ obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o
obj-$(CONFIG_X86_32) += tls.o
obj-$(CONFIG_IA32_EMULATION) += tls.o
obj-y += step.o
+obj-$(CONFIG_INTEL_TXT) += tboot.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += cpu/
obj-y += acpi/
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 22a47c82f3c..f32fa71ccf9 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -333,6 +333,16 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
#endif
}
+int amd_get_nb_id(int cpu)
+{
+ int id = 0;
+#ifdef CONFIG_SMP
+ id = per_cpu(cpu_llc_id, cpu);
+#endif
+ return id;
+}
+EXPORT_SYMBOL_GPL(amd_get_nb_id);
+
static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
{
#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
index 6b2a52dd040..dca325c0399 100644
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -30,8 +30,8 @@
#include <asm/apic.h>
#include <asm/desc.h>
-static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]);
-static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]);
+static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpu_arr);
+static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], priv_arr);
static DEFINE_PER_CPU(int, cpu_priv_count);
static DEFINE_MUTEX(cpu_debug_lock);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 01213048f62..fdd51b55435 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -183,6 +183,11 @@ void mce_log(struct mce *mce)
set_bit(0, &mce_need_notify);
}
+void __weak decode_mce(struct mce *m)
+{
+ return;
+}
+
static void print_mce(struct mce *m)
{
printk(KERN_EMERG
@@ -205,6 +210,8 @@ static void print_mce(struct mce *m)
printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
m->cpuvendor, m->cpuid, m->time, m->socketid,
m->apicid);
+
+ decode_mce(m);
}
static void print_mce_head(void)
@@ -215,7 +222,10 @@ static void print_mce_head(void)
static void print_mce_tail(void)
{
printk(KERN_EMERG "This is not a software problem!\n"
- "Run through mcelog --ascii to decode and contact your hardware vendor\n");
+#if (!defined(CONFIG_EDAC) || !defined(CONFIG_CPU_SUP_AMD))
+ "Run through mcelog --ascii to decode and contact your hardware vendor\n"
+#endif
+ );
}
#define PANIC_TIMEOUT 5 /* 5 seconds */
@@ -1091,7 +1101,7 @@ void mce_log_therm_throt_event(__u64 status)
*/
static int check_interval = 5 * 60; /* 5 minutes */
-static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
+static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
static DEFINE_PER_CPU(struct timer_list, mce_timer);
static void mcheck_timer(unsigned long data)
@@ -1110,7 +1120,7 @@ static void mcheck_timer(unsigned long data)
* Alert userspace if needed. If we logged an MCE, reduce the
* polling interval, otherwise increase the polling interval.
*/
- n = &__get_cpu_var(next_interval);
+ n = &__get_cpu_var(mce_next_interval);
if (mce_notify_irq())
*n = max(*n/2, HZ/100);
else
@@ -1325,7 +1335,7 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
static void mce_init_timer(void)
{
struct timer_list *t = &__get_cpu_var(mce_timer);
- int *n = &__get_cpu_var(next_interval);
+ int *n = &__get_cpu_var(mce_next_interval);
if (mce_ignore_ce)
return;
@@ -1925,7 +1935,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN:
t->expires = round_jiffies(jiffies +
- __get_cpu_var(next_interval));
+ __get_cpu_var(mce_next_interval));
add_timer_on(t, cpu);
smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
break;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 1fecba404fd..8cd5224943b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -69,7 +69,7 @@ struct threshold_bank {
struct threshold_block *blocks;
cpumask_var_t cpus;
};
-static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
+static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks);
#ifdef CONFIG_SMP
static unsigned char shared_bank[NR_BANKS] = {
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 7af0f88a416..84e83de5457 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -58,6 +58,7 @@ unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
static DEFINE_MUTEX(mtrr_mutex);
u64 size_or_mask, size_and_mask;
+static bool mtrr_aps_delayed_init;
static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM];
@@ -163,7 +164,10 @@ static void ipi_handler(void *info)
if (data->smp_reg != ~0U) {
mtrr_if->set(data->smp_reg, data->smp_base,
data->smp_size, data->smp_type);
- } else {
+ } else if (mtrr_aps_delayed_init) {
+ /*
+ * Initialize the MTRRs inaddition to the synchronisation.
+ */
mtrr_if->set_all();
}
@@ -265,6 +269,8 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
*/
if (reg != ~0U)
mtrr_if->set(reg, base, size, type);
+ else if (!mtrr_aps_delayed_init)
+ mtrr_if->set_all();
/* Wait for the others */
while (atomic_read(&data.count))
@@ -721,9 +727,7 @@ void __init mtrr_bp_init(void)
void mtrr_ap_init(void)
{
- unsigned long flags;
-
- if (!mtrr_if || !use_intel())
+ if (!use_intel() || mtrr_aps_delayed_init)
return;
/*
* Ideally we should hold mtrr_mutex here to avoid mtrr entries
@@ -738,11 +742,7 @@ void mtrr_ap_init(void)
* 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug
* lock to prevent mtrr entry changes
*/
- local_irq_save(flags);
-
- mtrr_if->set_all();
-
- local_irq_restore(flags);
+ set_mtrr(~0U, 0, 0, 0);
}
/**
@@ -753,6 +753,34 @@ void mtrr_save_state(void)
smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1);
}
+void set_mtrr_aps_delayed_init(void)
+{
+ if (!use_intel())
+ return;
+
+ mtrr_aps_delayed_init = true;
+}
+
+/*
+ * MTRR initialization for all AP's
+ */
+void mtrr_aps_init(void)
+{
+ if (!use_intel())
+ return;
+
+ set_mtrr(~0U, 0, 0, 0);
+ mtrr_aps_delayed_init = false;
+}
+
+void mtrr_bp_restore(void)
+{
+ if (!use_intel())
+ return;
+
+ mtrr_if->set_all();
+}
+
static int __init mtrr_init_finialize(void)
{
if (!mtrr_if)
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index f9cd0849bd4..2732e2c1e4d 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1211,7 +1211,7 @@ amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
x86_pmu_disable_counter(hwc, idx);
}
-static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
+static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
/*
* Set the next IRQ period, based on the hwc->period_left value.
@@ -1253,7 +1253,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
if (left > x86_pmu.max_period)
left = x86_pmu.max_period;
- per_cpu(prev_left[idx], smp_processor_id()) = left;
+ per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
/*
* The hw counter starts counting from this counter offset,
@@ -1470,7 +1470,7 @@ void perf_counter_print_debug(void)
rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
rdmsrl(x86_pmu.perfctr + idx, pmc_count);
- prev_left = per_cpu(prev_left[idx], cpu);
+ prev_left = per_cpu(pmc_prev_left[idx], cpu);
pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
cpu, idx, pmc_ctrl);
@@ -2110,8 +2110,8 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip)
entry->ip[entry->nr++] = ip;
}
-static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
-static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
+static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
+static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
static DEFINE_PER_CPU(int, in_nmi_frame);
@@ -2264,9 +2264,9 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
struct perf_callchain_entry *entry;
if (in_nmi())
- entry = &__get_cpu_var(nmi_entry);
+ entry = &__get_cpu_var(pmc_nmi_entry);
else
- entry = &__get_cpu_var(irq_entry);
+ entry = &__get_cpu_var(pmc_irq_entry);
entry->nr = 0;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index c251be74510..d59fe323807 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -146,7 +146,7 @@ ENTRY(ftrace_graph_caller)
END(ftrace_graph_caller)
GLOBAL(return_to_handler)
- subq $80, %rsp
+ subq $24, %rsp
/* Save the return values */
movq %rax, (%rsp)
@@ -155,10 +155,10 @@ GLOBAL(return_to_handler)
call ftrace_return_to_handler
- movq %rax, 72(%rsp)
+ movq %rax, 16(%rsp)
movq 8(%rsp), %rdx
movq (%rsp), %rax
- addq $72, %rsp
+ addq $16, %rsp
retq
#endif
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index c664d515f61..63b0ec8d3d4 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -34,7 +34,6 @@
struct kvm_para_state {
u8 mmu_queue[MMU_QUEUE_SIZE];
int mmu_queue_len;
- enum paravirt_lazy_mode mode;
};
static DEFINE_PER_CPU(struct kvm_para_state, para_state);
@@ -77,7 +76,7 @@ static void kvm_deferred_mmu_op(void *buffer, int len)
{
struct kvm_para_state *state = kvm_para_state();
- if (state->mode != PARAVIRT_LAZY_MMU) {
+ if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) {
kvm_mmu_op(buffer, len);
return;
}
@@ -185,10 +184,7 @@ static void kvm_release_pt(unsigned long pfn)
static void kvm_enter_lazy_mmu(void)
{
- struct kvm_para_state *state = kvm_para_state();
-
paravirt_enter_lazy_mmu();
- state->mode = paravirt_get_lazy_mode();
}
static void kvm_leave_lazy_mmu(void)
@@ -197,7 +193,6 @@ static void kvm_leave_lazy_mmu(void)
mmu_queue_flush(state);
paravirt_leave_lazy_mmu();
- state->mode = paravirt_get_lazy_mode();
}
static void __init paravirt_ops_setup(void)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 223af43f152..e5efcdcca31 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -50,8 +50,8 @@ static unsigned long kvm_get_wallclock(void)
struct timespec ts;
int low, high;
- low = (int)__pa(&wall_clock);
- high = ((u64)__pa(&wall_clock) >> 32);
+ low = (int)__pa_symbol(&wall_clock);
+ high = ((u64)__pa_symbol(&wall_clock) >> 32);
native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
vcpu_time = &get_cpu_var(hv_clock);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index d71c8655905..64b838eac18 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -225,10 +225,8 @@ static __init int iommu_setup(char *p)
if (!strncmp(p, "soft", 4))
swiotlb = 1;
#endif
- if (!strncmp(p, "pt", 2)) {
+ if (!strncmp(p, "pt", 2))
iommu_pass_through = 1;
- return 1;
- }
gart_parse_options(p);
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index af71d06624b..6c3b2c6fd77 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -508,7 +508,7 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev)
pci_read_config_dword(nb_ht, 0x60, &val);
set_dev_node(&dev->dev, val & 7);
- pci_dev_put(dev);
+ pci_dev_put(nb_ht);
}
DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index a06e8d10184..27349f92a6d 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -4,6 +4,7 @@
#include <linux/pm.h>
#include <linux/efi.h>
#include <linux/dmi.h>
+#include <linux/tboot.h>
#include <acpi/reboot.h>
#include <asm/io.h>
#include <asm/apic.h>
@@ -508,6 +509,8 @@ static void native_machine_emergency_restart(void)
if (reboot_emergency)
emergency_vmx_disable_all();
+ tboot_shutdown(TB_SHUTDOWN_REBOOT);
+
/* Tell the BIOS if we want cold or warm reboot */
*((unsigned short *)__va(0x472)) = reboot_mode;
@@ -634,6 +637,8 @@ static void native_machine_halt(void)
/* stop other cpus and apics */
machine_shutdown();
+ tboot_shutdown(TB_SHUTDOWN_HALT);
+
/* stop this cpu */
stop_this_cpu(NULL);
}
@@ -645,6 +650,8 @@ static void native_machine_power_off(void)
machine_shutdown();
pm_power_off();
}
+ /* a fallback in case there is no PM info available */
+ tboot_shutdown(TB_SHUTDOWN_HALT);
}
struct machine_ops machine_ops = {
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 63f32d220ef..19f15c4076f 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -66,6 +66,7 @@
#include <linux/percpu.h>
#include <linux/crash_dump.h>
+#include <linux/tboot.h>
#include <video/edid.h>
@@ -711,6 +712,21 @@ void __init setup_arch(char **cmdline_p)
printk(KERN_INFO "Command line: %s\n", boot_command_line);
#endif
+ strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
+ *cmdline_p = command_line;
+
+#ifdef CONFIG_X86_64
+ /*
+ * Must call this twice: Once just to detect whether hardware doesn't
+ * support NX (so that the early EHCI debug console setup can safely
+ * call set_fixmap(), and then again after parsing early parameters to
+ * honor the respective command line option.
+ */
+ check_efer();
+#endif
+
+ parse_early_param();
+
/* VMI may relocate the fixmap; do this before touching ioremap area */
vmi_init();
@@ -793,11 +809,6 @@ void __init setup_arch(char **cmdline_p)
#endif
#endif
- strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
- *cmdline_p = command_line;
-
- parse_early_param();
-
#ifdef CONFIG_X86_64
check_efer();
#endif
@@ -977,6 +988,8 @@ void __init setup_arch(char **cmdline_p)
paravirt_pagetable_setup_done(swapper_pg_dir);
paravirt_post_allocator_init();
+ tboot_probe();
+
#ifdef CONFIG_X86_64
map_vsyscall();
#endif
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 07d81916f21..d559af913e1 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -55,6 +55,7 @@ EXPORT_SYMBOL(__per_cpu_offset);
#define PERCPU_FIRST_CHUNK_RESERVE 0
#endif
+#ifdef CONFIG_X86_32
/**
* pcpu_need_numa - determine percpu allocation needs to consider NUMA
*
@@ -83,6 +84,7 @@ static bool __init pcpu_need_numa(void)
#endif
return false;
}
+#endif
/**
* pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
@@ -124,308 +126,35 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
}
/*
- * Large page remap allocator
- *
- * This allocator uses PMD page as unit. A PMD page is allocated for
- * each cpu and each is remapped into vmalloc area using PMD mapping.
- * As PMD page is quite large, only part of it is used for the first
- * chunk. Unused part is returned to the bootmem allocator.
- *
- * So, the PMD pages are mapped twice - once to the physical mapping
- * and to the vmalloc area for the first percpu chunk. The double
- * mapping does add one more PMD TLB entry pressure but still is much
- * better than only using 4k mappings while still being NUMA friendly.
+ * Helpers for first chunk memory allocation
*/
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-struct pcpul_ent {
- unsigned int cpu;
- void *ptr;
-};
-
-static size_t pcpul_size;
-static struct pcpul_ent *pcpul_map;
-static struct vm_struct pcpul_vm;
-
-static struct page * __init pcpul_get_page(unsigned int cpu, int pageno)
+static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
{
- size_t off = (size_t)pageno << PAGE_SHIFT;
-
- if (off >= pcpul_size)
- return NULL;
-
- return virt_to_page(pcpul_map[cpu].ptr + off);
+ return pcpu_alloc_bootmem(cpu, size, align);
}
-static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
+static void __init pcpu_fc_free(void *ptr, size_t size)
{
- size_t map_size, dyn_size;
- unsigned int cpu;
- int i, j;
- ssize_t ret;
-
- if (!chosen) {
- size_t vm_size = VMALLOC_END - VMALLOC_START;
- size_t tot_size = nr_cpu_ids * PMD_SIZE;
-
- /* on non-NUMA, embedding is better */
- if (!pcpu_need_numa())
- return -EINVAL;
-
- /* don't consume more than 20% of vmalloc area */
- if (tot_size > vm_size / 5) {
- pr_info("PERCPU: too large chunk size %zuMB for "
- "large page remap\n", tot_size >> 20);
- return -EINVAL;
- }
- }
-
- /* need PSE */
- if (!cpu_has_pse) {
- pr_warning("PERCPU: lpage allocator requires PSE\n");
- return -EINVAL;
- }
-
- /*
- * Currently supports only single page. Supporting multiple
- * pages won't be too difficult if it ever becomes necessary.
- */
- pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
- PERCPU_DYNAMIC_RESERVE);
- if (pcpul_size > PMD_SIZE) {
- pr_warning("PERCPU: static data is larger than large page, "
- "can't use large page\n");
- return -EINVAL;
- }
- dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
-
- /* allocate pointer array and alloc large pages */
- map_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpul_map[0]));
- pcpul_map = alloc_bootmem(map_size);
-
- for_each_possible_cpu(cpu) {
- pcpul_map[cpu].cpu = cpu;
- pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE,
- PMD_SIZE);
- if (!pcpul_map[cpu].ptr) {
- pr_warning("PERCPU: failed to allocate large page "
- "for cpu%u\n", cpu);
- goto enomem;
- }
-
- /*
- * Only use pcpul_size bytes and give back the rest.
- *
- * Ingo: The 2MB up-rounding bootmem is needed to make
- * sure the partial 2MB page is still fully RAM - it's
- * not well-specified to have a PAT-incompatible area
- * (unmapped RAM, device memory, etc.) in that hole.
- */
- free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size),
- PMD_SIZE - pcpul_size);
-
- memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size);
- }
-
- /* allocate address and map */
- pcpul_vm.flags = VM_ALLOC;
- pcpul_vm.size = nr_cpu_ids * PMD_SIZE;
- vm_area_register_early(&pcpul_vm, PMD_SIZE);
-
- for_each_possible_cpu(cpu) {
- pmd_t *pmd, pmd_v;
-
- pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr +
- cpu * PMD_SIZE);
- pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)),
- PAGE_KERNEL_LARGE);
- set_pmd(pmd, pmd_v);
- }
-
- /* we're ready, commit */
- pr_info("PERCPU: Remapped at %p with large pages, static data "
- "%zu bytes\n", pcpul_vm.addr, static_size);
-
- ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
- PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
- PMD_SIZE, pcpul_vm.addr, NULL);
-
- /* sort pcpul_map array for pcpu_lpage_remapped() */
- for (i = 0; i < nr_cpu_ids - 1; i++)
- for (j = i + 1; j < nr_cpu_ids; j++)
- if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
- struct pcpul_ent tmp = pcpul_map[i];
- pcpul_map[i] = pcpul_map[j];
- pcpul_map[j] = tmp;
- }
-
- return ret;
-
-enomem:
- for_each_possible_cpu(cpu)
- if (pcpul_map[cpu].ptr)
- free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size);
- free_bootmem(__pa(pcpul_map), map_size);
- return -ENOMEM;
+ free_bootmem(__pa(ptr), size);
}
-/**
- * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
- * @kaddr: the kernel address in question
- *
- * Determine whether @kaddr falls in the pcpul recycled area. This is
- * used by pageattr to detect VM aliases and break up the pcpu PMD
- * mapping such that the same physical page is not mapped under
- * different attributes.
- *
- * The recycled area is always at the tail of a partially used PMD
- * page.
- *
- * RETURNS:
- * Address of corresponding remapped pcpu address if match is found;
- * otherwise, NULL.
- */
-void *pcpu_lpage_remapped(void *kaddr)
+static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
{
- void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK);
- unsigned long offset = (unsigned long)kaddr & ~PMD_MASK;
- int left = 0, right = nr_cpu_ids - 1;
- int pos;
-
- /* pcpul in use at all? */
- if (!pcpul_map)
- return NULL;
-
- /* okay, perform binary search */
- while (left <= right) {
- pos = (left + right) / 2;
-
- if (pcpul_map[pos].ptr < pmd_addr)
- left = pos + 1;
- else if (pcpul_map[pos].ptr > pmd_addr)
- right = pos - 1;
- else {
- /* it shouldn't be in the area for the first chunk */
- WARN_ON(offset < pcpul_size);
-
- return pcpul_vm.addr +
- pcpul_map[pos].cpu * PMD_SIZE + offset;
- }
- }
-
- return NULL;
-}
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+ if (early_cpu_to_node(from) == early_cpu_to_node(to))
+ return LOCAL_DISTANCE;
+ else
+ return REMOTE_DISTANCE;
#else
-static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
-{
- return -EINVAL;
-}
+ return LOCAL_DISTANCE;
#endif
-
-/*
- * Embedding allocator
- *
- * The first chunk is sized to just contain the static area plus
- * module and dynamic reserves and embedded into linear physical
- * mapping so that it can use PMD mapping without additional TLB
- * pressure.
- */
-static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen)
-{
- size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
-
- /*
- * If large page isn't supported, there's no benefit in doing
- * this. Also, embedding allocation doesn't play well with
- * NUMA.
- */
- if (!chosen && (!cpu_has_pse || pcpu_need_numa()))
- return -EINVAL;
-
- return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
- reserve - PERCPU_FIRST_CHUNK_RESERVE, -1);
}
-/*
- * 4k page allocator
- *
- * This is the basic allocator. Static percpu area is allocated
- * page-by-page and most of initialization is done by the generic
- * setup function.
- */
-static struct page **pcpu4k_pages __initdata;
-static int pcpu4k_nr_static_pages __initdata;
-
-static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
-{
- if (pageno < pcpu4k_nr_static_pages)
- return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
- return NULL;
-}
-
-static void __init pcpu4k_populate_pte(unsigned long addr)
+static void __init pcpup_populate_pte(unsigned long addr)
{
populate_extra_pte(addr);
}
-static ssize_t __init setup_pcpu_4k(size_t static_size)
-{
- size_t pages_size;
- unsigned int cpu;
- int i, j;
- ssize_t ret;
-
- pcpu4k_nr_static_pages = PFN_UP(static_size);
-
- /* unaligned allocations can't be freed, round up to page size */
- pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * nr_cpu_ids
- * sizeof(pcpu4k_pages[0]));
- pcpu4k_pages = alloc_bootmem(pages_size);
-
- /* allocate and copy */
- j = 0;
- for_each_possible_cpu(cpu)
- for (i = 0; i < pcpu4k_nr_static_pages; i++) {
- void *ptr;
-
- ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
- if (!ptr) {
- pr_warning("PERCPU: failed to allocate "
- "4k page for cpu%u\n", cpu);
- goto enomem;
- }
-
- memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
- pcpu4k_pages[j++] = virt_to_page(ptr);
- }
-
- /* we're ready, commit */
- pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
- pcpu4k_nr_static_pages, static_size);
-
- ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
- PERCPU_FIRST_CHUNK_RESERVE, -1,
- -1, NULL, pcpu4k_populate_pte);
- goto out_free_ar;
-
-enomem:
- while (--j >= 0)
- free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
- ret = -ENOMEM;
-out_free_ar:
- free_bootmem(__pa(pcpu4k_pages), pages_size);
- return ret;
-}
-
-/* for explicit first chunk allocator selection */
-static char pcpu_chosen_alloc[16] __initdata;
-
-static int __init percpu_alloc_setup(char *str)
-{
- strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1);
- return 0;
-}
-early_param("percpu_alloc", percpu_alloc_setup);
-
static inline void setup_percpu_segment(int cpu)
{
#ifdef CONFIG_X86_32
@@ -441,52 +170,49 @@ static inline void setup_percpu_segment(int cpu)
void __init setup_per_cpu_areas(void)
{
- size_t static_size = __per_cpu_end - __per_cpu_start;
unsigned int cpu;
unsigned long delta;
- size_t pcpu_unit_size;
- ssize_t ret;
+ int rc;
pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
/*
- * Allocate percpu area. If PSE is supported, try to make use
- * of large page mappings. Please read comments on top of
- * each allocator for details.
+ * Allocate percpu area. Embedding allocator is our favorite;
+ * however, on NUMA configurations, it can result in very
+ * sparse unit mapping and vmalloc area isn't spacious enough
+ * on 32bit. Use page in that case.
*/
- ret = -EINVAL;
- if (strlen(pcpu_chosen_alloc)) {
- if (strcmp(pcpu_chosen_alloc, "4k")) {
- if (!strcmp(pcpu_chosen_alloc, "lpage"))
- ret = setup_pcpu_lpage(static_size, true);
- else if (!strcmp(pcpu_chosen_alloc, "embed"))
- ret = setup_pcpu_embed(static_size, true);
- else
- pr_warning("PERCPU: unknown allocator %s "
- "specified\n", pcpu_chosen_alloc);
- if (ret < 0)
- pr_warning("PERCPU: %s allocator failed (%zd), "
- "falling back to 4k\n",
- pcpu_chosen_alloc, ret);
- }
- } else {
- ret = setup_pcpu_lpage(static_size, false);
- if (ret < 0)
- ret = setup_pcpu_embed(static_size, false);
+#ifdef CONFIG_X86_32
+ if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa())
+ pcpu_chosen_fc = PCPU_FC_PAGE;
+#endif
+ rc = -EINVAL;
+ if (pcpu_chosen_fc != PCPU_FC_PAGE) {
+ const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE;
+ const size_t dyn_size = PERCPU_MODULE_RESERVE +
+ PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
+
+ rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
+ dyn_size, atom_size,
+ pcpu_cpu_distance,
+ pcpu_fc_alloc, pcpu_fc_free);
+ if (rc < 0)
+ pr_warning("PERCPU: %s allocator failed (%d), "
+ "falling back to page size\n",
+ pcpu_fc_names[pcpu_chosen_fc], rc);
}
- if (ret < 0)
- ret = setup_pcpu_4k(static_size);
- if (ret < 0)
- panic("cannot allocate static percpu area (%zu bytes, err=%zd)",
- static_size, ret);
-
- pcpu_unit_size = ret;
+ if (rc < 0)
+ rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
+ pcpu_fc_alloc, pcpu_fc_free,
+ pcpup_populate_pte);
+ if (rc < 0)
+ panic("cannot initialize percpu area (err=%d)", rc);
/* alrighty, percpu areas up and running */
delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
for_each_possible_cpu(cpu) {
- per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
+ per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];
per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
per_cpu(cpu_number, cpu) = cpu;
setup_percpu_segment(cpu);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c36cc1452cd..a25eeec0008 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -47,6 +47,7 @@
#include <linux/bootmem.h>
#include <linux/err.h>
#include <linux/nmi.h>
+#include <linux/tboot.h>
#include <asm/acpi.h>
#include <asm/desc.h>
@@ -1117,9 +1118,22 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
if (is_uv_system())
uv_system_init();
+
+ set_mtrr_aps_delayed_init();
out:
preempt_enable();
}
+
+void arch_enable_nonboot_cpus_begin(void)
+{
+ set_mtrr_aps_delayed_init();
+}
+
+void arch_enable_nonboot_cpus_end(void)
+{
+ mtrr_aps_init();
+}
+
/*
* Early setup to make printk work.
*/
@@ -1141,6 +1155,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
setup_ioapic_dest();
#endif
check_nmi_watchdog();
+ mtrr_aps_init();
}
static int __initdata setup_possible_cpus = -1;
@@ -1318,6 +1333,7 @@ void play_dead_common(void)
void native_play_dead(void)
{
play_dead_common();
+ tboot_shutdown(TB_SHUTDOWN_WFS);
wbinvd_halt();
}
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
new file mode 100644
index 00000000000..86c9f91b48a
--- /dev/null
+++ b/arch/x86/kernel/tboot.c
@@ -0,0 +1,447 @@
+/*
+ * tboot.c: main implementation of helper functions used by kernel for
+ * runtime support of Intel(R) Trusted Execution Technology
+ *
+ * Copyright (c) 2006-2009, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#include <linux/dma_remapping.h>
+#include <linux/init_task.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/dmar.h>
+#include <linux/cpu.h>
+#include <linux/pfn.h>
+#include <linux/mm.h>
+#include <linux/tboot.h>
+
+#include <asm/trampoline.h>
+#include <asm/processor.h>
+#include <asm/bootparam.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/fixmap.h>
+#include <asm/proto.h>
+#include <asm/setup.h>
+#include <asm/e820.h>
+#include <asm/io.h>
+
+#include "acpi/realmode/wakeup.h"
+
+/* Global pointer to shared data; NULL means no measured launch. */
+struct tboot *tboot __read_mostly;
+
+/* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */
+#define AP_WAIT_TIMEOUT 1
+
+#undef pr_fmt
+#define pr_fmt(fmt) "tboot: " fmt
+
+static u8 tboot_uuid[16] __initdata = TBOOT_UUID;
+
+void __init tboot_probe(void)
+{
+ /* Look for valid page-aligned address for shared page. */
+ if (!boot_params.tboot_addr)
+ return;
+ /*
+ * also verify that it is mapped as we expect it before calling
+ * set_fixmap(), to reduce chance of garbage value causing crash
+ */
+ if (!e820_any_mapped(boot_params.tboot_addr,
+ boot_params.tboot_addr, E820_RESERVED)) {
+ pr_warning("non-0 tboot_addr but it is not of type E820_RESERVED\n");
+ return;
+ }
+
+ /* only a natively booted kernel should be using TXT */
+ if (paravirt_enabled()) {
+ pr_warning("non-0 tboot_addr but pv_ops is enabled\n");
+ return;
+ }
+
+ /* Map and check for tboot UUID. */
+ set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr);
+ tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE);
+ if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) {
+ pr_warning("tboot at 0x%llx is invalid\n",
+ boot_params.tboot_addr);
+ tboot = NULL;
+ return;
+ }
+ if (tboot->version < 5) {
+ pr_warning("tboot version is invalid: %u\n", tboot->version);
+ tboot = NULL;
+ return;
+ }
+
+ pr_info("found shared page at phys addr 0x%llx:\n",
+ boot_params.tboot_addr);
+ pr_debug("version: %d\n", tboot->version);
+ pr_debug("log_addr: 0x%08x\n", tboot->log_addr);
+ pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry);
+ pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base);
+ pr_debug("tboot_size: 0x%x\n", tboot->tboot_size);
+}
+
+static pgd_t *tboot_pg_dir;
+static struct mm_struct tboot_mm = {
+ .mm_rb = RB_ROOT,
+ .pgd = swapper_pg_dir,
+ .mm_users = ATOMIC_INIT(2),
+ .mm_count = ATOMIC_INIT(1),
+ .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
+ .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
+ .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
+ .cpu_vm_mask = CPU_MASK_ALL,
+};
+
+static inline void switch_to_tboot_pt(void)
+{
+ write_cr3(virt_to_phys(tboot_pg_dir));
+}
+
+static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
+ pgprot_t prot)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = pgd_offset(&tboot_mm, vaddr);
+ pud = pud_alloc(&tboot_mm, pgd, vaddr);
+ if (!pud)
+ return -1;
+ pmd = pmd_alloc(&tboot_mm, pud, vaddr);
+ if (!pmd)
+ return -1;
+ pte = pte_alloc_map(&tboot_mm, pmd, vaddr);
+ if (!pte)
+ return -1;
+ set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
+ pte_unmap(pte);
+ return 0;
+}
+
+static int map_tboot_pages(unsigned long vaddr, unsigned long start_pfn,
+ unsigned long nr)
+{
+ /* Reuse the original kernel mapping */
+ tboot_pg_dir = pgd_alloc(&tboot_mm);
+ if (!tboot_pg_dir)
+ return -1;
+
+ for (; nr > 0; nr--, vaddr += PAGE_SIZE, start_pfn++) {
+ if (map_tboot_page(vaddr, start_pfn, PAGE_KERNEL_EXEC))
+ return -1;
+ }
+
+ return 0;
+}
+
+static void tboot_create_trampoline(void)
+{
+ u32 map_base, map_size;
+
+ /* Create identity map for tboot shutdown code. */
+ map_base = PFN_DOWN(tboot->tboot_base);
+ map_size = PFN_UP(tboot->tboot_size);
+ if (map_tboot_pages(map_base << PAGE_SHIFT, map_base, map_size))
+ panic("tboot: Error mapping tboot pages (mfns) @ 0x%x, 0x%x\n",
+ map_base, map_size);
+}
+
+#ifdef CONFIG_ACPI_SLEEP
+
+static void add_mac_region(phys_addr_t start, unsigned long size)
+{
+ struct tboot_mac_region *mr;
+ phys_addr_t end = start + size;
+
+ if (start && size) {
+ mr = &tboot->mac_regions[tboot->num_mac_regions++];
+ mr->start = round_down(start, PAGE_SIZE);
+ mr->size = round_up(end, PAGE_SIZE) - mr->start;
+ }
+}
+
+static int tboot_setup_sleep(void)
+{
+ tboot->num_mac_regions = 0;
+
+ /* S3 resume code */
+ add_mac_region(acpi_wakeup_address, WAKEUP_SIZE);
+
+#ifdef CONFIG_X86_TRAMPOLINE
+ /* AP trampoline code */
+ add_mac_region(virt_to_phys(trampoline_base), TRAMPOLINE_SIZE);
+#endif
+
+ /* kernel code + data + bss */
+ add_mac_region(virt_to_phys(_text), _end - _text);
+
+ tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address;
+
+ return 0;
+}
+
+#else /* no CONFIG_ACPI_SLEEP */
+
+static int tboot_setup_sleep(void)
+{
+ /* S3 shutdown requested, but S3 not supported by the kernel... */
+ BUG();
+ return -1;
+}
+
+#endif
+
+void tboot_shutdown(u32 shutdown_type)
+{
+ void (*shutdown)(void);
+
+ if (!tboot_enabled())
+ return;
+
+ /*
+ * if we're being called before the 1:1 mapping is set up then just
+ * return and let the normal shutdown happen; this should only be
+ * due to very early panic()
+ */
+ if (!tboot_pg_dir)
+ return;
+
+ /* if this is S3 then set regions to MAC */
+ if (shutdown_type == TB_SHUTDOWN_S3)
+ if (tboot_setup_sleep())
+ return;
+
+ tboot->shutdown_type = shutdown_type;
+
+ switch_to_tboot_pt();
+
+ shutdown = (void(*)(void))(unsigned long)tboot->shutdown_entry;
+ shutdown();
+
+ /* should not reach here */
+ while (1)
+ halt();
+}
+
+static void tboot_copy_fadt(const struct acpi_table_fadt *fadt)
+{
+#define TB_COPY_GAS(tbg, g) \
+ tbg.space_id = g.space_id; \
+ tbg.bit_width = g.bit_width; \
+ tbg.bit_offset = g.bit_offset; \
+ tbg.access_width = g.access_width; \
+ tbg.address = g.address;
+
+ TB_COPY_GAS(tboot->acpi_sinfo.pm1a_cnt_blk, fadt->xpm1a_control_block);
+ TB_COPY_GAS(tboot->acpi_sinfo.pm1b_cnt_blk, fadt->xpm1b_control_block);
+ TB_COPY_GAS(tboot->acpi_sinfo.pm1a_evt_blk, fadt->xpm1a_event_block);
+ TB_COPY_GAS(tboot->acpi_sinfo.pm1b_evt_blk, fadt->xpm1b_event_block);
+
+ /*
+ * We need phys addr of waking vector, but can't use virt_to_phys() on
+ * &acpi_gbl_FACS because it is ioremap'ed, so calc from FACS phys
+ * addr.
+ */
+ tboot->acpi_sinfo.wakeup_vector = fadt->facs +
+ offsetof(struct acpi_table_facs, firmware_waking_vector);
+}
+
+void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
+{
+ static u32 acpi_shutdown_map[ACPI_S_STATE_COUNT] = {
+ /* S0,1,2: */ -1, -1, -1,
+ /* S3: */ TB_SHUTDOWN_S3,
+ /* S4: */ TB_SHUTDOWN_S4,
+ /* S5: */ TB_SHUTDOWN_S5 };
+
+ if (!tboot_enabled())
+ return;
+
+ tboot_copy_fadt(&acpi_gbl_FADT);
+ tboot->acpi_sinfo.pm1a_cnt_val = pm1a_control;
+ tboot->acpi_sinfo.pm1b_cnt_val = pm1b_control;
+ /* we always use the 32b wakeup vector */
+ tboot->acpi_sinfo.vector_width = 32;
+
+ if (sleep_state >= ACPI_S_STATE_COUNT ||
+ acpi_shutdown_map[sleep_state] == -1) {
+ pr_warning("unsupported sleep state 0x%x\n", sleep_state);
+ return;
+ }
+
+ tboot_shutdown(acpi_shutdown_map[sleep_state]);
+}
+
+static atomic_t ap_wfs_count;
+
+static int tboot_wait_for_aps(int num_aps)
+{
+ unsigned long timeout;
+
+ timeout = AP_WAIT_TIMEOUT*HZ;
+ while (atomic_read((atomic_t *)&tboot->num_in_wfs) != num_aps &&
+ timeout) {
+ mdelay(1);
+ timeout--;
+ }
+
+ if (timeout)
+ pr_warning("tboot wait for APs timeout\n");
+
+ return !(atomic_read((atomic_t *)&tboot->num_in_wfs) == num_aps);
+}
+
+static int __cpuinit tboot_cpu_callback(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ switch (action) {
+ case CPU_DYING:
+ atomic_inc(&ap_wfs_count);
+ if (num_online_cpus() == 1)
+ if (tboot_wait_for_aps(atomic_read(&ap_wfs_count)))
+ return NOTIFY_BAD;
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block tboot_cpu_notifier __cpuinitdata =
+{
+ .notifier_call = tboot_cpu_callback,
+};
+
+static __init int tboot_late_init(void)
+{
+ if (!tboot_enabled())
+ return 0;
+
+ tboot_create_trampoline();
+
+ atomic_set(&ap_wfs_count, 0);
+ register_hotcpu_notifier(&tboot_cpu_notifier);
+ return 0;
+}
+
+late_initcall(tboot_late_init);
+
+/*
+ * TXT configuration registers (offsets from TXT_{PUB, PRIV}_CONFIG_REGS_BASE)
+ */
+
+#define TXT_PUB_CONFIG_REGS_BASE 0xfed30000
+#define TXT_PRIV_CONFIG_REGS_BASE 0xfed20000
+
+/* # pages for each config regs space - used by fixmap */
+#define NR_TXT_CONFIG_PAGES ((TXT_PUB_CONFIG_REGS_BASE - \
+ TXT_PRIV_CONFIG_REGS_BASE) >> PAGE_SHIFT)
+
+/* offsets from pub/priv config space */
+#define TXTCR_HEAP_BASE 0x0300
+#define TXTCR_HEAP_SIZE 0x0308
+
+#define SHA1_SIZE 20
+
+struct sha1_hash {
+ u8 hash[SHA1_SIZE];
+};
+
+struct sinit_mle_data {
+ u32 version; /* currently 6 */
+ struct sha1_hash bios_acm_id;
+ u32 edx_senter_flags;
+ u64 mseg_valid;
+ struct sha1_hash sinit_hash;
+ struct sha1_hash mle_hash;
+ struct sha1_hash stm_hash;
+ struct sha1_hash lcp_policy_hash;
+ u32 lcp_policy_control;
+ u32 rlp_wakeup_addr;
+ u32 reserved;
+ u32 num_mdrs;
+ u32 mdrs_off;
+ u32 num_vtd_dmars;
+ u32 vtd_dmars_off;
+} __packed;
+
+struct acpi_table_header *tboot_get_dmar_table(struct acpi_table_header *dmar_tbl)
+{
+ void *heap_base, *heap_ptr, *config;
+
+ if (!tboot_enabled())
+ return dmar_tbl;
+
+ /*
+ * ACPI tables may not be DMA protected by tboot, so use DMAR copy
+ * SINIT saved in SinitMleData in TXT heap (which is DMA protected)
+ */
+
+ /* map config space in order to get heap addr */
+ config = ioremap(TXT_PUB_CONFIG_REGS_BASE, NR_TXT_CONFIG_PAGES *
+ PAGE_SIZE);
+ if (!config)
+ return NULL;
+
+ /* now map TXT heap */
+ heap_base = ioremap(*(u64 *)(config + TXTCR_HEAP_BASE),
+ *(u64 *)(config + TXTCR_HEAP_SIZE));
+ iounmap(config);
+ if (!heap_base)
+ return NULL;
+
+ /* walk heap to SinitMleData */
+ /* skip BiosData */
+ heap_ptr = heap_base + *(u64 *)heap_base;
+ /* skip OsMleData */
+ heap_ptr += *(u64 *)heap_ptr;
+ /* skip OsSinitData */
+ heap_ptr += *(u64 *)heap_ptr;
+ /* now points to SinitMleDataSize; set to SinitMleData */
+ heap_ptr += sizeof(u64);
+ /* get addr of DMAR table */
+ dmar_tbl = (struct acpi_table_header *)(heap_ptr +
+ ((struct sinit_mle_data *)heap_ptr)->vtd_dmars_off -
+ sizeof(u64));
+
+ /* don't unmap heap because dmar.c needs access to this */
+
+ return dmar_tbl;
+}
+
+int tboot_force_iommu(void)
+{
+ if (!tboot_enabled())
+ return 0;
+
+ if (no_iommu || swiotlb || dmar_disabled)
+ pr_warning("Forcing Intel-IOMMU to enabled\n");
+
+ dmar_disabled = 0;
+#ifdef CONFIG_SWIOTLB
+ swiotlb = 0;
+#endif
+ no_iommu = 0;
+
+ return 1;
+}
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 9fc178255c0..0ccb57d5ee3 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -348,15 +348,12 @@ SECTIONS
_end = .;
}
- /* Sections to be discarded */
- /DISCARD/ : {
- *(.exitcall.exit)
- *(.eh_frame)
- *(.discard)
- }
-
STABS_DEBUG
DWARF_DEBUG
+
+ /* Sections to be discarded */
+ DISCARDS
+ /DISCARD/ : { *(.eh_frame) }
}
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 8600a09e0c6..b84e571f417 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -1,12 +1,8 @@
#
# KVM configuration
#
-config HAVE_KVM
- bool
-config HAVE_KVM_IRQCHIP
- bool
- default y
+source "virt/kvm/Kconfig"
menuconfig VIRTUALIZATION
bool "Virtualization"
@@ -29,6 +25,9 @@ config KVM
select PREEMPT_NOTIFIERS
select MMU_NOTIFIER
select ANON_INODES
+ select HAVE_KVM_IRQCHIP
+ select HAVE_KVM_EVENTFD
+ select KVM_APIC_ARCHITECTURE
---help---
Support hosting fully virtualized guest machines using hardware
virtualization extensions. You will need a fairly recent
@@ -63,18 +62,6 @@ config KVM_AMD
To compile this as a module, choose M here: the module
will be called kvm-amd.
-config KVM_TRACE
- bool "KVM trace support"
- depends on KVM && SYSFS
- select MARKERS
- select RELAY
- select DEBUG_FS
- default n
- ---help---
- This option allows reading a trace of kvm-related events through
- relayfs. Note the ABI is not considered stable and will be
- modified in future updates.
-
# OK, it's a little counter-intuitive to do this, but it puts it neatly under
# the virtualization menu.
source drivers/lguest/Kconfig
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index b43c4efafe8..0e7fe78d0f7 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -1,22 +1,19 @@
-#
-# Makefile for Kernel-based Virtual Machine module
-#
-
-common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
- coalesced_mmio.o irq_comm.o)
-ifeq ($(CONFIG_KVM_TRACE),y)
-common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
-endif
-ifeq ($(CONFIG_IOMMU_API),y)
-common-objs += $(addprefix ../../../virt/kvm/, iommu.o)
-endif
EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
-kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \
- i8254.o timer.o
-obj-$(CONFIG_KVM) += kvm.o
-kvm-intel-objs = vmx.o
-obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
-kvm-amd-objs = svm.o
-obj-$(CONFIG_KVM_AMD) += kvm-amd.o
+CFLAGS_x86.o := -I.
+CFLAGS_svm.o := -I.
+CFLAGS_vmx.o := -I.
+
+kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
+ coalesced_mmio.o irq_comm.o eventfd.o)
+kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
+
+kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
+ i8254.o timer.o
+kvm-intel-y += vmx.o
+kvm-amd-y += svm.o
+
+obj-$(CONFIG_KVM) += kvm.o
+obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
+obj-$(CONFIG_KVM_AMD) += kvm-amd.o
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/emulate.c
index 616de4628d6..1be5cd640e9 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1,5 +1,5 @@
/******************************************************************************
- * x86_emulate.c
+ * emulate.c
*
* Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
*
@@ -30,7 +30,9 @@
#define DPRINTF(x...) do {} while (0)
#endif
#include <linux/module.h>
-#include <asm/kvm_x86_emulate.h>
+#include <asm/kvm_emulate.h>
+
+#include "mmu.h" /* for is_long_mode() */
/*
* Opcode effective-address decode tables.
@@ -60,6 +62,7 @@
#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */
#define SrcOne (7<<4) /* Implied '1' */
#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */
+#define SrcImmU (9<<4) /* Immediate operand, unsigned */
#define SrcMask (0xf<<4)
/* Generic ModRM decode. */
#define ModRM (1<<8)
@@ -97,11 +100,11 @@ static u32 opcode_table[256] = {
/* 0x10 - 0x17 */
ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- 0, 0, 0, 0,
+ ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0,
/* 0x18 - 0x1F */
ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- 0, 0, 0, 0,
+ ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0,
/* 0x20 - 0x27 */
ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -195,7 +198,7 @@ static u32 opcode_table[256] = {
ByteOp | SrcImmUByte, SrcImmUByte,
/* 0xE8 - 0xEF */
SrcImm | Stack, SrcImm | ImplicitOps,
- SrcImm | Src2Imm16, SrcImmByte | ImplicitOps,
+ SrcImmU | Src2Imm16, SrcImmByte | ImplicitOps,
SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
/* 0xF0 - 0xF7 */
@@ -208,7 +211,7 @@ static u32 opcode_table[256] = {
static u32 twobyte_table[256] = {
/* 0x00 - 0x0F */
- 0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0,
+ 0, Group | GroupDual | Group7, 0, 0, 0, ImplicitOps, ImplicitOps, 0,
ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
/* 0x10 - 0x1F */
0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
@@ -216,7 +219,9 @@ static u32 twobyte_table[256] = {
ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
/* 0x30 - 0x3F */
- ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ ImplicitOps, 0, ImplicitOps, 0,
+ ImplicitOps, ImplicitOps, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
/* 0x40 - 0x47 */
DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
@@ -319,8 +324,11 @@ static u32 group2_table[] = {
};
/* EFLAGS bit definitions. */
+#define EFLG_VM (1<<17)
+#define EFLG_RF (1<<16)
#define EFLG_OF (1<<11)
#define EFLG_DF (1<<10)
+#define EFLG_IF (1<<9)
#define EFLG_SF (1<<7)
#define EFLG_ZF (1<<6)
#define EFLG_AF (1<<4)
@@ -1027,6 +1035,7 @@ done_prefixes:
c->src.type = OP_MEM;
break;
case SrcImm:
+ case SrcImmU:
c->src.type = OP_IMM;
c->src.ptr = (unsigned long *)c->eip;
c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
@@ -1044,6 +1053,19 @@ done_prefixes:
c->src.val = insn_fetch(s32, 4, c->eip);
break;
}
+ if ((c->d & SrcMask) == SrcImmU) {
+ switch (c->src.bytes) {
+ case 1:
+ c->src.val &= 0xff;
+ break;
+ case 2:
+ c->src.val &= 0xffff;
+ break;
+ case 4:
+ c->src.val &= 0xffffffff;
+ break;
+ }
+ }
break;
case SrcImmByte:
case SrcImmUByte:
@@ -1375,6 +1397,217 @@ static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
ctxt->interruptibility = mask;
}
+static inline void
+setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
+ struct kvm_segment *cs, struct kvm_segment *ss)
+{
+ memset(cs, 0, sizeof(struct kvm_segment));
+ kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS);
+ memset(ss, 0, sizeof(struct kvm_segment));
+
+ cs->l = 0; /* will be adjusted later */
+ cs->base = 0; /* flat segment */
+ cs->g = 1; /* 4kb granularity */
+ cs->limit = 0xffffffff; /* 4GB limit */
+ cs->type = 0x0b; /* Read, Execute, Accessed */
+ cs->s = 1;
+ cs->dpl = 0; /* will be adjusted later */
+ cs->present = 1;
+ cs->db = 1;
+
+ ss->unusable = 0;
+ ss->base = 0; /* flat segment */
+ ss->limit = 0xffffffff; /* 4GB limit */
+ ss->g = 1; /* 4kb granularity */
+ ss->s = 1;
+ ss->type = 0x03; /* Read/Write, Accessed */
+ ss->db = 1; /* 32bit stack segment */
+ ss->dpl = 0;
+ ss->present = 1;
+}
+
+static int
+emulate_syscall(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ struct kvm_segment cs, ss;
+ u64 msr_data;
+
+ /* syscall is not available in real mode */
+ if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL
+ || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE))
+ return -1;
+
+ setup_syscalls_segments(ctxt, &cs, &ss);
+
+ kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
+ msr_data >>= 32;
+ cs.selector = (u16)(msr_data & 0xfffc);
+ ss.selector = (u16)(msr_data + 8);
+
+ if (is_long_mode(ctxt->vcpu)) {
+ cs.db = 0;
+ cs.l = 1;
+ }
+ kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
+ kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+
+ c->regs[VCPU_REGS_RCX] = c->eip;
+ if (is_long_mode(ctxt->vcpu)) {
+#ifdef CONFIG_X86_64
+ c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
+
+ kvm_x86_ops->get_msr(ctxt->vcpu,
+ ctxt->mode == X86EMUL_MODE_PROT64 ?
+ MSR_LSTAR : MSR_CSTAR, &msr_data);
+ c->eip = msr_data;
+
+ kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data);
+ ctxt->eflags &= ~(msr_data | EFLG_RF);
+#endif
+ } else {
+ /* legacy mode */
+ kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
+ c->eip = (u32)msr_data;
+
+ ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
+ }
+
+ return 0;
+}
+
+static int
+emulate_sysenter(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ struct kvm_segment cs, ss;
+ u64 msr_data;
+
+ /* inject #UD if LOCK prefix is used */
+ if (c->lock_prefix)
+ return -1;
+
+ /* inject #GP if in real mode or paging is disabled */
+ if (ctxt->mode == X86EMUL_MODE_REAL ||
+ !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return -1;
+ }
+
+ /* XXX sysenter/sysexit have not been tested in 64bit mode.
+ * Therefore, we inject an #UD.
+ */
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ return -1;
+
+ setup_syscalls_segments(ctxt, &cs, &ss);
+
+ kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
+ switch (ctxt->mode) {
+ case X86EMUL_MODE_PROT32:
+ if ((msr_data & 0xfffc) == 0x0) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return -1;
+ }
+ break;
+ case X86EMUL_MODE_PROT64:
+ if (msr_data == 0x0) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return -1;
+ }
+ break;
+ }
+
+ ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
+ cs.selector = (u16)msr_data;
+ cs.selector &= ~SELECTOR_RPL_MASK;
+ ss.selector = cs.selector + 8;
+ ss.selector &= ~SELECTOR_RPL_MASK;
+ if (ctxt->mode == X86EMUL_MODE_PROT64
+ || is_long_mode(ctxt->vcpu)) {
+ cs.db = 0;
+ cs.l = 1;
+ }
+
+ kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
+ kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+
+ kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
+ c->eip = msr_data;
+
+ kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
+ c->regs[VCPU_REGS_RSP] = msr_data;
+
+ return 0;
+}
+
+static int
+emulate_sysexit(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ struct kvm_segment cs, ss;
+ u64 msr_data;
+ int usermode;
+
+ /* inject #UD if LOCK prefix is used */
+ if (c->lock_prefix)
+ return -1;
+
+ /* inject #GP if in real mode or paging is disabled */
+ if (ctxt->mode == X86EMUL_MODE_REAL
+ || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return -1;
+ }
+
+ /* sysexit must be called from CPL 0 */
+ if (kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return -1;
+ }
+
+ setup_syscalls_segments(ctxt, &cs, &ss);
+
+ if ((c->rex_prefix & 0x8) != 0x0)
+ usermode = X86EMUL_MODE_PROT64;
+ else
+ usermode = X86EMUL_MODE_PROT32;
+
+ cs.dpl = 3;
+ ss.dpl = 3;
+ kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
+ switch (usermode) {
+ case X86EMUL_MODE_PROT32:
+ cs.selector = (u16)(msr_data + 16);
+ if ((msr_data & 0xfffc) == 0x0) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return -1;
+ }
+ ss.selector = (u16)(msr_data + 24);
+ break;
+ case X86EMUL_MODE_PROT64:
+ cs.selector = (u16)(msr_data + 32);
+ if (msr_data == 0x0) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return -1;
+ }
+ ss.selector = cs.selector + 8;
+ cs.db = 0;
+ cs.l = 1;
+ break;
+ }
+ cs.selector |= SELECTOR_RPL_MASK;
+ ss.selector |= SELECTOR_RPL_MASK;
+
+ kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
+ kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+
+ c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX];
+ c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX];
+
+ return 0;
+}
+
int
x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
{
@@ -1970,6 +2203,12 @@ twobyte_insn:
goto cannot_emulate;
}
break;
+ case 0x05: /* syscall */
+ if (emulate_syscall(ctxt) == -1)
+ goto cannot_emulate;
+ else
+ goto writeback;
+ break;
case 0x06:
emulate_clts(ctxt->vcpu);
c->dst.type = OP_NONE;
@@ -2036,6 +2275,18 @@ twobyte_insn:
rc = X86EMUL_CONTINUE;
c->dst.type = OP_NONE;
break;
+ case 0x34: /* sysenter */
+ if (emulate_sysenter(ctxt) == -1)
+ goto cannot_emulate;
+ else
+ goto writeback;
+ break;
+ case 0x35: /* sysexit */
+ if (emulate_sysexit(ctxt) == -1)
+ goto cannot_emulate;
+ else
+ goto writeback;
+ break;
case 0x40 ... 0x4f: /* cmov */
c->dst.val = c->dst.orig_val = c->src.val;
if (!test_cc(c->b, ctxt->eflags))
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 21f68e00524..82ad523b490 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -231,7 +231,7 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu)
{
struct kvm_pit *pit = vcpu->kvm->arch.vpit;
- if (pit && vcpu->vcpu_id == 0 && pit->pit_state.irq_ack)
+ if (pit && kvm_vcpu_is_bsp(vcpu) && pit->pit_state.irq_ack)
return atomic_read(&pit->pit_state.pit_timer.pending);
return 0;
}
@@ -252,7 +252,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
struct kvm_pit *pit = vcpu->kvm->arch.vpit;
struct hrtimer *timer;
- if (vcpu->vcpu_id != 0 || !pit)
+ if (!kvm_vcpu_is_bsp(vcpu) || !pit)
return;
timer = &pit->pit_state.pit_timer.timer;
@@ -294,7 +294,7 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
pt->timer.function = kvm_timer_fn;
pt->t_ops = &kpit_ops;
pt->kvm = ps->pit->kvm;
- pt->vcpu_id = 0;
+ pt->vcpu = pt->kvm->bsp_vcpu;
atomic_set(&pt->pending, 0);
ps->irq_ack = 1;
@@ -332,33 +332,62 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
case 1:
/* FIXME: enhance mode 4 precision */
case 4:
- create_pit_timer(ps, val, 0);
+ if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)) {
+ create_pit_timer(ps, val, 0);
+ }
break;
case 2:
case 3:
- create_pit_timer(ps, val, 1);
+ if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)){
+ create_pit_timer(ps, val, 1);
+ }
break;
default:
destroy_pit_timer(&ps->pit_timer);
}
}
-void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val)
+void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start)
+{
+ u8 saved_mode;
+ if (hpet_legacy_start) {
+ /* save existing mode for later reenablement */
+ saved_mode = kvm->arch.vpit->pit_state.channels[0].mode;
+ kvm->arch.vpit->pit_state.channels[0].mode = 0xff; /* disable timer */
+ pit_load_count(kvm, channel, val);
+ kvm->arch.vpit->pit_state.channels[0].mode = saved_mode;
+ } else {
+ pit_load_count(kvm, channel, val);
+ }
+}
+
+static inline struct kvm_pit *dev_to_pit(struct kvm_io_device *dev)
+{
+ return container_of(dev, struct kvm_pit, dev);
+}
+
+static inline struct kvm_pit *speaker_to_pit(struct kvm_io_device *dev)
{
- mutex_lock(&kvm->arch.vpit->pit_state.lock);
- pit_load_count(kvm, channel, val);
- mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ return container_of(dev, struct kvm_pit, speaker_dev);
}
-static void pit_ioport_write(struct kvm_io_device *this,
- gpa_t addr, int len, const void *data)
+static inline int pit_in_range(gpa_t addr)
{
- struct kvm_pit *pit = (struct kvm_pit *)this->private;
+ return ((addr >= KVM_PIT_BASE_ADDRESS) &&
+ (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
+}
+
+static int pit_ioport_write(struct kvm_io_device *this,
+ gpa_t addr, int len, const void *data)
+{
+ struct kvm_pit *pit = dev_to_pit(this);
struct kvm_kpit_state *pit_state = &pit->pit_state;
struct kvm *kvm = pit->kvm;
int channel, access;
struct kvm_kpit_channel_state *s;
u32 val = *(u32 *) data;
+ if (!pit_in_range(addr))
+ return -EOPNOTSUPP;
val &= 0xff;
addr &= KVM_PIT_CHANNEL_MASK;
@@ -421,16 +450,19 @@ static void pit_ioport_write(struct kvm_io_device *this,
}
mutex_unlock(&pit_state->lock);
+ return 0;
}
-static void pit_ioport_read(struct kvm_io_device *this,
- gpa_t addr, int len, void *data)
+static int pit_ioport_read(struct kvm_io_device *this,
+ gpa_t addr, int len, void *data)
{
- struct kvm_pit *pit = (struct kvm_pit *)this->private;
+ struct kvm_pit *pit = dev_to_pit(this);
struct kvm_kpit_state *pit_state = &pit->pit_state;
struct kvm *kvm = pit->kvm;
int ret, count;
struct kvm_kpit_channel_state *s;
+ if (!pit_in_range(addr))
+ return -EOPNOTSUPP;
addr &= KVM_PIT_CHANNEL_MASK;
s = &pit_state->channels[addr];
@@ -485,37 +517,36 @@ static void pit_ioport_read(struct kvm_io_device *this,
memcpy(data, (char *)&ret, len);
mutex_unlock(&pit_state->lock);
+ return 0;
}
-static int pit_in_range(struct kvm_io_device *this, gpa_t addr,
- int len, int is_write)
-{
- return ((addr >= KVM_PIT_BASE_ADDRESS) &&
- (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
-}
-
-static void speaker_ioport_write(struct kvm_io_device *this,
- gpa_t addr, int len, const void *data)
+static int speaker_ioport_write(struct kvm_io_device *this,
+ gpa_t addr, int len, const void *data)
{
- struct kvm_pit *pit = (struct kvm_pit *)this->private;
+ struct kvm_pit *pit = speaker_to_pit(this);
struct kvm_kpit_state *pit_state = &pit->pit_state;
struct kvm *kvm = pit->kvm;
u32 val = *(u32 *) data;
+ if (addr != KVM_SPEAKER_BASE_ADDRESS)
+ return -EOPNOTSUPP;
mutex_lock(&pit_state->lock);
pit_state->speaker_data_on = (val >> 1) & 1;
pit_set_gate(kvm, 2, val & 1);
mutex_unlock(&pit_state->lock);
+ return 0;
}
-static void speaker_ioport_read(struct kvm_io_device *this,
- gpa_t addr, int len, void *data)
+static int speaker_ioport_read(struct kvm_io_device *this,
+ gpa_t addr, int len, void *data)
{
- struct kvm_pit *pit = (struct kvm_pit *)this->private;
+ struct kvm_pit *pit = speaker_to_pit(this);
struct kvm_kpit_state *pit_state = &pit->pit_state;
struct kvm *kvm = pit->kvm;
unsigned int refresh_clock;
int ret;
+ if (addr != KVM_SPEAKER_BASE_ADDRESS)
+ return -EOPNOTSUPP;
/* Refresh clock toggles at about 15us. We approximate as 2^14ns. */
refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1;
@@ -527,12 +558,7 @@ static void speaker_ioport_read(struct kvm_io_device *this,
len = sizeof(ret);
memcpy(data, (char *)&ret, len);
mutex_unlock(&pit_state->lock);
-}
-
-static int speaker_in_range(struct kvm_io_device *this, gpa_t addr,
- int len, int is_write)
-{
- return (addr == KVM_SPEAKER_BASE_ADDRESS);
+ return 0;
}
void kvm_pit_reset(struct kvm_pit *pit)
@@ -541,6 +567,7 @@ void kvm_pit_reset(struct kvm_pit *pit)
struct kvm_kpit_channel_state *c;
mutex_lock(&pit->pit_state.lock);
+ pit->pit_state.flags = 0;
for (i = 0; i < 3; i++) {
c = &pit->pit_state.channels[i];
c->mode = 0xff;
@@ -563,10 +590,22 @@ static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
}
}
-struct kvm_pit *kvm_create_pit(struct kvm *kvm)
+static const struct kvm_io_device_ops pit_dev_ops = {
+ .read = pit_ioport_read,
+ .write = pit_ioport_write,
+};
+
+static const struct kvm_io_device_ops speaker_dev_ops = {
+ .read = speaker_ioport_read,
+ .write = speaker_ioport_write,
+};
+
+/* Caller must have writers lock on slots_lock */
+struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
{
struct kvm_pit *pit;
struct kvm_kpit_state *pit_state;
+ int ret;
pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL);
if (!pit)
@@ -582,19 +621,6 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
mutex_lock(&pit->pit_state.lock);
spin_lock_init(&pit->pit_state.inject_lock);
- /* Initialize PIO device */
- pit->dev.read = pit_ioport_read;
- pit->dev.write = pit_ioport_write;
- pit->dev.in_range = pit_in_range;
- pit->dev.private = pit;
- kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev);
-
- pit->speaker_dev.read = speaker_ioport_read;
- pit->speaker_dev.write = speaker_ioport_write;
- pit->speaker_dev.in_range = speaker_in_range;
- pit->speaker_dev.private = pit;
- kvm_io_bus_register_dev(&kvm->pio_bus, &pit->speaker_dev);
-
kvm->arch.vpit = pit;
pit->kvm = kvm;
@@ -613,7 +639,30 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
pit->mask_notifier.func = pit_mask_notifer;
kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+ kvm_iodevice_init(&pit->dev, &pit_dev_ops);
+ ret = __kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev);
+ if (ret < 0)
+ goto fail;
+
+ if (flags & KVM_PIT_SPEAKER_DUMMY) {
+ kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops);
+ ret = __kvm_io_bus_register_dev(&kvm->pio_bus,
+ &pit->speaker_dev);
+ if (ret < 0)
+ goto fail_unregister;
+ }
+
return pit;
+
+fail_unregister:
+ __kvm_io_bus_unregister_dev(&kvm->pio_bus, &pit->dev);
+
+fail:
+ if (pit->irq_source_id >= 0)
+ kvm_free_irq_source_id(kvm, pit->irq_source_id);
+
+ kfree(pit);
+ return NULL;
}
void kvm_free_pit(struct kvm *kvm)
@@ -623,6 +672,8 @@ void kvm_free_pit(struct kvm *kvm)
if (kvm->arch.vpit) {
kvm_unregister_irq_mask_notifier(kvm, 0,
&kvm->arch.vpit->mask_notifier);
+ kvm_unregister_irq_ack_notifier(kvm,
+ &kvm->arch.vpit->pit_state.irq_ack_notifier);
mutex_lock(&kvm->arch.vpit->pit_state.lock);
timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
hrtimer_cancel(timer);
@@ -637,10 +688,10 @@ static void __inject_pit_timer_intr(struct kvm *kvm)
struct kvm_vcpu *vcpu;
int i;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->irq_lock);
kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->irq_lock);
/*
* Provides NMI watchdog support via Virtual Wire mode.
@@ -652,11 +703,8 @@ static void __inject_pit_timer_intr(struct kvm *kvm)
* VCPU0, and only if its LVT0 is in EXTINT mode.
*/
if (kvm->arch.vapics_in_nmi_mode > 0)
- for (i = 0; i < KVM_MAX_VCPUS; ++i) {
- vcpu = kvm->vcpus[i];
- if (vcpu)
- kvm_apic_nmi_wd_deliver(vcpu);
- }
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_apic_nmi_wd_deliver(vcpu);
}
void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
@@ -665,7 +713,7 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
struct kvm *kvm = vcpu->kvm;
struct kvm_kpit_state *ps;
- if (vcpu && pit) {
+ if (pit) {
int inject = 0;
ps = &pit->pit_state;
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index bbd863ff60b..d4c1c7ffdc0 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -21,6 +21,7 @@ struct kvm_kpit_channel_state {
struct kvm_kpit_state {
struct kvm_kpit_channel_state channels[3];
+ u32 flags;
struct kvm_timer pit_timer;
bool is_periodic;
u32 speaker_data_on;
@@ -49,8 +50,8 @@ struct kvm_pit {
#define KVM_PIT_CHANNEL_MASK 0x3
void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
-void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val);
-struct kvm_pit *kvm_create_pit(struct kvm *kvm);
+void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start);
+struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags);
void kvm_free_pit(struct kvm *kvm);
void kvm_pit_reset(struct kvm_pit *pit);
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 1ccb50c74f1..01f15168280 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -30,50 +30,24 @@
#include "irq.h"
#include <linux/kvm_host.h>
-
-static void pic_lock(struct kvm_pic *s)
- __acquires(&s->lock)
-{
- spin_lock(&s->lock);
-}
-
-static void pic_unlock(struct kvm_pic *s)
- __releases(&s->lock)
-{
- struct kvm *kvm = s->kvm;
- unsigned acks = s->pending_acks;
- bool wakeup = s->wakeup_needed;
- struct kvm_vcpu *vcpu;
-
- s->pending_acks = 0;
- s->wakeup_needed = false;
-
- spin_unlock(&s->lock);
-
- while (acks) {
- kvm_notify_acked_irq(kvm, SELECT_PIC(__ffs(acks)),
- __ffs(acks));
- acks &= acks - 1;
- }
-
- if (wakeup) {
- vcpu = s->kvm->vcpus[0];
- if (vcpu)
- kvm_vcpu_kick(vcpu);
- }
-}
+#include "trace.h"
static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
{
s->isr &= ~(1 << irq);
s->isr_ack |= (1 << irq);
+ if (s != &s->pics_state->pics[0])
+ irq += 8;
+ kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
}
void kvm_pic_clear_isr_ack(struct kvm *kvm)
{
struct kvm_pic *s = pic_irqchip(kvm);
+ spin_lock(&s->lock);
s->pics[0].isr_ack = 0xff;
s->pics[1].isr_ack = 0xff;
+ spin_unlock(&s->lock);
}
/*
@@ -174,9 +148,9 @@ static void pic_update_irq(struct kvm_pic *s)
void kvm_pic_update_irq(struct kvm_pic *s)
{
- pic_lock(s);
+ spin_lock(&s->lock);
pic_update_irq(s);
- pic_unlock(s);
+ spin_unlock(&s->lock);
}
int kvm_pic_set_irq(void *opaque, int irq, int level)
@@ -184,12 +158,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
struct kvm_pic *s = opaque;
int ret = -1;
- pic_lock(s);
+ spin_lock(&s->lock);
if (irq >= 0 && irq < PIC_NUM_PINS) {
ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
pic_update_irq(s);
+ trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
+ s->pics[irq >> 3].imr, ret == 0);
}
- pic_unlock(s);
+ spin_unlock(&s->lock);
return ret;
}
@@ -217,7 +193,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
int irq, irq2, intno;
struct kvm_pic *s = pic_irqchip(kvm);
- pic_lock(s);
+ spin_lock(&s->lock);
irq = pic_get_irq(&s->pics[0]);
if (irq >= 0) {
pic_intack(&s->pics[0], irq);
@@ -242,8 +218,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
intno = s->pics[0].irq_base + irq;
}
pic_update_irq(s);
- pic_unlock(s);
- kvm_notify_acked_irq(kvm, SELECT_PIC(irq), irq);
+ spin_unlock(&s->lock);
return intno;
}
@@ -252,7 +227,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
{
int irq, irqbase, n;
struct kvm *kvm = s->pics_state->irq_request_opaque;
- struct kvm_vcpu *vcpu0 = kvm->vcpus[0];
+ struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu;
if (s == &s->pics_state->pics[0])
irqbase = 0;
@@ -263,7 +238,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
if (s->irr & (1 << irq) || s->isr & (1 << irq)) {
n = irq + irqbase;
- s->pics_state->pending_acks |= 1 << n;
+ kvm_notify_acked_irq(kvm, SELECT_PIC(n), n);
}
}
s->last_irr = 0;
@@ -428,8 +403,7 @@ static u32 elcr_ioport_read(void *opaque, u32 addr1)
return s->elcr;
}
-static int picdev_in_range(struct kvm_io_device *this, gpa_t addr,
- int len, int is_write)
+static int picdev_in_range(gpa_t addr)
{
switch (addr) {
case 0x20:
@@ -444,18 +418,25 @@ static int picdev_in_range(struct kvm_io_device *this, gpa_t addr,
}
}
-static void picdev_write(struct kvm_io_device *this,
+static inline struct kvm_pic *to_pic(struct kvm_io_device *dev)
+{
+ return container_of(dev, struct kvm_pic, dev);
+}
+
+static int picdev_write(struct kvm_io_device *this,
gpa_t addr, int len, const void *val)
{
- struct kvm_pic *s = this->private;
+ struct kvm_pic *s = to_pic(this);
unsigned char data = *(unsigned char *)val;
+ if (!picdev_in_range(addr))
+ return -EOPNOTSUPP;
if (len != 1) {
if (printk_ratelimit())
printk(KERN_ERR "PIC: non byte write\n");
- return;
+ return 0;
}
- pic_lock(s);
+ spin_lock(&s->lock);
switch (addr) {
case 0x20:
case 0x21:
@@ -468,21 +449,24 @@ static void picdev_write(struct kvm_io_device *this,
elcr_ioport_write(&s->pics[addr & 1], addr, data);
break;
}
- pic_unlock(s);
+ spin_unlock(&s->lock);
+ return 0;
}
-static void picdev_read(struct kvm_io_device *this,
- gpa_t addr, int len, void *val)
+static int picdev_read(struct kvm_io_device *this,
+ gpa_t addr, int len, void *val)
{
- struct kvm_pic *s = this->private;
+ struct kvm_pic *s = to_pic(this);
unsigned char data = 0;
+ if (!picdev_in_range(addr))
+ return -EOPNOTSUPP;
if (len != 1) {
if (printk_ratelimit())
printk(KERN_ERR "PIC: non byte read\n");
- return;
+ return 0;
}
- pic_lock(s);
+ spin_lock(&s->lock);
switch (addr) {
case 0x20:
case 0x21:
@@ -496,7 +480,8 @@ static void picdev_read(struct kvm_io_device *this,
break;
}
*(unsigned char *)val = data;
- pic_unlock(s);
+ spin_unlock(&s->lock);
+ return 0;
}
/*
@@ -505,20 +490,27 @@ static void picdev_read(struct kvm_io_device *this,
static void pic_irq_request(void *opaque, int level)
{
struct kvm *kvm = opaque;
- struct kvm_vcpu *vcpu = kvm->vcpus[0];
+ struct kvm_vcpu *vcpu = kvm->bsp_vcpu;
struct kvm_pic *s = pic_irqchip(kvm);
int irq = pic_get_irq(&s->pics[0]);
s->output = level;
if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
s->pics[0].isr_ack &= ~(1 << irq);
- s->wakeup_needed = true;
+ kvm_vcpu_kick(vcpu);
}
}
+static const struct kvm_io_device_ops picdev_ops = {
+ .read = picdev_read,
+ .write = picdev_write,
+};
+
struct kvm_pic *kvm_create_pic(struct kvm *kvm)
{
struct kvm_pic *s;
+ int ret;
+
s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
if (!s)
return NULL;
@@ -534,10 +526,12 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
/*
* Initialize PIO device
*/
- s->dev.read = picdev_read;
- s->dev.write = picdev_write;
- s->dev.in_range = picdev_in_range;
- s->dev.private = s;
- kvm_io_bus_register_dev(&kvm->pio_bus, &s->dev);
+ kvm_iodevice_init(&s->dev, &picdev_ops);
+ ret = kvm_io_bus_register_dev(kvm, &kvm->pio_bus, &s->dev);
+ if (ret < 0) {
+ kfree(s);
+ return NULL;
+ }
+
return s;
}
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 9f593188129..7d6058a2fd3 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -63,7 +63,6 @@ struct kvm_kpic_state {
struct kvm_pic {
spinlock_t lock;
- bool wakeup_needed;
unsigned pending_acks;
struct kvm *kvm;
struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 1ff819dce7d..7bcc5b6a440 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -29,4 +29,13 @@ static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
kvm_register_write(vcpu, VCPU_REGS_RIP, val);
}
+static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
+{
+ if (!test_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_avail))
+ kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);
+
+ return vcpu->arch.pdptrs[index];
+}
+
#endif
diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h
deleted file mode 100644
index ed66e4c078d..00000000000
--- a/arch/x86/kvm/kvm_svm.h
+++ /dev/null
@@ -1,51 +0,0 @@
-#ifndef __KVM_SVM_H
-#define __KVM_SVM_H
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/list.h>
-#include <linux/kvm_host.h>
-#include <asm/msr.h>
-
-#include <asm/svm.h>
-
-static const u32 host_save_user_msrs[] = {
-#ifdef CONFIG_X86_64
- MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
- MSR_FS_BASE,
-#endif
- MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
-};
-
-#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
-
-struct kvm_vcpu;
-
-struct vcpu_svm {
- struct kvm_vcpu vcpu;
- struct vmcb *vmcb;
- unsigned long vmcb_pa;
- struct svm_cpu_data *svm_data;
- uint64_t asid_generation;
-
- u64 next_rip;
-
- u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
- u64 host_gs_base;
- unsigned long host_cr2;
-
- u32 *msrpm;
- struct vmcb *hsave;
- u64 hsave_msr;
-
- u64 nested_vmcb;
-
- /* These are the merged vectors */
- u32 *nested_msrpm;
-
- /* gpa pointers to the real vectors */
- u64 nested_vmcb_msrpm;
-};
-
-#endif
-
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
index 26bd6ba74e1..55c7524dda5 100644
--- a/arch/x86/kvm/kvm_timer.h
+++ b/arch/x86/kvm/kvm_timer.h
@@ -6,7 +6,7 @@ struct kvm_timer {
bool reinject;
struct kvm_timer_ops *t_ops;
struct kvm *kvm;
- int vcpu_id;
+ struct kvm_vcpu *vcpu;
};
struct kvm_timer_ops {
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ae99d83f81a..1ae5ceba7eb 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -32,8 +32,11 @@
#include <asm/current.h>
#include <asm/apicdef.h>
#include <asm/atomic.h>
+#include <asm/apicdef.h>
#include "kvm_cache_regs.h"
#include "irq.h"
+#include "trace.h"
+#include "x86.h"
#ifndef CONFIG_X86_64
#define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
@@ -141,6 +144,26 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val)
return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI;
}
+void kvm_apic_set_version(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ struct kvm_cpuid_entry2 *feat;
+ u32 v = APIC_VERSION;
+
+ if (!irqchip_in_kernel(vcpu->kvm))
+ return;
+
+ feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0);
+ if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31))))
+ v |= APIC_LVR_DIRECTED_EOI;
+ apic_set_reg(apic, APIC_LVR, v);
+}
+
+static inline int apic_x2apic_mode(struct kvm_lapic *apic)
+{
+ return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
+}
+
static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */
LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */
@@ -165,36 +188,52 @@ static int find_highest_vector(void *bitmap)
static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
{
+ apic->irr_pending = true;
return apic_test_and_set_vector(vec, apic->regs + APIC_IRR);
}
-static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
+static inline int apic_search_irr(struct kvm_lapic *apic)
{
- apic_clear_vector(vec, apic->regs + APIC_IRR);
+ return find_highest_vector(apic->regs + APIC_IRR);
}
static inline int apic_find_highest_irr(struct kvm_lapic *apic)
{
int result;
- result = find_highest_vector(apic->regs + APIC_IRR);
+ if (!apic->irr_pending)
+ return -1;
+
+ result = apic_search_irr(apic);
ASSERT(result == -1 || result >= 16);
return result;
}
+static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
+{
+ apic->irr_pending = false;
+ apic_clear_vector(vec, apic->regs + APIC_IRR);
+ if (apic_search_irr(apic) != -1)
+ apic->irr_pending = true;
+}
+
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
int highest_irr;
+ /* This may race with setting of irr in __apic_accept_irq() and
+ * value returned may be wrong, but kvm_vcpu_kick() in __apic_accept_irq
+ * will cause vmexit immediately and the value will be recalculated
+ * on the next vmentry.
+ */
if (!apic)
return 0;
highest_irr = apic_find_highest_irr(apic);
return highest_irr;
}
-EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
int vector, int level, int trig_mode);
@@ -251,7 +290,12 @@ int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
{
int result = 0;
- u8 logical_id;
+ u32 logical_id;
+
+ if (apic_x2apic_mode(apic)) {
+ logical_id = apic_get_reg(apic, APIC_LDR);
+ return logical_id & mda;
+ }
logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR));
@@ -331,6 +375,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
break;
result = !apic_test_and_set_irr(vector, apic);
+ trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
+ trig_mode, vector, !result);
if (!result) {
if (trig_mode)
apic_debug("level trig mode repeatedly for "
@@ -425,7 +471,11 @@ static void apic_set_eoi(struct kvm_lapic *apic)
trigger_mode = IOAPIC_LEVEL_TRIG;
else
trigger_mode = IOAPIC_EDGE_TRIG;
- kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
+ if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) {
+ mutex_lock(&apic->vcpu->kvm->irq_lock);
+ kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
+ mutex_unlock(&apic->vcpu->kvm->irq_lock);
+ }
}
static void apic_send_ipi(struct kvm_lapic *apic)
@@ -440,7 +490,12 @@ static void apic_send_ipi(struct kvm_lapic *apic)
irq.level = icr_low & APIC_INT_ASSERT;
irq.trig_mode = icr_low & APIC_INT_LEVELTRIG;
irq.shorthand = icr_low & APIC_SHORT_MASK;
- irq.dest_id = GET_APIC_DEST_FIELD(icr_high);
+ if (apic_x2apic_mode(apic))
+ irq.dest_id = icr_high;
+ else
+ irq.dest_id = GET_APIC_DEST_FIELD(icr_high);
+
+ trace_kvm_apic_ipi(icr_low, irq.dest_id);
apic_debug("icr_high 0x%x, icr_low 0x%x, "
"short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
@@ -449,7 +504,9 @@ static void apic_send_ipi(struct kvm_lapic *apic)
irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
irq.vector);
+ mutex_lock(&apic->vcpu->kvm->irq_lock);
kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq);
+ mutex_unlock(&apic->vcpu->kvm->irq_lock);
}
static u32 apic_get_tmcct(struct kvm_lapic *apic)
@@ -495,12 +552,16 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
{
u32 val = 0;
- KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler);
-
if (offset >= LAPIC_MMIO_LENGTH)
return 0;
switch (offset) {
+ case APIC_ID:
+ if (apic_x2apic_mode(apic))
+ val = kvm_apic_id(apic);
+ else
+ val = kvm_apic_id(apic) << 24;
+ break;
case APIC_ARBPRI:
printk(KERN_WARNING "Access APIC ARBPRI register "
"which is for P6\n");
@@ -522,21 +583,35 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
return val;
}
-static void apic_mmio_read(struct kvm_io_device *this,
- gpa_t address, int len, void *data)
+static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev)
+{
+ return container_of(dev, struct kvm_lapic, dev);
+}
+
+static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
+ void *data)
{
- struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
- unsigned int offset = address - apic->base_address;
unsigned char alignment = offset & 0xf;
u32 result;
+ /* this bitmask has a bit cleared for each reserver register */
+ static const u64 rmask = 0x43ff01ffffffe70cULL;
if ((alignment + len) > 4) {
- printk(KERN_ERR "KVM_APIC_READ: alignment error %lx %d",
- (unsigned long)address, len);
- return;
+ apic_debug("KVM_APIC_READ: alignment error %x %d\n",
+ offset, len);
+ return 1;
}
+
+ if (offset > 0x3f0 || !(rmask & (1ULL << (offset >> 4)))) {
+ apic_debug("KVM_APIC_READ: read reserved register %x\n",
+ offset);
+ return 1;
+ }
+
result = __apic_read(apic, offset & ~0xf);
+ trace_kvm_apic_read(offset, result);
+
switch (len) {
case 1:
case 2:
@@ -548,6 +623,28 @@ static void apic_mmio_read(struct kvm_io_device *this,
"should be 1,2, or 4 instead\n", len);
break;
}
+ return 0;
+}
+
+static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr)
+{
+ return apic_hw_enabled(apic) &&
+ addr >= apic->base_address &&
+ addr < apic->base_address + LAPIC_MMIO_LENGTH;
+}
+
+static int apic_mmio_read(struct kvm_io_device *this,
+ gpa_t address, int len, void *data)
+{
+ struct kvm_lapic *apic = to_lapic(this);
+ u32 offset = address - apic->base_address;
+
+ if (!apic_mmio_in_range(apic, address))
+ return -EOPNOTSUPP;
+
+ apic_reg_read(apic, offset, len, data);
+
+ return 0;
}
static void update_divide_count(struct kvm_lapic *apic)
@@ -573,6 +670,15 @@ static void start_apic_timer(struct kvm_lapic *apic)
if (!apic->lapic_timer.period)
return;
+ /*
+ * Do not allow the guest to program periodic timers with small
+ * interval, since the hrtimers are not throttled by the host
+ * scheduler.
+ */
+ if (apic_lvtt_period(apic)) {
+ if (apic->lapic_timer.period < NSEC_PER_MSEC/2)
+ apic->lapic_timer.period = NSEC_PER_MSEC/2;
+ }
hrtimer_start(&apic->lapic_timer.timer,
ktime_add_ns(now, apic->lapic_timer.period),
@@ -603,40 +709,18 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
apic->vcpu->kvm->arch.vapics_in_nmi_mode--;
}
-static void apic_mmio_write(struct kvm_io_device *this,
- gpa_t address, int len, const void *data)
+static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
{
- struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
- unsigned int offset = address - apic->base_address;
- unsigned char alignment = offset & 0xf;
- u32 val;
-
- /*
- * APIC register must be aligned on 128-bits boundary.
- * 32/64/128 bits registers must be accessed thru 32 bits.
- * Refer SDM 8.4.1
- */
- if (len != 4 || alignment) {
- /* Don't shout loud, $infamous_os would cause only noise. */
- apic_debug("apic write: bad size=%d %lx\n",
- len, (long)address);
- return;
- }
-
- val = *(u32 *) data;
-
- /* too common printing */
- if (offset != APIC_EOI)
- apic_debug("%s: offset 0x%x with length 0x%x, and value is "
- "0x%x\n", __func__, offset, len, val);
-
- offset &= 0xff0;
+ int ret = 0;
- KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler);
+ trace_kvm_apic_write(reg, val);
- switch (offset) {
+ switch (reg) {
case APIC_ID: /* Local APIC ID */
- apic_set_reg(apic, APIC_ID, val);
+ if (!apic_x2apic_mode(apic))
+ apic_set_reg(apic, APIC_ID, val);
+ else
+ ret = 1;
break;
case APIC_TASKPRI:
@@ -649,15 +733,24 @@ static void apic_mmio_write(struct kvm_io_device *this,
break;
case APIC_LDR:
- apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK);
+ if (!apic_x2apic_mode(apic))
+ apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK);
+ else
+ ret = 1;
break;
case APIC_DFR:
- apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
+ if (!apic_x2apic_mode(apic))
+ apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
+ else
+ ret = 1;
break;
- case APIC_SPIV:
- apic_set_reg(apic, APIC_SPIV, val & 0x3ff);
+ case APIC_SPIV: {
+ u32 mask = 0x3ff;
+ if (apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI)
+ mask |= APIC_SPIV_DIRECTED_EOI;
+ apic_set_reg(apic, APIC_SPIV, val & mask);
if (!(val & APIC_SPIV_APIC_ENABLED)) {
int i;
u32 lvt_val;
@@ -672,7 +765,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
}
break;
-
+ }
case APIC_ICR:
/* No delay here, so we always clear the pending bit */
apic_set_reg(apic, APIC_ICR, val & ~(1 << 12));
@@ -680,7 +773,9 @@ static void apic_mmio_write(struct kvm_io_device *this,
break;
case APIC_ICR2:
- apic_set_reg(apic, APIC_ICR2, val & 0xff000000);
+ if (!apic_x2apic_mode(apic))
+ val &= 0xff000000;
+ apic_set_reg(apic, APIC_ICR2, val);
break;
case APIC_LVT0:
@@ -694,8 +789,8 @@ static void apic_mmio_write(struct kvm_io_device *this,
if (!apic_sw_enabled(apic))
val |= APIC_LVT_MASKED;
- val &= apic_lvt_mask[(offset - APIC_LVTT) >> 4];
- apic_set_reg(apic, offset, val);
+ val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4];
+ apic_set_reg(apic, reg, val);
break;
@@ -703,7 +798,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
hrtimer_cancel(&apic->lapic_timer.timer);
apic_set_reg(apic, APIC_TMICT, val);
start_apic_timer(apic);
- return;
+ break;
case APIC_TDCR:
if (val & 4)
@@ -712,27 +807,59 @@ static void apic_mmio_write(struct kvm_io_device *this,
update_divide_count(apic);
break;
+ case APIC_ESR:
+ if (apic_x2apic_mode(apic) && val != 0) {
+ printk(KERN_ERR "KVM_WRITE:ESR not zero %x\n", val);
+ ret = 1;
+ }
+ break;
+
+ case APIC_SELF_IPI:
+ if (apic_x2apic_mode(apic)) {
+ apic_reg_write(apic, APIC_ICR, 0x40000 | (val & 0xff));
+ } else
+ ret = 1;
+ break;
default:
- apic_debug("Local APIC Write to read-only register %x\n",
- offset);
+ ret = 1;
break;
}
-
+ if (ret)
+ apic_debug("Local APIC Write to read-only register %x\n", reg);
+ return ret;
}
-static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr,
- int len, int size)
+static int apic_mmio_write(struct kvm_io_device *this,
+ gpa_t address, int len, const void *data)
{
- struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
- int ret = 0;
+ struct kvm_lapic *apic = to_lapic(this);
+ unsigned int offset = address - apic->base_address;
+ u32 val;
+ if (!apic_mmio_in_range(apic, address))
+ return -EOPNOTSUPP;
- if (apic_hw_enabled(apic) &&
- (addr >= apic->base_address) &&
- (addr < (apic->base_address + LAPIC_MMIO_LENGTH)))
- ret = 1;
+ /*
+ * APIC register must be aligned on 128-bits boundary.
+ * 32/64/128 bits registers must be accessed thru 32 bits.
+ * Refer SDM 8.4.1
+ */
+ if (len != 4 || (offset & 0xf)) {
+ /* Don't shout loud, $infamous_os would cause only noise. */
+ apic_debug("apic write: bad size=%d %lx\n", len, (long)address);
+ return 0;
+ }
- return ret;
+ val = *(u32*)data;
+
+ /* too common printing */
+ if (offset != APIC_EOI)
+ apic_debug("%s: offset 0x%x with length 0x%x, and value is "
+ "0x%x\n", __func__, offset, len, val);
+
+ apic_reg_write(apic, offset & 0xff0, val);
+
+ return 0;
}
void kvm_free_lapic(struct kvm_vcpu *vcpu)
@@ -763,7 +890,6 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
| (apic_get_reg(apic, APIC_TASKPRI) & 4));
}
-EXPORT_SYMBOL_GPL(kvm_lapic_set_tpr);
u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
{
@@ -776,7 +902,6 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
return (tpr & 0xf0) >> 4;
}
-EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8);
void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
{
@@ -787,10 +912,16 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
vcpu->arch.apic_base = value;
return;
}
- if (apic->vcpu->vcpu_id)
+
+ if (!kvm_vcpu_is_bsp(apic->vcpu))
value &= ~MSR_IA32_APICBASE_BSP;
vcpu->arch.apic_base = value;
+ if (apic_x2apic_mode(apic)) {
+ u32 id = kvm_apic_id(apic);
+ u32 ldr = ((id & ~0xf) << 16) | (1 << (id & 0xf));
+ apic_set_reg(apic, APIC_LDR, ldr);
+ }
apic->base_address = apic->vcpu->arch.apic_base &
MSR_IA32_APICBASE_BASE;
@@ -800,12 +931,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
}
-u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.apic_base;
-}
-EXPORT_SYMBOL_GPL(kvm_lapic_get_base);
-
void kvm_lapic_reset(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic;
@@ -821,7 +946,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
hrtimer_cancel(&apic->lapic_timer.timer);
apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
- apic_set_reg(apic, APIC_LVR, APIC_VERSION);
+ kvm_apic_set_version(apic->vcpu);
for (i = 0; i < APIC_LVT_NUM; i++)
apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
@@ -842,9 +967,10 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
}
+ apic->irr_pending = false;
update_divide_count(apic);
atomic_set(&apic->lapic_timer.pending, 0);
- if (vcpu->vcpu_id == 0)
+ if (kvm_vcpu_is_bsp(vcpu))
vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
apic_update_ppr(apic);
@@ -855,7 +981,6 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
vcpu, kvm_apic_id(apic),
vcpu->arch.apic_base, apic->base_address);
}
-EXPORT_SYMBOL_GPL(kvm_lapic_reset);
bool kvm_apic_present(struct kvm_vcpu *vcpu)
{
@@ -866,7 +991,6 @@ int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
{
return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic);
}
-EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
/*
*----------------------------------------------------------------------
@@ -917,6 +1041,11 @@ static struct kvm_timer_ops lapic_timer_ops = {
.is_periodic = lapic_is_periodic,
};
+static const struct kvm_io_device_ops apic_mmio_ops = {
+ .read = apic_mmio_read,
+ .write = apic_mmio_write,
+};
+
int kvm_create_lapic(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic;
@@ -945,16 +1074,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
apic->lapic_timer.timer.function = kvm_timer_fn;
apic->lapic_timer.t_ops = &lapic_timer_ops;
apic->lapic_timer.kvm = vcpu->kvm;
- apic->lapic_timer.vcpu_id = vcpu->vcpu_id;
+ apic->lapic_timer.vcpu = vcpu;
apic->base_address = APIC_DEFAULT_PHYS_BASE;
vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
kvm_lapic_reset(vcpu);
- apic->dev.read = apic_mmio_read;
- apic->dev.write = apic_mmio_write;
- apic->dev.in_range = apic_mmio_range;
- apic->dev.private = apic;
+ kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
return 0;
nomem_free_apic:
@@ -962,7 +1088,6 @@ nomem_free_apic:
nomem:
return -ENOMEM;
}
-EXPORT_SYMBOL_GPL(kvm_create_lapic);
int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
{
@@ -985,7 +1110,7 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
int r = 0;
- if (vcpu->vcpu_id == 0) {
+ if (kvm_vcpu_is_bsp(vcpu)) {
if (!apic_hw_enabled(vcpu->arch.apic))
r = 1;
if ((lvt0 & APIC_LVT_MASKED) == 0 &&
@@ -1025,7 +1150,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
apic->base_address = vcpu->arch.apic_base &
MSR_IA32_APICBASE_BASE;
- apic_set_reg(apic, APIC_LVR, APIC_VERSION);
+ kvm_apic_set_version(vcpu);
+
apic_update_ppr(apic);
hrtimer_cancel(&apic->lapic_timer.timer);
update_divide_count(apic);
@@ -1092,3 +1218,35 @@ void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
vcpu->arch.apic->vapic_addr = vapic_addr;
}
+
+int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 reg = (msr - APIC_BASE_MSR) << 4;
+
+ if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
+ return 1;
+
+ /* if this is ICR write vector before command */
+ if (msr == 0x830)
+ apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
+ return apic_reg_write(apic, reg, (u32)data);
+}
+
+int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0;
+
+ if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
+ return 1;
+
+ if (apic_reg_read(apic, reg, 4, &low))
+ return 1;
+ if (msr == 0x830)
+ apic_reg_read(apic, APIC_ICR2, 4, &high);
+
+ *data = (((u64)high) << 32) | low;
+
+ return 0;
+}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index a587f8349c4..40010b09c4a 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -12,6 +12,7 @@ struct kvm_lapic {
struct kvm_timer lapic_timer;
u32 divide_count;
struct kvm_vcpu *vcpu;
+ bool irr_pending;
struct page *regs_page;
void *regs;
gpa_t vapic_addr;
@@ -28,6 +29,7 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
+void kvm_apic_set_version(struct kvm_vcpu *vcpu);
int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
@@ -44,4 +46,6 @@ void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
+int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0ef5bb2b404..eca41ae9f45 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -18,6 +18,7 @@
*/
#include "mmu.h"
+#include "kvm_cache_regs.h"
#include <linux/kvm_host.h>
#include <linux/types.h>
@@ -107,6 +108,9 @@ module_param(oos_shadow, bool, 0644);
#define PT32_LEVEL_MASK(level) \
(((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
+#define PT32_LVL_OFFSET_MASK(level) \
+ (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
+ * PT32_LEVEL_BITS))) - 1))
#define PT32_INDEX(address, level)\
(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
@@ -115,10 +119,19 @@ module_param(oos_shadow, bool, 0644);
#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
#define PT64_DIR_BASE_ADDR_MASK \
(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
+#define PT64_LVL_ADDR_MASK(level) \
+ (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
+ * PT64_LEVEL_BITS))) - 1))
+#define PT64_LVL_OFFSET_MASK(level) \
+ (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
+ * PT64_LEVEL_BITS))) - 1))
#define PT32_BASE_ADDR_MASK PAGE_MASK
#define PT32_DIR_BASE_ADDR_MASK \
(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
+#define PT32_LVL_ADDR_MASK(level) \
+ (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
+ * PT32_LEVEL_BITS))) - 1))
#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
| PT64_NX_MASK)
@@ -129,6 +142,7 @@ module_param(oos_shadow, bool, 0644);
#define PFERR_RSVD_MASK (1U << 3)
#define PFERR_FETCH_MASK (1U << 4)
+#define PT_PDPE_LEVEL 3
#define PT_DIRECTORY_LEVEL 2
#define PT_PAGE_TABLE_LEVEL 1
@@ -139,10 +153,13 @@ module_param(oos_shadow, bool, 0644);
#define ACC_USER_MASK PT_USER_MASK
#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
+#define CREATE_TRACE_POINTS
+#include "mmutrace.h"
+
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
struct kvm_rmap_desc {
- u64 *shadow_ptes[RMAP_EXT];
+ u64 *sptes[RMAP_EXT];
struct kvm_rmap_desc *more;
};
@@ -239,16 +256,25 @@ static int is_writeble_pte(unsigned long pte)
return pte & PT_WRITABLE_MASK;
}
-static int is_dirty_pte(unsigned long pte)
+static int is_dirty_gpte(unsigned long pte)
{
- return pte & shadow_dirty_mask;
+ return pte & PT_DIRTY_MASK;
}
-static int is_rmap_pte(u64 pte)
+static int is_rmap_spte(u64 pte)
{
return is_shadow_present_pte(pte);
}
+static int is_last_spte(u64 pte, int level)
+{
+ if (level == PT_PAGE_TABLE_LEVEL)
+ return 1;
+ if (is_large_pte(pte))
+ return 1;
+ return 0;
+}
+
static pfn_t spte_to_pfn(u64 pte)
{
return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -261,7 +287,7 @@ static gfn_t pse36_gfn_delta(u32 gpte)
return (gpte & PT32_DIR_PSE36_MASK) << shift;
}
-static void set_shadow_pte(u64 *sptep, u64 spte)
+static void __set_spte(u64 *sptep, u64 spte)
{
#ifdef CONFIG_X86_64
set_64bit((unsigned long *)sptep, spte);
@@ -380,37 +406,52 @@ static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
* Return the pointer to the largepage write count for a given
* gfn, handling slots that are not large page aligned.
*/
-static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot)
+static int *slot_largepage_idx(gfn_t gfn,
+ struct kvm_memory_slot *slot,
+ int level)
{
unsigned long idx;
- idx = (gfn / KVM_PAGES_PER_HPAGE) -
- (slot->base_gfn / KVM_PAGES_PER_HPAGE);
- return &slot->lpage_info[idx].write_count;
+ idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
+ (slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
+ return &slot->lpage_info[level - 2][idx].write_count;
}
static void account_shadowed(struct kvm *kvm, gfn_t gfn)
{
+ struct kvm_memory_slot *slot;
int *write_count;
+ int i;
gfn = unalias_gfn(kvm, gfn);
- write_count = slot_largepage_idx(gfn,
- gfn_to_memslot_unaliased(kvm, gfn));
- *write_count += 1;
+
+ slot = gfn_to_memslot_unaliased(kvm, gfn);
+ for (i = PT_DIRECTORY_LEVEL;
+ i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+ write_count = slot_largepage_idx(gfn, slot, i);
+ *write_count += 1;
+ }
}
static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
{
+ struct kvm_memory_slot *slot;
int *write_count;
+ int i;
gfn = unalias_gfn(kvm, gfn);
- write_count = slot_largepage_idx(gfn,
- gfn_to_memslot_unaliased(kvm, gfn));
- *write_count -= 1;
- WARN_ON(*write_count < 0);
+ for (i = PT_DIRECTORY_LEVEL;
+ i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+ slot = gfn_to_memslot_unaliased(kvm, gfn);
+ write_count = slot_largepage_idx(gfn, slot, i);
+ *write_count -= 1;
+ WARN_ON(*write_count < 0);
+ }
}
-static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
+static int has_wrprotected_page(struct kvm *kvm,
+ gfn_t gfn,
+ int level)
{
struct kvm_memory_slot *slot;
int *largepage_idx;
@@ -418,47 +459,67 @@ static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
gfn = unalias_gfn(kvm, gfn);
slot = gfn_to_memslot_unaliased(kvm, gfn);
if (slot) {
- largepage_idx = slot_largepage_idx(gfn, slot);
+ largepage_idx = slot_largepage_idx(gfn, slot, level);
return *largepage_idx;
}
return 1;
}
-static int host_largepage_backed(struct kvm *kvm, gfn_t gfn)
+static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
{
+ unsigned long page_size = PAGE_SIZE;
struct vm_area_struct *vma;
unsigned long addr;
- int ret = 0;
+ int i, ret = 0;
addr = gfn_to_hva(kvm, gfn);
if (kvm_is_error_hva(addr))
- return ret;
+ return page_size;
down_read(&current->mm->mmap_sem);
vma = find_vma(current->mm, addr);
- if (vma && is_vm_hugetlb_page(vma))
- ret = 1;
+ if (!vma)
+ goto out;
+
+ page_size = vma_kernel_pagesize(vma);
+
+out:
up_read(&current->mm->mmap_sem);
+ for (i = PT_PAGE_TABLE_LEVEL;
+ i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
+ if (page_size >= KVM_HPAGE_SIZE(i))
+ ret = i;
+ else
+ break;
+ }
+
return ret;
}
-static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
+static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
{
struct kvm_memory_slot *slot;
-
- if (has_wrprotected_page(vcpu->kvm, large_gfn))
- return 0;
-
- if (!host_largepage_backed(vcpu->kvm, large_gfn))
- return 0;
+ int host_level;
+ int level = PT_PAGE_TABLE_LEVEL;
slot = gfn_to_memslot(vcpu->kvm, large_gfn);
if (slot && slot->dirty_bitmap)
- return 0;
+ return PT_PAGE_TABLE_LEVEL;
- return 1;
+ host_level = host_mapping_level(vcpu->kvm, large_gfn);
+
+ if (host_level == PT_PAGE_TABLE_LEVEL)
+ return host_level;
+
+ for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) {
+
+ if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
+ break;
+ }
+
+ return level - 1;
}
/*
@@ -466,19 +527,19 @@ static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
* Note: gfn must be unaliased before this function get called
*/
-static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage)
+static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
{
struct kvm_memory_slot *slot;
unsigned long idx;
slot = gfn_to_memslot(kvm, gfn);
- if (!lpage)
+ if (likely(level == PT_PAGE_TABLE_LEVEL))
return &slot->rmap[gfn - slot->base_gfn];
- idx = (gfn / KVM_PAGES_PER_HPAGE) -
- (slot->base_gfn / KVM_PAGES_PER_HPAGE);
+ idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
+ (slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
- return &slot->lpage_info[idx].rmap_pde;
+ return &slot->lpage_info[level - 2][idx].rmap_pde;
}
/*
@@ -494,42 +555,42 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage)
* the spte was not added.
*
*/
-static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage)
+static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
{
struct kvm_mmu_page *sp;
struct kvm_rmap_desc *desc;
unsigned long *rmapp;
int i, count = 0;
- if (!is_rmap_pte(*spte))
+ if (!is_rmap_spte(*spte))
return count;
gfn = unalias_gfn(vcpu->kvm, gfn);
sp = page_header(__pa(spte));
sp->gfns[spte - sp->spt] = gfn;
- rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage);
+ rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
if (!*rmapp) {
rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
*rmapp = (unsigned long)spte;
} else if (!(*rmapp & 1)) {
rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
desc = mmu_alloc_rmap_desc(vcpu);
- desc->shadow_ptes[0] = (u64 *)*rmapp;
- desc->shadow_ptes[1] = spte;
+ desc->sptes[0] = (u64 *)*rmapp;
+ desc->sptes[1] = spte;
*rmapp = (unsigned long)desc | 1;
} else {
rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
- while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) {
+ while (desc->sptes[RMAP_EXT-1] && desc->more) {
desc = desc->more;
count += RMAP_EXT;
}
- if (desc->shadow_ptes[RMAP_EXT-1]) {
+ if (desc->sptes[RMAP_EXT-1]) {
desc->more = mmu_alloc_rmap_desc(vcpu);
desc = desc->more;
}
- for (i = 0; desc->shadow_ptes[i]; ++i)
+ for (i = 0; desc->sptes[i]; ++i)
;
- desc->shadow_ptes[i] = spte;
+ desc->sptes[i] = spte;
}
return count;
}
@@ -541,14 +602,14 @@ static void rmap_desc_remove_entry(unsigned long *rmapp,
{
int j;
- for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
+ for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j)
;
- desc->shadow_ptes[i] = desc->shadow_ptes[j];
- desc->shadow_ptes[j] = NULL;
+ desc->sptes[i] = desc->sptes[j];
+ desc->sptes[j] = NULL;
if (j != 0)
return;
if (!prev_desc && !desc->more)
- *rmapp = (unsigned long)desc->shadow_ptes[0];
+ *rmapp = (unsigned long)desc->sptes[0];
else
if (prev_desc)
prev_desc->more = desc->more;
@@ -566,7 +627,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
unsigned long *rmapp;
int i;
- if (!is_rmap_pte(*spte))
+ if (!is_rmap_spte(*spte))
return;
sp = page_header(__pa(spte));
pfn = spte_to_pfn(*spte);
@@ -576,7 +637,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
kvm_release_pfn_dirty(pfn);
else
kvm_release_pfn_clean(pfn);
- rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte));
+ rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level);
if (!*rmapp) {
printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
BUG();
@@ -593,8 +654,8 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
prev_desc = NULL;
while (desc) {
- for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
- if (desc->shadow_ptes[i] == spte) {
+ for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i)
+ if (desc->sptes[i] == spte) {
rmap_desc_remove_entry(rmapp,
desc, i,
prev_desc);
@@ -625,10 +686,10 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
prev_desc = NULL;
prev_spte = NULL;
while (desc) {
- for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) {
+ for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
if (prev_spte == spte)
- return desc->shadow_ptes[i];
- prev_spte = desc->shadow_ptes[i];
+ return desc->sptes[i];
+ prev_spte = desc->sptes[i];
}
desc = desc->more;
}
@@ -639,10 +700,10 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
{
unsigned long *rmapp;
u64 *spte;
- int write_protected = 0;
+ int i, write_protected = 0;
gfn = unalias_gfn(kvm, gfn);
- rmapp = gfn_to_rmap(kvm, gfn, 0);
+ rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
spte = rmap_next(kvm, rmapp, NULL);
while (spte) {
@@ -650,7 +711,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
BUG_ON(!(*spte & PT_PRESENT_MASK));
rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
if (is_writeble_pte(*spte)) {
- set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
+ __set_spte(spte, *spte & ~PT_WRITABLE_MASK);
write_protected = 1;
}
spte = rmap_next(kvm, rmapp, spte);
@@ -664,21 +725,24 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
}
/* check for huge page mappings */
- rmapp = gfn_to_rmap(kvm, gfn, 1);
- spte = rmap_next(kvm, rmapp, NULL);
- while (spte) {
- BUG_ON(!spte);
- BUG_ON(!(*spte & PT_PRESENT_MASK));
- BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
- pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
- if (is_writeble_pte(*spte)) {
- rmap_remove(kvm, spte);
- --kvm->stat.lpages;
- set_shadow_pte(spte, shadow_trap_nonpresent_pte);
- spte = NULL;
- write_protected = 1;
+ for (i = PT_DIRECTORY_LEVEL;
+ i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+ rmapp = gfn_to_rmap(kvm, gfn, i);
+ spte = rmap_next(kvm, rmapp, NULL);
+ while (spte) {
+ BUG_ON(!spte);
+ BUG_ON(!(*spte & PT_PRESENT_MASK));
+ BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
+ pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
+ if (is_writeble_pte(*spte)) {
+ rmap_remove(kvm, spte);
+ --kvm->stat.lpages;
+ __set_spte(spte, shadow_trap_nonpresent_pte);
+ spte = NULL;
+ write_protected = 1;
+ }
+ spte = rmap_next(kvm, rmapp, spte);
}
- spte = rmap_next(kvm, rmapp, spte);
}
return write_protected;
@@ -693,7 +757,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
BUG_ON(!(*spte & PT_PRESENT_MASK));
rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
rmap_remove(kvm, spte);
- set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+ __set_spte(spte, shadow_trap_nonpresent_pte);
need_tlb_flush = 1;
}
return need_tlb_flush;
@@ -702,7 +766,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
int (*handler)(struct kvm *kvm, unsigned long *rmapp))
{
- int i;
+ int i, j;
int retval = 0;
/*
@@ -721,11 +785,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
end = start + (memslot->npages << PAGE_SHIFT);
if (hva >= start && hva < end) {
gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+
retval |= handler(kvm, &memslot->rmap[gfn_offset]);
- retval |= handler(kvm,
- &memslot->lpage_info[
- gfn_offset /
- KVM_PAGES_PER_HPAGE].rmap_pde);
+
+ for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
+ int idx = gfn_offset;
+ idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j);
+ retval |= handler(kvm,
+ &memslot->lpage_info[j][idx].rmap_pde);
+ }
}
}
@@ -763,12 +831,15 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
#define RMAP_RECYCLE_THRESHOLD 1000
-static void rmap_recycle(struct kvm_vcpu *vcpu, gfn_t gfn, int lpage)
+static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
{
unsigned long *rmapp;
+ struct kvm_mmu_page *sp;
+
+ sp = page_header(__pa(spte));
gfn = unalias_gfn(vcpu->kvm, gfn);
- rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage);
+ rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
kvm_unmap_rmapp(vcpu->kvm, rmapp);
kvm_flush_remote_tlbs(vcpu->kvm);
@@ -1109,6 +1180,7 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
return 1;
}
+ trace_kvm_mmu_sync_page(sp);
if (rmap_write_protect(vcpu->kvm, sp->gfn))
kvm_flush_remote_tlbs(vcpu->kvm);
kvm_unlink_unsync_page(vcpu->kvm, sp);
@@ -1231,8 +1303,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
role.quadrant = quadrant;
}
- pgprintk("%s: looking gfn %lx role %x\n", __func__,
- gfn, role.word);
index = kvm_page_table_hashfn(gfn);
bucket = &vcpu->kvm->arch.mmu_page_hash[index];
hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
@@ -1249,14 +1319,13 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
kvm_mmu_mark_parents_unsync(vcpu, sp);
}
- pgprintk("%s: found\n", __func__);
+ trace_kvm_mmu_get_page(sp, false);
return sp;
}
++vcpu->kvm->stat.mmu_cache_miss;
sp = kvm_mmu_alloc_page(vcpu, parent_pte);
if (!sp)
return sp;
- pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word);
sp->gfn = gfn;
sp->role = role;
hlist_add_head(&sp->hash_link, bucket);
@@ -1269,6 +1338,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
vcpu->arch.mmu.prefetch_page(vcpu, sp);
else
nonpaging_prefetch_page(vcpu, sp);
+ trace_kvm_mmu_get_page(sp, true);
return sp;
}
@@ -1292,6 +1362,11 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
{
if (iterator->level < PT_PAGE_TABLE_LEVEL)
return false;
+
+ if (iterator->level == PT_PAGE_TABLE_LEVEL)
+ if (is_large_pte(*iterator->sptep))
+ return false;
+
iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
return true;
@@ -1312,25 +1387,17 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
pt = sp->spt;
- if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- if (is_shadow_present_pte(pt[i]))
- rmap_remove(kvm, &pt[i]);
- pt[i] = shadow_trap_nonpresent_pte;
- }
- return;
- }
-
for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
ent = pt[i];
if (is_shadow_present_pte(ent)) {
- if (!is_large_pte(ent)) {
+ if (!is_last_spte(ent, sp->role.level)) {
ent &= PT64_BASE_ADDR_MASK;
mmu_page_remove_parent_pte(page_header(ent),
&pt[i]);
} else {
- --kvm->stat.lpages;
+ if (is_large_pte(ent))
+ --kvm->stat.lpages;
rmap_remove(kvm, &pt[i]);
}
}
@@ -1346,10 +1413,10 @@ static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
{
int i;
+ struct kvm_vcpu *vcpu;
- for (i = 0; i < KVM_MAX_VCPUS; ++i)
- if (kvm->vcpus[i])
- kvm->vcpus[i]->arch.last_pte_updated = NULL;
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ vcpu->arch.last_pte_updated = NULL;
}
static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -1368,7 +1435,7 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
}
BUG_ON(!parent_pte);
kvm_mmu_put_page(sp, parent_pte);
- set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
+ __set_spte(parent_pte, shadow_trap_nonpresent_pte);
}
}
@@ -1400,6 +1467,8 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
int ret;
+
+ trace_kvm_mmu_zap_page(sp);
++kvm->stat.mmu_shadow_zapped;
ret = mmu_zap_unsync_children(kvm, sp);
kvm_mmu_page_unlink_children(kvm, sp);
@@ -1516,7 +1585,7 @@ static void mmu_convert_notrap(struct kvm_mmu_page *sp)
for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
if (pt[i] == shadow_notrap_nonpresent_pte)
- set_shadow_pte(&pt[i], shadow_trap_nonpresent_pte);
+ __set_spte(&pt[i], shadow_trap_nonpresent_pte);
}
}
@@ -1646,6 +1715,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
struct kvm_mmu_page *s;
struct hlist_node *node, *n;
+ trace_kvm_mmu_unsync_page(sp);
index = kvm_page_table_hashfn(sp->gfn);
bucket = &vcpu->kvm->arch.mmu_page_hash[index];
/* don't unsync if pagetable is shadowed with multiple roles */
@@ -1682,9 +1752,9 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
return 0;
}
-static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
+static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pte_access, int user_fault,
- int write_fault, int dirty, int largepage,
+ int write_fault, int dirty, int level,
gfn_t gfn, pfn_t pfn, bool speculative,
bool can_unsync)
{
@@ -1707,7 +1777,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
spte |= shadow_nx_mask;
if (pte_access & ACC_USER_MASK)
spte |= shadow_user_mask;
- if (largepage)
+ if (level > PT_PAGE_TABLE_LEVEL)
spte |= PT_PAGE_SIZE_MASK;
if (tdp_enabled)
spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
@@ -1718,7 +1788,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
if ((pte_access & ACC_WRITE_MASK)
|| (write_fault && !is_write_protection(vcpu) && !user_fault)) {
- if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) {
+ if (level > PT_PAGE_TABLE_LEVEL &&
+ has_wrprotected_page(vcpu->kvm, gfn, level)) {
ret = 1;
spte = shadow_trap_nonpresent_pte;
goto set_pte;
@@ -1732,7 +1803,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
* is responsibility of mmu_get_page / kvm_sync_page.
* Same reasoning can be applied to dirty page accounting.
*/
- if (!can_unsync && is_writeble_pte(*shadow_pte))
+ if (!can_unsync && is_writeble_pte(*sptep))
goto set_pte;
if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
@@ -1749,65 +1820,67 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
mark_page_dirty(vcpu->kvm, gfn);
set_pte:
- set_shadow_pte(shadow_pte, spte);
+ __set_spte(sptep, spte);
return ret;
}
-static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
+static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pt_access, unsigned pte_access,
int user_fault, int write_fault, int dirty,
- int *ptwrite, int largepage, gfn_t gfn,
+ int *ptwrite, int level, gfn_t gfn,
pfn_t pfn, bool speculative)
{
int was_rmapped = 0;
- int was_writeble = is_writeble_pte(*shadow_pte);
+ int was_writeble = is_writeble_pte(*sptep);
int rmap_count;
pgprintk("%s: spte %llx access %x write_fault %d"
" user_fault %d gfn %lx\n",
- __func__, *shadow_pte, pt_access,
+ __func__, *sptep, pt_access,
write_fault, user_fault, gfn);
- if (is_rmap_pte(*shadow_pte)) {
+ if (is_rmap_spte(*sptep)) {
/*
* If we overwrite a PTE page pointer with a 2MB PMD, unlink
* the parent of the now unreachable PTE.
*/
- if (largepage && !is_large_pte(*shadow_pte)) {
+ if (level > PT_PAGE_TABLE_LEVEL &&
+ !is_large_pte(*sptep)) {
struct kvm_mmu_page *child;
- u64 pte = *shadow_pte;
+ u64 pte = *sptep;
child = page_header(pte & PT64_BASE_ADDR_MASK);
- mmu_page_remove_parent_pte(child, shadow_pte);
- } else if (pfn != spte_to_pfn(*shadow_pte)) {
+ mmu_page_remove_parent_pte(child, sptep);
+ } else if (pfn != spte_to_pfn(*sptep)) {
pgprintk("hfn old %lx new %lx\n",
- spte_to_pfn(*shadow_pte), pfn);
- rmap_remove(vcpu->kvm, shadow_pte);
+ spte_to_pfn(*sptep), pfn);
+ rmap_remove(vcpu->kvm, sptep);
} else
was_rmapped = 1;
}
- if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
- dirty, largepage, gfn, pfn, speculative, true)) {
+
+ if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
+ dirty, level, gfn, pfn, speculative, true)) {
if (write_fault)
*ptwrite = 1;
kvm_x86_ops->tlb_flush(vcpu);
}
- pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte);
+ pgprintk("%s: setting spte %llx\n", __func__, *sptep);
pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
- is_large_pte(*shadow_pte)? "2MB" : "4kB",
- is_present_pte(*shadow_pte)?"RW":"R", gfn,
- *shadow_pte, shadow_pte);
- if (!was_rmapped && is_large_pte(*shadow_pte))
+ is_large_pte(*sptep)? "2MB" : "4kB",
+ *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
+ *sptep, sptep);
+ if (!was_rmapped && is_large_pte(*sptep))
++vcpu->kvm->stat.lpages;
- page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
+ page_header_update_slot(vcpu->kvm, sptep, gfn);
if (!was_rmapped) {
- rmap_count = rmap_add(vcpu, shadow_pte, gfn, largepage);
- if (!is_rmap_pte(*shadow_pte))
+ rmap_count = rmap_add(vcpu, sptep, gfn);
+ if (!is_rmap_spte(*sptep))
kvm_release_pfn_clean(pfn);
if (rmap_count > RMAP_RECYCLE_THRESHOLD)
- rmap_recycle(vcpu, gfn, largepage);
+ rmap_recycle(vcpu, sptep, gfn);
} else {
if (was_writeble)
kvm_release_pfn_dirty(pfn);
@@ -1815,7 +1888,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
kvm_release_pfn_clean(pfn);
}
if (speculative) {
- vcpu->arch.last_pte_updated = shadow_pte;
+ vcpu->arch.last_pte_updated = sptep;
vcpu->arch.last_pte_gfn = gfn;
}
}
@@ -1825,7 +1898,7 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
}
static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
- int largepage, gfn_t gfn, pfn_t pfn)
+ int level, gfn_t gfn, pfn_t pfn)
{
struct kvm_shadow_walk_iterator iterator;
struct kvm_mmu_page *sp;
@@ -1833,11 +1906,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
gfn_t pseudo_gfn;
for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
- if (iterator.level == PT_PAGE_TABLE_LEVEL
- || (largepage && iterator.level == PT_DIRECTORY_LEVEL)) {
+ if (iterator.level == level) {
mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
0, write, 1, &pt_write,
- largepage, gfn, pfn, false);
+ level, gfn, pfn, false);
++vcpu->stat.pf_fixed;
break;
}
@@ -1853,10 +1925,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
return -ENOMEM;
}
- set_shadow_pte(iterator.sptep,
- __pa(sp->spt)
- | PT_PRESENT_MASK | PT_WRITABLE_MASK
- | shadow_user_mask | shadow_x_mask);
+ __set_spte(iterator.sptep,
+ __pa(sp->spt)
+ | PT_PRESENT_MASK | PT_WRITABLE_MASK
+ | shadow_user_mask | shadow_x_mask);
}
}
return pt_write;
@@ -1865,14 +1937,20 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
{
int r;
- int largepage = 0;
+ int level;
pfn_t pfn;
unsigned long mmu_seq;
- if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
- gfn &= ~(KVM_PAGES_PER_HPAGE-1);
- largepage = 1;
- }
+ level = mapping_level(vcpu, gfn);
+
+ /*
+ * This path builds a PAE pagetable - so we can map 2mb pages at
+ * maximum. Therefore check if the level is larger than that.
+ */
+ if (level > PT_DIRECTORY_LEVEL)
+ level = PT_DIRECTORY_LEVEL;
+
+ gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
@@ -1888,7 +1966,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
- r = __direct_map(vcpu, v, write, largepage, gfn, pfn);
+ r = __direct_map(vcpu, v, write, level, gfn, pfn);
spin_unlock(&vcpu->kvm->mmu_lock);
@@ -1954,6 +2032,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
gfn_t root_gfn;
struct kvm_mmu_page *sp;
int direct = 0;
+ u64 pdptr;
root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
@@ -1981,11 +2060,12 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
ASSERT(!VALID_PAGE(root));
if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
- if (!is_present_pte(vcpu->arch.pdptrs[i])) {
+ pdptr = kvm_pdptr_read(vcpu, i);
+ if (!is_present_gpte(pdptr)) {
vcpu->arch.mmu.pae_root[i] = 0;
continue;
}
- root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT;
+ root_gfn = pdptr >> PAGE_SHIFT;
} else if (vcpu->arch.mmu.root_level == 0)
root_gfn = 0;
if (mmu_check_root(vcpu, root_gfn))
@@ -2062,7 +2142,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
{
pfn_t pfn;
int r;
- int largepage = 0;
+ int level;
gfn_t gfn = gpa >> PAGE_SHIFT;
unsigned long mmu_seq;
@@ -2073,10 +2153,10 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
if (r)
return r;
- if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
- gfn &= ~(KVM_PAGES_PER_HPAGE-1);
- largepage = 1;
- }
+ level = mapping_level(vcpu, gfn);
+
+ gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
pfn = gfn_to_pfn(vcpu->kvm, gfn);
@@ -2089,7 +2169,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
- largepage, gfn, pfn);
+ level, gfn, pfn);
spin_unlock(&vcpu->kvm->mmu_lock);
return r;
@@ -2206,7 +2286,9 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 51);
context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
- context->rsvd_bits_mask[1][2] = context->rsvd_bits_mask[0][2];
+ context->rsvd_bits_mask[1][2] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 51) |
+ rsvd_bits(13, 29);
context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 51) |
rsvd_bits(13, 20); /* large page */
@@ -2357,8 +2439,8 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
spin_unlock(&vcpu->kvm->mmu_lock);
if (r)
goto out;
+ /* set_cr3() should ensure TLB has been flushed */
kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
- kvm_mmu_flush_tlb(vcpu);
out:
return r;
}
@@ -2378,15 +2460,14 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
pte = *spte;
if (is_shadow_present_pte(pte)) {
- if (sp->role.level == PT_PAGE_TABLE_LEVEL ||
- is_large_pte(pte))
+ if (is_last_spte(pte, sp->role.level))
rmap_remove(vcpu->kvm, spte);
else {
child = page_header(pte & PT64_BASE_ADDR_MASK);
mmu_page_remove_parent_pte(child, spte);
}
}
- set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+ __set_spte(spte, shadow_trap_nonpresent_pte);
if (is_large_pte(pte))
--vcpu->kvm->stat.lpages;
}
@@ -2397,11 +2478,8 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
const void *new)
{
if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
- if (!vcpu->arch.update_pte.largepage ||
- sp->role.glevels == PT32_ROOT_LEVEL) {
- ++vcpu->kvm->stat.mmu_pde_zapped;
- return;
- }
+ ++vcpu->kvm->stat.mmu_pde_zapped;
+ return;
}
++vcpu->kvm->stat.mmu_pte_updated;
@@ -2447,8 +2525,6 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
u64 gpte = 0;
pfn_t pfn;
- vcpu->arch.update_pte.largepage = 0;
-
if (bytes != 4 && bytes != 8)
return;
@@ -2472,14 +2548,10 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
if ((bytes == 4) && (gpa % 4 == 0))
memcpy((void *)&gpte, new, 4);
}
- if (!is_present_pte(gpte))
+ if (!is_present_gpte(gpte))
return;
gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
- if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
- gfn &= ~(KVM_PAGES_PER_HPAGE-1);
- vcpu->arch.update_pte.largepage = 1;
- }
vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
pfn = gfn_to_pfn(vcpu->kvm, gfn);
@@ -2622,6 +2694,9 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
gpa_t gpa;
int r;
+ if (tdp_enabled)
+ return 0;
+
gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
spin_lock(&vcpu->kvm->mmu_lock);
@@ -2633,7 +2708,8 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
{
- while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) {
+ while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES &&
+ !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
struct kvm_mmu_page *sp;
sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
@@ -2670,8 +2746,9 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
++vcpu->stat.mmio_exits;
return 0;
case EMULATE_FAIL:
- kvm_report_emulation_failure(vcpu, "pagetable");
- return 1;
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ return 0;
default:
BUG();
}
@@ -2712,12 +2789,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
ASSERT(vcpu);
- if (vcpu->kvm->arch.n_requested_mmu_pages)
- vcpu->kvm->arch.n_free_mmu_pages =
- vcpu->kvm->arch.n_requested_mmu_pages;
- else
- vcpu->kvm->arch.n_free_mmu_pages =
- vcpu->kvm->arch.n_alloc_mmu_pages;
/*
* When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
* Therefore we need to allocate shadow page tables in the first
@@ -3029,6 +3100,24 @@ out:
return r;
}
+int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
+{
+ struct kvm_shadow_walk_iterator iterator;
+ int nr_sptes = 0;
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+ for_each_shadow_entry(vcpu, addr, iterator) {
+ sptes[iterator.level-1] = *iterator.sptep;
+ nr_sptes++;
+ if (!is_shadow_present_pte(*iterator.sptep))
+ break;
+ }
+ spin_unlock(&vcpu->kvm->mmu_lock);
+
+ return nr_sptes;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
+
#ifdef AUDIT
static const char *audit_msg;
@@ -3041,6 +3130,54 @@ static gva_t canonicalize(gva_t gva)
return gva;
}
+
+typedef void (*inspect_spte_fn) (struct kvm *kvm, struct kvm_mmu_page *sp,
+ u64 *sptep);
+
+static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
+ inspect_spte_fn fn)
+{
+ int i;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ u64 ent = sp->spt[i];
+
+ if (is_shadow_present_pte(ent)) {
+ if (!is_last_spte(ent, sp->role.level)) {
+ struct kvm_mmu_page *child;
+ child = page_header(ent & PT64_BASE_ADDR_MASK);
+ __mmu_spte_walk(kvm, child, fn);
+ } else
+ fn(kvm, sp, &sp->spt[i]);
+ }
+ }
+}
+
+static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
+{
+ int i;
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return;
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+ hpa_t root = vcpu->arch.mmu.root_hpa;
+ sp = page_header(root);
+ __mmu_spte_walk(vcpu->kvm, sp, fn);
+ return;
+ }
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+ if (root && VALID_PAGE(root)) {
+ root &= PT64_BASE_ADDR_MASK;
+ sp = page_header(root);
+ __mmu_spte_walk(vcpu->kvm, sp, fn);
+ }
+ }
+ return;
+}
+
static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
gva_t va, int level)
{
@@ -3055,20 +3192,19 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
continue;
va = canonicalize(va);
- if (level > 1) {
- if (ent == shadow_notrap_nonpresent_pte)
- printk(KERN_ERR "audit: (%s) nontrapping pte"
- " in nonleaf level: levels %d gva %lx"
- " level %d pte %llx\n", audit_msg,
- vcpu->arch.mmu.root_level, va, level, ent);
- else
- audit_mappings_page(vcpu, ent, va, level - 1);
- } else {
+ if (is_shadow_present_pte(ent) && !is_last_spte(ent, level))
+ audit_mappings_page(vcpu, ent, va, level - 1);
+ else {
gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
gfn_t gfn = gpa >> PAGE_SHIFT;
pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
+ if (is_error_pfn(pfn)) {
+ kvm_release_pfn_clean(pfn);
+ continue;
+ }
+
if (is_shadow_present_pte(ent)
&& (ent & PT64_BASE_ADDR_MASK) != hpa)
printk(KERN_ERR "xx audit error: (%s) levels %d"
@@ -3122,7 +3258,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
while (d) {
for (k = 0; k < RMAP_EXT; ++k)
- if (d->shadow_ptes[k])
+ if (d->sptes[k])
++nmaps;
else
break;
@@ -3133,9 +3269,48 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
return nmaps;
}
-static int count_writable_mappings(struct kvm_vcpu *vcpu)
+void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep)
+{
+ unsigned long *rmapp;
+ struct kvm_mmu_page *rev_sp;
+ gfn_t gfn;
+
+ if (*sptep & PT_WRITABLE_MASK) {
+ rev_sp = page_header(__pa(sptep));
+ gfn = rev_sp->gfns[sptep - rev_sp->spt];
+
+ if (!gfn_to_memslot(kvm, gfn)) {
+ if (!printk_ratelimit())
+ return;
+ printk(KERN_ERR "%s: no memslot for gfn %ld\n",
+ audit_msg, gfn);
+ printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
+ audit_msg, sptep - rev_sp->spt,
+ rev_sp->gfn);
+ dump_stack();
+ return;
+ }
+
+ rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt],
+ is_large_pte(*sptep));
+ if (!*rmapp) {
+ if (!printk_ratelimit())
+ return;
+ printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
+ audit_msg, *sptep);
+ dump_stack();
+ }
+ }
+
+}
+
+void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu)
+{
+ mmu_spte_walk(vcpu, inspect_spte_has_rmap);
+}
+
+static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
{
- int nmaps = 0;
struct kvm_mmu_page *sp;
int i;
@@ -3152,20 +3327,16 @@ static int count_writable_mappings(struct kvm_vcpu *vcpu)
continue;
if (!(ent & PT_WRITABLE_MASK))
continue;
- ++nmaps;
+ inspect_spte_has_rmap(vcpu->kvm, sp, &pt[i]);
}
}
- return nmaps;
+ return;
}
static void audit_rmap(struct kvm_vcpu *vcpu)
{
- int n_rmap = count_rmaps(vcpu);
- int n_actual = count_writable_mappings(vcpu);
-
- if (n_rmap != n_actual)
- printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
- __func__, audit_msg, n_rmap, n_actual);
+ check_writable_mappings_rmap(vcpu);
+ count_rmaps(vcpu);
}
static void audit_write_protection(struct kvm_vcpu *vcpu)
@@ -3173,20 +3344,28 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
struct kvm_mmu_page *sp;
struct kvm_memory_slot *slot;
unsigned long *rmapp;
+ u64 *spte;
gfn_t gfn;
list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
if (sp->role.direct)
continue;
+ if (sp->unsync)
+ continue;
gfn = unalias_gfn(vcpu->kvm, sp->gfn);
slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn);
rmapp = &slot->rmap[gfn - slot->base_gfn];
- if (*rmapp)
- printk(KERN_ERR "%s: (%s) shadow page has writable"
- " mappings: gfn %lx role %x\n",
+
+ spte = rmap_next(vcpu->kvm, rmapp, NULL);
+ while (spte) {
+ if (*spte & PT_WRITABLE_MASK)
+ printk(KERN_ERR "%s: (%s) shadow page has "
+ "writable mappings: gfn %lx role %x\n",
__func__, audit_msg, sp->gfn,
sp->role.word);
+ spte = rmap_next(vcpu->kvm, rmapp, spte);
+ }
}
}
@@ -3198,7 +3377,9 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
audit_msg = msg;
audit_rmap(vcpu);
audit_write_protection(vcpu);
- audit_mappings(vcpu);
+ if (strcmp("pre pte write", audit_msg) != 0)
+ audit_mappings(vcpu);
+ audit_writable_sptes_have_rmaps(vcpu);
dbg = olddbg;
}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 3494a2fb136..61a1b3884b4 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -37,6 +37,8 @@
#define PT32_ROOT_LEVEL 2
#define PT32E_ROOT_LEVEL 3
+int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
+
static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
{
if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
@@ -75,7 +77,7 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
return vcpu->arch.cr0 & X86_CR0_PG;
}
-static inline int is_present_pte(unsigned long pte)
+static inline int is_present_gpte(unsigned long pte)
{
return pte & PT_PRESENT_MASK;
}
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
new file mode 100644
index 00000000000..3e4a5c6ca2a
--- /dev/null
+++ b/arch/x86/kvm/mmutrace.h
@@ -0,0 +1,220 @@
+#if !defined(_TRACE_KVMMMU_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVMMMU_H
+
+#include <linux/tracepoint.h>
+#include <linux/ftrace_event.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvmmmu
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE mmutrace
+
+#define KVM_MMU_PAGE_FIELDS \
+ __field(__u64, gfn) \
+ __field(__u32, role) \
+ __field(__u32, root_count) \
+ __field(__u32, unsync)
+
+#define KVM_MMU_PAGE_ASSIGN(sp) \
+ __entry->gfn = sp->gfn; \
+ __entry->role = sp->role.word; \
+ __entry->root_count = sp->root_count; \
+ __entry->unsync = sp->unsync;
+
+#define KVM_MMU_PAGE_PRINTK() ({ \
+ const char *ret = p->buffer + p->len; \
+ static const char *access_str[] = { \
+ "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" \
+ }; \
+ union kvm_mmu_page_role role; \
+ \
+ role.word = __entry->role; \
+ \
+ trace_seq_printf(p, "sp gfn %llx %u/%u q%u%s %s%s %spge" \
+ " %snxe root %u %s%c", \
+ __entry->gfn, role.level, role.glevels, \
+ role.quadrant, \
+ role.direct ? " direct" : "", \
+ access_str[role.access], \
+ role.invalid ? " invalid" : "", \
+ role.cr4_pge ? "" : "!", \
+ role.nxe ? "" : "!", \
+ __entry->root_count, \
+ __entry->unsync ? "unsync" : "sync", 0); \
+ ret; \
+ })
+
+#define kvm_mmu_trace_pferr_flags \
+ { PFERR_PRESENT_MASK, "P" }, \
+ { PFERR_WRITE_MASK, "W" }, \
+ { PFERR_USER_MASK, "U" }, \
+ { PFERR_RSVD_MASK, "RSVD" }, \
+ { PFERR_FETCH_MASK, "F" }
+
+/*
+ * A pagetable walk has started
+ */
+TRACE_EVENT(
+ kvm_mmu_pagetable_walk,
+ TP_PROTO(u64 addr, int write_fault, int user_fault, int fetch_fault),
+ TP_ARGS(addr, write_fault, user_fault, fetch_fault),
+
+ TP_STRUCT__entry(
+ __field(__u64, addr)
+ __field(__u32, pferr)
+ ),
+
+ TP_fast_assign(
+ __entry->addr = addr;
+ __entry->pferr = (!!write_fault << 1) | (!!user_fault << 2)
+ | (!!fetch_fault << 4);
+ ),
+
+ TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr,
+ __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags))
+);
+
+
+/* We just walked a paging element */
+TRACE_EVENT(
+ kvm_mmu_paging_element,
+ TP_PROTO(u64 pte, int level),
+ TP_ARGS(pte, level),
+
+ TP_STRUCT__entry(
+ __field(__u64, pte)
+ __field(__u32, level)
+ ),
+
+ TP_fast_assign(
+ __entry->pte = pte;
+ __entry->level = level;
+ ),
+
+ TP_printk("pte %llx level %u", __entry->pte, __entry->level)
+);
+
+/* We set a pte accessed bit */
+TRACE_EVENT(
+ kvm_mmu_set_accessed_bit,
+ TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
+ TP_ARGS(table_gfn, index, size),
+
+ TP_STRUCT__entry(
+ __field(__u64, gpa)
+ ),
+
+ TP_fast_assign(
+ __entry->gpa = ((u64)table_gfn << PAGE_SHIFT)
+ + index * size;
+ ),
+
+ TP_printk("gpa %llx", __entry->gpa)
+);
+
+/* We set a pte dirty bit */
+TRACE_EVENT(
+ kvm_mmu_set_dirty_bit,
+ TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
+ TP_ARGS(table_gfn, index, size),
+
+ TP_STRUCT__entry(
+ __field(__u64, gpa)
+ ),
+
+ TP_fast_assign(
+ __entry->gpa = ((u64)table_gfn << PAGE_SHIFT)
+ + index * size;
+ ),
+
+ TP_printk("gpa %llx", __entry->gpa)
+);
+
+TRACE_EVENT(
+ kvm_mmu_walker_error,
+ TP_PROTO(u32 pferr),
+ TP_ARGS(pferr),
+
+ TP_STRUCT__entry(
+ __field(__u32, pferr)
+ ),
+
+ TP_fast_assign(
+ __entry->pferr = pferr;
+ ),
+
+ TP_printk("pferr %x %s", __entry->pferr,
+ __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags))
+);
+
+TRACE_EVENT(
+ kvm_mmu_get_page,
+ TP_PROTO(struct kvm_mmu_page *sp, bool created),
+ TP_ARGS(sp, created),
+
+ TP_STRUCT__entry(
+ KVM_MMU_PAGE_FIELDS
+ __field(bool, created)
+ ),
+
+ TP_fast_assign(
+ KVM_MMU_PAGE_ASSIGN(sp)
+ __entry->created = created;
+ ),
+
+ TP_printk("%s %s", KVM_MMU_PAGE_PRINTK(),
+ __entry->created ? "new" : "existing")
+);
+
+TRACE_EVENT(
+ kvm_mmu_sync_page,
+ TP_PROTO(struct kvm_mmu_page *sp),
+ TP_ARGS(sp),
+
+ TP_STRUCT__entry(
+ KVM_MMU_PAGE_FIELDS
+ ),
+
+ TP_fast_assign(
+ KVM_MMU_PAGE_ASSIGN(sp)
+ ),
+
+ TP_printk("%s", KVM_MMU_PAGE_PRINTK())
+);
+
+TRACE_EVENT(
+ kvm_mmu_unsync_page,
+ TP_PROTO(struct kvm_mmu_page *sp),
+ TP_ARGS(sp),
+
+ TP_STRUCT__entry(
+ KVM_MMU_PAGE_FIELDS
+ ),
+
+ TP_fast_assign(
+ KVM_MMU_PAGE_ASSIGN(sp)
+ ),
+
+ TP_printk("%s", KVM_MMU_PAGE_PRINTK())
+);
+
+TRACE_EVENT(
+ kvm_mmu_zap_page,
+ TP_PROTO(struct kvm_mmu_page *sp),
+ TP_ARGS(sp),
+
+ TP_STRUCT__entry(
+ KVM_MMU_PAGE_FIELDS
+ ),
+
+ TP_fast_assign(
+ KVM_MMU_PAGE_ASSIGN(sp)
+ ),
+
+ TP_printk("%s", KVM_MMU_PAGE_PRINTK())
+);
+
+#endif /* _TRACE_KVMMMU_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 67785f63539..d2fec9c12d2 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -27,7 +27,8 @@
#define guest_walker guest_walker64
#define FNAME(name) paging##64_##name
#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
- #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
+ #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
+ #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
#define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
#define PT_LEVEL_BITS PT64_LEVEL_BITS
@@ -43,7 +44,8 @@
#define guest_walker guest_walker32
#define FNAME(name) paging##32_##name
#define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
- #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
+ #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl)
+ #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl)
#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
#define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
#define PT_LEVEL_BITS PT32_LEVEL_BITS
@@ -53,8 +55,8 @@
#error Invalid PTTYPE value
#endif
-#define gpte_to_gfn FNAME(gpte_to_gfn)
-#define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde)
+#define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl)
+#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL)
/*
* The guest_walker structure emulates the behavior of the hardware page
@@ -71,14 +73,9 @@ struct guest_walker {
u32 error_code;
};
-static gfn_t gpte_to_gfn(pt_element_t gpte)
+static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
{
- return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
-}
-
-static gfn_t gpte_to_gfn_pde(pt_element_t gpte)
-{
- return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
+ return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
}
static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
@@ -125,14 +122,16 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
gpa_t pte_gpa;
int rsvd_fault = 0;
- pgprintk("%s: addr %lx\n", __func__, addr);
+ trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
+ fetch_fault);
walk:
walker->level = vcpu->arch.mmu.root_level;
pte = vcpu->arch.cr3;
#if PTTYPE == 64
if (!is_long_mode(vcpu)) {
- pte = vcpu->arch.pdptrs[(addr >> 30) & 3];
- if (!is_present_pte(pte))
+ pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3);
+ trace_kvm_mmu_paging_element(pte, walker->level);
+ if (!is_present_gpte(pte))
goto not_present;
--walker->level;
}
@@ -150,12 +149,11 @@ walk:
pte_gpa += index * sizeof(pt_element_t);
walker->table_gfn[walker->level - 1] = table_gfn;
walker->pte_gpa[walker->level - 1] = pte_gpa;
- pgprintk("%s: table_gfn[%d] %lx\n", __func__,
- walker->level - 1, table_gfn);
kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
+ trace_kvm_mmu_paging_element(pte, walker->level);
- if (!is_present_pte(pte))
+ if (!is_present_gpte(pte))
goto not_present;
rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level);
@@ -175,6 +173,8 @@ walk:
#endif
if (!(pte & PT_ACCESSED_MASK)) {
+ trace_kvm_mmu_set_accessed_bit(table_gfn, index,
+ sizeof(pte));
mark_page_dirty(vcpu->kvm, table_gfn);
if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
index, pte, pte|PT_ACCESSED_MASK))
@@ -186,18 +186,24 @@ walk:
walker->ptes[walker->level - 1] = pte;
- if (walker->level == PT_PAGE_TABLE_LEVEL) {
- walker->gfn = gpte_to_gfn(pte);
- break;
- }
-
- if (walker->level == PT_DIRECTORY_LEVEL
- && (pte & PT_PAGE_SIZE_MASK)
- && (PTTYPE == 64 || is_pse(vcpu))) {
- walker->gfn = gpte_to_gfn_pde(pte);
- walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
- if (PTTYPE == 32 && is_cpuid_PSE36())
+ if ((walker->level == PT_PAGE_TABLE_LEVEL) ||
+ ((walker->level == PT_DIRECTORY_LEVEL) &&
+ (pte & PT_PAGE_SIZE_MASK) &&
+ (PTTYPE == 64 || is_pse(vcpu))) ||
+ ((walker->level == PT_PDPE_LEVEL) &&
+ (pte & PT_PAGE_SIZE_MASK) &&
+ is_long_mode(vcpu))) {
+ int lvl = walker->level;
+
+ walker->gfn = gpte_to_gfn_lvl(pte, lvl);
+ walker->gfn += (addr & PT_LVL_OFFSET_MASK(lvl))
+ >> PAGE_SHIFT;
+
+ if (PTTYPE == 32 &&
+ walker->level == PT_DIRECTORY_LEVEL &&
+ is_cpuid_PSE36())
walker->gfn += pse36_gfn_delta(pte);
+
break;
}
@@ -205,9 +211,10 @@ walk:
--walker->level;
}
- if (write_fault && !is_dirty_pte(pte)) {
+ if (write_fault && !is_dirty_gpte(pte)) {
bool ret;
+ trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
mark_page_dirty(vcpu->kvm, table_gfn);
ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
pte|PT_DIRTY_MASK);
@@ -239,6 +246,7 @@ err:
walker->error_code |= PFERR_FETCH_MASK;
if (rsvd_fault)
walker->error_code |= PFERR_RSVD_MASK;
+ trace_kvm_mmu_walker_error(walker->error_code);
return 0;
}
@@ -248,12 +256,11 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
pt_element_t gpte;
unsigned pte_access;
pfn_t pfn;
- int largepage = vcpu->arch.update_pte.largepage;
gpte = *(const pt_element_t *)pte;
if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
- if (!is_present_pte(gpte))
- set_shadow_pte(spte, shadow_notrap_nonpresent_pte);
+ if (!is_present_gpte(gpte))
+ __set_spte(spte, shadow_notrap_nonpresent_pte);
return;
}
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
@@ -267,7 +274,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
return;
kvm_get_pfn(pfn);
mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
- gpte & PT_DIRTY_MASK, NULL, largepage,
+ gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL,
gpte_to_gfn(gpte), pfn, true);
}
@@ -276,7 +283,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
*/
static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct guest_walker *gw,
- int user_fault, int write_fault, int largepage,
+ int user_fault, int write_fault, int hlevel,
int *ptwrite, pfn_t pfn)
{
unsigned access = gw->pt_access;
@@ -289,19 +296,18 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
pt_element_t curr_pte;
struct kvm_shadow_walk_iterator iterator;
- if (!is_present_pte(gw->ptes[gw->level - 1]))
+ if (!is_present_gpte(gw->ptes[gw->level - 1]))
return NULL;
for_each_shadow_entry(vcpu, addr, iterator) {
level = iterator.level;
sptep = iterator.sptep;
- if (level == PT_PAGE_TABLE_LEVEL
- || (largepage && level == PT_DIRECTORY_LEVEL)) {
+ if (iterator.level == hlevel) {
mmu_set_spte(vcpu, sptep, access,
gw->pte_access & access,
user_fault, write_fault,
gw->ptes[gw->level-1] & PT_DIRTY_MASK,
- ptwrite, largepage,
+ ptwrite, level,
gw->gfn, pfn, false);
break;
}
@@ -311,16 +317,19 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
if (is_large_pte(*sptep)) {
rmap_remove(vcpu->kvm, sptep);
- set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
+ __set_spte(sptep, shadow_trap_nonpresent_pte);
kvm_flush_remote_tlbs(vcpu->kvm);
}
- if (level == PT_DIRECTORY_LEVEL
- && gw->level == PT_DIRECTORY_LEVEL) {
+ if (level <= gw->level) {
+ int delta = level - gw->level + 1;
direct = 1;
- if (!is_dirty_pte(gw->ptes[level - 1]))
+ if (!is_dirty_gpte(gw->ptes[level - delta]))
access &= ~ACC_WRITE_MASK;
- table_gfn = gpte_to_gfn(gw->ptes[level - 1]);
+ table_gfn = gpte_to_gfn(gw->ptes[level - delta]);
+ /* advance table_gfn when emulating 1gb pages with 4k */
+ if (delta == 0)
+ table_gfn += PT_INDEX(addr, level);
} else {
direct = 0;
table_gfn = gw->table_gfn[level - 2];
@@ -369,11 +378,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
int user_fault = error_code & PFERR_USER_MASK;
int fetch_fault = error_code & PFERR_FETCH_MASK;
struct guest_walker walker;
- u64 *shadow_pte;
+ u64 *sptep;
int write_pt = 0;
int r;
pfn_t pfn;
- int largepage = 0;
+ int level = PT_PAGE_TABLE_LEVEL;
unsigned long mmu_seq;
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
@@ -399,14 +408,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
return 0;
}
- if (walker.level == PT_DIRECTORY_LEVEL) {
- gfn_t large_gfn;
- large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1);
- if (is_largepage_backed(vcpu, large_gfn)) {
- walker.gfn = large_gfn;
- largepage = 1;
- }
+ if (walker.level >= PT_DIRECTORY_LEVEL) {
+ level = min(walker.level, mapping_level(vcpu, walker.gfn));
+ walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
}
+
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
@@ -422,11 +428,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
- shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
- largepage, &write_pt, pfn);
-
+ sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
+ level, &write_pt, pfn);
pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
- shadow_pte, *shadow_pte, write_pt);
+ sptep, *sptep, write_pt);
if (!write_pt)
vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
@@ -459,8 +464,9 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
sptep = iterator.sptep;
/* FIXME: properly handle invlpg on large guest pages */
- if (level == PT_PAGE_TABLE_LEVEL ||
- ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) {
+ if (level == PT_PAGE_TABLE_LEVEL ||
+ ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) ||
+ ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) {
struct kvm_mmu_page *sp = page_header(__pa(sptep));
pte_gpa = (sp->gfn << PAGE_SHIFT);
@@ -472,7 +478,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
--vcpu->kvm->stat.lpages;
need_flush = 1;
}
- set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
+ __set_spte(sptep, shadow_trap_nonpresent_pte);
break;
}
@@ -489,7 +495,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
sizeof(pt_element_t)))
return;
- if (is_present_pte(gpte) && (gpte & PT_ACCESSED_MASK)) {
+ if (is_present_gpte(gpte) && (gpte & PT_ACCESSED_MASK)) {
if (mmu_topup_memory_caches(vcpu))
return;
kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)&gpte,
@@ -536,7 +542,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt);
pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t);
for (j = 0; j < ARRAY_SIZE(pt); ++j)
- if (r || is_present_pte(pt[j]))
+ if (r || is_present_gpte(pt[j]))
sp->spt[i+j] = shadow_trap_nonpresent_pte;
else
sp->spt[i+j] = shadow_notrap_nonpresent_pte;
@@ -574,23 +580,23 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
sizeof(pt_element_t)))
return -EINVAL;
- if (gpte_to_gfn(gpte) != gfn || !is_present_pte(gpte) ||
+ if (gpte_to_gfn(gpte) != gfn || !is_present_gpte(gpte) ||
!(gpte & PT_ACCESSED_MASK)) {
u64 nonpresent;
rmap_remove(vcpu->kvm, &sp->spt[i]);
- if (is_present_pte(gpte))
+ if (is_present_gpte(gpte))
nonpresent = shadow_trap_nonpresent_pte;
else
nonpresent = shadow_notrap_nonpresent_pte;
- set_shadow_pte(&sp->spt[i], nonpresent);
+ __set_spte(&sp->spt[i], nonpresent);
continue;
}
nr_present++;
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
- is_dirty_pte(gpte), 0, gfn,
+ is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
spte_to_pfn(sp->spt[i]), true, false);
}
@@ -603,9 +609,10 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
#undef PT_BASE_ADDR_MASK
#undef PT_INDEX
#undef PT_LEVEL_MASK
-#undef PT_DIR_BASE_ADDR_MASK
+#undef PT_LVL_ADDR_MASK
+#undef PT_LVL_OFFSET_MASK
#undef PT_LEVEL_BITS
#undef PT_MAX_FULL_LEVELS
#undef gpte_to_gfn
-#undef gpte_to_gfn_pde
+#undef gpte_to_gfn_lvl
#undef CMPXCHG
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index b1f658ad2f0..944cc9c04b3 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -15,7 +15,6 @@
*/
#include <linux/kvm_host.h>
-#include "kvm_svm.h"
#include "irq.h"
#include "mmu.h"
#include "kvm_cache_regs.h"
@@ -26,10 +25,12 @@
#include <linux/vmalloc.h>
#include <linux/highmem.h>
#include <linux/sched.h>
+#include <linux/ftrace_event.h>
#include <asm/desc.h>
#include <asm/virtext.h>
+#include "trace.h"
#define __ex(x) __kvm_handle_fault_on_reboot(x)
@@ -46,6 +47,10 @@ MODULE_LICENSE("GPL");
#define SVM_FEATURE_LBRV (1 << 1)
#define SVM_FEATURE_SVML (1 << 2)
+#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
+#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */
+#define NESTED_EXIT_CONTINUE 2 /* Further checks needed */
+
#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
/* Turn on to get debugging output*/
@@ -57,6 +62,58 @@ MODULE_LICENSE("GPL");
#define nsvm_printk(fmt, args...) do {} while(0)
#endif
+static const u32 host_save_user_msrs[] = {
+#ifdef CONFIG_X86_64
+ MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
+ MSR_FS_BASE,
+#endif
+ MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
+};
+
+#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
+
+struct kvm_vcpu;
+
+struct nested_state {
+ struct vmcb *hsave;
+ u64 hsave_msr;
+ u64 vmcb;
+
+ /* These are the merged vectors */
+ u32 *msrpm;
+
+ /* gpa pointers to the real vectors */
+ u64 vmcb_msrpm;
+
+ /* cache for intercepts of the guest */
+ u16 intercept_cr_read;
+ u16 intercept_cr_write;
+ u16 intercept_dr_read;
+ u16 intercept_dr_write;
+ u32 intercept_exceptions;
+ u64 intercept;
+
+};
+
+struct vcpu_svm {
+ struct kvm_vcpu vcpu;
+ struct vmcb *vmcb;
+ unsigned long vmcb_pa;
+ struct svm_cpu_data *svm_data;
+ uint64_t asid_generation;
+ uint64_t sysenter_esp;
+ uint64_t sysenter_eip;
+
+ u64 next_rip;
+
+ u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
+ u64 host_gs_base;
+
+ u32 *msrpm;
+
+ struct nested_state nested;
+};
+
/* enable NPT for AMD64 and X86 with PAE */
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
static bool npt_enabled = true;
@@ -67,15 +124,14 @@ static int npt = 1;
module_param(npt, int, S_IRUGO);
-static int nested = 0;
+static int nested = 1;
module_param(nested, int, S_IRUGO);
static void svm_flush_tlb(struct kvm_vcpu *vcpu);
+static void svm_complete_interrupts(struct vcpu_svm *svm);
-static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override);
+static int nested_svm_exit_handled(struct vcpu_svm *svm);
static int nested_svm_vmexit(struct vcpu_svm *svm);
-static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb,
- void *arg2, void *opaque);
static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
bool has_error_code, u32 error_code);
@@ -86,7 +142,22 @@ static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
static inline bool is_nested(struct vcpu_svm *svm)
{
- return svm->nested_vmcb;
+ return svm->nested.vmcb;
+}
+
+static inline void enable_gif(struct vcpu_svm *svm)
+{
+ svm->vcpu.arch.hflags |= HF_GIF_MASK;
+}
+
+static inline void disable_gif(struct vcpu_svm *svm)
+{
+ svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
+}
+
+static inline bool gif_set(struct vcpu_svm *svm)
+{
+ return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
}
static unsigned long iopm_base;
@@ -147,19 +218,6 @@ static inline void invlpga(unsigned long addr, u32 asid)
asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid));
}
-static inline unsigned long kvm_read_cr2(void)
-{
- unsigned long cr2;
-
- asm volatile ("mov %%cr2, %0" : "=r" (cr2));
- return cr2;
-}
-
-static inline void kvm_write_cr2(unsigned long val)
-{
- asm volatile ("mov %0, %%cr2" :: "r" (val));
-}
-
static inline void force_new_asid(struct kvm_vcpu *vcpu)
{
to_svm(vcpu)->asid_generation--;
@@ -263,7 +321,7 @@ static void svm_hardware_enable(void *garbage)
struct svm_cpu_data *svm_data;
uint64_t efer;
- struct desc_ptr gdt_descr;
+ struct descriptor_table gdt_descr;
struct desc_struct *gdt;
int me = raw_smp_processor_id();
@@ -283,8 +341,8 @@ static void svm_hardware_enable(void *garbage)
svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
svm_data->next_asid = svm_data->max_asid + 1;
- asm volatile ("sgdt %0" : "=m"(gdt_descr));
- gdt = (struct desc_struct *)gdt_descr.address;
+ kvm_get_gdt(&gdt_descr);
+ gdt = (struct desc_struct *)gdt_descr.base;
svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
rdmsrl(MSR_EFER, efer);
@@ -367,8 +425,6 @@ static void svm_vcpu_init_msrpm(u32 *msrpm)
#endif
set_msr_interception(msrpm, MSR_K6_STAR, 1, 1);
set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1);
- set_msr_interception(msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
- set_msr_interception(msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
}
static void svm_enable_lbrv(struct vcpu_svm *svm)
@@ -595,8 +651,10 @@ static void init_vmcb(struct vcpu_svm *svm)
}
force_new_asid(&svm->vcpu);
- svm->nested_vmcb = 0;
- svm->vcpu.arch.hflags = HF_GIF_MASK;
+ svm->nested.vmcb = 0;
+ svm->vcpu.arch.hflags = 0;
+
+ enable_gif(svm);
}
static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
@@ -605,7 +663,7 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
init_vmcb(svm);
- if (vcpu->vcpu_id != 0) {
+ if (!kvm_vcpu_is_bsp(vcpu)) {
kvm_rip_write(vcpu, 0);
svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
@@ -656,9 +714,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
hsave_page = alloc_page(GFP_KERNEL);
if (!hsave_page)
goto uninit;
- svm->hsave = page_address(hsave_page);
+ svm->nested.hsave = page_address(hsave_page);
- svm->nested_msrpm = page_address(nested_msrpm_pages);
+ svm->nested.msrpm = page_address(nested_msrpm_pages);
svm->vmcb = page_address(page);
clear_page(svm->vmcb);
@@ -669,7 +727,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
fx_init(&svm->vcpu);
svm->vcpu.fpu_active = 1;
svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
- if (svm->vcpu.vcpu_id == 0)
+ if (kvm_vcpu_is_bsp(&svm->vcpu))
svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
return &svm->vcpu;
@@ -688,8 +746,8 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
__free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
- __free_page(virt_to_page(svm->hsave));
- __free_pages(virt_to_page(svm->nested_msrpm), MSRPM_ALLOC_ORDER);
+ __free_page(virt_to_page(svm->nested.hsave));
+ __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
kvm_vcpu_uninit(vcpu);
kmem_cache_free(kvm_vcpu_cache, svm);
}
@@ -740,6 +798,18 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
to_svm(vcpu)->vmcb->save.rflags = rflags;
}
+static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
+{
+ switch (reg) {
+ case VCPU_EXREG_PDPTR:
+ BUG_ON(!npt_enabled);
+ load_pdptrs(vcpu, vcpu->arch.cr3);
+ break;
+ default:
+ BUG();
+ }
+}
+
static void svm_set_vintr(struct vcpu_svm *svm)
{
svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR;
@@ -1061,7 +1131,6 @@ static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
val = 0;
}
- KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
return val;
}
@@ -1070,8 +1139,6 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
{
struct vcpu_svm *svm = to_svm(vcpu);
- KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)value, handler);
-
*exception = 0;
switch (dr) {
@@ -1119,25 +1186,9 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
fault_address = svm->vmcb->control.exit_info_2;
error_code = svm->vmcb->control.exit_info_1;
- if (!npt_enabled)
- KVMTRACE_3D(PAGE_FAULT, &svm->vcpu, error_code,
- (u32)fault_address, (u32)(fault_address >> 32),
- handler);
- else
- KVMTRACE_3D(TDP_FAULT, &svm->vcpu, error_code,
- (u32)fault_address, (u32)(fault_address >> 32),
- handler);
- /*
- * FIXME: Tis shouldn't be necessary here, but there is a flush
- * missing in the MMU code. Until we find this bug, flush the
- * complete TLB here on an NPF
- */
- if (npt_enabled)
- svm_flush_tlb(&svm->vcpu);
- else {
- if (kvm_event_needs_reinjection(&svm->vcpu))
- kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
- }
+ trace_kvm_page_fault(fault_address, error_code);
+ if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
+ kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
}
@@ -1253,14 +1304,12 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
- KVMTRACE_0D(NMI, &svm->vcpu, handler);
return 1;
}
static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
++svm->vcpu.stat.irq_exits;
- KVMTRACE_0D(INTR, &svm->vcpu, handler);
return 1;
}
@@ -1303,44 +1352,39 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm)
static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
bool has_error_code, u32 error_code)
{
- if (is_nested(svm)) {
- svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
- svm->vmcb->control.exit_code_hi = 0;
- svm->vmcb->control.exit_info_1 = error_code;
- svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
- if (nested_svm_exit_handled(svm, false)) {
- nsvm_printk("VMexit -> EXCP 0x%x\n", nr);
-
- nested_svm_vmexit(svm);
- return 1;
- }
- }
+ if (!is_nested(svm))
+ return 0;
- return 0;
+ svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
+ svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_info_1 = error_code;
+ svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
+
+ return nested_svm_exit_handled(svm);
}
static inline int nested_svm_intr(struct vcpu_svm *svm)
{
- if (is_nested(svm)) {
- if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
- return 0;
+ if (!is_nested(svm))
+ return 0;
- if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
- return 0;
+ if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
+ return 0;
- svm->vmcb->control.exit_code = SVM_EXIT_INTR;
+ if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
+ return 0;
- if (nested_svm_exit_handled(svm, false)) {
- nsvm_printk("VMexit -> INTR\n");
- nested_svm_vmexit(svm);
- return 1;
- }
+ svm->vmcb->control.exit_code = SVM_EXIT_INTR;
+
+ if (nested_svm_exit_handled(svm)) {
+ nsvm_printk("VMexit -> INTR\n");
+ return 1;
}
return 0;
}
-static struct page *nested_svm_get_page(struct vcpu_svm *svm, u64 gpa)
+static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx)
{
struct page *page;
@@ -1348,236 +1392,246 @@ static struct page *nested_svm_get_page(struct vcpu_svm *svm, u64 gpa)
page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
up_read(&current->mm->mmap_sem);
- if (is_error_page(page)) {
- printk(KERN_INFO "%s: could not find page at 0x%llx\n",
- __func__, gpa);
- kvm_release_page_clean(page);
- kvm_inject_gp(&svm->vcpu, 0);
- return NULL;
- }
- return page;
+ if (is_error_page(page))
+ goto error;
+
+ return kmap_atomic(page, idx);
+
+error:
+ kvm_release_page_clean(page);
+ kvm_inject_gp(&svm->vcpu, 0);
+
+ return NULL;
}
-static int nested_svm_do(struct vcpu_svm *svm,
- u64 arg1_gpa, u64 arg2_gpa, void *opaque,
- int (*handler)(struct vcpu_svm *svm,
- void *arg1,
- void *arg2,
- void *opaque))
+static void nested_svm_unmap(void *addr, enum km_type idx)
{
- struct page *arg1_page;
- struct page *arg2_page = NULL;
- void *arg1;
- void *arg2 = NULL;
- int retval;
+ struct page *page;
- arg1_page = nested_svm_get_page(svm, arg1_gpa);
- if(arg1_page == NULL)
- return 1;
+ if (!addr)
+ return;
- if (arg2_gpa) {
- arg2_page = nested_svm_get_page(svm, arg2_gpa);
- if(arg2_page == NULL) {
- kvm_release_page_clean(arg1_page);
- return 1;
- }
- }
+ page = kmap_atomic_to_page(addr);
+
+ kunmap_atomic(addr, idx);
+ kvm_release_page_dirty(page);
+}
+
+static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm)
+{
+ u32 param = svm->vmcb->control.exit_info_1 & 1;
+ u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+ bool ret = false;
+ u32 t0, t1;
+ u8 *msrpm;
- arg1 = kmap_atomic(arg1_page, KM_USER0);
- if (arg2_gpa)
- arg2 = kmap_atomic(arg2_page, KM_USER1);
+ if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
+ return false;
- retval = handler(svm, arg1, arg2, opaque);
+ msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0);
+
+ if (!msrpm)
+ goto out;
+
+ switch (msr) {
+ case 0 ... 0x1fff:
+ t0 = (msr * 2) % 8;
+ t1 = msr / 8;
+ break;
+ case 0xc0000000 ... 0xc0001fff:
+ t0 = (8192 + msr - 0xc0000000) * 2;
+ t1 = (t0 / 8);
+ t0 %= 8;
+ break;
+ case 0xc0010000 ... 0xc0011fff:
+ t0 = (16384 + msr - 0xc0010000) * 2;
+ t1 = (t0 / 8);
+ t0 %= 8;
+ break;
+ default:
+ ret = true;
+ goto out;
+ }
- kunmap_atomic(arg1, KM_USER0);
- if (arg2_gpa)
- kunmap_atomic(arg2, KM_USER1);
+ ret = msrpm[t1] & ((1 << param) << t0);
- kvm_release_page_dirty(arg1_page);
- if (arg2_gpa)
- kvm_release_page_dirty(arg2_page);
+out:
+ nested_svm_unmap(msrpm, KM_USER0);
- return retval;
+ return ret;
}
-static int nested_svm_exit_handled_real(struct vcpu_svm *svm,
- void *arg1,
- void *arg2,
- void *opaque)
+static int nested_svm_exit_special(struct vcpu_svm *svm)
{
- struct vmcb *nested_vmcb = (struct vmcb *)arg1;
- bool kvm_overrides = *(bool *)opaque;
u32 exit_code = svm->vmcb->control.exit_code;
- if (kvm_overrides) {
- switch (exit_code) {
- case SVM_EXIT_INTR:
- case SVM_EXIT_NMI:
- return 0;
+ switch (exit_code) {
+ case SVM_EXIT_INTR:
+ case SVM_EXIT_NMI:
+ return NESTED_EXIT_HOST;
/* For now we are always handling NPFs when using them */
- case SVM_EXIT_NPF:
- if (npt_enabled)
- return 0;
- break;
- /* When we're shadowing, trap PFs */
- case SVM_EXIT_EXCP_BASE + PF_VECTOR:
- if (!npt_enabled)
- return 0;
- break;
- default:
- break;
- }
+ case SVM_EXIT_NPF:
+ if (npt_enabled)
+ return NESTED_EXIT_HOST;
+ break;
+ /* When we're shadowing, trap PFs */
+ case SVM_EXIT_EXCP_BASE + PF_VECTOR:
+ if (!npt_enabled)
+ return NESTED_EXIT_HOST;
+ break;
+ default:
+ break;
}
+ return NESTED_EXIT_CONTINUE;
+}
+
+/*
+ * If this function returns true, this #vmexit was already handled
+ */
+static int nested_svm_exit_handled(struct vcpu_svm *svm)
+{
+ u32 exit_code = svm->vmcb->control.exit_code;
+ int vmexit = NESTED_EXIT_HOST;
+
switch (exit_code) {
+ case SVM_EXIT_MSR:
+ vmexit = nested_svm_exit_handled_msr(svm);
+ break;
case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: {
u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0);
- if (nested_vmcb->control.intercept_cr_read & cr_bits)
- return 1;
+ if (svm->nested.intercept_cr_read & cr_bits)
+ vmexit = NESTED_EXIT_DONE;
break;
}
case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: {
u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0);
- if (nested_vmcb->control.intercept_cr_write & cr_bits)
- return 1;
+ if (svm->nested.intercept_cr_write & cr_bits)
+ vmexit = NESTED_EXIT_DONE;
break;
}
case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: {
u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0);
- if (nested_vmcb->control.intercept_dr_read & dr_bits)
- return 1;
+ if (svm->nested.intercept_dr_read & dr_bits)
+ vmexit = NESTED_EXIT_DONE;
break;
}
case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: {
u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0);
- if (nested_vmcb->control.intercept_dr_write & dr_bits)
- return 1;
+ if (svm->nested.intercept_dr_write & dr_bits)
+ vmexit = NESTED_EXIT_DONE;
break;
}
case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
- if (nested_vmcb->control.intercept_exceptions & excp_bits)
- return 1;
+ if (svm->nested.intercept_exceptions & excp_bits)
+ vmexit = NESTED_EXIT_DONE;
break;
}
default: {
u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
nsvm_printk("exit code: 0x%x\n", exit_code);
- if (nested_vmcb->control.intercept & exit_bits)
- return 1;
+ if (svm->nested.intercept & exit_bits)
+ vmexit = NESTED_EXIT_DONE;
}
}
- return 0;
-}
-
-static int nested_svm_exit_handled_msr(struct vcpu_svm *svm,
- void *arg1, void *arg2,
- void *opaque)
-{
- struct vmcb *nested_vmcb = (struct vmcb *)arg1;
- u8 *msrpm = (u8 *)arg2;
- u32 t0, t1;
- u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
- u32 param = svm->vmcb->control.exit_info_1 & 1;
-
- if (!(nested_vmcb->control.intercept & (1ULL << INTERCEPT_MSR_PROT)))
- return 0;
-
- switch(msr) {
- case 0 ... 0x1fff:
- t0 = (msr * 2) % 8;
- t1 = msr / 8;
- break;
- case 0xc0000000 ... 0xc0001fff:
- t0 = (8192 + msr - 0xc0000000) * 2;
- t1 = (t0 / 8);
- t0 %= 8;
- break;
- case 0xc0010000 ... 0xc0011fff:
- t0 = (16384 + msr - 0xc0010000) * 2;
- t1 = (t0 / 8);
- t0 %= 8;
- break;
- default:
- return 1;
- break;
+ if (vmexit == NESTED_EXIT_DONE) {
+ nsvm_printk("#VMEXIT reason=%04x\n", exit_code);
+ nested_svm_vmexit(svm);
}
- if (msrpm[t1] & ((1 << param) << t0))
- return 1;
- return 0;
+ return vmexit;
+}
+
+static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb)
+{
+ struct vmcb_control_area *dst = &dst_vmcb->control;
+ struct vmcb_control_area *from = &from_vmcb->control;
+
+ dst->intercept_cr_read = from->intercept_cr_read;
+ dst->intercept_cr_write = from->intercept_cr_write;
+ dst->intercept_dr_read = from->intercept_dr_read;
+ dst->intercept_dr_write = from->intercept_dr_write;
+ dst->intercept_exceptions = from->intercept_exceptions;
+ dst->intercept = from->intercept;
+ dst->iopm_base_pa = from->iopm_base_pa;
+ dst->msrpm_base_pa = from->msrpm_base_pa;
+ dst->tsc_offset = from->tsc_offset;
+ dst->asid = from->asid;
+ dst->tlb_ctl = from->tlb_ctl;
+ dst->int_ctl = from->int_ctl;
+ dst->int_vector = from->int_vector;
+ dst->int_state = from->int_state;
+ dst->exit_code = from->exit_code;
+ dst->exit_code_hi = from->exit_code_hi;
+ dst->exit_info_1 = from->exit_info_1;
+ dst->exit_info_2 = from->exit_info_2;
+ dst->exit_int_info = from->exit_int_info;
+ dst->exit_int_info_err = from->exit_int_info_err;
+ dst->nested_ctl = from->nested_ctl;
+ dst->event_inj = from->event_inj;
+ dst->event_inj_err = from->event_inj_err;
+ dst->nested_cr3 = from->nested_cr3;
+ dst->lbr_ctl = from->lbr_ctl;
}
-static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override)
+static int nested_svm_vmexit(struct vcpu_svm *svm)
{
- bool k = kvm_override;
-
- switch (svm->vmcb->control.exit_code) {
- case SVM_EXIT_MSR:
- return nested_svm_do(svm, svm->nested_vmcb,
- svm->nested_vmcb_msrpm, NULL,
- nested_svm_exit_handled_msr);
- default: break;
- }
+ struct vmcb *nested_vmcb;
+ struct vmcb *hsave = svm->nested.hsave;
+ struct vmcb *vmcb = svm->vmcb;
- return nested_svm_do(svm, svm->nested_vmcb, 0, &k,
- nested_svm_exit_handled_real);
-}
-
-static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1,
- void *arg2, void *opaque)
-{
- struct vmcb *nested_vmcb = (struct vmcb *)arg1;
- struct vmcb *hsave = svm->hsave;
- u64 nested_save[] = { nested_vmcb->save.cr0,
- nested_vmcb->save.cr3,
- nested_vmcb->save.cr4,
- nested_vmcb->save.efer,
- nested_vmcb->control.intercept_cr_read,
- nested_vmcb->control.intercept_cr_write,
- nested_vmcb->control.intercept_dr_read,
- nested_vmcb->control.intercept_dr_write,
- nested_vmcb->control.intercept_exceptions,
- nested_vmcb->control.intercept,
- nested_vmcb->control.msrpm_base_pa,
- nested_vmcb->control.iopm_base_pa,
- nested_vmcb->control.tsc_offset };
+ nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0);
+ if (!nested_vmcb)
+ return 1;
/* Give the current vmcb to the guest */
- memcpy(nested_vmcb, svm->vmcb, sizeof(struct vmcb));
- nested_vmcb->save.cr0 = nested_save[0];
- if (!npt_enabled)
- nested_vmcb->save.cr3 = nested_save[1];
- nested_vmcb->save.cr4 = nested_save[2];
- nested_vmcb->save.efer = nested_save[3];
- nested_vmcb->control.intercept_cr_read = nested_save[4];
- nested_vmcb->control.intercept_cr_write = nested_save[5];
- nested_vmcb->control.intercept_dr_read = nested_save[6];
- nested_vmcb->control.intercept_dr_write = nested_save[7];
- nested_vmcb->control.intercept_exceptions = nested_save[8];
- nested_vmcb->control.intercept = nested_save[9];
- nested_vmcb->control.msrpm_base_pa = nested_save[10];
- nested_vmcb->control.iopm_base_pa = nested_save[11];
- nested_vmcb->control.tsc_offset = nested_save[12];
+ disable_gif(svm);
+
+ nested_vmcb->save.es = vmcb->save.es;
+ nested_vmcb->save.cs = vmcb->save.cs;
+ nested_vmcb->save.ss = vmcb->save.ss;
+ nested_vmcb->save.ds = vmcb->save.ds;
+ nested_vmcb->save.gdtr = vmcb->save.gdtr;
+ nested_vmcb->save.idtr = vmcb->save.idtr;
+ if (npt_enabled)
+ nested_vmcb->save.cr3 = vmcb->save.cr3;
+ nested_vmcb->save.cr2 = vmcb->save.cr2;
+ nested_vmcb->save.rflags = vmcb->save.rflags;
+ nested_vmcb->save.rip = vmcb->save.rip;
+ nested_vmcb->save.rsp = vmcb->save.rsp;
+ nested_vmcb->save.rax = vmcb->save.rax;
+ nested_vmcb->save.dr7 = vmcb->save.dr7;
+ nested_vmcb->save.dr6 = vmcb->save.dr6;
+ nested_vmcb->save.cpl = vmcb->save.cpl;
+
+ nested_vmcb->control.int_ctl = vmcb->control.int_ctl;
+ nested_vmcb->control.int_vector = vmcb->control.int_vector;
+ nested_vmcb->control.int_state = vmcb->control.int_state;
+ nested_vmcb->control.exit_code = vmcb->control.exit_code;
+ nested_vmcb->control.exit_code_hi = vmcb->control.exit_code_hi;
+ nested_vmcb->control.exit_info_1 = vmcb->control.exit_info_1;
+ nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
+ nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info;
+ nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
+ nested_vmcb->control.tlb_ctl = 0;
+ nested_vmcb->control.event_inj = 0;
+ nested_vmcb->control.event_inj_err = 0;
/* We always set V_INTR_MASKING and remember the old value in hflags */
if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
- if ((nested_vmcb->control.int_ctl & V_IRQ_MASK) &&
- (nested_vmcb->control.int_vector)) {
- nsvm_printk("WARNING: IRQ 0x%x still enabled on #VMEXIT\n",
- nested_vmcb->control.int_vector);
- }
-
/* Restore the original control entries */
- svm->vmcb->control = hsave->control;
+ copy_vmcb_control_area(vmcb, hsave);
/* Kill any pending exceptions */
if (svm->vcpu.arch.exception.pending == true)
nsvm_printk("WARNING: Pending Exception\n");
- svm->vcpu.arch.exception.pending = false;
+
+ kvm_clear_exception_queue(&svm->vcpu);
+ kvm_clear_interrupt_queue(&svm->vcpu);
/* Restore selected save entries */
svm->vmcb->save.es = hsave->save.es;
@@ -1603,19 +1657,10 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1,
svm->vmcb->save.cpl = 0;
svm->vmcb->control.exit_int_info = 0;
- svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
/* Exit nested SVM mode */
- svm->nested_vmcb = 0;
+ svm->nested.vmcb = 0;
- return 0;
-}
-
-static int nested_svm_vmexit(struct vcpu_svm *svm)
-{
- nsvm_printk("VMexit\n");
- if (nested_svm_do(svm, svm->nested_vmcb, 0,
- NULL, nested_svm_vmexit_real))
- return 1;
+ nested_svm_unmap(nested_vmcb, KM_USER0);
kvm_mmu_reset_context(&svm->vcpu);
kvm_mmu_load(&svm->vcpu);
@@ -1623,38 +1668,63 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
return 0;
}
-static int nested_svm_vmrun_msrpm(struct vcpu_svm *svm, void *arg1,
- void *arg2, void *opaque)
+static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
{
+ u32 *nested_msrpm;
int i;
- u32 *nested_msrpm = (u32*)arg1;
+
+ nested_msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0);
+ if (!nested_msrpm)
+ return false;
+
for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++)
- svm->nested_msrpm[i] = svm->msrpm[i] | nested_msrpm[i];
- svm->vmcb->control.msrpm_base_pa = __pa(svm->nested_msrpm);
+ svm->nested.msrpm[i] = svm->msrpm[i] | nested_msrpm[i];
- return 0;
+ svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
+
+ nested_svm_unmap(nested_msrpm, KM_USER0);
+
+ return true;
}
-static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1,
- void *arg2, void *opaque)
+static bool nested_svm_vmrun(struct vcpu_svm *svm)
{
- struct vmcb *nested_vmcb = (struct vmcb *)arg1;
- struct vmcb *hsave = svm->hsave;
+ struct vmcb *nested_vmcb;
+ struct vmcb *hsave = svm->nested.hsave;
+ struct vmcb *vmcb = svm->vmcb;
+
+ nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0);
+ if (!nested_vmcb)
+ return false;
/* nested_vmcb is our indicator if nested SVM is activated */
- svm->nested_vmcb = svm->vmcb->save.rax;
+ svm->nested.vmcb = svm->vmcb->save.rax;
/* Clear internal status */
- svm->vcpu.arch.exception.pending = false;
+ kvm_clear_exception_queue(&svm->vcpu);
+ kvm_clear_interrupt_queue(&svm->vcpu);
/* Save the old vmcb, so we don't need to pick what we save, but
can restore everything when a VMEXIT occurs */
- memcpy(hsave, svm->vmcb, sizeof(struct vmcb));
- /* We need to remember the original CR3 in the SPT case */
- if (!npt_enabled)
- hsave->save.cr3 = svm->vcpu.arch.cr3;
- hsave->save.cr4 = svm->vcpu.arch.cr4;
- hsave->save.rip = svm->next_rip;
+ hsave->save.es = vmcb->save.es;
+ hsave->save.cs = vmcb->save.cs;
+ hsave->save.ss = vmcb->save.ss;
+ hsave->save.ds = vmcb->save.ds;
+ hsave->save.gdtr = vmcb->save.gdtr;
+ hsave->save.idtr = vmcb->save.idtr;
+ hsave->save.efer = svm->vcpu.arch.shadow_efer;
+ hsave->save.cr0 = svm->vcpu.arch.cr0;
+ hsave->save.cr4 = svm->vcpu.arch.cr4;
+ hsave->save.rflags = vmcb->save.rflags;
+ hsave->save.rip = svm->next_rip;
+ hsave->save.rsp = vmcb->save.rsp;
+ hsave->save.rax = vmcb->save.rax;
+ if (npt_enabled)
+ hsave->save.cr3 = vmcb->save.cr3;
+ else
+ hsave->save.cr3 = svm->vcpu.arch.cr3;
+
+ copy_vmcb_control_area(hsave, vmcb);
if (svm->vmcb->save.rflags & X86_EFLAGS_IF)
svm->vcpu.arch.hflags |= HF_HIF_MASK;
@@ -1679,7 +1749,7 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1,
kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
kvm_mmu_reset_context(&svm->vcpu);
}
- svm->vmcb->save.cr2 = nested_vmcb->save.cr2;
+ svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
@@ -1706,7 +1776,15 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1,
svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
- svm->nested_vmcb_msrpm = nested_vmcb->control.msrpm_base_pa;
+ svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa;
+
+ /* cache intercepts */
+ svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read;
+ svm->nested.intercept_cr_write = nested_vmcb->control.intercept_cr_write;
+ svm->nested.intercept_dr_read = nested_vmcb->control.intercept_dr_read;
+ svm->nested.intercept_dr_write = nested_vmcb->control.intercept_dr_write;
+ svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
+ svm->nested.intercept = nested_vmcb->control.intercept;
force_new_asid(&svm->vcpu);
svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info;
@@ -1734,12 +1812,14 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1,
svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
- svm->vcpu.arch.hflags |= HF_GIF_MASK;
+ nested_svm_unmap(nested_vmcb, KM_USER0);
- return 0;
+ enable_gif(svm);
+
+ return true;
}
-static int nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
+static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
{
to_vmcb->save.fs = from_vmcb->save.fs;
to_vmcb->save.gs = from_vmcb->save.gs;
@@ -1753,44 +1833,44 @@ static int nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
-
- return 1;
-}
-
-static int nested_svm_vmload(struct vcpu_svm *svm, void *nested_vmcb,
- void *arg2, void *opaque)
-{
- return nested_svm_vmloadsave((struct vmcb *)nested_vmcb, svm->vmcb);
-}
-
-static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb,
- void *arg2, void *opaque)
-{
- return nested_svm_vmloadsave(svm->vmcb, (struct vmcb *)nested_vmcb);
}
static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
+ struct vmcb *nested_vmcb;
+
if (nested_svm_check_permissions(svm))
return 1;
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
skip_emulated_instruction(&svm->vcpu);
- nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmload);
+ nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0);
+ if (!nested_vmcb)
+ return 1;
+
+ nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
+ nested_svm_unmap(nested_vmcb, KM_USER0);
return 1;
}
static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
+ struct vmcb *nested_vmcb;
+
if (nested_svm_check_permissions(svm))
return 1;
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
skip_emulated_instruction(&svm->vcpu);
- nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmsave);
+ nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0);
+ if (!nested_vmcb)
+ return 1;
+
+ nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
+ nested_svm_unmap(nested_vmcb, KM_USER0);
return 1;
}
@@ -1798,19 +1878,29 @@ static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
nsvm_printk("VMrun\n");
+
if (nested_svm_check_permissions(svm))
return 1;
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
skip_emulated_instruction(&svm->vcpu);
- if (nested_svm_do(svm, svm->vmcb->save.rax, 0,
- NULL, nested_svm_vmrun))
+ if (!nested_svm_vmrun(svm))
return 1;
- if (nested_svm_do(svm, svm->nested_vmcb_msrpm, 0,
- NULL, nested_svm_vmrun_msrpm))
- return 1;
+ if (!nested_svm_vmrun_msrpm(svm))
+ goto failed;
+
+ return 1;
+
+failed:
+
+ svm->vmcb->control.exit_code = SVM_EXIT_ERR;
+ svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_info_1 = 0;
+ svm->vmcb->control.exit_info_2 = 0;
+
+ nested_svm_vmexit(svm);
return 1;
}
@@ -1823,7 +1913,7 @@ static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
skip_emulated_instruction(&svm->vcpu);
- svm->vcpu.arch.hflags |= HF_GIF_MASK;
+ enable_gif(svm);
return 1;
}
@@ -1836,7 +1926,7 @@ static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
skip_emulated_instruction(&svm->vcpu);
- svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
+ disable_gif(svm);
/* After a CLGI no interrupts should come */
svm_clear_vintr(svm);
@@ -1845,6 +1935,19 @@ static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
return 1;
}
+static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ nsvm_printk("INVLPGA\n");
+
+ /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
+ kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]);
+
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+ skip_emulated_instruction(&svm->vcpu);
+ return 1;
+}
+
static int invalid_op_interception(struct vcpu_svm *svm,
struct kvm_run *kvm_run)
{
@@ -1953,7 +2056,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
struct vcpu_svm *svm = to_svm(vcpu);
switch (ecx) {
- case MSR_IA32_TIME_STAMP_COUNTER: {
+ case MSR_IA32_TSC: {
u64 tsc;
rdtscll(tsc);
@@ -1981,10 +2084,10 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
*data = svm->vmcb->save.sysenter_cs;
break;
case MSR_IA32_SYSENTER_EIP:
- *data = svm->vmcb->save.sysenter_eip;
+ *data = svm->sysenter_eip;
break;
case MSR_IA32_SYSENTER_ESP:
- *data = svm->vmcb->save.sysenter_esp;
+ *data = svm->sysenter_esp;
break;
/* Nobody will change the following 5 values in the VMCB so
we can safely return them on rdmsr. They will always be 0
@@ -2005,7 +2108,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
*data = svm->vmcb->save.last_excp_to;
break;
case MSR_VM_HSAVE_PA:
- *data = svm->hsave_msr;
+ *data = svm->nested.hsave_msr;
break;
case MSR_VM_CR:
*data = 0;
@@ -2027,8 +2130,7 @@ static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
if (svm_get_msr(&svm->vcpu, ecx, &data))
kvm_inject_gp(&svm->vcpu, 0);
else {
- KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data,
- (u32)(data >> 32), handler);
+ trace_kvm_msr_read(ecx, data);
svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff;
svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
@@ -2043,7 +2145,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
struct vcpu_svm *svm = to_svm(vcpu);
switch (ecx) {
- case MSR_IA32_TIME_STAMP_COUNTER: {
+ case MSR_IA32_TSC: {
u64 tsc;
rdtscll(tsc);
@@ -2071,9 +2173,11 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
svm->vmcb->save.sysenter_cs = data;
break;
case MSR_IA32_SYSENTER_EIP:
+ svm->sysenter_eip = data;
svm->vmcb->save.sysenter_eip = data;
break;
case MSR_IA32_SYSENTER_ESP:
+ svm->sysenter_esp = data;
svm->vmcb->save.sysenter_esp = data;
break;
case MSR_IA32_DEBUGCTLMSR:
@@ -2091,24 +2195,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
else
svm_disable_lbrv(svm);
break;
- case MSR_K7_EVNTSEL0:
- case MSR_K7_EVNTSEL1:
- case MSR_K7_EVNTSEL2:
- case MSR_K7_EVNTSEL3:
- case MSR_K7_PERFCTR0:
- case MSR_K7_PERFCTR1:
- case MSR_K7_PERFCTR2:
- case MSR_K7_PERFCTR3:
- /*
- * Just discard all writes to the performance counters; this
- * should keep both older linux and windows 64-bit guests
- * happy
- */
- pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", ecx, data);
-
- break;
case MSR_VM_HSAVE_PA:
- svm->hsave_msr = data;
+ svm->nested.hsave_msr = data;
+ break;
+ case MSR_VM_CR:
+ case MSR_VM_IGNNE:
+ pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
break;
default:
return kvm_set_msr_common(vcpu, ecx, data);
@@ -2122,8 +2214,7 @@ static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
| ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
- KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32),
- handler);
+ trace_kvm_msr_write(ecx, data);
svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
if (svm_set_msr(&svm->vcpu, ecx, data))
@@ -2144,8 +2235,6 @@ static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
static int interrupt_window_interception(struct vcpu_svm *svm,
struct kvm_run *kvm_run)
{
- KVMTRACE_0D(PEND_INTR, &svm->vcpu, handler);
-
svm_clear_vintr(svm);
svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
/*
@@ -2201,7 +2290,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
[SVM_EXIT_INVD] = emulate_on_interception,
[SVM_EXIT_HLT] = halt_interception,
[SVM_EXIT_INVLPG] = invlpg_interception,
- [SVM_EXIT_INVLPGA] = invalid_op_interception,
+ [SVM_EXIT_INVLPGA] = invlpga_interception,
[SVM_EXIT_IOIO] = io_interception,
[SVM_EXIT_MSR] = msr_interception,
[SVM_EXIT_TASK_SWITCH] = task_switch_interception,
@@ -2224,20 +2313,26 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
u32 exit_code = svm->vmcb->control.exit_code;
- KVMTRACE_3D(VMEXIT, vcpu, exit_code, (u32)svm->vmcb->save.rip,
- (u32)((u64)svm->vmcb->save.rip >> 32), entryexit);
+ trace_kvm_exit(exit_code, svm->vmcb->save.rip);
if (is_nested(svm)) {
+ int vmexit;
+
nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n",
exit_code, svm->vmcb->control.exit_info_1,
svm->vmcb->control.exit_info_2, svm->vmcb->save.rip);
- if (nested_svm_exit_handled(svm, true)) {
- nested_svm_vmexit(svm);
- nsvm_printk("-> #VMEXIT\n");
+
+ vmexit = nested_svm_exit_special(svm);
+
+ if (vmexit == NESTED_EXIT_CONTINUE)
+ vmexit = nested_svm_exit_handled(svm);
+
+ if (vmexit == NESTED_EXIT_DONE)
return 1;
- }
}
+ svm_complete_interrupts(svm);
+
if (npt_enabled) {
int mmu_reload = 0;
if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) {
@@ -2246,12 +2341,6 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
}
vcpu->arch.cr0 = svm->vmcb->save.cr0;
vcpu->arch.cr3 = svm->vmcb->save.cr3;
- if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
- if (!load_pdptrs(vcpu, vcpu->arch.cr3)) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
- }
if (mmu_reload) {
kvm_mmu_reset_context(vcpu);
kvm_mmu_load(vcpu);
@@ -2319,7 +2408,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
{
struct vmcb_control_area *control;
- KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler);
+ trace_kvm_inj_virq(irq);
++svm->vcpu.stat.irq_injections;
control = &svm->vmcb->control;
@@ -2329,21 +2418,14 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
}
-static void svm_queue_irq(struct kvm_vcpu *vcpu, unsigned nr)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->vmcb->control.event_inj = nr |
- SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
-}
-
static void svm_set_irq(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- nested_svm_intr(svm);
+ BUG_ON(!(gif_set(svm)));
- svm_queue_irq(vcpu, vcpu->arch.interrupt.nr);
+ svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
+ SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
}
static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
@@ -2371,13 +2453,25 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
struct vmcb *vmcb = svm->vmcb;
return (vmcb->save.rflags & X86_EFLAGS_IF) &&
!(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
- (svm->vcpu.arch.hflags & HF_GIF_MASK);
+ gif_set(svm) &&
+ !(is_nested(svm) && (svm->vcpu.arch.hflags & HF_VINTR_MASK));
}
static void enable_irq_window(struct kvm_vcpu *vcpu)
{
- svm_set_vintr(to_svm(vcpu));
- svm_inject_irq(to_svm(vcpu), 0x0);
+ struct vcpu_svm *svm = to_svm(vcpu);
+ nsvm_printk("Trying to open IRQ window\n");
+
+ nested_svm_intr(svm);
+
+ /* In case GIF=0 we can't rely on the CPU to tell us when
+ * GIF becomes 1, because that's a separate STGI/VMRUN intercept.
+ * The next time we get that intercept, this function will be
+ * called again though and we'll get the vintr intercept. */
+ if (gif_set(svm)) {
+ svm_set_vintr(svm);
+ svm_inject_irq(svm, 0x0);
+ }
}
static void enable_nmi_window(struct kvm_vcpu *vcpu)
@@ -2456,6 +2550,8 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
case SVM_EXITINTINFO_TYPE_EXEPT:
/* In case of software exception do not reinject an exception
vector, but re-execute and instruction instead */
+ if (is_nested(svm))
+ break;
if (kvm_exception_is_soft(vector))
break;
if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
@@ -2498,9 +2594,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
fs_selector = kvm_read_fs();
gs_selector = kvm_read_gs();
ldt_selector = kvm_read_ldt();
- svm->host_cr2 = kvm_read_cr2();
- if (!is_nested(svm))
- svm->vmcb->save.cr2 = vcpu->arch.cr2;
+ svm->vmcb->save.cr2 = vcpu->arch.cr2;
/* required for live migration with NPT */
if (npt_enabled)
svm->vmcb->save.cr3 = vcpu->arch.cr3;
@@ -2585,8 +2679,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
- kvm_write_cr2(svm->host_cr2);
-
kvm_load_fs(fs_selector);
kvm_load_gs(gs_selector);
kvm_load_ldt(ldt_selector);
@@ -2602,7 +2694,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
svm->next_rip = 0;
- svm_complete_interrupts(svm);
+ if (npt_enabled) {
+ vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
+ vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
+ }
}
#undef R
@@ -2673,6 +2768,64 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
return 0;
}
+static const struct trace_print_flags svm_exit_reasons_str[] = {
+ { SVM_EXIT_READ_CR0, "read_cr0" },
+ { SVM_EXIT_READ_CR3, "read_cr3" },
+ { SVM_EXIT_READ_CR4, "read_cr4" },
+ { SVM_EXIT_READ_CR8, "read_cr8" },
+ { SVM_EXIT_WRITE_CR0, "write_cr0" },
+ { SVM_EXIT_WRITE_CR3, "write_cr3" },
+ { SVM_EXIT_WRITE_CR4, "write_cr4" },
+ { SVM_EXIT_WRITE_CR8, "write_cr8" },
+ { SVM_EXIT_READ_DR0, "read_dr0" },
+ { SVM_EXIT_READ_DR1, "read_dr1" },
+ { SVM_EXIT_READ_DR2, "read_dr2" },
+ { SVM_EXIT_READ_DR3, "read_dr3" },
+ { SVM_EXIT_WRITE_DR0, "write_dr0" },
+ { SVM_EXIT_WRITE_DR1, "write_dr1" },
+ { SVM_EXIT_WRITE_DR2, "write_dr2" },
+ { SVM_EXIT_WRITE_DR3, "write_dr3" },
+ { SVM_EXIT_WRITE_DR5, "write_dr5" },
+ { SVM_EXIT_WRITE_DR7, "write_dr7" },
+ { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" },
+ { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" },
+ { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" },
+ { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" },
+ { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" },
+ { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" },
+ { SVM_EXIT_INTR, "interrupt" },
+ { SVM_EXIT_NMI, "nmi" },
+ { SVM_EXIT_SMI, "smi" },
+ { SVM_EXIT_INIT, "init" },
+ { SVM_EXIT_VINTR, "vintr" },
+ { SVM_EXIT_CPUID, "cpuid" },
+ { SVM_EXIT_INVD, "invd" },
+ { SVM_EXIT_HLT, "hlt" },
+ { SVM_EXIT_INVLPG, "invlpg" },
+ { SVM_EXIT_INVLPGA, "invlpga" },
+ { SVM_EXIT_IOIO, "io" },
+ { SVM_EXIT_MSR, "msr" },
+ { SVM_EXIT_TASK_SWITCH, "task_switch" },
+ { SVM_EXIT_SHUTDOWN, "shutdown" },
+ { SVM_EXIT_VMRUN, "vmrun" },
+ { SVM_EXIT_VMMCALL, "hypercall" },
+ { SVM_EXIT_VMLOAD, "vmload" },
+ { SVM_EXIT_VMSAVE, "vmsave" },
+ { SVM_EXIT_STGI, "stgi" },
+ { SVM_EXIT_CLGI, "clgi" },
+ { SVM_EXIT_SKINIT, "skinit" },
+ { SVM_EXIT_WBINVD, "wbinvd" },
+ { SVM_EXIT_MONITOR, "monitor" },
+ { SVM_EXIT_MWAIT, "mwait" },
+ { SVM_EXIT_NPF, "npf" },
+ { -1, NULL }
+};
+
+static bool svm_gb_page_enable(void)
+{
+ return true;
+}
+
static struct kvm_x86_ops svm_x86_ops = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
@@ -2710,6 +2863,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.set_gdt = svm_set_gdt,
.get_dr = svm_get_dr,
.set_dr = svm_set_dr,
+ .cache_reg = svm_cache_reg,
.get_rflags = svm_get_rflags,
.set_rflags = svm_set_rflags,
@@ -2733,6 +2887,9 @@ static struct kvm_x86_ops svm_x86_ops = {
.set_tss_addr = svm_set_tss_addr,
.get_tdp_level = get_npt_level,
.get_mt_mask = svm_get_mt_mask,
+
+ .exit_reasons_str = svm_exit_reasons_str,
+ .gb_page_enable = svm_gb_page_enable,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index 86dbac072d0..eea40439066 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -9,12 +9,16 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
int restart_timer = 0;
wait_queue_head_t *q = &vcpu->wq;
- /* FIXME: this code should not know anything about vcpus */
- if (!atomic_inc_and_test(&ktimer->pending))
+ /*
+ * There is a race window between reading and incrementing, but we do
+ * not care about potentially loosing timer events in the !reinject
+ * case anyway.
+ */
+ if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
+ atomic_inc(&ktimer->pending);
+ /* FIXME: this code should not know anything about vcpus */
set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
-
- if (!ktimer->reinject)
- atomic_set(&ktimer->pending, 1);
+ }
if (waitqueue_active(q))
wake_up_interruptible(q);
@@ -33,7 +37,7 @@ enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
struct kvm_vcpu *vcpu;
struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
- vcpu = ktimer->kvm->vcpus[ktimer->vcpu_id];
+ vcpu = ktimer->vcpu;
if (!vcpu)
return HRTIMER_NORESTART;
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
new file mode 100644
index 00000000000..0d480e77eac
--- /dev/null
+++ b/arch/x86/kvm/trace.h
@@ -0,0 +1,355 @@
+#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVM_H
+
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm
+#define TRACE_INCLUDE_PATH arch/x86/kvm
+#define TRACE_INCLUDE_FILE trace
+
+/*
+ * Tracepoint for guest mode entry.
+ */
+TRACE_EVENT(kvm_entry,
+ TP_PROTO(unsigned int vcpu_id),
+ TP_ARGS(vcpu_id),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ ),
+
+ TP_printk("vcpu %u", __entry->vcpu_id)
+);
+
+/*
+ * Tracepoint for hypercall.
+ */
+TRACE_EVENT(kvm_hypercall,
+ TP_PROTO(unsigned long nr, unsigned long a0, unsigned long a1,
+ unsigned long a2, unsigned long a3),
+ TP_ARGS(nr, a0, a1, a2, a3),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, nr )
+ __field( unsigned long, a0 )
+ __field( unsigned long, a1 )
+ __field( unsigned long, a2 )
+ __field( unsigned long, a3 )
+ ),
+
+ TP_fast_assign(
+ __entry->nr = nr;
+ __entry->a0 = a0;
+ __entry->a1 = a1;
+ __entry->a2 = a2;
+ __entry->a3 = a3;
+ ),
+
+ TP_printk("nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx",
+ __entry->nr, __entry->a0, __entry->a1, __entry->a2,
+ __entry->a3)
+);
+
+/*
+ * Tracepoint for PIO.
+ */
+TRACE_EVENT(kvm_pio,
+ TP_PROTO(unsigned int rw, unsigned int port, unsigned int size,
+ unsigned int count),
+ TP_ARGS(rw, port, size, count),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, rw )
+ __field( unsigned int, port )
+ __field( unsigned int, size )
+ __field( unsigned int, count )
+ ),
+
+ TP_fast_assign(
+ __entry->rw = rw;
+ __entry->port = port;
+ __entry->size = size;
+ __entry->count = count;
+ ),
+
+ TP_printk("pio_%s at 0x%x size %d count %d",
+ __entry->rw ? "write" : "read",
+ __entry->port, __entry->size, __entry->count)
+);
+
+/*
+ * Tracepoint for cpuid.
+ */
+TRACE_EVENT(kvm_cpuid,
+ TP_PROTO(unsigned int function, unsigned long rax, unsigned long rbx,
+ unsigned long rcx, unsigned long rdx),
+ TP_ARGS(function, rax, rbx, rcx, rdx),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, function )
+ __field( unsigned long, rax )
+ __field( unsigned long, rbx )
+ __field( unsigned long, rcx )
+ __field( unsigned long, rdx )
+ ),
+
+ TP_fast_assign(
+ __entry->function = function;
+ __entry->rax = rax;
+ __entry->rbx = rbx;
+ __entry->rcx = rcx;
+ __entry->rdx = rdx;
+ ),
+
+ TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx",
+ __entry->function, __entry->rax,
+ __entry->rbx, __entry->rcx, __entry->rdx)
+);
+
+#define AREG(x) { APIC_##x, "APIC_" #x }
+
+#define kvm_trace_symbol_apic \
+ AREG(ID), AREG(LVR), AREG(TASKPRI), AREG(ARBPRI), AREG(PROCPRI), \
+ AREG(EOI), AREG(RRR), AREG(LDR), AREG(DFR), AREG(SPIV), AREG(ISR), \
+ AREG(TMR), AREG(IRR), AREG(ESR), AREG(ICR), AREG(ICR2), AREG(LVTT), \
+ AREG(LVTTHMR), AREG(LVTPC), AREG(LVT0), AREG(LVT1), AREG(LVTERR), \
+ AREG(TMICT), AREG(TMCCT), AREG(TDCR), AREG(SELF_IPI), AREG(EFEAT), \
+ AREG(ECTRL)
+/*
+ * Tracepoint for apic access.
+ */
+TRACE_EVENT(kvm_apic,
+ TP_PROTO(unsigned int rw, unsigned int reg, unsigned int val),
+ TP_ARGS(rw, reg, val),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, rw )
+ __field( unsigned int, reg )
+ __field( unsigned int, val )
+ ),
+
+ TP_fast_assign(
+ __entry->rw = rw;
+ __entry->reg = reg;
+ __entry->val = val;
+ ),
+
+ TP_printk("apic_%s %s = 0x%x",
+ __entry->rw ? "write" : "read",
+ __print_symbolic(__entry->reg, kvm_trace_symbol_apic),
+ __entry->val)
+);
+
+#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val)
+#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val)
+
+/*
+ * Tracepoint for kvm guest exit:
+ */
+TRACE_EVENT(kvm_exit,
+ TP_PROTO(unsigned int exit_reason, unsigned long guest_rip),
+ TP_ARGS(exit_reason, guest_rip),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, exit_reason )
+ __field( unsigned long, guest_rip )
+ ),
+
+ TP_fast_assign(
+ __entry->exit_reason = exit_reason;
+ __entry->guest_rip = guest_rip;
+ ),
+
+ TP_printk("reason %s rip 0x%lx",
+ ftrace_print_symbols_seq(p, __entry->exit_reason,
+ kvm_x86_ops->exit_reasons_str),
+ __entry->guest_rip)
+);
+
+/*
+ * Tracepoint for kvm interrupt injection:
+ */
+TRACE_EVENT(kvm_inj_virq,
+ TP_PROTO(unsigned int irq),
+ TP_ARGS(irq),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, irq )
+ ),
+
+ TP_fast_assign(
+ __entry->irq = irq;
+ ),
+
+ TP_printk("irq %u", __entry->irq)
+);
+
+/*
+ * Tracepoint for page fault.
+ */
+TRACE_EVENT(kvm_page_fault,
+ TP_PROTO(unsigned long fault_address, unsigned int error_code),
+ TP_ARGS(fault_address, error_code),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, fault_address )
+ __field( unsigned int, error_code )
+ ),
+
+ TP_fast_assign(
+ __entry->fault_address = fault_address;
+ __entry->error_code = error_code;
+ ),
+
+ TP_printk("address %lx error_code %x",
+ __entry->fault_address, __entry->error_code)
+);
+
+/*
+ * Tracepoint for guest MSR access.
+ */
+TRACE_EVENT(kvm_msr,
+ TP_PROTO(unsigned int rw, unsigned int ecx, unsigned long data),
+ TP_ARGS(rw, ecx, data),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, rw )
+ __field( unsigned int, ecx )
+ __field( unsigned long, data )
+ ),
+
+ TP_fast_assign(
+ __entry->rw = rw;
+ __entry->ecx = ecx;
+ __entry->data = data;
+ ),
+
+ TP_printk("msr_%s %x = 0x%lx",
+ __entry->rw ? "write" : "read",
+ __entry->ecx, __entry->data)
+);
+
+#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data)
+#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data)
+
+/*
+ * Tracepoint for guest CR access.
+ */
+TRACE_EVENT(kvm_cr,
+ TP_PROTO(unsigned int rw, unsigned int cr, unsigned long val),
+ TP_ARGS(rw, cr, val),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, rw )
+ __field( unsigned int, cr )
+ __field( unsigned long, val )
+ ),
+
+ TP_fast_assign(
+ __entry->rw = rw;
+ __entry->cr = cr;
+ __entry->val = val;
+ ),
+
+ TP_printk("cr_%s %x = 0x%lx",
+ __entry->rw ? "write" : "read",
+ __entry->cr, __entry->val)
+);
+
+#define trace_kvm_cr_read(cr, val) trace_kvm_cr(0, cr, val)
+#define trace_kvm_cr_write(cr, val) trace_kvm_cr(1, cr, val)
+
+TRACE_EVENT(kvm_pic_set_irq,
+ TP_PROTO(__u8 chip, __u8 pin, __u8 elcr, __u8 imr, bool coalesced),
+ TP_ARGS(chip, pin, elcr, imr, coalesced),
+
+ TP_STRUCT__entry(
+ __field( __u8, chip )
+ __field( __u8, pin )
+ __field( __u8, elcr )
+ __field( __u8, imr )
+ __field( bool, coalesced )
+ ),
+
+ TP_fast_assign(
+ __entry->chip = chip;
+ __entry->pin = pin;
+ __entry->elcr = elcr;
+ __entry->imr = imr;
+ __entry->coalesced = coalesced;
+ ),
+
+ TP_printk("chip %u pin %u (%s%s)%s",
+ __entry->chip, __entry->pin,
+ (__entry->elcr & (1 << __entry->pin)) ? "level":"edge",
+ (__entry->imr & (1 << __entry->pin)) ? "|masked":"",
+ __entry->coalesced ? " (coalesced)" : "")
+);
+
+#define kvm_apic_dst_shorthand \
+ {0x0, "dst"}, \
+ {0x1, "self"}, \
+ {0x2, "all"}, \
+ {0x3, "all-but-self"}
+
+TRACE_EVENT(kvm_apic_ipi,
+ TP_PROTO(__u32 icr_low, __u32 dest_id),
+ TP_ARGS(icr_low, dest_id),
+
+ TP_STRUCT__entry(
+ __field( __u32, icr_low )
+ __field( __u32, dest_id )
+ ),
+
+ TP_fast_assign(
+ __entry->icr_low = icr_low;
+ __entry->dest_id = dest_id;
+ ),
+
+ TP_printk("dst %x vec %u (%s|%s|%s|%s|%s)",
+ __entry->dest_id, (u8)__entry->icr_low,
+ __print_symbolic((__entry->icr_low >> 8 & 0x7),
+ kvm_deliver_mode),
+ (__entry->icr_low & (1<<11)) ? "logical" : "physical",
+ (__entry->icr_low & (1<<14)) ? "assert" : "de-assert",
+ (__entry->icr_low & (1<<15)) ? "level" : "edge",
+ __print_symbolic((__entry->icr_low >> 18 & 0x3),
+ kvm_apic_dst_shorthand))
+);
+
+TRACE_EVENT(kvm_apic_accept_irq,
+ TP_PROTO(__u32 apicid, __u16 dm, __u8 tm, __u8 vec, bool coalesced),
+ TP_ARGS(apicid, dm, tm, vec, coalesced),
+
+ TP_STRUCT__entry(
+ __field( __u32, apicid )
+ __field( __u16, dm )
+ __field( __u8, tm )
+ __field( __u8, vec )
+ __field( bool, coalesced )
+ ),
+
+ TP_fast_assign(
+ __entry->apicid = apicid;
+ __entry->dm = dm;
+ __entry->tm = tm;
+ __entry->vec = vec;
+ __entry->coalesced = coalesced;
+ ),
+
+ TP_printk("apicid %x vec %u (%s|%s)%s",
+ __entry->apicid, __entry->vec,
+ __print_symbolic((__entry->dm >> 8 & 0x7), kvm_deliver_mode),
+ __entry->tm ? "level" : "edge",
+ __entry->coalesced ? " (coalesced)" : "")
+);
+
+#endif /* _TRACE_KVM_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 29f912927a5..f3812014bd0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -25,6 +25,7 @@
#include <linux/highmem.h>
#include <linux/sched.h>
#include <linux/moduleparam.h>
+#include <linux/ftrace_event.h>
#include "kvm_cache_regs.h"
#include "x86.h"
@@ -34,6 +35,8 @@
#include <asm/virtext.h>
#include <asm/mce.h>
+#include "trace.h"
+
#define __ex(x) __kvm_handle_fault_on_reboot(x)
MODULE_AUTHOR("Qumranet");
@@ -51,6 +54,10 @@ module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
static int __read_mostly enable_ept = 1;
module_param_named(ept, enable_ept, bool, S_IRUGO);
+static int __read_mostly enable_unrestricted_guest = 1;
+module_param_named(unrestricted_guest,
+ enable_unrestricted_guest, bool, S_IRUGO);
+
static int __read_mostly emulate_invalid_guest_state = 0;
module_param(emulate_invalid_guest_state, bool, S_IRUGO);
@@ -84,6 +91,14 @@ struct vcpu_vmx {
int guest_efer_loaded;
} host_state;
struct {
+ int vm86_active;
+ u8 save_iopl;
+ struct kvm_save_segment {
+ u16 selector;
+ unsigned long base;
+ u32 limit;
+ u32 ar;
+ } tr, es, ds, fs, gs;
struct {
bool pending;
u8 vector;
@@ -161,6 +176,8 @@ static struct kvm_vmx_segment_field {
VMX_SEGMENT_FIELD(LDTR),
};
+static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
+
/*
* Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
* away by decrementing the array size.
@@ -256,6 +273,26 @@ static inline bool cpu_has_vmx_flexpriority(void)
cpu_has_vmx_virtualize_apic_accesses();
}
+static inline bool cpu_has_vmx_ept_execute_only(void)
+{
+ return !!(vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT);
+}
+
+static inline bool cpu_has_vmx_eptp_uncacheable(void)
+{
+ return !!(vmx_capability.ept & VMX_EPTP_UC_BIT);
+}
+
+static inline bool cpu_has_vmx_eptp_writeback(void)
+{
+ return !!(vmx_capability.ept & VMX_EPTP_WB_BIT);
+}
+
+static inline bool cpu_has_vmx_ept_2m_page(void)
+{
+ return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT);
+}
+
static inline int cpu_has_vmx_invept_individual_addr(void)
{
return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
@@ -277,6 +314,12 @@ static inline int cpu_has_vmx_ept(void)
SECONDARY_EXEC_ENABLE_EPT;
}
+static inline int cpu_has_vmx_unrestricted_guest(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_UNRESTRICTED_GUEST;
+}
+
static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
{
return flexpriority_enabled &&
@@ -497,14 +540,16 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR);
if (!vcpu->fpu_active)
eb |= 1u << NM_VECTOR;
+ /*
+ * Unconditionally intercept #DB so we can maintain dr6 without
+ * reading it every exit.
+ */
+ eb |= 1u << DB_VECTOR;
if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
- if (vcpu->guest_debug &
- (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
- eb |= 1u << DB_VECTOR;
if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
eb |= 1u << BP_VECTOR;
}
- if (vcpu->arch.rmode.vm86_active)
+ if (to_vmx(vcpu)->rmode.vm86_active)
eb = ~0;
if (enable_ept)
eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
@@ -528,12 +573,15 @@ static void reload_tss(void)
static void load_transition_efer(struct vcpu_vmx *vmx)
{
int efer_offset = vmx->msr_offset_efer;
- u64 host_efer = vmx->host_msrs[efer_offset].data;
- u64 guest_efer = vmx->guest_msrs[efer_offset].data;
+ u64 host_efer;
+ u64 guest_efer;
u64 ignore_bits;
if (efer_offset < 0)
return;
+ host_efer = vmx->host_msrs[efer_offset].data;
+ guest_efer = vmx->guest_msrs[efer_offset].data;
+
/*
* NX is emulated; LMA and LME handled by hardware; SCE meaninless
* outside long mode
@@ -735,12 +783,17 @@ static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
{
- return vmcs_readl(GUEST_RFLAGS);
+ unsigned long rflags;
+
+ rflags = vmcs_readl(GUEST_RFLAGS);
+ if (to_vmx(vcpu)->rmode.vm86_active)
+ rflags &= ~(unsigned long)(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
+ return rflags;
}
static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
- if (vcpu->arch.rmode.vm86_active)
+ if (to_vmx(vcpu)->rmode.vm86_active)
rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
vmcs_writel(GUEST_RFLAGS, rflags);
}
@@ -797,12 +850,13 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
intr_info |= INTR_INFO_DELIVER_CODE_MASK;
}
- if (vcpu->arch.rmode.vm86_active) {
+ if (vmx->rmode.vm86_active) {
vmx->rmode.irq.pending = true;
vmx->rmode.irq.vector = nr;
vmx->rmode.irq.rip = kvm_rip_read(vcpu);
- if (nr == BP_VECTOR || nr == OF_VECTOR)
- vmx->rmode.irq.rip++;
+ if (kvm_exception_is_soft(nr))
+ vmx->rmode.irq.rip +=
+ vmx->vcpu.arch.event_exit_inst_len;
intr_info |= INTR_TYPE_SOFT_INTR;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
@@ -940,7 +994,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
case MSR_EFER:
return kvm_get_msr_common(vcpu, msr_index, pdata);
#endif
- case MSR_IA32_TIME_STAMP_COUNTER:
+ case MSR_IA32_TSC:
data = guest_read_tsc();
break;
case MSR_IA32_SYSENTER_CS:
@@ -953,9 +1007,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
data = vmcs_readl(GUEST_SYSENTER_ESP);
break;
default:
- vmx_load_host_state(to_vmx(vcpu));
msr = find_msr_entry(to_vmx(vcpu), msr_index);
if (msr) {
+ vmx_load_host_state(to_vmx(vcpu));
data = msr->data;
break;
}
@@ -1000,22 +1054,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
case MSR_IA32_SYSENTER_ESP:
vmcs_writel(GUEST_SYSENTER_ESP, data);
break;
- case MSR_IA32_TIME_STAMP_COUNTER:
+ case MSR_IA32_TSC:
rdtscll(host_tsc);
guest_write_tsc(data, host_tsc);
break;
- case MSR_P6_PERFCTR0:
- case MSR_P6_PERFCTR1:
- case MSR_P6_EVNTSEL0:
- case MSR_P6_EVNTSEL1:
- /*
- * Just discard all writes to the performance counters; this
- * should keep both older linux and windows 64-bit guests
- * happy
- */
- pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data);
-
- break;
case MSR_IA32_CR_PAT:
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
vmcs_write64(GUEST_IA32_PAT, data);
@@ -1024,9 +1066,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
}
/* Otherwise falls through to kvm_set_msr_common */
default:
- vmx_load_host_state(vmx);
msr = find_msr_entry(vmx, msr_index);
if (msr) {
+ vmx_load_host_state(vmx);
msr->data = data;
break;
}
@@ -1046,6 +1088,10 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
case VCPU_REGS_RIP:
vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
break;
+ case VCPU_EXREG_PDPTR:
+ if (enable_ept)
+ ept_save_pdptrs(vcpu);
+ break;
default:
break;
}
@@ -1203,7 +1249,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_WBINVD_EXITING |
SECONDARY_EXEC_ENABLE_VPID |
- SECONDARY_EXEC_ENABLE_EPT;
+ SECONDARY_EXEC_ENABLE_EPT |
+ SECONDARY_EXEC_UNRESTRICTED_GUEST;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
@@ -1217,12 +1264,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
/* CR3 accesses and invlpg don't need to cause VM Exits when EPT
enabled */
- min &= ~(CPU_BASED_CR3_LOAD_EXITING |
- CPU_BASED_CR3_STORE_EXITING |
- CPU_BASED_INVLPG_EXITING);
- if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
- &_cpu_based_exec_control) < 0)
- return -EIO;
+ _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
+ CPU_BASED_CR3_STORE_EXITING |
+ CPU_BASED_INVLPG_EXITING);
rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
vmx_capability.ept, vmx_capability.vpid);
}
@@ -1333,8 +1377,13 @@ static __init int hardware_setup(void)
if (!cpu_has_vmx_vpid())
enable_vpid = 0;
- if (!cpu_has_vmx_ept())
+ if (!cpu_has_vmx_ept()) {
enable_ept = 0;
+ enable_unrestricted_guest = 0;
+ }
+
+ if (!cpu_has_vmx_unrestricted_guest())
+ enable_unrestricted_guest = 0;
if (!cpu_has_vmx_flexpriority())
flexpriority_enabled = 0;
@@ -1342,6 +1391,9 @@ static __init int hardware_setup(void)
if (!cpu_has_vmx_tpr_shadow())
kvm_x86_ops->update_cr8_intercept = NULL;
+ if (enable_ept && !cpu_has_vmx_ept_2m_page())
+ kvm_disable_largepages();
+
return alloc_kvm_area();
}
@@ -1372,15 +1424,15 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
vmx->emulation_required = 1;
- vcpu->arch.rmode.vm86_active = 0;
+ vmx->rmode.vm86_active = 0;
- vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
- vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit);
- vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar);
+ vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base);
+ vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit);
+ vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar);
flags = vmcs_readl(GUEST_RFLAGS);
flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
- flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT);
+ flags |= (vmx->rmode.save_iopl << IOPL_SHIFT);
vmcs_writel(GUEST_RFLAGS, flags);
vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
@@ -1391,10 +1443,10 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
if (emulate_invalid_guest_state)
return;
- fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
- fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
- fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
- fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
+ fix_pmode_dataseg(VCPU_SREG_ES, &vmx->rmode.es);
+ fix_pmode_dataseg(VCPU_SREG_DS, &vmx->rmode.ds);
+ fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs);
+ fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs);
vmcs_write16(GUEST_SS_SELECTOR, 0);
vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
@@ -1433,20 +1485,23 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
unsigned long flags;
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (enable_unrestricted_guest)
+ return;
+
vmx->emulation_required = 1;
- vcpu->arch.rmode.vm86_active = 1;
+ vmx->rmode.vm86_active = 1;
- vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
+ vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
- vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
+ vmx->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
- vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
+ vmx->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
flags = vmcs_readl(GUEST_RFLAGS);
- vcpu->arch.rmode.save_iopl
+ vmx->rmode.save_iopl
= (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
@@ -1468,10 +1523,10 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_CS_BASE, 0xf0000);
vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
- fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
- fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
- fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
- fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
+ fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es);
+ fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds);
+ fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
+ fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs);
continue_rmode:
kvm_mmu_reset_context(vcpu);
@@ -1545,11 +1600,11 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
{
+ if (!test_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_dirty))
+ return;
+
if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
- if (!load_pdptrs(vcpu, vcpu->arch.cr3)) {
- printk(KERN_ERR "EPT: Fail to load pdptrs!\n");
- return;
- }
vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]);
vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]);
vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]);
@@ -1557,6 +1612,21 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
}
}
+static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
+{
+ if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
+ vcpu->arch.pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
+ vcpu->arch.pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
+ vcpu->arch.pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
+ vcpu->arch.pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
+ }
+
+ __set_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_avail);
+ __set_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_dirty);
+}
+
static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
@@ -1571,8 +1641,6 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
CPU_BASED_CR3_STORE_EXITING));
vcpu->arch.cr0 = cr0;
vmx_set_cr4(vcpu, vcpu->arch.cr4);
- *hw_cr0 |= X86_CR0_PE | X86_CR0_PG;
- *hw_cr0 &= ~X86_CR0_WP;
} else if (!is_paging(vcpu)) {
/* From nonpaging to paging */
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
@@ -1581,9 +1649,10 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
CPU_BASED_CR3_STORE_EXITING));
vcpu->arch.cr0 = cr0;
vmx_set_cr4(vcpu, vcpu->arch.cr4);
- if (!(vcpu->arch.cr0 & X86_CR0_WP))
- *hw_cr0 &= ~X86_CR0_WP;
}
+
+ if (!(cr0 & X86_CR0_WP))
+ *hw_cr0 &= ~X86_CR0_WP;
}
static void ept_update_paging_mode_cr4(unsigned long *hw_cr4,
@@ -1598,15 +1667,21 @@ static void ept_update_paging_mode_cr4(unsigned long *hw_cr4,
static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
- unsigned long hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) |
- KVM_VM_CR0_ALWAYS_ON;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long hw_cr0;
+
+ if (enable_unrestricted_guest)
+ hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST)
+ | KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
+ else
+ hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON;
vmx_fpu_deactivate(vcpu);
- if (vcpu->arch.rmode.vm86_active && (cr0 & X86_CR0_PE))
+ if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
enter_pmode(vcpu);
- if (!vcpu->arch.rmode.vm86_active && !(cr0 & X86_CR0_PE))
+ if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
enter_rmode(vcpu);
#ifdef CONFIG_X86_64
@@ -1650,10 +1725,8 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
if (enable_ept) {
eptp = construct_eptp(cr3);
vmcs_write64(EPT_POINTER, eptp);
- ept_sync_context(eptp);
- ept_load_pdptrs(vcpu);
guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 :
- VMX_EPT_IDENTITY_PAGETABLE_ADDR;
+ vcpu->kvm->arch.ept_identity_map_addr;
}
vmx_flush_tlb(vcpu);
@@ -1664,7 +1737,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
- unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.vm86_active ?
+ unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ?
KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
vcpu->arch.cr4 = cr4;
@@ -1707,16 +1780,13 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
static int vmx_get_cpl(struct kvm_vcpu *vcpu)
{
- struct kvm_segment kvm_seg;
-
if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */
return 0;
if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */
return 3;
- vmx_get_segment(vcpu, &kvm_seg, VCPU_SREG_CS);
- return kvm_seg.selector & 3;
+ return vmcs_read16(GUEST_CS_SELECTOR) & 3;
}
static u32 vmx_segment_access_rights(struct kvm_segment *var)
@@ -1744,20 +1814,21 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var)
static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
u32 ar;
- if (vcpu->arch.rmode.vm86_active && seg == VCPU_SREG_TR) {
- vcpu->arch.rmode.tr.selector = var->selector;
- vcpu->arch.rmode.tr.base = var->base;
- vcpu->arch.rmode.tr.limit = var->limit;
- vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var);
+ if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) {
+ vmx->rmode.tr.selector = var->selector;
+ vmx->rmode.tr.base = var->base;
+ vmx->rmode.tr.limit = var->limit;
+ vmx->rmode.tr.ar = vmx_segment_access_rights(var);
return;
}
vmcs_writel(sf->base, var->base);
vmcs_write32(sf->limit, var->limit);
vmcs_write16(sf->selector, var->selector);
- if (vcpu->arch.rmode.vm86_active && var->s) {
+ if (vmx->rmode.vm86_active && var->s) {
/*
* Hack real-mode segments into vm86 compatibility.
*/
@@ -1766,6 +1837,21 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
ar = 0xf3;
} else
ar = vmx_segment_access_rights(var);
+
+ /*
+ * Fix the "Accessed" bit in AR field of segment registers for older
+ * qemu binaries.
+ * IA32 arch specifies that at the time of processor reset the
+ * "Accessed" bit in the AR field of segment registers is 1. And qemu
+ * is setting it to 0 in the usedland code. This causes invalid guest
+ * state vmexit when "unrestricted guest" mode is turned on.
+ * Fix for this setup issue in cpu_reset is being pushed in the qemu
+ * tree. Newer qemu binaries with that qemu fix would not need this
+ * kvm hack.
+ */
+ if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
+ ar |= 0x1; /* Accessed */
+
vmcs_write32(sf->ar_bytes, ar);
}
@@ -2040,7 +2126,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
if (likely(kvm->arch.ept_identity_pagetable_done))
return 1;
ret = 0;
- identity_map_pfn = VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT;
+ identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
if (r < 0)
goto out;
@@ -2062,11 +2148,19 @@ out:
static void seg_setup(int seg)
{
struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+ unsigned int ar;
vmcs_write16(sf->selector, 0);
vmcs_writel(sf->base, 0);
vmcs_write32(sf->limit, 0xffff);
- vmcs_write32(sf->ar_bytes, 0xf3);
+ if (enable_unrestricted_guest) {
+ ar = 0x93;
+ if (seg == VCPU_SREG_CS)
+ ar |= 0x08; /* code segment */
+ } else
+ ar = 0xf3;
+
+ vmcs_write32(sf->ar_bytes, ar);
}
static int alloc_apic_access_page(struct kvm *kvm)
@@ -2101,14 +2195,15 @@ static int alloc_identity_pagetable(struct kvm *kvm)
goto out;
kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
kvm_userspace_mem.flags = 0;
- kvm_userspace_mem.guest_phys_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
+ kvm_userspace_mem.guest_phys_addr =
+ kvm->arch.ept_identity_map_addr;
kvm_userspace_mem.memory_size = PAGE_SIZE;
r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
if (r)
goto out;
kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
- VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT);
+ kvm->arch.ept_identity_map_addr >> PAGE_SHIFT);
out:
up_write(&kvm->slots_lock);
return r;
@@ -2209,6 +2304,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
if (!enable_ept)
exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+ if (!enable_unrestricted_guest)
+ exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
}
@@ -2326,14 +2423,14 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
goto out;
}
- vmx->vcpu.arch.rmode.vm86_active = 0;
+ vmx->rmode.vm86_active = 0;
vmx->soft_vnmi_blocked = 0;
vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
kvm_set_cr8(&vmx->vcpu, 0);
msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
- if (vmx->vcpu.vcpu_id == 0)
+ if (kvm_vcpu_is_bsp(&vmx->vcpu))
msr |= MSR_IA32_APICBASE_BSP;
kvm_set_apic_base(&vmx->vcpu, msr);
@@ -2344,7 +2441,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
* GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
* insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
*/
- if (vmx->vcpu.vcpu_id == 0) {
+ if (kvm_vcpu_is_bsp(&vmx->vcpu)) {
vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
vmcs_writel(GUEST_CS_BASE, 0x000f0000);
} else {
@@ -2373,7 +2470,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_SYSENTER_EIP, 0);
vmcs_writel(GUEST_RFLAGS, 0x02);
- if (vmx->vcpu.vcpu_id == 0)
+ if (kvm_vcpu_is_bsp(&vmx->vcpu))
kvm_rip_write(vcpu, 0xfff0);
else
kvm_rip_write(vcpu, 0);
@@ -2461,13 +2558,16 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
uint32_t intr;
int irq = vcpu->arch.interrupt.nr;
- KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler);
+ trace_kvm_inj_virq(irq);
++vcpu->stat.irq_injections;
- if (vcpu->arch.rmode.vm86_active) {
+ if (vmx->rmode.vm86_active) {
vmx->rmode.irq.pending = true;
vmx->rmode.irq.vector = irq;
vmx->rmode.irq.rip = kvm_rip_read(vcpu);
+ if (vcpu->arch.interrupt.soft)
+ vmx->rmode.irq.rip +=
+ vmx->vcpu.arch.event_exit_inst_len;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
@@ -2502,7 +2602,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
}
++vcpu->stat.nmi_injections;
- if (vcpu->arch.rmode.vm86_active) {
+ if (vmx->rmode.vm86_active) {
vmx->rmode.irq.pending = true;
vmx->rmode.irq.vector = NMI_VECTOR;
vmx->rmode.irq.rip = kvm_rip_read(vcpu);
@@ -2659,14 +2759,14 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if (enable_ept)
BUG();
cr2 = vmcs_readl(EXIT_QUALIFICATION);
- KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
- (u32)((u64)cr2 >> 32), handler);
+ trace_kvm_page_fault(cr2, error_code);
+
if (kvm_event_needs_reinjection(vcpu))
kvm_mmu_unprotect_page_virt(vcpu, cr2);
return kvm_mmu_page_fault(vcpu, cr2, error_code);
}
- if (vcpu->arch.rmode.vm86_active &&
+ if (vmx->rmode.vm86_active &&
handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
error_code)) {
if (vcpu->arch.halt_request) {
@@ -2707,7 +2807,6 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu,
struct kvm_run *kvm_run)
{
++vcpu->stat.irq_exits;
- KVMTRACE_1D(INTR, vcpu, vmcs_read32(VM_EXIT_INTR_INFO), handler);
return 1;
}
@@ -2755,7 +2854,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
- unsigned long exit_qualification;
+ unsigned long exit_qualification, val;
int cr;
int reg;
@@ -2764,21 +2863,19 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
reg = (exit_qualification >> 8) & 15;
switch ((exit_qualification >> 4) & 3) {
case 0: /* mov to cr */
- KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr,
- (u32)kvm_register_read(vcpu, reg),
- (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
- handler);
+ val = kvm_register_read(vcpu, reg);
+ trace_kvm_cr_write(cr, val);
switch (cr) {
case 0:
- kvm_set_cr0(vcpu, kvm_register_read(vcpu, reg));
+ kvm_set_cr0(vcpu, val);
skip_emulated_instruction(vcpu);
return 1;
case 3:
- kvm_set_cr3(vcpu, kvm_register_read(vcpu, reg));
+ kvm_set_cr3(vcpu, val);
skip_emulated_instruction(vcpu);
return 1;
case 4:
- kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg));
+ kvm_set_cr4(vcpu, val);
skip_emulated_instruction(vcpu);
return 1;
case 8: {
@@ -2800,23 +2897,19 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
vcpu->arch.cr0 &= ~X86_CR0_TS;
vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
vmx_fpu_activate(vcpu);
- KVMTRACE_0D(CLTS, vcpu, handler);
skip_emulated_instruction(vcpu);
return 1;
case 1: /*mov from cr*/
switch (cr) {
case 3:
kvm_register_write(vcpu, reg, vcpu->arch.cr3);
- KVMTRACE_3D(CR_READ, vcpu, (u32)cr,
- (u32)kvm_register_read(vcpu, reg),
- (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
- handler);
+ trace_kvm_cr_read(cr, vcpu->arch.cr3);
skip_emulated_instruction(vcpu);
return 1;
case 8:
- kvm_register_write(vcpu, reg, kvm_get_cr8(vcpu));
- KVMTRACE_2D(CR_READ, vcpu, (u32)cr,
- (u32)kvm_register_read(vcpu, reg), handler);
+ val = kvm_get_cr8(vcpu);
+ kvm_register_write(vcpu, reg, val);
+ trace_kvm_cr_read(cr, val);
skip_emulated_instruction(vcpu);
return 1;
}
@@ -2841,6 +2934,8 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
unsigned long val;
int dr, reg;
+ if (!kvm_require_cpl(vcpu, 0))
+ return 1;
dr = vmcs_readl(GUEST_DR7);
if (dr & DR7_GD) {
/*
@@ -2884,7 +2979,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
val = 0;
}
kvm_register_write(vcpu, reg, val);
- KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
} else {
val = vcpu->arch.regs[reg];
switch (dr) {
@@ -2917,7 +3011,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
}
break;
}
- KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)val, handler);
}
skip_emulated_instruction(vcpu);
return 1;
@@ -2939,8 +3032,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
return 1;
}
- KVMTRACE_3D(MSR_READ, vcpu, ecx, (u32)data, (u32)(data >> 32),
- handler);
+ trace_kvm_msr_read(ecx, data);
/* FIXME: handling of bits 32:63 of rax, rdx */
vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
@@ -2955,8 +3047,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
| ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
- KVMTRACE_3D(MSR_WRITE, vcpu, ecx, (u32)data, (u32)(data >> 32),
- handler);
+ trace_kvm_msr_write(ecx, data);
if (vmx_set_msr(vcpu, ecx, data) != 0) {
kvm_inject_gp(vcpu, 0);
@@ -2983,7 +3074,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
- KVMTRACE_0D(PEND_INTR, vcpu, handler);
++vcpu->stat.irq_window_exits;
/*
@@ -3049,7 +3139,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
printk(KERN_ERR
"Fail to handle apic access vmexit! Offset is 0x%lx\n",
offset);
- return -ENOTSUPP;
+ return -ENOEXEC;
}
return 1;
}
@@ -3118,7 +3208,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if (exit_qualification & (1 << 6)) {
printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
- return -ENOTSUPP;
+ return -EINVAL;
}
gla_validity = (exit_qualification >> 7) & 0x3;
@@ -3130,14 +3220,98 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
(long unsigned int)exit_qualification);
kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
- kvm_run->hw.hardware_exit_reason = 0;
- return -ENOTSUPP;
+ kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION;
+ return 0;
}
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+ trace_kvm_page_fault(gpa, exit_qualification);
return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0);
}
+static u64 ept_rsvd_mask(u64 spte, int level)
+{
+ int i;
+ u64 mask = 0;
+
+ for (i = 51; i > boot_cpu_data.x86_phys_bits; i--)
+ mask |= (1ULL << i);
+
+ if (level > 2)
+ /* bits 7:3 reserved */
+ mask |= 0xf8;
+ else if (level == 2) {
+ if (spte & (1ULL << 7))
+ /* 2MB ref, bits 20:12 reserved */
+ mask |= 0x1ff000;
+ else
+ /* bits 6:3 reserved */
+ mask |= 0x78;
+ }
+
+ return mask;
+}
+
+static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
+ int level)
+{
+ printk(KERN_ERR "%s: spte 0x%llx level %d\n", __func__, spte, level);
+
+ /* 010b (write-only) */
+ WARN_ON((spte & 0x7) == 0x2);
+
+ /* 110b (write/execute) */
+ WARN_ON((spte & 0x7) == 0x6);
+
+ /* 100b (execute-only) and value not supported by logical processor */
+ if (!cpu_has_vmx_ept_execute_only())
+ WARN_ON((spte & 0x7) == 0x4);
+
+ /* not 000b */
+ if ((spte & 0x7)) {
+ u64 rsvd_bits = spte & ept_rsvd_mask(spte, level);
+
+ if (rsvd_bits != 0) {
+ printk(KERN_ERR "%s: rsvd_bits = 0x%llx\n",
+ __func__, rsvd_bits);
+ WARN_ON(1);
+ }
+
+ if (level == 1 || (level == 2 && (spte & (1ULL << 7)))) {
+ u64 ept_mem_type = (spte & 0x38) >> 3;
+
+ if (ept_mem_type == 2 || ept_mem_type == 3 ||
+ ept_mem_type == 7) {
+ printk(KERN_ERR "%s: ept_mem_type=0x%llx\n",
+ __func__, ept_mem_type);
+ WARN_ON(1);
+ }
+ }
+ }
+}
+
+static int handle_ept_misconfig(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ u64 sptes[4];
+ int nr_sptes, i;
+ gpa_t gpa;
+
+ gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+
+ printk(KERN_ERR "EPT: Misconfiguration.\n");
+ printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa);
+
+ nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes);
+
+ for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i)
+ ept_misconfig_inspect_spte(vcpu, sptes[i-1], i);
+
+ kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
+ kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG;
+
+ return 0;
+}
+
static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
u32 cpu_based_vm_exec_control;
@@ -3217,8 +3391,9 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
[EXIT_REASON_APIC_ACCESS] = handle_apic_access,
[EXIT_REASON_WBINVD] = handle_wbinvd,
[EXIT_REASON_TASK_SWITCH] = handle_task_switch,
- [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
[EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
+ [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
+ [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
};
static const int kvm_vmx_max_exit_handlers =
@@ -3234,8 +3409,7 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
u32 exit_reason = vmx->exit_reason;
u32 vectoring_info = vmx->idt_vectoring_info;
- KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu),
- (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit);
+ trace_kvm_exit(exit_reason, kvm_rip_read(vcpu));
/* If we need to emulate an MMIO from handle_invalid_guest_state
* we just return 0 */
@@ -3247,10 +3421,8 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
/* Access CR3 don't cause VMExit in paging mode, so we need
* to sync with guest real CR3. */
- if (enable_ept && is_paging(vcpu)) {
+ if (enable_ept && is_paging(vcpu))
vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
- ept_load_pdptrs(vcpu);
- }
if (unlikely(vmx->fail)) {
kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -3326,10 +3498,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
/* We need to handle NMIs before interrupts are enabled */
if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
- (exit_intr_info & INTR_INFO_VALID_MASK)) {
- KVMTRACE_0D(NMI, &vmx->vcpu, handler);
+ (exit_intr_info & INTR_INFO_VALID_MASK))
asm("int $2");
- }
idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
@@ -3434,6 +3604,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (enable_ept && is_paging(vcpu)) {
+ vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
+ ept_load_pdptrs(vcpu);
+ }
/* Record the guest's net vcpu time for enforced NMI injections. */
if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
vmx->entry_time = ktime_get();
@@ -3449,12 +3623,21 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+ /* When single-stepping over STI and MOV SS, we must clear the
+ * corresponding interruptibility bits in the guest state. Otherwise
+ * vmentry fails as it then expects bit 14 (BS) in pending debug
+ * exceptions being set, but that's not correct for the guest debugging
+ * case. */
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+ vmx_set_interrupt_shadow(vcpu, 0);
+
/*
* Loading guest fpu may have cleared host cr0.ts
*/
vmcs_writel(HOST_CR0, read_cr0());
- set_debugreg(vcpu->arch.dr6, 6);
+ if (vcpu->arch.switch_db_regs)
+ set_debugreg(vcpu->arch.dr6, 6);
asm(
/* Store host registers */
@@ -3465,11 +3648,16 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
"mov %%"R"sp, %c[host_rsp](%0) \n\t"
__ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
"1: \n\t"
+ /* Reload cr2 if changed */
+ "mov %c[cr2](%0), %%"R"ax \n\t"
+ "mov %%cr2, %%"R"dx \n\t"
+ "cmp %%"R"ax, %%"R"dx \n\t"
+ "je 2f \n\t"
+ "mov %%"R"ax, %%cr2 \n\t"
+ "2: \n\t"
/* Check if vmlaunch of vmresume is needed */
"cmpl $0, %c[launched](%0) \n\t"
/* Load guest registers. Don't clobber flags. */
- "mov %c[cr2](%0), %%"R"ax \n\t"
- "mov %%"R"ax, %%cr2 \n\t"
"mov %c[rax](%0), %%"R"ax \n\t"
"mov %c[rbx](%0), %%"R"bx \n\t"
"mov %c[rdx](%0), %%"R"dx \n\t"
@@ -3547,10 +3735,12 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
#endif
);
- vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
+ vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
+ | (1 << VCPU_EXREG_PDPTR));
vcpu->arch.regs_dirty = 0;
- get_debugreg(vcpu->arch.dr6, 6);
+ if (vcpu->arch.switch_db_regs)
+ get_debugreg(vcpu->arch.dr6, 6);
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
if (vmx->rmode.irq.pending)
@@ -3633,9 +3823,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
if (alloc_apic_access_page(kvm) != 0)
goto free_vmcs;
- if (enable_ept)
+ if (enable_ept) {
+ if (!kvm->arch.ept_identity_map_addr)
+ kvm->arch.ept_identity_map_addr =
+ VMX_EPT_IDENTITY_PAGETABLE_ADDR;
if (alloc_identity_pagetable(kvm) != 0)
goto free_vmcs;
+ }
return &vmx->vcpu;
@@ -3699,6 +3893,34 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
return ret;
}
+static const struct trace_print_flags vmx_exit_reasons_str[] = {
+ { EXIT_REASON_EXCEPTION_NMI, "exception" },
+ { EXIT_REASON_EXTERNAL_INTERRUPT, "ext_irq" },
+ { EXIT_REASON_TRIPLE_FAULT, "triple_fault" },
+ { EXIT_REASON_NMI_WINDOW, "nmi_window" },
+ { EXIT_REASON_IO_INSTRUCTION, "io_instruction" },
+ { EXIT_REASON_CR_ACCESS, "cr_access" },
+ { EXIT_REASON_DR_ACCESS, "dr_access" },
+ { EXIT_REASON_CPUID, "cpuid" },
+ { EXIT_REASON_MSR_READ, "rdmsr" },
+ { EXIT_REASON_MSR_WRITE, "wrmsr" },
+ { EXIT_REASON_PENDING_INTERRUPT, "interrupt_window" },
+ { EXIT_REASON_HLT, "halt" },
+ { EXIT_REASON_INVLPG, "invlpg" },
+ { EXIT_REASON_VMCALL, "hypercall" },
+ { EXIT_REASON_TPR_BELOW_THRESHOLD, "tpr_below_thres" },
+ { EXIT_REASON_APIC_ACCESS, "apic_access" },
+ { EXIT_REASON_WBINVD, "wbinvd" },
+ { EXIT_REASON_TASK_SWITCH, "task_switch" },
+ { EXIT_REASON_EPT_VIOLATION, "ept_violation" },
+ { -1, NULL }
+};
+
+static bool vmx_gb_page_enable(void)
+{
+ return false;
+}
+
static struct kvm_x86_ops vmx_x86_ops = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
@@ -3758,6 +3980,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
.set_tss_addr = vmx_set_tss_addr,
.get_tdp_level = get_ept_level,
.get_mt_mask = vmx_get_mt_mask,
+
+ .exit_reasons_str = vmx_exit_reasons_str,
+ .gb_page_enable = vmx_gb_page_enable,
};
static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 633ccc7400a..be451ee4424 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -37,11 +37,16 @@
#include <linux/iommu.h>
#include <linux/intel-iommu.h>
#include <linux/cpufreq.h>
+#include <trace/events/kvm.h>
+#undef TRACE_INCLUDE_FILE
+#define CREATE_TRACE_POINTS
+#include "trace.h"
#include <asm/uaccess.h>
#include <asm/msr.h>
#include <asm/desc.h>
#include <asm/mtrr.h>
+#include <asm/mce.h>
#define MAX_IO_MSRS 256
#define CR0_RESERVED_BITS \
@@ -55,6 +60,10 @@
| X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
+
+#define KVM_MAX_MCE_BANKS 32
+#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P
+
/* EFER defaults:
* - enable syscall per default because its emulated by KVM
* - enable LME and LMA per default on 64 bit KVM
@@ -68,14 +77,16 @@ static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
+static void update_cr8_intercept(struct kvm_vcpu *vcpu);
static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_entry2 __user *entries);
-struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
- u32 function, u32 index);
struct kvm_x86_ops *kvm_x86_ops;
EXPORT_SYMBOL_GPL(kvm_x86_ops);
+int ignore_msrs = 0;
+module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
+
struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "pf_fixed", VCPU_STAT(pf_fixed) },
{ "pf_guest", VCPU_STAT(pf_guest) },
@@ -122,18 +133,16 @@ unsigned long segment_base(u16 selector)
if (selector == 0)
return 0;
- asm("sgdt %0" : "=m"(gdt));
+ kvm_get_gdt(&gdt);
table_base = gdt.base;
if (selector & 4) { /* from ldt */
- u16 ldt_selector;
+ u16 ldt_selector = kvm_read_ldt();
- asm("sldt %0" : "=g"(ldt_selector));
table_base = segment_base(ldt_selector);
}
d = (struct desc_struct *)(table_base + (selector & ~7));
- v = d->base0 | ((unsigned long)d->base1 << 16) |
- ((unsigned long)d->base2 << 24);
+ v = get_desc_base(d);
#ifdef CONFIG_X86_64
if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
@@ -176,16 +185,22 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
++vcpu->stat.pf_guest;
if (vcpu->arch.exception.pending) {
- if (vcpu->arch.exception.nr == PF_VECTOR) {
- printk(KERN_DEBUG "kvm: inject_page_fault:"
- " double fault 0x%lx\n", addr);
- vcpu->arch.exception.nr = DF_VECTOR;
- vcpu->arch.exception.error_code = 0;
- } else if (vcpu->arch.exception.nr == DF_VECTOR) {
+ switch(vcpu->arch.exception.nr) {
+ case DF_VECTOR:
/* triple fault -> shutdown */
set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+ return;
+ case PF_VECTOR:
+ vcpu->arch.exception.nr = DF_VECTOR;
+ vcpu->arch.exception.error_code = 0;
+ return;
+ default:
+ /* replace previous exception with a new one in a hope
+ that instruction re-execution will regenerate lost
+ exception */
+ vcpu->arch.exception.pending = false;
+ break;
}
- return;
}
vcpu->arch.cr2 = addr;
kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
@@ -207,12 +222,18 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
}
EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
-static void __queue_exception(struct kvm_vcpu *vcpu)
+/*
+ * Checks if cpl <= required_cpl; if true, return true. Otherwise queue
+ * a #GP and return false.
+ */
+bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
{
- kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
- vcpu->arch.exception.has_error_code,
- vcpu->arch.exception.error_code);
+ if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
+ return true;
+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+ return false;
}
+EXPORT_SYMBOL_GPL(kvm_require_cpl);
/*
* Load the pae pdptrs. Return true is they are all valid.
@@ -232,7 +253,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
goto out;
}
for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
- if (is_present_pte(pdpte[i]) &&
+ if (is_present_gpte(pdpte[i]) &&
(pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
ret = 0;
goto out;
@@ -241,6 +262,10 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
ret = 1;
memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
+ __set_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_avail);
+ __set_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_dirty);
out:
return ret;
@@ -256,6 +281,10 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
if (is_long_mode(vcpu) || !is_pae(vcpu))
return false;
+ if (!test_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_avail))
+ return true;
+
r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
if (r < 0)
goto out;
@@ -328,9 +357,6 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0);
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
{
kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
- KVMTRACE_1D(LMSW, vcpu,
- (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)),
- handler);
}
EXPORT_SYMBOL_GPL(kvm_lmsw);
@@ -466,7 +492,7 @@ static u32 msrs_to_save[] = {
#ifdef CONFIG_X86_64
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
#endif
- MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+ MSR_IA32_TSC, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
};
@@ -644,8 +670,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
- kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
- &vcpu->hv_clock.tsc_timestamp);
+ kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
ktime_get_ts(&ts);
local_irq_restore(flags);
@@ -778,23 +803,60 @@ static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
return 0;
}
+static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ u64 mcg_cap = vcpu->arch.mcg_cap;
+ unsigned bank_num = mcg_cap & 0xff;
+
+ switch (msr) {
+ case MSR_IA32_MCG_STATUS:
+ vcpu->arch.mcg_status = data;
+ break;
+ case MSR_IA32_MCG_CTL:
+ if (!(mcg_cap & MCG_CTL_P))
+ return 1;
+ if (data != 0 && data != ~(u64)0)
+ return -1;
+ vcpu->arch.mcg_ctl = data;
+ break;
+ default:
+ if (msr >= MSR_IA32_MC0_CTL &&
+ msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
+ u32 offset = msr - MSR_IA32_MC0_CTL;
+ /* only 0 or all 1s can be written to IA32_MCi_CTL */
+ if ((offset & 0x3) == 0 &&
+ data != 0 && data != ~(u64)0)
+ return -1;
+ vcpu->arch.mce_banks[offset] = data;
+ break;
+ }
+ return 1;
+ }
+ return 0;
+}
+
int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
switch (msr) {
case MSR_EFER:
set_efer(vcpu, data);
break;
- case MSR_IA32_MC0_STATUS:
- pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
- __func__, data);
+ case MSR_K7_HWCR:
+ data &= ~(u64)0x40; /* ignore flush filter disable */
+ if (data != 0) {
+ pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
+ data);
+ return 1;
+ }
break;
- case MSR_IA32_MCG_STATUS:
- pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
- __func__, data);
+ case MSR_FAM10H_MMIO_CONF_BASE:
+ if (data != 0) {
+ pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
+ "0x%llx\n", data);
+ return 1;
+ }
break;
- case MSR_IA32_MCG_CTL:
- pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
- __func__, data);
+ case MSR_AMD64_NB_CFG:
break;
case MSR_IA32_DEBUGCTLMSR:
if (!data) {
@@ -811,12 +873,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
case MSR_IA32_UCODE_REV:
case MSR_IA32_UCODE_WRITE:
case MSR_VM_HSAVE_PA:
+ case MSR_AMD64_PATCH_LOADER:
break;
case 0x200 ... 0x2ff:
return set_msr_mtrr(vcpu, msr, data);
case MSR_IA32_APICBASE:
kvm_set_apic_base(vcpu, data);
break;
+ case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
+ return kvm_x2apic_msr_write(vcpu, msr, data);
case MSR_IA32_MISC_ENABLE:
vcpu->arch.ia32_misc_enable_msr = data;
break;
@@ -850,9 +915,50 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
kvm_request_guest_time_update(vcpu);
break;
}
+ case MSR_IA32_MCG_CTL:
+ case MSR_IA32_MCG_STATUS:
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
+ return set_msr_mce(vcpu, msr, data);
+
+ /* Performance counters are not protected by a CPUID bit,
+ * so we should check all of them in the generic path for the sake of
+ * cross vendor migration.
+ * Writing a zero into the event select MSRs disables them,
+ * which we perfectly emulate ;-). Any other value should be at least
+ * reported, some guests depend on them.
+ */
+ case MSR_P6_EVNTSEL0:
+ case MSR_P6_EVNTSEL1:
+ case MSR_K7_EVNTSEL0:
+ case MSR_K7_EVNTSEL1:
+ case MSR_K7_EVNTSEL2:
+ case MSR_K7_EVNTSEL3:
+ if (data != 0)
+ pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
+ "0x%x data 0x%llx\n", msr, data);
+ break;
+ /* at least RHEL 4 unconditionally writes to the perfctr registers,
+ * so we ignore writes to make it happy.
+ */
+ case MSR_P6_PERFCTR0:
+ case MSR_P6_PERFCTR1:
+ case MSR_K7_PERFCTR0:
+ case MSR_K7_PERFCTR1:
+ case MSR_K7_PERFCTR2:
+ case MSR_K7_PERFCTR3:
+ pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
+ "0x%x data 0x%llx\n", msr, data);
+ break;
default:
- pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
- return 1;
+ if (!ignore_msrs) {
+ pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
+ msr, data);
+ return 1;
+ } else {
+ pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",
+ msr, data);
+ break;
+ }
}
return 0;
}
@@ -905,26 +1011,47 @@ static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
return 0;
}
-int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
{
u64 data;
+ u64 mcg_cap = vcpu->arch.mcg_cap;
+ unsigned bank_num = mcg_cap & 0xff;
switch (msr) {
- case 0xc0010010: /* SYSCFG */
- case 0xc0010015: /* HWCR */
- case MSR_IA32_PLATFORM_ID:
case MSR_IA32_P5_MC_ADDR:
case MSR_IA32_P5_MC_TYPE:
- case MSR_IA32_MC0_CTL:
- case MSR_IA32_MCG_STATUS:
+ data = 0;
+ break;
case MSR_IA32_MCG_CAP:
+ data = vcpu->arch.mcg_cap;
+ break;
case MSR_IA32_MCG_CTL:
- case MSR_IA32_MC0_MISC:
- case MSR_IA32_MC0_MISC+4:
- case MSR_IA32_MC0_MISC+8:
- case MSR_IA32_MC0_MISC+12:
- case MSR_IA32_MC0_MISC+16:
- case MSR_IA32_MC0_MISC+20:
+ if (!(mcg_cap & MCG_CTL_P))
+ return 1;
+ data = vcpu->arch.mcg_ctl;
+ break;
+ case MSR_IA32_MCG_STATUS:
+ data = vcpu->arch.mcg_status;
+ break;
+ default:
+ if (msr >= MSR_IA32_MC0_CTL &&
+ msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
+ u32 offset = msr - MSR_IA32_MC0_CTL;
+ data = vcpu->arch.mce_banks[offset];
+ break;
+ }
+ return 1;
+ }
+ *pdata = data;
+ return 0;
+}
+
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+ u64 data;
+
+ switch (msr) {
+ case MSR_IA32_PLATFORM_ID:
case MSR_IA32_UCODE_REV:
case MSR_IA32_EBL_CR_POWERON:
case MSR_IA32_DEBUGCTLMSR:
@@ -932,10 +1059,18 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_IA32_LASTBRANCHTOIP:
case MSR_IA32_LASTINTFROMIP:
case MSR_IA32_LASTINTTOIP:
+ case MSR_K8_SYSCFG:
+ case MSR_K7_HWCR:
case MSR_VM_HSAVE_PA:
+ case MSR_P6_PERFCTR0:
+ case MSR_P6_PERFCTR1:
case MSR_P6_EVNTSEL0:
case MSR_P6_EVNTSEL1:
case MSR_K7_EVNTSEL0:
+ case MSR_K7_PERFCTR0:
+ case MSR_K8_INT_PENDING_MSG:
+ case MSR_AMD64_NB_CFG:
+ case MSR_FAM10H_MMIO_CONF_BASE:
data = 0;
break;
case MSR_MTRRcap:
@@ -949,6 +1084,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_IA32_APICBASE:
data = kvm_get_apic_base(vcpu);
break;
+ case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
+ return kvm_x2apic_msr_read(vcpu, msr, pdata);
+ break;
case MSR_IA32_MISC_ENABLE:
data = vcpu->arch.ia32_misc_enable_msr;
break;
@@ -967,9 +1105,22 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_KVM_SYSTEM_TIME:
data = vcpu->arch.time;
break;
+ case MSR_IA32_P5_MC_ADDR:
+ case MSR_IA32_P5_MC_TYPE:
+ case MSR_IA32_MCG_CAP:
+ case MSR_IA32_MCG_CTL:
+ case MSR_IA32_MCG_STATUS:
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
+ return get_msr_mce(vcpu, msr, pdata);
default:
- pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
- return 1;
+ if (!ignore_msrs) {
+ pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
+ return 1;
+ } else {
+ pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
+ data = 0;
+ }
+ break;
}
*pdata = data;
return 0;
@@ -1068,6 +1219,11 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_REINJECT_CONTROL:
case KVM_CAP_IRQ_INJECT_STATUS:
case KVM_CAP_ASSIGN_DEV_IRQ:
+ case KVM_CAP_IRQFD:
+ case KVM_CAP_IOEVENTFD:
+ case KVM_CAP_PIT2:
+ case KVM_CAP_PIT_STATE2:
+ case KVM_CAP_SET_IDENTITY_MAP_ADDR:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -1088,6 +1244,9 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_IOMMU:
r = iommu_found();
break;
+ case KVM_CAP_MCE:
+ r = KVM_MAX_MCE_BANKS;
+ break;
default:
r = 0;
break;
@@ -1147,6 +1306,16 @@ long kvm_arch_dev_ioctl(struct file *filp,
r = 0;
break;
}
+ case KVM_X86_GET_MCE_CAP_SUPPORTED: {
+ u64 mce_cap;
+
+ mce_cap = KVM_MCE_CAP_SUPPORTED;
+ r = -EFAULT;
+ if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
+ goto out;
+ r = 0;
+ break;
+ }
default:
r = -EINVAL;
}
@@ -1227,6 +1396,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
vcpu->arch.cpuid_nent = cpuid->nent;
cpuid_fix_nx_cap(vcpu);
r = 0;
+ kvm_apic_set_version(vcpu);
out_free:
vfree(cpuid_entries);
@@ -1248,6 +1418,7 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
goto out;
vcpu->arch.cpuid_nent = cpuid->nent;
+ kvm_apic_set_version(vcpu);
return 0;
out:
@@ -1290,6 +1461,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
u32 index, int *nent, int maxnent)
{
unsigned f_nx = is_efer_nx() ? F(NX) : 0;
+ unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0;
#ifdef CONFIG_X86_64
unsigned f_lm = F(LM);
#else
@@ -1314,7 +1486,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
F(PAT) | F(PSE36) | 0 /* Reserved */ |
f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
- F(FXSR) | F(FXSR_OPT) | 0 /* GBPAGES */ | 0 /* RDTSCP */ |
+ F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ |
0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
/* cpuid 1.ecx */
const u32 kvm_supported_word4_x86_features =
@@ -1323,7 +1495,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
0 /* Reserved, DCA */ | F(XMM4_1) |
- F(XMM4_2) | 0 /* x2APIC */ | F(MOVBE) | F(POPCNT) |
+ F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
0 /* Reserved, XSAVE, OSXSAVE */;
/* cpuid 0x80000001.ecx */
const u32 kvm_supported_word6_x86_features =
@@ -1344,6 +1516,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
case 1:
entry->edx &= kvm_supported_word0_x86_features;
entry->ecx &= kvm_supported_word4_x86_features;
+ /* we support x2apic emulation even if host does not support
+ * it since we emulate x2apic in software */
+ entry->ecx |= F(X2APIC);
break;
/* function 2 entries are STATEFUL. That is, repeated cpuid commands
* may return different values. This forces us to get_cpu() before
@@ -1435,6 +1610,10 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
do_cpuid_ent(&cpuid_entries[nent], func, 0,
&nent, cpuid->nent);
+ r = -E2BIG;
+ if (nent >= cpuid->nent)
+ goto out_free;
+
r = -EFAULT;
if (copy_to_user(entries, cpuid_entries,
nent * sizeof(struct kvm_cpuid_entry2)))
@@ -1464,6 +1643,7 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
vcpu_load(vcpu);
memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
kvm_apic_post_state_restore(vcpu);
+ update_cr8_intercept(vcpu);
vcpu_put(vcpu);
return 0;
@@ -1503,6 +1683,80 @@ static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
return 0;
}
+static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
+ u64 mcg_cap)
+{
+ int r;
+ unsigned bank_num = mcg_cap & 0xff, bank;
+
+ r = -EINVAL;
+ if (!bank_num)
+ goto out;
+ if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
+ goto out;
+ r = 0;
+ vcpu->arch.mcg_cap = mcg_cap;
+ /* Init IA32_MCG_CTL to all 1s */
+ if (mcg_cap & MCG_CTL_P)
+ vcpu->arch.mcg_ctl = ~(u64)0;
+ /* Init IA32_MCi_CTL to all 1s */
+ for (bank = 0; bank < bank_num; bank++)
+ vcpu->arch.mce_banks[bank*4] = ~(u64)0;
+out:
+ return r;
+}
+
+static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
+ struct kvm_x86_mce *mce)
+{
+ u64 mcg_cap = vcpu->arch.mcg_cap;
+ unsigned bank_num = mcg_cap & 0xff;
+ u64 *banks = vcpu->arch.mce_banks;
+
+ if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
+ return -EINVAL;
+ /*
+ * if IA32_MCG_CTL is not all 1s, the uncorrected error
+ * reporting is disabled
+ */
+ if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
+ vcpu->arch.mcg_ctl != ~(u64)0)
+ return 0;
+ banks += 4 * mce->bank;
+ /*
+ * if IA32_MCi_CTL is not all 1s, the uncorrected error
+ * reporting is disabled for the bank
+ */
+ if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
+ return 0;
+ if (mce->status & MCI_STATUS_UC) {
+ if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
+ !(vcpu->arch.cr4 & X86_CR4_MCE)) {
+ printk(KERN_DEBUG "kvm: set_mce: "
+ "injects mce exception while "
+ "previous one is in progress!\n");
+ set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+ return 0;
+ }
+ if (banks[1] & MCI_STATUS_VAL)
+ mce->status |= MCI_STATUS_OVER;
+ banks[2] = mce->addr;
+ banks[3] = mce->misc;
+ vcpu->arch.mcg_status = mce->mcg_status;
+ banks[1] = mce->status;
+ kvm_queue_exception(vcpu, MC_VECTOR);
+ } else if (!(banks[1] & MCI_STATUS_VAL)
+ || !(banks[1] & MCI_STATUS_UC)) {
+ if (banks[1] & MCI_STATUS_VAL)
+ mce->status |= MCI_STATUS_OVER;
+ banks[2] = mce->addr;
+ banks[3] = mce->misc;
+ banks[1] = mce->status;
+ } else
+ banks[1] |= MCI_STATUS_OVER;
+ return 0;
+}
+
long kvm_arch_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -1636,6 +1890,24 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
break;
}
+ case KVM_X86_SETUP_MCE: {
+ u64 mcg_cap;
+
+ r = -EFAULT;
+ if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
+ goto out;
+ r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
+ break;
+ }
+ case KVM_X86_SET_MCE: {
+ struct kvm_x86_mce mce;
+
+ r = -EFAULT;
+ if (copy_from_user(&mce, argp, sizeof mce))
+ goto out;
+ r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
+ break;
+ }
default:
r = -EINVAL;
}
@@ -1654,6 +1926,13 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
return ret;
}
+static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
+ u64 ident_addr)
+{
+ kvm->arch.ept_identity_map_addr = ident_addr;
+ return 0;
+}
+
static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
u32 kvm_nr_mmu_pages)
{
@@ -1775,19 +2054,25 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
r = 0;
switch (chip->chip_id) {
case KVM_IRQCHIP_PIC_MASTER:
+ spin_lock(&pic_irqchip(kvm)->lock);
memcpy(&pic_irqchip(kvm)->pics[0],
&chip->chip.pic,
sizeof(struct kvm_pic_state));
+ spin_unlock(&pic_irqchip(kvm)->lock);
break;
case KVM_IRQCHIP_PIC_SLAVE:
+ spin_lock(&pic_irqchip(kvm)->lock);
memcpy(&pic_irqchip(kvm)->pics[1],
&chip->chip.pic,
sizeof(struct kvm_pic_state));
+ spin_unlock(&pic_irqchip(kvm)->lock);
break;
case KVM_IRQCHIP_IOAPIC:
+ mutex_lock(&kvm->irq_lock);
memcpy(ioapic_irqchip(kvm),
&chip->chip.ioapic,
sizeof(struct kvm_ioapic_state));
+ mutex_unlock(&kvm->irq_lock);
break;
default:
r = -EINVAL;
@@ -1801,7 +2086,9 @@ static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
{
int r = 0;
+ mutex_lock(&kvm->arch.vpit->pit_state.lock);
memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
+ mutex_unlock(&kvm->arch.vpit->pit_state.lock);
return r;
}
@@ -1809,8 +2096,39 @@ static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
{
int r = 0;
+ mutex_lock(&kvm->arch.vpit->pit_state.lock);
memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
- kvm_pit_load_count(kvm, 0, ps->channels[0].count);
+ kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
+ mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ return r;
+}
+
+static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+{
+ int r = 0;
+
+ mutex_lock(&kvm->arch.vpit->pit_state.lock);
+ memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
+ sizeof(ps->channels));
+ ps->flags = kvm->arch.vpit->pit_state.flags;
+ mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ return r;
+}
+
+static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+{
+ int r = 0, start = 0;
+ u32 prev_legacy, cur_legacy;
+ mutex_lock(&kvm->arch.vpit->pit_state.lock);
+ prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
+ cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
+ if (!prev_legacy && cur_legacy)
+ start = 1;
+ memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
+ sizeof(kvm->arch.vpit->pit_state.channels));
+ kvm->arch.vpit->pit_state.flags = ps->flags;
+ kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
+ mutex_unlock(&kvm->arch.vpit->pit_state.lock);
return r;
}
@@ -1819,7 +2137,9 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
{
if (!kvm->arch.vpit)
return -ENXIO;
+ mutex_lock(&kvm->arch.vpit->pit_state.lock);
kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
+ mutex_unlock(&kvm->arch.vpit->pit_state.lock);
return 0;
}
@@ -1845,7 +2165,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
spin_lock(&kvm->mmu_lock);
kvm_mmu_slot_remove_write_access(kvm, log->slot);
spin_unlock(&kvm->mmu_lock);
- kvm_flush_remote_tlbs(kvm);
memslot = &kvm->memslots[log->slot];
n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
memset(memslot->dirty_bitmap, 0, n);
@@ -1869,7 +2188,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
*/
union {
struct kvm_pit_state ps;
+ struct kvm_pit_state2 ps2;
struct kvm_memory_alias alias;
+ struct kvm_pit_config pit_config;
} u;
switch (ioctl) {
@@ -1878,6 +2199,17 @@ long kvm_arch_vm_ioctl(struct file *filp,
if (r < 0)
goto out;
break;
+ case KVM_SET_IDENTITY_MAP_ADDR: {
+ u64 ident_addr;
+
+ r = -EFAULT;
+ if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
+ goto out;
+ r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
+ if (r < 0)
+ goto out;
+ break;
+ }
case KVM_SET_MEMORY_REGION: {
struct kvm_memory_region kvm_mem;
struct kvm_userspace_memory_region kvm_userspace_mem;
@@ -1930,16 +2262,24 @@ long kvm_arch_vm_ioctl(struct file *filp,
}
break;
case KVM_CREATE_PIT:
- mutex_lock(&kvm->lock);
+ u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
+ goto create_pit;
+ case KVM_CREATE_PIT2:
+ r = -EFAULT;
+ if (copy_from_user(&u.pit_config, argp,
+ sizeof(struct kvm_pit_config)))
+ goto out;
+ create_pit:
+ down_write(&kvm->slots_lock);
r = -EEXIST;
if (kvm->arch.vpit)
goto create_pit_unlock;
r = -ENOMEM;
- kvm->arch.vpit = kvm_create_pit(kvm);
+ kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
if (kvm->arch.vpit)
r = 0;
create_pit_unlock:
- mutex_unlock(&kvm->lock);
+ up_write(&kvm->slots_lock);
break;
case KVM_IRQ_LINE_STATUS:
case KVM_IRQ_LINE: {
@@ -1950,10 +2290,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
goto out;
if (irqchip_in_kernel(kvm)) {
__s32 status;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->irq_lock);
status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
irq_event.irq, irq_event.level);
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->irq_lock);
if (ioctl == KVM_IRQ_LINE_STATUS) {
irq_event.status = status;
if (copy_to_user(argp, &irq_event,
@@ -2042,6 +2382,32 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = 0;
break;
}
+ case KVM_GET_PIT2: {
+ r = -ENXIO;
+ if (!kvm->arch.vpit)
+ goto out;
+ r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_SET_PIT2: {
+ r = -EFAULT;
+ if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
+ goto out;
+ r = -ENXIO;
+ if (!kvm->arch.vpit)
+ goto out;
+ r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
+ if (r)
+ goto out;
+ r = 0;
+ break;
+ }
case KVM_REINJECT_CONTROL: {
struct kvm_reinject_control control;
r = -EFAULT;
@@ -2075,35 +2441,23 @@ static void kvm_init_msr_list(void)
num_msrs_to_save = j;
}
-/*
- * Only apic need an MMIO device hook, so shortcut now..
- */
-static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
- gpa_t addr, int len,
- int is_write)
+static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
+ const void *v)
{
- struct kvm_io_device *dev;
+ if (vcpu->arch.apic &&
+ !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
+ return 0;
- if (vcpu->arch.apic) {
- dev = &vcpu->arch.apic->dev;
- if (dev->in_range(dev, addr, len, is_write))
- return dev;
- }
- return NULL;
+ return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v);
}
-
-static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
- gpa_t addr, int len,
- int is_write)
+static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
{
- struct kvm_io_device *dev;
+ if (vcpu->arch.apic &&
+ !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v))
+ return 0;
- dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write);
- if (dev == NULL)
- dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len,
- is_write);
- return dev;
+ return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v);
}
static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
@@ -2172,11 +2526,12 @@ static int emulator_read_emulated(unsigned long addr,
unsigned int bytes,
struct kvm_vcpu *vcpu)
{
- struct kvm_io_device *mmio_dev;
gpa_t gpa;
if (vcpu->mmio_read_completed) {
memcpy(val, vcpu->mmio_data, bytes);
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
+ vcpu->mmio_phys_addr, *(u64 *)val);
vcpu->mmio_read_completed = 0;
return X86EMUL_CONTINUE;
}
@@ -2197,14 +2552,12 @@ mmio:
/*
* Is this MMIO handled locally?
*/
- mutex_lock(&vcpu->kvm->lock);
- mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0);
- if (mmio_dev) {
- kvm_iodevice_read(mmio_dev, gpa, bytes, val);
- mutex_unlock(&vcpu->kvm->lock);
+ if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) {
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val);
return X86EMUL_CONTINUE;
}
- mutex_unlock(&vcpu->kvm->lock);
+
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
vcpu->mmio_needed = 1;
vcpu->mmio_phys_addr = gpa;
@@ -2231,7 +2584,6 @@ static int emulator_write_emulated_onepage(unsigned long addr,
unsigned int bytes,
struct kvm_vcpu *vcpu)
{
- struct kvm_io_device *mmio_dev;
gpa_t gpa;
gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
@@ -2249,17 +2601,12 @@ static int emulator_write_emulated_onepage(unsigned long addr,
return X86EMUL_CONTINUE;
mmio:
+ trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
/*
* Is this MMIO handled locally?
*/
- mutex_lock(&vcpu->kvm->lock);
- mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1);
- if (mmio_dev) {
- kvm_iodevice_write(mmio_dev, gpa, bytes, val);
- mutex_unlock(&vcpu->kvm->lock);
+ if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
return X86EMUL_CONTINUE;
- }
- mutex_unlock(&vcpu->kvm->lock);
vcpu->mmio_needed = 1;
vcpu->mmio_phys_addr = gpa;
@@ -2343,7 +2690,6 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
int emulate_clts(struct kvm_vcpu *vcpu)
{
- KVMTRACE_0D(CLTS, vcpu, handler);
kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
return X86EMUL_CONTINUE;
}
@@ -2420,7 +2766,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
kvm_clear_exception_queue(vcpu);
vcpu->arch.mmio_fault_cr2 = cr2;
/*
- * TODO: fix x86_emulate.c to use guest_read/write_register
+ * TODO: fix emulate.c to use guest_read/write_register
* instead of direct ->regs accesses, can save hundred cycles
* on Intel for instructions that don't read/change RSP, for
* for example.
@@ -2444,14 +2790,33 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
- /* Reject the instructions other than VMCALL/VMMCALL when
- * try to emulate invalid opcode */
+ /* Only allow emulation of specific instructions on #UD
+ * (namely VMMCALL, sysenter, sysexit, syscall)*/
c = &vcpu->arch.emulate_ctxt.decode;
- if ((emulation_type & EMULTYPE_TRAP_UD) &&
- (!(c->twobyte && c->b == 0x01 &&
- (c->modrm_reg == 0 || c->modrm_reg == 3) &&
- c->modrm_mod == 3 && c->modrm_rm == 1)))
- return EMULATE_FAIL;
+ if (emulation_type & EMULTYPE_TRAP_UD) {
+ if (!c->twobyte)
+ return EMULATE_FAIL;
+ switch (c->b) {
+ case 0x01: /* VMMCALL */
+ if (c->modrm_mod != 3 || c->modrm_rm != 1)
+ return EMULATE_FAIL;
+ break;
+ case 0x34: /* sysenter */
+ case 0x35: /* sysexit */
+ if (c->modrm_mod != 0 || c->modrm_rm != 0)
+ return EMULATE_FAIL;
+ break;
+ case 0x05: /* syscall */
+ if (c->modrm_mod != 0 || c->modrm_rm != 0)
+ return EMULATE_FAIL;
+ break;
+ default:
+ return EMULATE_FAIL;
+ }
+
+ if (!(c->modrm_reg == 0 || c->modrm_reg == 3))
+ return EMULATE_FAIL;
+ }
++vcpu->stat.insn_emulation;
if (r) {
@@ -2571,52 +2936,40 @@ int complete_pio(struct kvm_vcpu *vcpu)
return 0;
}
-static void kernel_pio(struct kvm_io_device *pio_dev,
- struct kvm_vcpu *vcpu,
- void *pd)
+static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
{
/* TODO: String I/O for in kernel device */
+ int r;
- mutex_lock(&vcpu->kvm->lock);
if (vcpu->arch.pio.in)
- kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
- vcpu->arch.pio.size,
- pd);
+ r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port,
+ vcpu->arch.pio.size, pd);
else
- kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
- vcpu->arch.pio.size,
- pd);
- mutex_unlock(&vcpu->kvm->lock);
+ r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port,
+ vcpu->arch.pio.size, pd);
+ return r;
}
-static void pio_string_write(struct kvm_io_device *pio_dev,
- struct kvm_vcpu *vcpu)
+static int pio_string_write(struct kvm_vcpu *vcpu)
{
struct kvm_pio_request *io = &vcpu->arch.pio;
void *pd = vcpu->arch.pio_data;
- int i;
+ int i, r = 0;
- mutex_lock(&vcpu->kvm->lock);
for (i = 0; i < io->cur_count; i++) {
- kvm_iodevice_write(pio_dev, io->port,
- io->size,
- pd);
+ if (kvm_io_bus_write(&vcpu->kvm->pio_bus,
+ io->port, io->size, pd)) {
+ r = -EOPNOTSUPP;
+ break;
+ }
pd += io->size;
}
- mutex_unlock(&vcpu->kvm->lock);
-}
-
-static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
- gpa_t addr, int len,
- int is_write)
-{
- return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write);
+ return r;
}
int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
int size, unsigned port)
{
- struct kvm_io_device *pio_dev;
unsigned long val;
vcpu->run->exit_reason = KVM_EXIT_IO;
@@ -2630,19 +2983,13 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
vcpu->arch.pio.down = 0;
vcpu->arch.pio.rep = 0;
- if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
- KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
- handler);
- else
- KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
- handler);
+ trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
+ size, 1);
val = kvm_register_read(vcpu, VCPU_REGS_RAX);
memcpy(vcpu->arch.pio_data, &val, 4);
- pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
- if (pio_dev) {
- kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
+ if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
complete_pio(vcpu);
return 1;
}
@@ -2656,7 +3003,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
{
unsigned now, in_page;
int ret = 0;
- struct kvm_io_device *pio_dev;
vcpu->run->exit_reason = KVM_EXIT_IO;
vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
@@ -2669,12 +3015,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
vcpu->arch.pio.down = down;
vcpu->arch.pio.rep = rep;
- if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
- KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
- handler);
- else
- KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
- handler);
+ trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
+ size, count);
if (!count) {
kvm_x86_ops->skip_emulated_instruction(vcpu);
@@ -2704,9 +3046,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
vcpu->arch.pio.guest_gva = address;
- pio_dev = vcpu_find_pio_dev(vcpu, port,
- vcpu->arch.pio.cur_count,
- !vcpu->arch.pio.in);
if (!vcpu->arch.pio.in) {
/* string PIO write */
ret = pio_copy_data(vcpu);
@@ -2714,16 +3053,13 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
kvm_inject_gp(vcpu, 0);
return 1;
}
- if (ret == 0 && pio_dev) {
- pio_string_write(pio_dev, vcpu);
+ if (ret == 0 && !pio_string_write(vcpu)) {
complete_pio(vcpu);
if (vcpu->arch.pio.count == 0)
ret = 1;
}
- } else if (pio_dev)
- pr_unimpl(vcpu, "no string pio read support yet, "
- "port %x size %d count %ld\n",
- port, size, count);
+ }
+ /* no string PIO read support yet */
return ret;
}
@@ -2756,10 +3092,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
- for (i = 0; i < KVM_MAX_VCPUS; ++i) {
- vcpu = kvm->vcpus[i];
- if (!vcpu)
- continue;
+ kvm_for_each_vcpu(i, vcpu, kvm) {
if (vcpu->cpu != freq->cpu)
continue;
if (!kvm_request_guest_time_update(vcpu))
@@ -2852,7 +3185,6 @@ void kvm_arch_exit(void)
int kvm_emulate_halt(struct kvm_vcpu *vcpu)
{
++vcpu->stat.halt_exits;
- KVMTRACE_0D(HLT, vcpu, handler);
if (irqchip_in_kernel(vcpu->kvm)) {
vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
return 1;
@@ -2883,7 +3215,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
- KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler);
+ trace_kvm_hypercall(nr, a0, a1, a2, a3);
if (!is_long_mode(vcpu)) {
nr &= 0xFFFFFFFF;
@@ -2893,6 +3225,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
a3 &= 0xFFFFFFFF;
}
+ if (kvm_x86_ops->get_cpl(vcpu) != 0) {
+ ret = -KVM_EPERM;
+ goto out;
+ }
+
switch (nr) {
case KVM_HC_VAPIC_POLL_IRQ:
ret = 0;
@@ -2904,6 +3241,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
ret = -KVM_ENOSYS;
break;
}
+out:
kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
++vcpu->stat.hypercalls;
return r;
@@ -2983,8 +3321,6 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
return 0;
}
- KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value,
- (u32)((u64)value >> 32), handler);
return value;
}
@@ -2992,9 +3328,6 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
unsigned long *rflags)
{
- KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val,
- (u32)((u64)val >> 32), handler);
-
switch (cr) {
case 0:
kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
@@ -3104,11 +3437,11 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
}
kvm_x86_ops->skip_emulated_instruction(vcpu);
- KVMTRACE_5D(CPUID, vcpu, function,
- (u32)kvm_register_read(vcpu, VCPU_REGS_RAX),
- (u32)kvm_register_read(vcpu, VCPU_REGS_RBX),
- (u32)kvm_register_read(vcpu, VCPU_REGS_RCX),
- (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler);
+ trace_kvm_cpuid(function,
+ kvm_register_read(vcpu, VCPU_REGS_RAX),
+ kvm_register_read(vcpu, VCPU_REGS_RBX),
+ kvm_register_read(vcpu, VCPU_REGS_RCX),
+ kvm_register_read(vcpu, VCPU_REGS_RDX));
}
EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
@@ -3174,6 +3507,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
if (!kvm_x86_ops->update_cr8_intercept)
return;
+ if (!vcpu->arch.apic)
+ return;
+
if (!vcpu->arch.apic->vapic_addr)
max_irr = kvm_lapic_find_highest_irr(vcpu);
else
@@ -3187,12 +3523,16 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
}
-static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
- kvm_x86_ops->set_interrupt_shadow(vcpu, 0);
-
/* try to reinject previous events if any */
+ if (vcpu->arch.exception.pending) {
+ kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
+ vcpu->arch.exception.has_error_code,
+ vcpu->arch.exception.error_code);
+ return;
+ }
+
if (vcpu->arch.nmi_injected) {
kvm_x86_ops->set_nmi(vcpu);
return;
@@ -3266,16 +3606,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
smp_mb__after_clear_bit();
if (vcpu->requests || need_resched() || signal_pending(current)) {
+ set_bit(KVM_REQ_KICK, &vcpu->requests);
local_irq_enable();
preempt_enable();
r = 1;
goto out;
}
- if (vcpu->arch.exception.pending)
- __queue_exception(vcpu);
- else
- inject_pending_irq(vcpu, kvm_run);
+ inject_pending_event(vcpu, kvm_run);
/* enable NMI/IRQ window open exits if needed */
if (vcpu->arch.nmi_pending)
@@ -3292,14 +3630,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
kvm_guest_enter();
- get_debugreg(vcpu->arch.host_dr6, 6);
- get_debugreg(vcpu->arch.host_dr7, 7);
if (unlikely(vcpu->arch.switch_db_regs)) {
- get_debugreg(vcpu->arch.host_db[0], 0);
- get_debugreg(vcpu->arch.host_db[1], 1);
- get_debugreg(vcpu->arch.host_db[2], 2);
- get_debugreg(vcpu->arch.host_db[3], 3);
-
set_debugreg(0, 7);
set_debugreg(vcpu->arch.eff_db[0], 0);
set_debugreg(vcpu->arch.eff_db[1], 1);
@@ -3307,18 +3638,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
set_debugreg(vcpu->arch.eff_db[3], 3);
}
- KVMTRACE_0D(VMENTRY, vcpu, entryexit);
+ trace_kvm_entry(vcpu->vcpu_id);
kvm_x86_ops->run(vcpu, kvm_run);
- if (unlikely(vcpu->arch.switch_db_regs)) {
- set_debugreg(0, 7);
- set_debugreg(vcpu->arch.host_db[0], 0);
- set_debugreg(vcpu->arch.host_db[1], 1);
- set_debugreg(vcpu->arch.host_db[2], 2);
- set_debugreg(vcpu->arch.host_db[3], 3);
+ if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) {
+ set_debugreg(current->thread.debugreg0, 0);
+ set_debugreg(current->thread.debugreg1, 1);
+ set_debugreg(current->thread.debugreg2, 2);
+ set_debugreg(current->thread.debugreg3, 3);
+ set_debugreg(current->thread.debugreg6, 6);
+ set_debugreg(current->thread.debugreg7, 7);
}
- set_debugreg(vcpu->arch.host_dr6, 6);
- set_debugreg(vcpu->arch.host_dr7, 7);
set_bit(KVM_REQ_KICK, &vcpu->requests);
local_irq_enable();
@@ -3648,11 +3978,8 @@ static void kvm_set_segment(struct kvm_vcpu *vcpu,
static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
struct kvm_segment *kvm_desct)
{
- kvm_desct->base = seg_desc->base0;
- kvm_desct->base |= seg_desc->base1 << 16;
- kvm_desct->base |= seg_desc->base2 << 24;
- kvm_desct->limit = seg_desc->limit0;
- kvm_desct->limit |= seg_desc->limit << 16;
+ kvm_desct->base = get_desc_base(seg_desc);
+ kvm_desct->limit = get_desc_limit(seg_desc);
if (seg_desc->g) {
kvm_desct->limit <<= 12;
kvm_desct->limit |= 0xfff;
@@ -3696,7 +4023,6 @@ static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
struct desc_struct *seg_desc)
{
- gpa_t gpa;
struct descriptor_table dtable;
u16 index = selector >> 3;
@@ -3706,16 +4032,13 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
return 1;
}
- gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
- gpa += index * 8;
- return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8);
+ return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu);
}
/* allowed just for 8 bytes segments */
static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
struct desc_struct *seg_desc)
{
- gpa_t gpa;
struct descriptor_table dtable;
u16 index = selector >> 3;
@@ -3723,19 +4046,13 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
if (dtable.limit < index * 8 + 7)
return 1;
- gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
- gpa += index * 8;
- return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8);
+ return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu);
}
static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
struct desc_struct *seg_desc)
{
- u32 base_addr;
-
- base_addr = seg_desc->base0;
- base_addr |= (seg_desc->base1 << 16);
- base_addr |= (seg_desc->base2 << 24);
+ u32 base_addr = get_desc_base(seg_desc);
return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr);
}
@@ -3780,12 +4097,19 @@ static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int se
return 0;
}
+static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
+{
+ return (seg != VCPU_SREG_LDTR) &&
+ (seg != VCPU_SREG_TR) &&
+ (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_VM);
+}
+
int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
int type_bits, int seg)
{
struct kvm_segment kvm_seg;
- if (!(vcpu->arch.cr0 & X86_CR0_PE))
+ if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE))
return kvm_load_realmode_segment(vcpu, selector, seg);
if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
return 1;
@@ -4024,7 +4348,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
}
}
- if (!nseg_desc.p || (nseg_desc.limit0 | nseg_desc.limit << 16) < 0x67) {
+ if (!nseg_desc.p || get_desc_limit(&nseg_desc) < 0x67) {
kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
return 1;
}
@@ -4094,13 +4418,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
vcpu->arch.cr2 = sregs->cr2;
mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
-
- down_read(&vcpu->kvm->slots_lock);
- if (gfn_to_memslot(vcpu->kvm, sregs->cr3 >> PAGE_SHIFT))
- vcpu->arch.cr3 = sregs->cr3;
- else
- set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
- up_read(&vcpu->kvm->slots_lock);
+ vcpu->arch.cr3 = sregs->cr3;
kvm_set_cr8(vcpu, sregs->cr8);
@@ -4142,8 +4460,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
+ update_cr8_intercept(vcpu);
+
/* Older userspace won't unhalt the vcpu on reset. */
- if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 &&
+ if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
!(vcpu->arch.cr0 & X86_CR0_PE))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -4414,7 +4734,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
kvm = vcpu->kvm;
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
- if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
+ if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
else
vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
@@ -4436,6 +4756,14 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
goto fail_mmu_destroy;
}
+ vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
+ GFP_KERNEL);
+ if (!vcpu->arch.mce_banks) {
+ r = -ENOMEM;
+ goto fail_mmu_destroy;
+ }
+ vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
+
return 0;
fail_mmu_destroy:
@@ -4483,20 +4811,22 @@ static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
static void kvm_free_vcpus(struct kvm *kvm)
{
unsigned int i;
+ struct kvm_vcpu *vcpu;
/*
* Unpin any mmu pages first.
*/
- for (i = 0; i < KVM_MAX_VCPUS; ++i)
- if (kvm->vcpus[i])
- kvm_unload_vcpu_mmu(kvm->vcpus[i]);
- for (i = 0; i < KVM_MAX_VCPUS; ++i) {
- if (kvm->vcpus[i]) {
- kvm_arch_vcpu_free(kvm->vcpus[i]);
- kvm->vcpus[i] = NULL;
- }
- }
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_unload_vcpu_mmu(vcpu);
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_arch_vcpu_free(vcpu);
+
+ mutex_lock(&kvm->lock);
+ for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
+ kvm->vcpus[i] = NULL;
+ atomic_set(&kvm->online_vcpus, 0);
+ mutex_unlock(&kvm->lock);
}
void kvm_arch_sync_events(struct kvm *kvm)
@@ -4573,7 +4903,6 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
kvm_mmu_slot_remove_write_access(kvm, mem->slot);
spin_unlock(&kvm->mmu_lock);
- kvm_flush_remote_tlbs(kvm);
return 0;
}
@@ -4587,8 +4916,10 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
{
return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
- || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
- || vcpu->arch.nmi_pending;
+ || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
+ || vcpu->arch.nmi_pending ||
+ (kvm_arch_interrupt_allowed(vcpu) &&
+ kvm_cpu_has_interrupt(vcpu));
}
void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
@@ -4612,3 +4943,9 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
{
return kvm_x86_ops->interrupt_allowed(vcpu);
}
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 4c8e10af78e..5eadea585d2 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -31,4 +31,8 @@ static inline bool kvm_exception_is_soft(unsigned int nr)
{
return (nr == BP_VECTOR) || (nr == OF_VECTOR);
}
+
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
+ u32 function, u32 index);
+
#endif
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index eefdeee8a87..9b5a9f59a47 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,5 +1,9 @@
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
- pat.o pgtable.o gup.o
+ pat.o pgtable.o physaddr.o gup.o
+
+# Make sure __phys_addr has no stackprotector
+nostackp := $(call cc-option, -fno-stack-protector)
+CFLAGS_physaddr.o := $(nostackp)
obj-$(CONFIG_SMP) += tlb.o
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 1617958a380..63a6ba66cbe 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -104,6 +104,7 @@ EXPORT_SYMBOL(kunmap);
EXPORT_SYMBOL(kmap_atomic);
EXPORT_SYMBOL(kunmap_atomic);
EXPORT_SYMBOL(kmap_atomic_prot);
+EXPORT_SYMBOL(kmap_atomic_to_page);
void __init set_highmem_pages_init(void)
{
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index fe6f84ca121..84e236ce76b 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -21,7 +21,7 @@
#include <linux/module.h>
#include <linux/highmem.h>
-int is_io_mapping_possible(resource_size_t base, unsigned long size)
+static int is_io_mapping_possible(resource_size_t base, unsigned long size)
{
#if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT)
/* There is no way to map greater than 1 << 32 address without PAE */
@@ -30,7 +30,30 @@ int is_io_mapping_possible(resource_size_t base, unsigned long size)
#endif
return 1;
}
-EXPORT_SYMBOL_GPL(is_io_mapping_possible);
+
+int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot)
+{
+ unsigned long flag = _PAGE_CACHE_WC;
+ int ret;
+
+ if (!is_io_mapping_possible(base, size))
+ return -EINVAL;
+
+ ret = io_reserve_memtype(base, base + size, &flag);
+ if (ret)
+ return ret;
+
+ *prot = __pgprot(__PAGE_KERNEL | flag);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_create_wc);
+
+void
+iomap_free(resource_size_t base, unsigned long size)
+{
+ io_free_memtype(base, base + size);
+}
+EXPORT_SYMBOL_GPL(iomap_free);
void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
{
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 8a450930834..334e63ca7b2 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -22,77 +22,7 @@
#include <asm/pgalloc.h>
#include <asm/pat.h>
-static inline int phys_addr_valid(resource_size_t addr)
-{
-#ifdef CONFIG_PHYS_ADDR_T_64BIT
- return !(addr >> boot_cpu_data.x86_phys_bits);
-#else
- return 1;
-#endif
-}
-
-#ifdef CONFIG_X86_64
-
-unsigned long __phys_addr(unsigned long x)
-{
- if (x >= __START_KERNEL_map) {
- x -= __START_KERNEL_map;
- VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE);
- x += phys_base;
- } else {
- VIRTUAL_BUG_ON(x < PAGE_OFFSET);
- x -= PAGE_OFFSET;
- VIRTUAL_BUG_ON(!phys_addr_valid(x));
- }
- return x;
-}
-EXPORT_SYMBOL(__phys_addr);
-
-bool __virt_addr_valid(unsigned long x)
-{
- if (x >= __START_KERNEL_map) {
- x -= __START_KERNEL_map;
- if (x >= KERNEL_IMAGE_SIZE)
- return false;
- x += phys_base;
- } else {
- if (x < PAGE_OFFSET)
- return false;
- x -= PAGE_OFFSET;
- if (!phys_addr_valid(x))
- return false;
- }
-
- return pfn_valid(x >> PAGE_SHIFT);
-}
-EXPORT_SYMBOL(__virt_addr_valid);
-
-#else
-
-#ifdef CONFIG_DEBUG_VIRTUAL
-unsigned long __phys_addr(unsigned long x)
-{
- /* VMALLOC_* aren't constants */
- VIRTUAL_BUG_ON(x < PAGE_OFFSET);
- VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
- return x - PAGE_OFFSET;
-}
-EXPORT_SYMBOL(__phys_addr);
-#endif
-
-bool __virt_addr_valid(unsigned long x)
-{
- if (x < PAGE_OFFSET)
- return false;
- if (__vmalloc_start_set && is_vmalloc_addr((void *) x))
- return false;
- if (x >= FIXADDR_START)
- return false;
- return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
-}
-EXPORT_SYMBOL(__virt_addr_valid);
-
-#endif
+#include "physaddr.h"
int page_is_ram(unsigned long pagenr)
{
@@ -228,24 +158,14 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
prot_val, &new_prot_val);
if (retval) {
- pr_debug("Warning: reserve_memtype returned %d\n", retval);
+ printk(KERN_ERR "ioremap reserve_memtype failed %d\n", retval);
return NULL;
}
if (prot_val != new_prot_val) {
- /*
- * Do not fallback to certain memory types with certain
- * requested type:
- * - request is uc-, return cannot be write-back
- * - request is uc-, return cannot be write-combine
- * - request is write-combine, return cannot be write-back
- */
- if ((prot_val == _PAGE_CACHE_UC_MINUS &&
- (new_prot_val == _PAGE_CACHE_WB ||
- new_prot_val == _PAGE_CACHE_WC)) ||
- (prot_val == _PAGE_CACHE_WC &&
- new_prot_val == _PAGE_CACHE_WB)) {
- pr_debug(
+ if (!is_new_memtype_allowed(phys_addr, size,
+ prot_val, new_prot_val)) {
+ printk(KERN_ERR
"ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n",
(unsigned long long)phys_addr,
(unsigned long long)(phys_addr + size),
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 16582960056..c8191defc38 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -29,13 +29,26 @@
#include <linux/random.h>
#include <linux/limits.h>
#include <linux/sched.h>
+#include <asm/elf.h>
+
+static unsigned int stack_maxrandom_size(void)
+{
+ unsigned int max = 0;
+ if ((current->flags & PF_RANDOMIZE) &&
+ !(current->personality & ADDR_NO_RANDOMIZE)) {
+ max = ((-1U) & STACK_RND_MASK) << PAGE_SHIFT;
+ }
+
+ return max;
+}
+
/*
* Top of mmap area (just below the process stack).
*
- * Leave an at least ~128 MB hole.
+ * Leave an at least ~128 MB hole with possible stack randomization.
*/
-#define MIN_GAP (128*1024*1024)
+#define MIN_GAP (128*1024*1024UL + stack_maxrandom_size())
#define MAX_GAP (TASK_SIZE/6*5)
/*
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 7e600c1962d..24952fdc7e4 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -12,6 +12,7 @@
#include <linux/seq_file.h>
#include <linux/debugfs.h>
#include <linux/pfn.h>
+#include <linux/percpu.h>
#include <asm/e820.h>
#include <asm/processor.h>
@@ -686,7 +687,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
{
struct cpa_data alias_cpa;
unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
- unsigned long vaddr, remapped;
+ unsigned long vaddr;
int ret;
if (cpa->pfn >= max_pfn_mapped)
@@ -744,24 +745,6 @@ static int cpa_process_alias(struct cpa_data *cpa)
}
#endif
- /*
- * If the PMD page was partially used for per-cpu remapping,
- * the recycled area needs to be split and modified. Because
- * the area is always proper subset of a PMD page
- * cpa->numpages is guaranteed to be 1 for these areas, so
- * there's no need to loop over and check for further remaps.
- */
- remapped = (unsigned long)pcpu_lpage_remapped((void *)laddr);
- if (remapped) {
- WARN_ON(cpa->numpages > 1);
- alias_cpa = *cpa;
- alias_cpa.vaddr = &remapped;
- alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
- ret = __change_page_attr_set_clr(&alias_cpa, 0);
- if (ret)
- return ret;
- }
-
return 0;
}
@@ -822,6 +805,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
{
struct cpa_data cpa;
int ret, cache, checkalias;
+ unsigned long baddr = 0;
/*
* Check, if we are requested to change a not supported
@@ -853,6 +837,11 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
*/
WARN_ON_ONCE(1);
}
+ /*
+ * Save address for cache flush. *addr is modified in the call
+ * to __change_page_attr_set_clr() below.
+ */
+ baddr = *addr;
}
/* Must avoid aliasing mappings in the highmem code */
@@ -900,7 +889,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
cpa_flush_array(addr, numpages, cache,
cpa.flags, pages);
} else
- cpa_flush_range(*addr, numpages, cache);
+ cpa_flush_range(baddr, numpages, cache);
} else
cpa_flush_all(cache);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index b2f7d3e59b8..7257cf3decf 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -15,6 +15,7 @@
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/fs.h>
+#include <linux/rbtree.h>
#include <asm/cacheflush.h>
#include <asm/processor.h>
@@ -148,11 +149,10 @@ static char *cattr_name(unsigned long flags)
* areas). All the aliases have the same cache attributes of course.
* Zero attributes are represented as holes.
*
- * Currently the data structure is a list because the number of mappings
- * are expected to be relatively small. If this should be a problem
- * it could be changed to a rbtree or similar.
+ * The data structure is a list that is also organized as an rbtree
+ * sorted on the start address of memtype range.
*
- * memtype_lock protects the whole list.
+ * memtype_lock protects both the linear list and rbtree.
*/
struct memtype {
@@ -160,11 +160,53 @@ struct memtype {
u64 end;
unsigned long type;
struct list_head nd;
+ struct rb_node rb;
};
+static struct rb_root memtype_rbroot = RB_ROOT;
static LIST_HEAD(memtype_list);
static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
+static struct memtype *memtype_rb_search(struct rb_root *root, u64 start)
+{
+ struct rb_node *node = root->rb_node;
+ struct memtype *last_lower = NULL;
+
+ while (node) {
+ struct memtype *data = container_of(node, struct memtype, rb);
+
+ if (data->start < start) {
+ last_lower = data;
+ node = node->rb_right;
+ } else if (data->start > start) {
+ node = node->rb_left;
+ } else
+ return data;
+ }
+
+ /* Will return NULL if there is no entry with its start <= start */
+ return last_lower;
+}
+
+static void memtype_rb_insert(struct rb_root *root, struct memtype *data)
+{
+ struct rb_node **new = &(root->rb_node);
+ struct rb_node *parent = NULL;
+
+ while (*new) {
+ struct memtype *this = container_of(*new, struct memtype, rb);
+
+ parent = *new;
+ if (data->start <= this->start)
+ new = &((*new)->rb_left);
+ else if (data->start > this->start)
+ new = &((*new)->rb_right);
+ }
+
+ rb_link_node(&data->rb, parent, new);
+ rb_insert_color(&data->rb, root);
+}
+
/*
* Does intersection of PAT memory type and MTRR memory type and returns
* the resulting memory type as PAT understands it.
@@ -218,9 +260,6 @@ chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type)
return -EBUSY;
}
-static struct memtype *cached_entry;
-static u64 cached_start;
-
static int pat_pagerange_is_ram(unsigned long start, unsigned long end)
{
int ram_page = 0, not_rampage = 0;
@@ -249,63 +288,61 @@ static int pat_pagerange_is_ram(unsigned long start, unsigned long end)
}
/*
- * For RAM pages, mark the pages as non WB memory type using
- * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or
- * set_memory_wc() on a RAM page at a time before marking it as WB again.
- * This is ok, because only one driver will be owning the page and
- * doing set_memory_*() calls.
+ * For RAM pages, we use page flags to mark the pages with appropriate type.
+ * Here we do two pass:
+ * - Find the memtype of all the pages in the range, look for any conflicts
+ * - In case of no conflicts, set the new memtype for pages in the range
*
- * For now, we use PageNonWB to track that the RAM page is being mapped
- * as non WB. In future, we will have to use one more flag
- * (or some other mechanism in page_struct) to distinguish between
- * UC and WC mapping.
+ * Caller must hold memtype_lock for atomicity.
*/
static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
unsigned long *new_type)
{
struct page *page;
- u64 pfn, end_pfn;
+ u64 pfn;
+
+ if (req_type == _PAGE_CACHE_UC) {
+ /* We do not support strong UC */
+ WARN_ON_ONCE(1);
+ req_type = _PAGE_CACHE_UC_MINUS;
+ }
for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
- page = pfn_to_page(pfn);
- if (page_mapped(page) || PageNonWB(page))
- goto out;
+ unsigned long type;
- SetPageNonWB(page);
+ page = pfn_to_page(pfn);
+ type = get_page_memtype(page);
+ if (type != -1) {
+ printk(KERN_INFO "reserve_ram_pages_type failed "
+ "0x%Lx-0x%Lx, track 0x%lx, req 0x%lx\n",
+ start, end, type, req_type);
+ if (new_type)
+ *new_type = type;
+
+ return -EBUSY;
+ }
}
- return 0;
-out:
- end_pfn = pfn;
- for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
+ if (new_type)
+ *new_type = req_type;
+
+ for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
page = pfn_to_page(pfn);
- ClearPageNonWB(page);
+ set_page_memtype(page, req_type);
}
-
- return -EINVAL;
+ return 0;
}
static int free_ram_pages_type(u64 start, u64 end)
{
struct page *page;
- u64 pfn, end_pfn;
+ u64 pfn;
for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
page = pfn_to_page(pfn);
- if (page_mapped(page) || !PageNonWB(page))
- goto out;
-
- ClearPageNonWB(page);
+ set_page_memtype(page, -1);
}
return 0;
-
-out:
- end_pfn = pfn;
- for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
- page = pfn_to_page(pfn);
- SetPageNonWB(page);
- }
- return -EINVAL;
}
/*
@@ -339,6 +376,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
if (new_type) {
if (req_type == -1)
*new_type = _PAGE_CACHE_WB;
+ else if (req_type == _PAGE_CACHE_WC)
+ *new_type = _PAGE_CACHE_UC_MINUS;
else
*new_type = req_type & _PAGE_CACHE_MASK;
}
@@ -364,11 +403,16 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
*new_type = actual_type;
is_range_ram = pat_pagerange_is_ram(start, end);
- if (is_range_ram == 1)
- return reserve_ram_pages_type(start, end, req_type,
- new_type);
- else if (is_range_ram < 0)
+ if (is_range_ram == 1) {
+
+ spin_lock(&memtype_lock);
+ err = reserve_ram_pages_type(start, end, req_type, new_type);
+ spin_unlock(&memtype_lock);
+
+ return err;
+ } else if (is_range_ram < 0) {
return -EINVAL;
+ }
new = kmalloc(sizeof(struct memtype), GFP_KERNEL);
if (!new)
@@ -380,17 +424,11 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
spin_lock(&memtype_lock);
- if (cached_entry && start >= cached_start)
- entry = cached_entry;
- else
- entry = list_entry(&memtype_list, struct memtype, nd);
-
/* Search for existing mapping that overlaps the current range */
where = NULL;
- list_for_each_entry_continue(entry, &memtype_list, nd) {
+ list_for_each_entry(entry, &memtype_list, nd) {
if (end <= entry->start) {
where = entry->nd.prev;
- cached_entry = list_entry(where, struct memtype, nd);
break;
} else if (start <= entry->start) { /* end > entry->start */
err = chk_conflict(new, entry, new_type);
@@ -398,8 +436,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
dprintk("Overlap at 0x%Lx-0x%Lx\n",
entry->start, entry->end);
where = entry->nd.prev;
- cached_entry = list_entry(where,
- struct memtype, nd);
}
break;
} else if (start < entry->end) { /* start > entry->start */
@@ -407,8 +443,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
if (!err) {
dprintk("Overlap at 0x%Lx-0x%Lx\n",
entry->start, entry->end);
- cached_entry = list_entry(entry->nd.prev,
- struct memtype, nd);
/*
* Move to right position in the linked
@@ -436,13 +470,13 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
return err;
}
- cached_start = start;
-
if (where)
list_add(&new->nd, where);
else
list_add_tail(&new->nd, &memtype_list);
+ memtype_rb_insert(&memtype_rbroot, new);
+
spin_unlock(&memtype_lock);
dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
@@ -454,7 +488,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
int free_memtype(u64 start, u64 end)
{
- struct memtype *entry;
+ struct memtype *entry, *saved_entry;
int err = -EINVAL;
int is_range_ram;
@@ -466,23 +500,58 @@ int free_memtype(u64 start, u64 end)
return 0;
is_range_ram = pat_pagerange_is_ram(start, end);
- if (is_range_ram == 1)
- return free_ram_pages_type(start, end);
- else if (is_range_ram < 0)
+ if (is_range_ram == 1) {
+
+ spin_lock(&memtype_lock);
+ err = free_ram_pages_type(start, end);
+ spin_unlock(&memtype_lock);
+
+ return err;
+ } else if (is_range_ram < 0) {
return -EINVAL;
+ }
spin_lock(&memtype_lock);
- list_for_each_entry(entry, &memtype_list, nd) {
+
+ entry = memtype_rb_search(&memtype_rbroot, start);
+ if (unlikely(entry == NULL))
+ goto unlock_ret;
+
+ /*
+ * Saved entry points to an entry with start same or less than what
+ * we searched for. Now go through the list in both directions to look
+ * for the entry that matches with both start and end, with list stored
+ * in sorted start address
+ */
+ saved_entry = entry;
+ list_for_each_entry_from(entry, &memtype_list, nd) {
if (entry->start == start && entry->end == end) {
- if (cached_entry == entry || cached_start == start)
- cached_entry = NULL;
+ rb_erase(&entry->rb, &memtype_rbroot);
+ list_del(&entry->nd);
+ kfree(entry);
+ err = 0;
+ break;
+ } else if (entry->start > start) {
+ break;
+ }
+ }
+
+ if (!err)
+ goto unlock_ret;
+ entry = saved_entry;
+ list_for_each_entry_reverse(entry, &memtype_list, nd) {
+ if (entry->start == start && entry->end == end) {
+ rb_erase(&entry->rb, &memtype_rbroot);
list_del(&entry->nd);
kfree(entry);
err = 0;
break;
+ } else if (entry->start < start) {
+ break;
}
}
+unlock_ret:
spin_unlock(&memtype_lock);
if (err) {
@@ -496,6 +565,101 @@ int free_memtype(u64 start, u64 end)
}
+/**
+ * lookup_memtype - Looksup the memory type for a physical address
+ * @paddr: physical address of which memory type needs to be looked up
+ *
+ * Only to be called when PAT is enabled
+ *
+ * Returns _PAGE_CACHE_WB, _PAGE_CACHE_WC, _PAGE_CACHE_UC_MINUS or
+ * _PAGE_CACHE_UC
+ */
+static unsigned long lookup_memtype(u64 paddr)
+{
+ int rettype = _PAGE_CACHE_WB;
+ struct memtype *entry;
+
+ if (is_ISA_range(paddr, paddr + PAGE_SIZE - 1))
+ return rettype;
+
+ if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
+ struct page *page;
+ spin_lock(&memtype_lock);
+ page = pfn_to_page(paddr >> PAGE_SHIFT);
+ rettype = get_page_memtype(page);
+ spin_unlock(&memtype_lock);
+ /*
+ * -1 from get_page_memtype() implies RAM page is in its
+ * default state and not reserved, and hence of type WB
+ */
+ if (rettype == -1)
+ rettype = _PAGE_CACHE_WB;
+
+ return rettype;
+ }
+
+ spin_lock(&memtype_lock);
+
+ entry = memtype_rb_search(&memtype_rbroot, paddr);
+ if (entry != NULL)
+ rettype = entry->type;
+ else
+ rettype = _PAGE_CACHE_UC_MINUS;
+
+ spin_unlock(&memtype_lock);
+ return rettype;
+}
+
+/**
+ * io_reserve_memtype - Request a memory type mapping for a region of memory
+ * @start: start (physical address) of the region
+ * @end: end (physical address) of the region
+ * @type: A pointer to memtype, with requested type. On success, requested
+ * or any other compatible type that was available for the region is returned
+ *
+ * On success, returns 0
+ * On failure, returns non-zero
+ */
+int io_reserve_memtype(resource_size_t start, resource_size_t end,
+ unsigned long *type)
+{
+ resource_size_t size = end - start;
+ unsigned long req_type = *type;
+ unsigned long new_type;
+ int ret;
+
+ WARN_ON_ONCE(iomem_map_sanity_check(start, size));
+
+ ret = reserve_memtype(start, end, req_type, &new_type);
+ if (ret)
+ goto out_err;
+
+ if (!is_new_memtype_allowed(start, size, req_type, new_type))
+ goto out_free;
+
+ if (kernel_map_sync_memtype(start, size, new_type) < 0)
+ goto out_free;
+
+ *type = new_type;
+ return 0;
+
+out_free:
+ free_memtype(start, end);
+ ret = -EBUSY;
+out_err:
+ return ret;
+}
+
+/**
+ * io_free_memtype - Release a memory type mapping for a region of memory
+ * @start: start (physical address) of the region
+ * @end: end (physical address) of the region
+ */
+void io_free_memtype(resource_size_t start, resource_size_t end)
+{
+ free_memtype(start, end);
+}
+
pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
unsigned long size, pgprot_t vma_prot)
{
@@ -577,7 +741,7 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags)
{
unsigned long id_sz;
- if (!pat_enabled || base >= __pa(high_memory))
+ if (base >= __pa(high_memory))
return 0;
id_sz = (__pa(high_memory) < base + size) ?
@@ -612,11 +776,29 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
is_ram = pat_pagerange_is_ram(paddr, paddr + size);
/*
- * reserve_pfn_range() doesn't support RAM pages. Maintain the current
- * behavior with RAM pages by returning success.
+ * reserve_pfn_range() for RAM pages. We do not refcount to keep
+ * track of number of mappings of RAM pages. We can assert that
+ * the type requested matches the type of first page in the range.
*/
- if (is_ram != 0)
+ if (is_ram) {
+ if (!pat_enabled)
+ return 0;
+
+ flags = lookup_memtype(paddr);
+ if (want_flags != flags) {
+ printk(KERN_WARNING
+ "%s:%d map pfn RAM range req %s for %Lx-%Lx, got %s\n",
+ current->comm, current->pid,
+ cattr_name(want_flags),
+ (unsigned long long)paddr,
+ (unsigned long long)(paddr + size),
+ cattr_name(flags));
+ *vma_prot = __pgprot((pgprot_val(*vma_prot) &
+ (~_PAGE_CACHE_MASK)) |
+ flags);
+ }
return 0;
+ }
ret = reserve_memtype(paddr, paddr + size, want_flags, &flags);
if (ret)
@@ -678,14 +860,6 @@ int track_pfn_vma_copy(struct vm_area_struct *vma)
unsigned long vma_size = vma->vm_end - vma->vm_start;
pgprot_t pgprot;
- if (!pat_enabled)
- return 0;
-
- /*
- * For now, only handle remap_pfn_range() vmas where
- * is_linear_pfn_mapping() == TRUE. Handling of
- * vm_insert_pfn() is TBD.
- */
if (is_linear_pfn_mapping(vma)) {
/*
* reserve the whole chunk covered by vma. We need the
@@ -713,23 +887,24 @@ int track_pfn_vma_copy(struct vm_area_struct *vma)
int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
unsigned long pfn, unsigned long size)
{
+ unsigned long flags;
resource_size_t paddr;
unsigned long vma_size = vma->vm_end - vma->vm_start;
- if (!pat_enabled)
- return 0;
-
- /*
- * For now, only handle remap_pfn_range() vmas where
- * is_linear_pfn_mapping() == TRUE. Handling of
- * vm_insert_pfn() is TBD.
- */
if (is_linear_pfn_mapping(vma)) {
/* reserve the whole chunk starting from vm_pgoff */
paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
return reserve_pfn_range(paddr, vma_size, prot, 0);
}
+ if (!pat_enabled)
+ return 0;
+
+ /* for vm_insert_pfn and friends, we set prot based on lookup */
+ flags = lookup_memtype(pfn << PAGE_SHIFT);
+ *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
+ flags);
+
return 0;
}
@@ -744,14 +919,6 @@ void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
resource_size_t paddr;
unsigned long vma_size = vma->vm_end - vma->vm_start;
- if (!pat_enabled)
- return;
-
- /*
- * For now, only handle remap_pfn_range() vmas where
- * is_linear_pfn_mapping() == TRUE. Handling of
- * vm_insert_pfn() is TBD.
- */
if (is_linear_pfn_mapping(vma)) {
/* free the whole chunk starting from vm_pgoff */
paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c
new file mode 100644
index 00000000000..d2e2735327b
--- /dev/null
+++ b/arch/x86/mm/physaddr.c
@@ -0,0 +1,70 @@
+#include <linux/mmdebug.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+
+#include <asm/page.h>
+
+#include "physaddr.h"
+
+#ifdef CONFIG_X86_64
+
+unsigned long __phys_addr(unsigned long x)
+{
+ if (x >= __START_KERNEL_map) {
+ x -= __START_KERNEL_map;
+ VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE);
+ x += phys_base;
+ } else {
+ VIRTUAL_BUG_ON(x < PAGE_OFFSET);
+ x -= PAGE_OFFSET;
+ VIRTUAL_BUG_ON(!phys_addr_valid(x));
+ }
+ return x;
+}
+EXPORT_SYMBOL(__phys_addr);
+
+bool __virt_addr_valid(unsigned long x)
+{
+ if (x >= __START_KERNEL_map) {
+ x -= __START_KERNEL_map;
+ if (x >= KERNEL_IMAGE_SIZE)
+ return false;
+ x += phys_base;
+ } else {
+ if (x < PAGE_OFFSET)
+ return false;
+ x -= PAGE_OFFSET;
+ if (!phys_addr_valid(x))
+ return false;
+ }
+
+ return pfn_valid(x >> PAGE_SHIFT);
+}
+EXPORT_SYMBOL(__virt_addr_valid);
+
+#else
+
+#ifdef CONFIG_DEBUG_VIRTUAL
+unsigned long __phys_addr(unsigned long x)
+{
+ /* VMALLOC_* aren't constants */
+ VIRTUAL_BUG_ON(x < PAGE_OFFSET);
+ VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
+ return x - PAGE_OFFSET;
+}
+EXPORT_SYMBOL(__phys_addr);
+#endif
+
+bool __virt_addr_valid(unsigned long x)
+{
+ if (x < PAGE_OFFSET)
+ return false;
+ if (__vmalloc_start_set && is_vmalloc_addr((void *) x))
+ return false;
+ if (x >= FIXADDR_START)
+ return false;
+ return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
+}
+EXPORT_SYMBOL(__virt_addr_valid);
+
+#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/mm/physaddr.h b/arch/x86/mm/physaddr.h
new file mode 100644
index 00000000000..a3cd5a0c97b
--- /dev/null
+++ b/arch/x86/mm/physaddr.h
@@ -0,0 +1,10 @@
+#include <asm/processor.h>
+
+static inline int phys_addr_valid(resource_size_t addr)
+{
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+ return !(addr >> boot_cpu_data.x86_phys_bits);
+#else
+ return 1;
+#endif
+}
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 3ffa10df20b..572ee9782f2 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -15,63 +15,6 @@
* also get peer root bus resource for io,mmio
*/
-#ifdef CONFIG_NUMA
-
-#define BUS_NR 256
-
-#ifdef CONFIG_X86_64
-
-static int mp_bus_to_node[BUS_NR];
-
-void set_mp_bus_to_node(int busnum, int node)
-{
- if (busnum >= 0 && busnum < BUS_NR)
- mp_bus_to_node[busnum] = node;
-}
-
-int get_mp_bus_to_node(int busnum)
-{
- int node = -1;
-
- if (busnum < 0 || busnum > (BUS_NR - 1))
- return node;
-
- node = mp_bus_to_node[busnum];
-
- /*
- * let numa_node_id to decide it later in dma_alloc_pages
- * if there is no ram on that node
- */
- if (node != -1 && !node_online(node))
- node = -1;
-
- return node;
-}
-
-#else /* CONFIG_X86_32 */
-
-static unsigned char mp_bus_to_node[BUS_NR];
-
-void set_mp_bus_to_node(int busnum, int node)
-{
- if (busnum >= 0 && busnum < BUS_NR)
- mp_bus_to_node[busnum] = (unsigned char) node;
-}
-
-int get_mp_bus_to_node(int busnum)
-{
- int node;
-
- if (busnum < 0 || busnum > (BUS_NR - 1))
- return 0;
- node = mp_bus_to_node[busnum];
- return node;
-}
-
-#endif /* CONFIG_X86_32 */
-
-#endif /* CONFIG_NUMA */
-
#ifdef CONFIG_X86_64
/*
@@ -301,11 +244,6 @@ static int __init early_fill_mp_bus_info(void)
u64 val;
u32 address;
-#ifdef CONFIG_NUMA
- for (i = 0; i < BUS_NR; i++)
- mp_bus_to_node[i] = -1;
-#endif
-
if (!early_pci_allowed())
return -1;
@@ -346,7 +284,7 @@ static int __init early_fill_mp_bus_info(void)
node = (reg >> 4) & 0x07;
#ifdef CONFIG_NUMA
for (j = min_bus; j <= max_bus; j++)
- mp_bus_to_node[j] = (unsigned char) node;
+ set_mp_bus_to_node(j, node);
#endif
link = (reg >> 8) & 0x03;
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 2202b6257b8..5db96d4304d 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -600,3 +600,72 @@ struct pci_bus * __devinit pci_scan_bus_with_sysdata(int busno)
{
return pci_scan_bus_on_node(busno, &pci_root_ops, -1);
}
+
+/*
+ * NUMA info for PCI busses
+ *
+ * Early arch code is responsible for filling in reasonable values here.
+ * A node id of "-1" means "use current node". In other words, if a bus
+ * has a -1 node id, it's not tightly coupled to any particular chunk
+ * of memory (as is the case on some Nehalem systems).
+ */
+#ifdef CONFIG_NUMA
+
+#define BUS_NR 256
+
+#ifdef CONFIG_X86_64
+
+static int mp_bus_to_node[BUS_NR] = {
+ [0 ... BUS_NR - 1] = -1
+};
+
+void set_mp_bus_to_node(int busnum, int node)
+{
+ if (busnum >= 0 && busnum < BUS_NR)
+ mp_bus_to_node[busnum] = node;
+}
+
+int get_mp_bus_to_node(int busnum)
+{
+ int node = -1;
+
+ if (busnum < 0 || busnum > (BUS_NR - 1))
+ return node;
+
+ node = mp_bus_to_node[busnum];
+
+ /*
+ * let numa_node_id to decide it later in dma_alloc_pages
+ * if there is no ram on that node
+ */
+ if (node != -1 && !node_online(node))
+ node = -1;
+
+ return node;
+}
+
+#else /* CONFIG_X86_32 */
+
+static unsigned char mp_bus_to_node[BUS_NR] = {
+ [0 ... BUS_NR - 1] = -1
+};
+
+void set_mp_bus_to_node(int busnum, int node)
+{
+ if (busnum >= 0 && busnum < BUS_NR)
+ mp_bus_to_node[busnum] = (unsigned char) node;
+}
+
+int get_mp_bus_to_node(int busnum)
+{
+ int node;
+
+ if (busnum < 0 || busnum > (BUS_NR - 1))
+ return 0;
+ node = mp_bus_to_node[busnum];
+ return node;
+}
+
+#endif /* CONFIG_X86_32 */
+
+#endif /* CONFIG_NUMA */
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index b3d20b9cac6..417c9f5b4af 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -242,7 +242,7 @@ static void __restore_processor_state(struct saved_context *ctxt)
fix_processor_context();
do_fpu_end();
- mtrr_ap_init();
+ mtrr_bp_restore();
#ifdef CONFIG_X86_OLD_MCE
mcheck_init(&boot_cpu_data);
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 7410640db17..3bb4fc21f4f 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -8,6 +8,7 @@ endif
# Make sure early boot has no stackprotector
nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_enlighten.o := $(nostackp)
+CFLAGS_mmu.o := $(nostackp)
obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
time.o xen-asm.o xen-asm_$(BITS).o \
@@ -16,3 +17,4 @@ obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
obj-$(CONFIG_SMP) += smp.o
obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
+
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index b62ccb840cf..0dd0c2c6cae 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -51,6 +51,7 @@
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/reboot.h>
+#include <asm/stackprotector.h>
#include "xen-ops.h"
#include "mmu.h"
@@ -330,18 +331,28 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
unsigned long frames[pages];
int f;
- /* A GDT can be up to 64k in size, which corresponds to 8192
- 8-byte entries, or 16 4k pages.. */
+ /*
+ * A GDT can be up to 64k in size, which corresponds to 8192
+ * 8-byte entries, or 16 4k pages..
+ */
BUG_ON(size > 65536);
BUG_ON(va & ~PAGE_MASK);
for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
int level;
- pte_t *ptep = lookup_address(va, &level);
+ pte_t *ptep;
unsigned long pfn, mfn;
void *virt;
+ /*
+ * The GDT is per-cpu and is in the percpu data area.
+ * That can be virtually mapped, so we need to do a
+ * page-walk to get the underlying MFN for the
+ * hypercall. The page can also be in the kernel's
+ * linear range, so we need to RO that mapping too.
+ */
+ ptep = lookup_address(va, &level);
BUG_ON(ptep == NULL);
pfn = pte_pfn(*ptep);
@@ -358,6 +369,44 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
BUG();
}
+/*
+ * load_gdt for early boot, when the gdt is only mapped once
+ */
+static __init void xen_load_gdt_boot(const struct desc_ptr *dtr)
+{
+ unsigned long va = dtr->address;
+ unsigned int size = dtr->size + 1;
+ unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+ unsigned long frames[pages];
+ int f;
+
+ /*
+ * A GDT can be up to 64k in size, which corresponds to 8192
+ * 8-byte entries, or 16 4k pages..
+ */
+
+ BUG_ON(size > 65536);
+ BUG_ON(va & ~PAGE_MASK);
+
+ for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
+ pte_t pte;
+ unsigned long pfn, mfn;
+
+ pfn = virt_to_pfn(va);
+ mfn = pfn_to_mfn(pfn);
+
+ pte = pfn_pte(pfn, PAGE_KERNEL_RO);
+
+ if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0))
+ BUG();
+
+ frames[f] = mfn;
+ }
+
+ if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct)))
+ BUG();
+}
+
static void load_TLS_descriptor(struct thread_struct *t,
unsigned int cpu, unsigned int i)
{
@@ -581,6 +630,29 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
preempt_enable();
}
+/*
+ * Version of write_gdt_entry for use at early boot-time needed to
+ * update an entry as simply as possible.
+ */
+static __init void xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
+ const void *desc, int type)
+{
+ switch (type) {
+ case DESC_LDT:
+ case DESC_TSS:
+ /* ignore */
+ break;
+
+ default: {
+ xmaddr_t maddr = virt_to_machine(&dt[entry]);
+
+ if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
+ dt[entry] = *(struct desc_struct *)desc;
+ }
+
+ }
+}
+
static void xen_load_sp0(struct tss_struct *tss,
struct thread_struct *thread)
{
@@ -965,6 +1037,23 @@ static const struct machine_ops __initdata xen_machine_ops = {
.emergency_restart = xen_emergency_restart,
};
+/*
+ * Set up the GDT and segment registers for -fstack-protector. Until
+ * we do this, we have to be careful not to call any stack-protected
+ * function, which is most of the kernel.
+ */
+static void __init xen_setup_stackprotector(void)
+{
+ pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot;
+ pv_cpu_ops.load_gdt = xen_load_gdt_boot;
+
+ setup_stack_canary_segment(0);
+ switch_to_new_gdt(0);
+
+ pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry;
+ pv_cpu_ops.load_gdt = xen_load_gdt;
+}
+
/* First C function to be called on Xen boot */
asmlinkage void __init xen_start_kernel(void)
{
@@ -983,13 +1072,28 @@ asmlinkage void __init xen_start_kernel(void)
pv_apic_ops = xen_apic_ops;
pv_mmu_ops = xen_mmu_ops;
-#ifdef CONFIG_X86_64
/*
- * Setup percpu state. We only need to do this for 64-bit
- * because 32-bit already has %fs set properly.
+ * Set up some pagetable state before starting to set any ptes.
*/
- load_percpu_segment(0);
-#endif
+
+ /* Prevent unwanted bits from being set in PTEs. */
+ __supported_pte_mask &= ~_PAGE_GLOBAL;
+ if (!xen_initial_domain())
+ __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
+
+ __supported_pte_mask |= _PAGE_IOMAP;
+
+ xen_setup_features();
+
+ /* Get mfn list */
+ if (!xen_feature(XENFEAT_auto_translated_physmap))
+ xen_build_dynamic_phys_to_machine();
+
+ /*
+ * Set up kernel GDT and segment registers, mainly so that
+ * -fstack-protector code can be executed.
+ */
+ xen_setup_stackprotector();
xen_init_irq_ops();
xen_init_cpuid_mask();
@@ -1001,8 +1105,6 @@ asmlinkage void __init xen_start_kernel(void)
set_xen_basic_apic_ops();
#endif
- xen_setup_features();
-
if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start;
pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit;
@@ -1019,17 +1121,8 @@ asmlinkage void __init xen_start_kernel(void)
xen_smp_init();
- /* Get mfn list */
- if (!xen_feature(XENFEAT_auto_translated_physmap))
- xen_build_dynamic_phys_to_machine();
-
pgd = (pgd_t *)xen_start_info->pt_base;
- /* Prevent unwanted bits from being set in PTEs. */
- __supported_pte_mask &= ~_PAGE_GLOBAL;
- if (!xen_initial_domain())
- __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
-
#ifdef CONFIG_X86_64
/* Work out if we support NX */
check_efer();
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 429834ec168..fe03eeed7b4 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -236,6 +236,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
ctxt->user_regs.ss = __KERNEL_DS;
#ifdef CONFIG_X86_32
ctxt->user_regs.fs = __KERNEL_PERCPU;
+ ctxt->user_regs.gs = __KERNEL_STACK_CANARY;
#else
ctxt->gs_base_kernel = per_cpu_offset(cpu);
#endif
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 5601506f2dd..36a5141108d 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -187,7 +187,6 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl
struct xen_spinlock *prev;
int irq = __get_cpu_var(lock_kicker_irq);
int ret;
- unsigned long flags;
u64 start;
/* If kicker interrupts not initialized yet, just spin */
@@ -199,16 +198,12 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl
/* announce we're spinning */
prev = spinning_lock(xl);
- flags = __raw_local_save_flags();
- if (irq_enable) {
- ADD_STATS(taken_slow_irqenable, 1);
- raw_local_irq_enable();
- }
-
ADD_STATS(taken_slow, 1);
ADD_STATS(taken_slow_nested, prev != NULL);
do {
+ unsigned long flags;
+
/* clear pending */
xen_clear_irq_pending(irq);
@@ -228,6 +223,12 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl
goto out;
}
+ flags = __raw_local_save_flags();
+ if (irq_enable) {
+ ADD_STATS(taken_slow_irqenable, 1);
+ raw_local_irq_enable();
+ }
+
/*
* Block until irq becomes pending. If we're
* interrupted at this point (after the trylock but
@@ -238,13 +239,15 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl
* pending.
*/
xen_poll_irq(irq);
+
+ raw_local_irq_restore(flags);
+
ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq));
} while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */
kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
out:
- raw_local_irq_restore(flags);
unspinning_lock(xl, prev);
spin_time_accum_blocked(start);
@@ -323,8 +326,13 @@ static void xen_spin_unlock(struct raw_spinlock *lock)
smp_wmb(); /* make sure no writes get moved after unlock */
xl->lock = 0; /* release lock */
- /* make sure unlock happens before kick */
- barrier();
+ /*
+ * Make sure unlock happens before checking for waiting
+ * spinners. We need a strong barrier to enforce the
+ * write-read ordering to different memory locations, as the
+ * CPU makes no implied guarantees about their ordering.
+ */
+ mb();
if (unlikely(xl->spinners))
xen_spin_unlock_slow(xl);