From 7c68af6e32c73992bad24107311f3433c89016e2 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sat, 19 Sep 2009 09:40:22 +0300
Subject: core, x86: Add user return notifiers

Add a general per-cpu notifier that is called whenever the kernel is
about to return to userspace.  The notifier uses a thread_info flag
and existing checks, so there is no impact on user return or context
switch fast paths.

This will be used initially to speed up KVM task switching by lazily
updating MSRs.

Signed-off-by: Avi Kivity <avi@redhat.com>
LKML-Reference: <1253342422-13811-1-git-send-email-avi@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/thread_info.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index d27d0a2fec4..375c917c37d 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -83,6 +83,7 @@ struct thread_info {
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
+#define TIF_USER_RETURN_NOTIFY	11	/* notify kernel of userspace return */
 #define TIF_NOTSC		16	/* TSC is not accessible in userland */
 #define TIF_IA32		17	/* 32bit process */
 #define TIF_FORK		18	/* ret_from_fork */
@@ -107,6 +108,7 @@ struct thread_info {
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
 #define _TIF_MCE_NOTIFY		(1 << TIF_MCE_NOTIFY)
+#define _TIF_USER_RETURN_NOTIFY	(1 << TIF_USER_RETURN_NOTIFY)
 #define _TIF_NOTSC		(1 << TIF_NOTSC)
 #define _TIF_IA32		(1 << TIF_IA32)
 #define _TIF_FORK		(1 << TIF_FORK)
@@ -142,13 +144,14 @@ struct thread_info {
 
 /* Only used for 64 bit */
 #define _TIF_DO_NOTIFY_MASK						\
-	(_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
+	(_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME |	\
+	 _TIF_USER_RETURN_NOTIFY)
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
 	(_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC)
 
-#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
+#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
 
 #define PREEMPT_ACTIVE		0x10000000
-- 
cgit v1.2.3


From 851ba6922ac575b749f63dee0ae072808163ba6a Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 24 Aug 2009 11:10:17 +0300
Subject: KVM: Don't pass kvm_run arguments

They're just copies of vcpu->run, which is readily accessible.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d83892226f7..0b113f2b58c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -506,8 +506,8 @@ struct kvm_x86_ops {
 
 	void (*tlb_flush)(struct kvm_vcpu *vcpu);
 
-	void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
-	int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
+	void (*run)(struct kvm_vcpu *vcpu);
+	int (*handle_exit)(struct kvm_vcpu *vcpu);
 	void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
 	void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
 	u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
@@ -568,7 +568,7 @@ enum emulation_result {
 #define EMULTYPE_NO_DECODE	    (1 << 0)
 #define EMULTYPE_TRAP_UD	    (1 << 1)
 #define EMULTYPE_SKIP		    (1 << 2)
-int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run,
+int emulate_instruction(struct kvm_vcpu *vcpu,
 			unsigned long cr2, u16 error_code, int emulation_type);
 void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
@@ -585,9 +585,9 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
 
 struct x86_emulate_ctxt;
 
-int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
+int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in,
 		     int size, unsigned port);
-int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
+int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
 			   int size, unsigned long count, int down,
 			    gva_t address, int rep, unsigned port);
 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
-- 
cgit v1.2.3


From 1a6e4a8c276e122dbeb6f9c610f29735e4236bfd Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 24 Aug 2009 11:54:19 +0300
Subject: KVM: Move irq sharing information to irqchip level

This removes assumptions that max GSIs is smaller than number of pins.
Sharing is tracked on pin level not GSI level.

[avi: no PIC on ia64]

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0b113f2b58c..35d3236c9de 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -410,7 +410,6 @@ struct kvm_arch{
 	gpa_t ept_identity_map_addr;
 
 	unsigned long irq_sources_bitmap;
-	unsigned long irq_states[KVM_IOAPIC_NUM_PINS];
 	u64 vm_init_tsc;
 };
 
-- 
cgit v1.2.3


From 3e71f88bc90792a187703860cf22fbed7c12cbd9 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 24 Aug 2009 11:54:21 +0300
Subject: KVM: Maintain back mapping from irqchip/pin to gsi

Maintain back mapping from irqchip/pin to gsi to speedup
interrupt acknowledgment notifications.

[avi: build fix on non-x86/ia64]

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 4a5fe914dc5..f02e87a5206 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -79,6 +79,7 @@ struct kvm_ioapic_state {
 #define KVM_IRQCHIP_PIC_MASTER   0
 #define KVM_IRQCHIP_PIC_SLAVE    1
 #define KVM_IRQCHIP_IOAPIC       2
+#define KVM_NR_IRQCHIPS          3
 
 /* for KVM_GET_REGS and KVM_SET_REGS */
 struct kvm_regs {
-- 
cgit v1.2.3


From 136bdfeee7b5bc986fc94af3a40d7d13ea37bb95 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 24 Aug 2009 11:54:23 +0300
Subject: KVM: Move irq ack notifier list to arch independent code

Mask irq notifier list is already there.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 35d3236c9de..a46e2dd9aca 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -397,7 +397,6 @@ struct kvm_arch{
 	struct kvm_pic *vpic;
 	struct kvm_ioapic *vioapic;
 	struct kvm_pit *vpit;
-	struct hlist_head irq_ack_notifier_list;
 	int vapics_in_nmi_mode;
 
 	unsigned int tss_addr;
-- 
cgit v1.2.3


From 10474ae8945ce08622fd1f3464e55bd817bf2376 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 15 Sep 2009 11:37:46 +0200
Subject: KVM: Activate Virtualization On Demand

X86 CPUs need to have some magic happening to enable the virtualization
extensions on them. This magic can result in unpleasant results for
users, like blocking other VMMs from working (vmx) or using invalid TLB
entries (svm).

Currently KVM activates virtualization when the respective kernel module
is loaded. This blocks us from autoloading KVM modules without breaking
other VMMs.

To circumvent this problem at least a bit, this patch introduces on
demand activation of virtualization. This means, that instead
virtualization is enabled on creation of the first virtual machine
and disabled on destruction of the last one.

So using this, KVM can be easily autoloaded, while keeping other
hypervisors usable.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a46e2dd9aca..295c7c4d9c9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -459,7 +459,7 @@ struct descriptor_table {
 struct kvm_x86_ops {
 	int (*cpu_has_kvm_support)(void);          /* __init */
 	int (*disabled_by_bios)(void);             /* __init */
-	void (*hardware_enable)(void *dummy);      /* __init */
+	int (*hardware_enable)(void *dummy);
 	void (*hardware_disable)(void *dummy);
 	void (*check_processor_compatibility)(void *rtn);
 	int (*hardware_setup)(void);               /* __init */
-- 
cgit v1.2.3


From 355be0b9300579e02275d7d19374806a974ce622 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@web.de>
Date: Sat, 3 Oct 2009 00:31:21 +0200
Subject: KVM: x86: Refactor guest debug IOCTL handling

Much of so far vendor-specific code for setting up guest debug can
actually be handled by the generic code. This also fixes a minor deficit
in the SVM part /wrt processing KVM_GUESTDBG_ENABLE.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 295c7c4d9c9..e7f87083260 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -475,8 +475,8 @@ struct kvm_x86_ops {
 	void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
 	void (*vcpu_put)(struct kvm_vcpu *vcpu);
 
-	int (*set_guest_debug)(struct kvm_vcpu *vcpu,
-			       struct kvm_guest_debug *dbg);
+	void (*set_guest_debug)(struct kvm_vcpu *vcpu,
+				struct kvm_guest_debug *dbg);
 	int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
 	int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
 	u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
-- 
cgit v1.2.3


From 91586a3b7d79432772a3cdcb81473cd08a237c79 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Mon, 5 Oct 2009 13:07:21 +0200
Subject: KVM: x86: Rework guest single-step flag injection and filtering

Push TF and RF injection and filtering on guest single-stepping into the
vender get/set_rflags callbacks. This makes the whole mechanism more
robust wrt user space IOCTL order and instruction emulations.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e7f87083260..179a919f53a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -614,6 +614,9 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
 
+unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
+void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
+
 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
-- 
cgit v1.2.3


From 4b8d54f9726f1159330201c5ed2ea30bce7e63ea Mon Sep 17 00:00:00 2001
From: "Zhai, Edwin" <edwin.zhai@intel.com>
Date: Fri, 9 Oct 2009 18:03:20 +0800
Subject: KVM: VMX: Add support for Pause-Loop Exiting

New NHM processors will support Pause-Loop Exiting by adding 2 VM-execution
control fields:
PLE_Gap    - upper bound on the amount of time between two successive
             executions of PAUSE in a loop.
PLE_Window - upper bound on the amount of time a guest is allowed to execute in
             a PAUSE loop

If the time, between this execution of PAUSE and previous one, exceeds the
PLE_Gap, processor consider this PAUSE belongs to a new loop.
Otherwise, processor determins the the total execution time of this loop(since
1st PAUSE in this loop), and triggers a VM exit if total time exceeds the
PLE_Window.
* Refer SDM volume 3b section 21.6.13 & 22.1.3.

Pause-Loop Exiting can be used to detect Lock-Holder Preemption, where one VP
is sched-out after hold a spinlock, then other VPs for same lock are sched-in
to waste the CPU time.

Our tests indicate that most spinlocks are held for less than 212 cycles.
Performance tests show that with 2X LP over-commitment we can get +2% perf
improvement for kernel build(Even more perf gain with more LPs).

Signed-off-by: Zhai Edwin <edwin.zhai@intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/vmx.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 272514c2d45..2b4945419a8 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -56,6 +56,7 @@
 #define SECONDARY_EXEC_ENABLE_VPID              0x00000020
 #define SECONDARY_EXEC_WBINVD_EXITING		0x00000040
 #define SECONDARY_EXEC_UNRESTRICTED_GUEST	0x00000080
+#define SECONDARY_EXEC_PAUSE_LOOP_EXITING	0x00000400
 
 
 #define PIN_BASED_EXT_INTR_MASK                 0x00000001
@@ -144,6 +145,8 @@ enum vmcs_field {
 	VM_ENTRY_INSTRUCTION_LEN        = 0x0000401a,
 	TPR_THRESHOLD                   = 0x0000401c,
 	SECONDARY_VM_EXEC_CONTROL       = 0x0000401e,
+	PLE_GAP                         = 0x00004020,
+	PLE_WINDOW                      = 0x00004022,
 	VM_INSTRUCTION_ERROR            = 0x00004400,
 	VM_EXIT_REASON                  = 0x00004402,
 	VM_EXIT_INTR_INFO               = 0x00004404,
@@ -248,6 +251,7 @@ enum vmcs_field {
 #define EXIT_REASON_MSR_READ            31
 #define EXIT_REASON_MSR_WRITE           32
 #define EXIT_REASON_MWAIT_INSTRUCTION   36
+#define EXIT_REASON_PAUSE_INSTRUCTION   40
 #define EXIT_REASON_MCE_DURING_VMENTRY	 41
 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43
 #define EXIT_REASON_APIC_ACCESS         44
-- 
cgit v1.2.3


From 565d0998ecac8373b9a9ecd5991abe74318cd235 Mon Sep 17 00:00:00 2001
From: Mark Langsdorf <mark.langsdorf@amd.com>
Date: Tue, 6 Oct 2009 14:25:02 -0500
Subject: KVM: SVM: Support Pause Filter in AMD processors

New AMD processors (Family 0x10 models 8+) support the Pause
Filter Feature.  This feature creates a new field in the VMCB
called Pause Filter Count.  If Pause Filter Count is greater
than 0 and intercepting PAUSEs is enabled, the processor will
increment an internal counter when a PAUSE instruction occurs
instead of intercepting.  When the internal counter reaches the
Pause Filter Count value, a PAUSE intercept will occur.

This feature can be used to detect contended spinlocks,
especially when the lock holding VCPU is not scheduled.
Rescheduling another VCPU prevents the VCPU seeking the
lock from wasting its quantum by spinning idly.

Experimental results show that most spinlocks are held
for less than 1000 PAUSE cycles or more than a few
thousand.  Default the Pause Filter Counter to 3000 to
detect the contended spinlocks.

Processor support for this feature is indicated by a CPUID
bit.

On a 24 core system running 4 guests each with 16 VCPUs,
this patch improved overall performance of each guest's
32 job kernbench by approximately 3-5% when combined
with a scheduler algorithm thati caused the VCPU to
sleep for a brief period. Further performance improvement
may be possible with a more sophisticated yield algorithm.

Signed-off-by: Mark Langsdorf <mark.langsdorf@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/svm.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 85574b7c1bc..1fecb7e6113 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -57,7 +57,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
 	u16 intercept_dr_write;
 	u32 intercept_exceptions;
 	u64 intercept;
-	u8 reserved_1[44];
+	u8 reserved_1[42];
+	u16 pause_filter_count;
 	u64 iopm_base_pa;
 	u64 msrpm_base_pa;
 	u64 tsc_offset;
-- 
cgit v1.2.3


From ffde22ac53b6d6b1d7206f1172176a667eead778 Mon Sep 17 00:00:00 2001
From: Ed Swierk <eswierk@aristanetworks.com>
Date: Thu, 15 Oct 2009 15:21:43 -0700
Subject: KVM: Xen PV-on-HVM guest support

Support for Xen PV-on-HVM guests can be implemented almost entirely in
userspace, except for handling one annoying MSR that maps a Xen
hypercall blob into guest address space.

A generic mechanism to delegate MSR writes to userspace seems overkill
and risks encouraging similar MSR abuse in the future.  Thus this patch
adds special support for the Xen HVM MSR.

I implemented a new ioctl, KVM_XEN_HVM_CONFIG, that lets userspace tell
KVM which MSR the guest will write to, as well as the starting address
and size of the hypercall blobs (one each for 32-bit and 64-bit) that
userspace has loaded from files.  When the guest writes to the MSR, KVM
copies one page of the blob from userspace to the guest.

I've tested this patch with a hacked-up version of Gerd's userspace
code, booting a number of guests (CentOS 5.3 i386 and x86_64, and
FreeBSD 8.0-RC1 amd64) and exercising PV network and block devices.

[jan: fix i386 build warning]
[avi: future proof abi with a flags field]

Signed-off-by: Ed Swierk <eswierk@aristanetworks.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm.h      | 1 +
 arch/x86/include/asm/kvm_host.h | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index f02e87a5206..ef9b4b73cce 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -19,6 +19,7 @@
 #define __KVM_HAVE_MSIX
 #define __KVM_HAVE_MCE
 #define __KVM_HAVE_PIT_STATE2
+#define __KVM_HAVE_XEN_HVM
 
 /* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 179a919f53a..36f3b53f5c2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -410,6 +410,8 @@ struct kvm_arch{
 
 	unsigned long irq_sources_bitmap;
 	u64 vm_init_tsc;
+
+	struct kvm_xen_hvm_config xen_hvm_config;
 };
 
 struct kvm_vm_stat {
-- 
cgit v1.2.3


From 94fe45da48f921d01d8ff02a0ad54ee9c326d7f0 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@web.de>
Date: Sun, 18 Oct 2009 13:24:44 +0200
Subject: KVM: x86: Fix guest single-stepping while interruptible

Commit 705c5323 opened the doors of hell by unconditionally injecting
single-step flags as long as guest_debug signaled this. This doesn't
work when the guest branches into some interrupt or exception handler
and triggers a vmexit with flag reloading.

Fix it by saving cs:rip when user space requests single-stepping and
restricting the trace flag injection to this guest code position.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 36f3b53f5c2..2536fbd85b3 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -371,6 +371,10 @@ struct kvm_vcpu_arch {
 	u64 mcg_status;
 	u64 mcg_ctl;
 	u64 *mce_banks;
+
+	/* used for guest single stepping over the given code position */
+	u16 singlestep_cs;
+	unsigned long singlestep_rip;
 };
 
 struct kvm_mem_alias {
-- 
cgit v1.2.3


From 6be7d3062b59af891be7e40c6802350de5f78cef Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@web.de>
Date: Sun, 18 Oct 2009 13:24:54 +0200
Subject: KVM: SVM: Cleanup NMI singlestep

Push the NMI-related singlestep variable into vcpu_svm. It's dealing
with an AMD-specific deficit, nothing generic for x86.

Acked-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>

 arch/x86/include/asm/kvm_host.h |    1 -
 arch/x86/kvm/svm.c              |   12 +++++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2536fbd85b3..4d994ad5051 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -354,7 +354,6 @@ struct kvm_vcpu_arch {
 	unsigned int time_offset;
 	struct page *time_page;
 
-	bool singlestep; /* guest is single stepped by KVM */
 	bool nmi_pending;
 	bool nmi_injected;
 
-- 
cgit v1.2.3


From afbcf7ab8d1bc8c2d04792f6d9e786e0adeb328d Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@redhat.com>
Date: Fri, 16 Oct 2009 15:28:36 -0400
Subject: KVM: allow userspace to adjust kvmclock offset

When we migrate a kvm guest that uses pvclock between two hosts, we may
suffer a large skew. This is because there can be significant differences
between the monotonic clock of the hosts involved. When a new host with
a much larger monotonic time starts running the guest, the view of time
will be significantly impacted.

Situation is much worse when we do the opposite, and migrate to a host with
a smaller monotonic clock.

This proposed ioctl will allow userspace to inform us what is the monotonic
clock value in the source host, so we can keep the time skew short, and
more importantly, never goes backwards. Userspace may also need to trigger
the current data, since from the first migration onwards, it won't be
reflected by a simple call to clock_gettime() anymore.

[marcelo: future-proof abi with a flags field]
[jan: fix KVM_GET_CLOCK by clearing flags field instead of checking it]

Signed-off-by: Glauber Costa <glommer@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4d994ad5051..0558ff8c32a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -413,6 +413,7 @@ struct kvm_arch{
 
 	unsigned long irq_sources_bitmap;
 	u64 vm_init_tsc;
+	s64 kvmclock_offset;
 
 	struct kvm_xen_hvm_config xen_hvm_config;
 };
-- 
cgit v1.2.3


From 18863bdd60f895f3b3ba16b15e8331aee781e8ec Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 7 Sep 2009 11:12:18 +0300
Subject: KVM: x86 shared msr infrastructure

The various syscall-related MSRs are fairly expensive to switch.  Currently
we switch them on every vcpu preemption, which is far too often:

- if we're switching to a kernel thread (idle task, threaded interrupt,
  kernel-mode virtio server (vhost-net), for example) and back, then
  there's no need to switch those MSRs since kernel threasd won't
  be exiting to userspace.

- if we're switching to another guest running an identical OS, most likely
  those MSRs will have the same value, so there's little point in reloading
  them.

- if we're running the same OS on the guest and host, the MSRs will have
  identical values and reloading is unnecessary.

This patch uses the new user return notifiers to implement last-minute
switching, and checks the msr values to avoid unnecessary reloading.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0558ff8c32a..26a74b7bb6b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -809,4 +809,7 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
 int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
 
+void kvm_define_shared_msr(unsigned index, u32 msr);
+void kvm_set_shared_msr(unsigned index, u64 val);
+
 #endif /* _ASM_X86_KVM_HOST_H */
-- 
cgit v1.2.3


From 3cfc3092f40bc37c57ba556cfd8de4218f2135ab Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@web.de>
Date: Thu, 12 Nov 2009 01:04:25 +0100
Subject: KVM: x86: Add KVM_GET/SET_VCPU_EVENTS

This new IOCTL exports all yet user-invisible states related to
exceptions, interrupts, and NMIs. Together with appropriate user space
changes, this fixes sporadic problems of vmsave/restore, live migration
and system reset.

[avi: future-proof abi by adding a flags field]

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm.h      | 28 ++++++++++++++++++++++++++++
 arch/x86/include/asm/kvm_host.h |  2 ++
 2 files changed, 30 insertions(+)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index ef9b4b73cce..950df434763 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -20,6 +20,7 @@
 #define __KVM_HAVE_MCE
 #define __KVM_HAVE_PIT_STATE2
 #define __KVM_HAVE_XEN_HVM
+#define __KVM_HAVE_VCPU_EVENTS
 
 /* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
@@ -252,4 +253,31 @@ struct kvm_reinject_control {
 	__u8 pit_reinject;
 	__u8 reserved[31];
 };
+
+/* for KVM_GET/SET_VCPU_EVENTS */
+struct kvm_vcpu_events {
+	struct {
+		__u8 injected;
+		__u8 nr;
+		__u8 has_error_code;
+		__u8 pad;
+		__u32 error_code;
+	} exception;
+	struct {
+		__u8 injected;
+		__u8 nr;
+		__u8 soft;
+		__u8 pad;
+	} interrupt;
+	struct {
+		__u8 injected;
+		__u8 pending;
+		__u8 masked;
+		__u8 pad;
+	} nmi;
+	__u32 sipi_vector;
+	__u32 flags;
+	__u32 reserved[10];
+};
+
 #endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 26a74b7bb6b..06e085614da 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -523,6 +523,8 @@ struct kvm_x86_ops {
 				bool has_error_code, u32 error_code);
 	int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
 	int (*nmi_allowed)(struct kvm_vcpu *vcpu);
+	bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
+	void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked);
 	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
 	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
 	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
-- 
cgit v1.2.3


From eb3c79e64a70fb8f7473e30fa07e89c1ecc2c9bb Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 24 Nov 2009 15:20:15 +0200
Subject: KVM: x86 emulator: limit instructions to 15 bytes

While we are never normally passed an instruction that exceeds 15 bytes,
smp games can cause us to attempt to interpret one, which will cause
large latencies in non-preempt hosts.

Cc: stable@kernel.org
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index b7ed2c42311..7c18e1230f5 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -129,7 +129,7 @@ struct decode_cache {
 	u8 seg_override;
 	unsigned int d;
 	unsigned long regs[NR_VCPU_REGS];
-	unsigned long eip;
+	unsigned long eip, eip_orig;
 	/* modrm */
 	u8 modrm;
 	u8 modrm_mod;
-- 
cgit v1.2.3


From d5696725b2a4c59503f5e0bc33adeee7f30cd45b Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 2 Dec 2009 12:28:47 +0200
Subject: KVM: VMX: Fix comparison of guest efer with stale host value

update_transition_efer() masks out some efer bits when deciding whether
to switch the msr during guest entry; for example, NX is emulated using the
mmu so we don't need to disable it, and LMA/LME are handled by the hardware.

However, with shared msrs, the comparison is made against a stale value;
at the time of the guest switch we may be running with another guest's efer.

Fix by deferring the mask/compare to the actual point of guest entry.

Noted by Marcelo.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 06e085614da..4f865e8b854 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -812,6 +812,6 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
 int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
 
 void kvm_define_shared_msr(unsigned index, u32 msr);
-void kvm_set_shared_msr(unsigned index, u64 val);
+void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
 
 #endif /* _ASM_X86_KVM_HOST_H */
-- 
cgit v1.2.3