diff options
author | Avi Kivity <avi@qumranet.com> | 2007-12-16 11:02:48 +0200 |
---|---|---|
committer | Avi Kivity <avi@qumranet.com> | 2008-01-30 18:01:18 +0200 |
commit | edf884172e9828c6234b254208af04655855038d (patch) | |
tree | f5e5d1eecaed9737eced6ba60d09fe93149751c1 /drivers | |
parent | 9584bf2c93f56656dba0de8f6c75b54ca7995143 (diff) |
KVM: Move arch dependent files to new directory arch/x86/kvm/
This paves the way for multiple architecture support. Note that while
ioapic.c could potentially be shared with ia64, it is also moved.
Signed-off-by: Avi Kivity <avi@qumranet.com>
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/Kconfig | 2 | ||||
-rw-r--r-- | drivers/Makefile | 1 | ||||
-rw-r--r-- | drivers/kvm/Kconfig | 57 | ||||
-rw-r--r-- | drivers/kvm/Makefile | 10 | ||||
-rw-r--r-- | drivers/kvm/i8259.c | 449 | ||||
-rw-r--r-- | drivers/kvm/ioapic.c | 402 | ||||
-rw-r--r-- | drivers/kvm/iodev.h | 2 | ||||
-rw-r--r-- | drivers/kvm/irq.c | 99 | ||||
-rw-r--r-- | drivers/kvm/irq.h | 196 | ||||
-rw-r--r-- | drivers/kvm/kvm.h | 289 | ||||
-rw-r--r-- | drivers/kvm/kvm_main.c | 2 | ||||
-rw-r--r-- | drivers/kvm/kvm_svm.h | 45 | ||||
-rw-r--r-- | drivers/kvm/lapic.c | 1087 | ||||
-rw-r--r-- | drivers/kvm/mmu.c | 1806 | ||||
-rw-r--r-- | drivers/kvm/mmu.h | 44 | ||||
-rw-r--r-- | drivers/kvm/paging_tmpl.h | 461 | ||||
-rw-r--r-- | drivers/kvm/segment_descriptor.h | 29 | ||||
-rw-r--r-- | drivers/kvm/svm.c | 1725 | ||||
-rw-r--r-- | drivers/kvm/svm.h | 325 | ||||
-rw-r--r-- | drivers/kvm/types.h | 54 | ||||
-rw-r--r-- | drivers/kvm/vmx.c | 2673 | ||||
-rw-r--r-- | drivers/kvm/vmx.h | 324 | ||||
-rw-r--r-- | drivers/kvm/x86.c | 3148 | ||||
-rw-r--r-- | drivers/kvm/x86.h | 602 | ||||
-rw-r--r-- | drivers/kvm/x86_emulate.c | 1913 | ||||
-rw-r--r-- | drivers/kvm/x86_emulate.h | 186 |
26 files changed, 2 insertions, 15929 deletions
diff --git a/drivers/Kconfig b/drivers/Kconfig index f4076d9e990..08d4ae20159 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -90,8 +90,6 @@ source "drivers/dca/Kconfig" source "drivers/auxdisplay/Kconfig" -source "drivers/kvm/Kconfig" - source "drivers/uio/Kconfig" source "drivers/virtio/Kconfig" diff --git a/drivers/Makefile b/drivers/Makefile index d92d4d82d00..9e1f808e43c 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -47,7 +47,6 @@ obj-$(CONFIG_SPI) += spi/ obj-$(CONFIG_PCCARD) += pcmcia/ obj-$(CONFIG_DIO) += dio/ obj-$(CONFIG_SBUS) += sbus/ -obj-$(CONFIG_KVM) += kvm/ obj-$(CONFIG_ZORRO) += zorro/ obj-$(CONFIG_MAC) += macintosh/ obj-$(CONFIG_ATA_OVER_ETH) += block/aoe/ diff --git a/drivers/kvm/Kconfig b/drivers/kvm/Kconfig deleted file mode 100644 index c83e1c9b512..00000000000 --- a/drivers/kvm/Kconfig +++ /dev/null @@ -1,57 +0,0 @@ -# -# KVM configuration -# -config HAVE_KVM - bool - -menuconfig VIRTUALIZATION - bool "Virtualization" - depends on HAVE_KVM || X86 - default y - ---help--- - Say Y here to get to see options for using your Linux host to run other - operating systems inside virtual machines (guests). - This option alone does not add any kernel code. - - If you say N, all options in this submenu will be skipped and disabled. - -if VIRTUALIZATION - -config KVM - tristate "Kernel-based Virtual Machine (KVM) support" - depends on HAVE_KVM && EXPERIMENTAL - select PREEMPT_NOTIFIERS - select ANON_INODES - ---help--- - Support hosting fully virtualized guest machines using hardware - virtualization extensions. You will need a fairly recent - processor equipped with virtualization extensions. You will also - need to select one or more of the processor modules below. - - This module provides access to the hardware capabilities through - a character device node named /dev/kvm. - - To compile this as a module, choose M here: the module - will be called kvm. - - If unsure, say N. - -config KVM_INTEL - tristate "KVM for Intel processors support" - depends on KVM - ---help--- - Provides support for KVM on Intel processors equipped with the VT - extensions. - -config KVM_AMD - tristate "KVM for AMD processors support" - depends on KVM - ---help--- - Provides support for KVM on AMD processors equipped with the AMD-V - (SVM) extensions. - -# OK, it's a little counter-intuitive to do this, but it puts it neatly under -# the virtualization menu. -source drivers/lguest/Kconfig - -endif # VIRTUALIZATION diff --git a/drivers/kvm/Makefile b/drivers/kvm/Makefile deleted file mode 100644 index cf18ad46e98..00000000000 --- a/drivers/kvm/Makefile +++ /dev/null @@ -1,10 +0,0 @@ -# -# Makefile for Kernel-based Virtual Machine module -# - -kvm-objs := kvm_main.o x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o ioapic.o -obj-$(CONFIG_KVM) += kvm.o -kvm-intel-objs = vmx.o -obj-$(CONFIG_KVM_INTEL) += kvm-intel.o -kvm-amd-objs = svm.o -obj-$(CONFIG_KVM_AMD) += kvm-amd.o diff --git a/drivers/kvm/i8259.c b/drivers/kvm/i8259.c deleted file mode 100644 index b3cad632f3d..00000000000 --- a/drivers/kvm/i8259.c +++ /dev/null @@ -1,449 +0,0 @@ -/* - * 8259 interrupt controller emulation - * - * Copyright (c) 2003-2004 Fabrice Bellard - * Copyright (c) 2007 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - * Authors: - * Yaozu (Eddie) Dong <Eddie.dong@intel.com> - * Port from Qemu. - */ -#include <linux/mm.h> -#include "irq.h" -#include "kvm.h" - -/* - * set irq level. If an edge is detected, then the IRR is set to 1 - */ -static inline void pic_set_irq1(struct kvm_kpic_state *s, int irq, int level) -{ - int mask; - mask = 1 << irq; - if (s->elcr & mask) /* level triggered */ - if (level) { - s->irr |= mask; - s->last_irr |= mask; - } else { - s->irr &= ~mask; - s->last_irr &= ~mask; - } - else /* edge triggered */ - if (level) { - if ((s->last_irr & mask) == 0) - s->irr |= mask; - s->last_irr |= mask; - } else - s->last_irr &= ~mask; -} - -/* - * return the highest priority found in mask (highest = smallest - * number). Return 8 if no irq - */ -static inline int get_priority(struct kvm_kpic_state *s, int mask) -{ - int priority; - if (mask == 0) - return 8; - priority = 0; - while ((mask & (1 << ((priority + s->priority_add) & 7))) == 0) - priority++; - return priority; -} - -/* - * return the pic wanted interrupt. return -1 if none - */ -static int pic_get_irq(struct kvm_kpic_state *s) -{ - int mask, cur_priority, priority; - - mask = s->irr & ~s->imr; - priority = get_priority(s, mask); - if (priority == 8) - return -1; - /* - * compute current priority. If special fully nested mode on the - * master, the IRQ coming from the slave is not taken into account - * for the priority computation. - */ - mask = s->isr; - if (s->special_fully_nested_mode && s == &s->pics_state->pics[0]) - mask &= ~(1 << 2); - cur_priority = get_priority(s, mask); - if (priority < cur_priority) - /* - * higher priority found: an irq should be generated - */ - return (priority + s->priority_add) & 7; - else - return -1; -} - -/* - * raise irq to CPU if necessary. must be called every time the active - * irq may change - */ -static void pic_update_irq(struct kvm_pic *s) -{ - int irq2, irq; - - irq2 = pic_get_irq(&s->pics[1]); - if (irq2 >= 0) { - /* - * if irq request by slave pic, signal master PIC - */ - pic_set_irq1(&s->pics[0], 2, 1); - pic_set_irq1(&s->pics[0], 2, 0); - } - irq = pic_get_irq(&s->pics[0]); - if (irq >= 0) - s->irq_request(s->irq_request_opaque, 1); - else - s->irq_request(s->irq_request_opaque, 0); -} - -void kvm_pic_update_irq(struct kvm_pic *s) -{ - pic_update_irq(s); -} - -void kvm_pic_set_irq(void *opaque, int irq, int level) -{ - struct kvm_pic *s = opaque; - - pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); - pic_update_irq(s); -} - -/* - * acknowledge interrupt 'irq' - */ -static inline void pic_intack(struct kvm_kpic_state *s, int irq) -{ - if (s->auto_eoi) { - if (s->rotate_on_auto_eoi) - s->priority_add = (irq + 1) & 7; - } else - s->isr |= (1 << irq); - /* - * We don't clear a level sensitive interrupt here - */ - if (!(s->elcr & (1 << irq))) - s->irr &= ~(1 << irq); -} - -int kvm_pic_read_irq(struct kvm_pic *s) -{ - int irq, irq2, intno; - - irq = pic_get_irq(&s->pics[0]); - if (irq >= 0) { - pic_intack(&s->pics[0], irq); - if (irq == 2) { - irq2 = pic_get_irq(&s->pics[1]); - if (irq2 >= 0) - pic_intack(&s->pics[1], irq2); - else - /* - * spurious IRQ on slave controller - */ - irq2 = 7; - intno = s->pics[1].irq_base + irq2; - irq = irq2 + 8; - } else - intno = s->pics[0].irq_base + irq; - } else { - /* - * spurious IRQ on host controller - */ - irq = 7; - intno = s->pics[0].irq_base + irq; - } - pic_update_irq(s); - - return intno; -} - -void kvm_pic_reset(struct kvm_kpic_state *s) -{ - s->last_irr = 0; - s->irr = 0; - s->imr = 0; - s->isr = 0; - s->priority_add = 0; - s->irq_base = 0; - s->read_reg_select = 0; - s->poll = 0; - s->special_mask = 0; - s->init_state = 0; - s->auto_eoi = 0; - s->rotate_on_auto_eoi = 0; - s->special_fully_nested_mode = 0; - s->init4 = 0; -} - -static void pic_ioport_write(void *opaque, u32 addr, u32 val) -{ - struct kvm_kpic_state *s = opaque; - int priority, cmd, irq; - - addr &= 1; - if (addr == 0) { - if (val & 0x10) { - kvm_pic_reset(s); /* init */ - /* - * deassert a pending interrupt - */ - s->pics_state->irq_request(s->pics_state-> - irq_request_opaque, 0); - s->init_state = 1; - s->init4 = val & 1; - if (val & 0x02) - printk(KERN_ERR "single mode not supported"); - if (val & 0x08) - printk(KERN_ERR - "level sensitive irq not supported"); - } else if (val & 0x08) { - if (val & 0x04) - s->poll = 1; - if (val & 0x02) - s->read_reg_select = val & 1; - if (val & 0x40) - s->special_mask = (val >> 5) & 1; - } else { - cmd = val >> 5; - switch (cmd) { - case 0: - case 4: - s->rotate_on_auto_eoi = cmd >> 2; - break; - case 1: /* end of interrupt */ - case 5: - priority = get_priority(s, s->isr); - if (priority != 8) { - irq = (priority + s->priority_add) & 7; - s->isr &= ~(1 << irq); - if (cmd == 5) - s->priority_add = (irq + 1) & 7; - pic_update_irq(s->pics_state); - } - break; - case 3: - irq = val & 7; - s->isr &= ~(1 << irq); - pic_update_irq(s->pics_state); - break; - case 6: - s->priority_add = (val + 1) & 7; - pic_update_irq(s->pics_state); - break; - case 7: - irq = val & 7; - s->isr &= ~(1 << irq); - s->priority_add = (irq + 1) & 7; - pic_update_irq(s->pics_state); - break; - default: - break; /* no operation */ - } - } - } else - switch (s->init_state) { - case 0: /* normal mode */ - s->imr = val; - pic_update_irq(s->pics_state); - break; - case 1: - s->irq_base = val & 0xf8; - s->init_state = 2; - break; - case 2: - if (s->init4) - s->init_state = 3; - else - s->init_state = 0; - break; - case 3: - s->special_fully_nested_mode = (val >> 4) & 1; - s->auto_eoi = (val >> 1) & 1; - s->init_state = 0; - break; - } -} - -static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1) -{ - int ret; - - ret = pic_get_irq(s); - if (ret >= 0) { - if (addr1 >> 7) { - s->pics_state->pics[0].isr &= ~(1 << 2); - s->pics_state->pics[0].irr &= ~(1 << 2); - } - s->irr &= ~(1 << ret); - s->isr &= ~(1 << ret); - if (addr1 >> 7 || ret != 2) - pic_update_irq(s->pics_state); - } else { - ret = 0x07; - pic_update_irq(s->pics_state); - } - - return ret; -} - -static u32 pic_ioport_read(void *opaque, u32 addr1) -{ - struct kvm_kpic_state *s = opaque; - unsigned int addr; - int ret; - - addr = addr1; - addr &= 1; - if (s->poll) { - ret = pic_poll_read(s, addr1); - s->poll = 0; - } else - if (addr == 0) - if (s->read_reg_select) - ret = s->isr; - else - ret = s->irr; - else - ret = s->imr; - return ret; -} - -static void elcr_ioport_write(void *opaque, u32 addr, u32 val) -{ - struct kvm_kpic_state *s = opaque; - s->elcr = val & s->elcr_mask; -} - -static u32 elcr_ioport_read(void *opaque, u32 addr1) -{ - struct kvm_kpic_state *s = opaque; - return s->elcr; -} - -static int picdev_in_range(struct kvm_io_device *this, gpa_t addr) -{ - switch (addr) { - case 0x20: - case 0x21: - case 0xa0: - case 0xa1: - case 0x4d0: - case 0x4d1: - return 1; - default: - return 0; - } -} - -static void picdev_write(struct kvm_io_device *this, - gpa_t addr, int len, const void *val) -{ - struct kvm_pic *s = this->private; - unsigned char data = *(unsigned char *)val; - - if (len != 1) { - if (printk_ratelimit()) - printk(KERN_ERR "PIC: non byte write\n"); - return; - } - switch (addr) { - case 0x20: - case 0x21: - case 0xa0: - case 0xa1: - pic_ioport_write(&s->pics[addr >> 7], addr, data); - break; - case 0x4d0: - case 0x4d1: - elcr_ioport_write(&s->pics[addr & 1], addr, data); - break; - } -} - -static void picdev_read(struct kvm_io_device *this, - gpa_t addr, int len, void *val) -{ - struct kvm_pic *s = this->private; - unsigned char data = 0; - - if (len != 1) { - if (printk_ratelimit()) - printk(KERN_ERR "PIC: non byte read\n"); - return; - } - switch (addr) { - case 0x20: - case 0x21: - case 0xa0: - case 0xa1: - data = pic_ioport_read(&s->pics[addr >> 7], addr); - break; - case 0x4d0: - case 0x4d1: - data = elcr_ioport_read(&s->pics[addr & 1], addr); - break; - } - *(unsigned char *)val = data; -} - -/* - * callback when PIC0 irq status changed - */ -static void pic_irq_request(void *opaque, int level) -{ - struct kvm *kvm = opaque; - struct kvm_vcpu *vcpu = kvm->vcpus[0]; - - pic_irqchip(kvm)->output = level; - if (vcpu) - kvm_vcpu_kick(vcpu); -} - -struct kvm_pic *kvm_create_pic(struct kvm *kvm) -{ - struct kvm_pic *s; - s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); - if (!s) - return NULL; - s->pics[0].elcr_mask = 0xf8; - s->pics[1].elcr_mask = 0xde; - s->irq_request = pic_irq_request; - s->irq_request_opaque = kvm; - s->pics[0].pics_state = s; - s->pics[1].pics_state = s; - - /* - * Initialize PIO device - */ - s->dev.read = picdev_read; - s->dev.write = picdev_write; - s->dev.in_range = picdev_in_range; - s->dev.private = s; - kvm_io_bus_register_dev(&kvm->pio_bus, &s->dev); - return s; -} diff --git a/drivers/kvm/ioapic.c b/drivers/kvm/ioapic.c deleted file mode 100644 index f8236774c1b..00000000000 --- a/drivers/kvm/ioapic.c +++ /dev/null @@ -1,402 +0,0 @@ -/* - * Copyright (C) 2001 MandrakeSoft S.A. - * - * MandrakeSoft S.A. - * 43, rue d'Aboukir - * 75002 Paris - France - * http://www.linux-mandrake.com/ - * http://www.mandrakesoft.com/ - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * Yunhong Jiang <yunhong.jiang@intel.com> - * Yaozu (Eddie) Dong <eddie.dong@intel.com> - * Based on Xen 3.1 code. - */ - -#include "kvm.h" -#include "x86.h" - -#include <linux/kvm.h> -#include <linux/mm.h> -#include <linux/highmem.h> -#include <linux/smp.h> -#include <linux/hrtimer.h> -#include <linux/io.h> -#include <asm/processor.h> -#include <asm/page.h> -#include <asm/current.h> -#include "irq.h" -#if 0 -#define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) -#else -#define ioapic_debug(fmt, arg...) -#endif -static void ioapic_deliver(struct kvm_ioapic *vioapic, int irq); - -static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, - unsigned long addr, - unsigned long length) -{ - unsigned long result = 0; - - switch (ioapic->ioregsel) { - case IOAPIC_REG_VERSION: - result = ((((IOAPIC_NUM_PINS - 1) & 0xff) << 16) - | (IOAPIC_VERSION_ID & 0xff)); - break; - - case IOAPIC_REG_APIC_ID: - case IOAPIC_REG_ARB_ID: - result = ((ioapic->id & 0xf) << 24); - break; - - default: - { - u32 redir_index = (ioapic->ioregsel - 0x10) >> 1; - u64 redir_content; - - ASSERT(redir_index < IOAPIC_NUM_PINS); - - redir_content = ioapic->redirtbl[redir_index].bits; - result = (ioapic->ioregsel & 0x1) ? - (redir_content >> 32) & 0xffffffff : - redir_content & 0xffffffff; - break; - } - } - - return result; -} - -static void ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx) -{ - union ioapic_redir_entry *pent; - - pent = &ioapic->redirtbl[idx]; - - if (!pent->fields.mask) { - ioapic_deliver(ioapic, idx); - if (pent->fields.trig_mode == IOAPIC_LEVEL_TRIG) - pent->fields.remote_irr = 1; - } - if (!pent->fields.trig_mode) - ioapic->irr &= ~(1 << idx); -} - -static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) -{ - unsigned index; - - switch (ioapic->ioregsel) { - case IOAPIC_REG_VERSION: - /* Writes are ignored. */ - break; - - case IOAPIC_REG_APIC_ID: - ioapic->id = (val >> 24) & 0xf; - break; - - case IOAPIC_REG_ARB_ID: - break; - - default: - index = (ioapic->ioregsel - 0x10) >> 1; - - ioapic_debug("change redir index %x val %x\n", index, val); - if (index >= IOAPIC_NUM_PINS) - return; - if (ioapic->ioregsel & 1) { - ioapic->redirtbl[index].bits &= 0xffffffff; - ioapic->redirtbl[index].bits |= (u64) val << 32; - } else { - ioapic->redirtbl[index].bits &= ~0xffffffffULL; - ioapic->redirtbl[index].bits |= (u32) val; - ioapic->redirtbl[index].fields.remote_irr = 0; - } - if (ioapic->irr & (1 << index)) - ioapic_service(ioapic, index); - break; - } -} - -static void ioapic_inj_irq(struct kvm_ioapic *ioapic, - struct kvm_vcpu *vcpu, - u8 vector, u8 trig_mode, u8 delivery_mode) -{ - ioapic_debug("irq %d trig %d deliv %d\n", vector, trig_mode, - delivery_mode); - - ASSERT((delivery_mode == IOAPIC_FIXED) || - (delivery_mode == IOAPIC_LOWEST_PRIORITY)); - - kvm_apic_set_irq(vcpu, vector, trig_mode); -} - -static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, - u8 dest_mode) -{ - u32 mask = 0; - int i; - struct kvm *kvm = ioapic->kvm; - struct kvm_vcpu *vcpu; - - ioapic_debug("dest %d dest_mode %d\n", dest, dest_mode); - - if (dest_mode == 0) { /* Physical mode. */ - if (dest == 0xFF) { /* Broadcast. */ - for (i = 0; i < KVM_MAX_VCPUS; ++i) - if (kvm->vcpus[i] && kvm->vcpus[i]->arch.apic) - mask |= 1 << i; - return mask; - } - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - vcpu = kvm->vcpus[i]; - if (!vcpu) - continue; - if (kvm_apic_match_physical_addr(vcpu->arch.apic, dest)) { - if (vcpu->arch.apic) - mask = 1 << i; - break; - } - } - } else if (dest != 0) /* Logical mode, MDA non-zero. */ - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - vcpu = kvm->vcpus[i]; - if (!vcpu) - continue; - if (vcpu->arch.apic && - kvm_apic_match_logical_addr(vcpu->arch.apic, dest)) - mask |= 1 << vcpu->vcpu_id; - } - ioapic_debug("mask %x\n", mask); - return mask; -} - -static void ioapic_deliver(struct kvm_ioapic *ioapic, int irq) -{ - u8 dest = ioapic->redirtbl[irq].fields.dest_id; - u8 dest_mode = ioapic->redirtbl[irq].fields.dest_mode; - u8 delivery_mode = ioapic->redirtbl[irq].fields.delivery_mode; - u8 vector = ioapic->redirtbl[irq].fields.vector; - u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode; - u32 deliver_bitmask; - struct kvm_vcpu *vcpu; - int vcpu_id; - - ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x " - "vector=%x trig_mode=%x\n", - dest, dest_mode, delivery_mode, vector, trig_mode); - - deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode); - if (!deliver_bitmask) { - ioapic_debug("no target on destination\n"); - return; - } - - switch (delivery_mode) { - case IOAPIC_LOWEST_PRIORITY: - vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector, - deliver_bitmask); - if (vcpu != NULL) - ioapic_inj_irq(ioapic, vcpu, vector, - trig_mode, delivery_mode); - else - ioapic_debug("null lowest prio vcpu: " - "mask=%x vector=%x delivery_mode=%x\n", - deliver_bitmask, vector, IOAPIC_LOWEST_PRIORITY); - break; - case IOAPIC_FIXED: - for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) { - if (!(deliver_bitmask & (1 << vcpu_id))) - continue; - deliver_bitmask &= ~(1 << vcpu_id); - vcpu = ioapic->kvm->vcpus[vcpu_id]; - if (vcpu) { - ioapic_inj_irq(ioapic, vcpu, vector, - trig_mode, delivery_mode); - } - } - break; - - /* TODO: NMI */ - default: - printk(KERN_WARNING "Unsupported delivery mode %d\n", - delivery_mode); - break; - } -} - -void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) -{ - u32 old_irr = ioapic->irr; - u32 mask = 1 << irq; - union ioapic_redir_entry entry; - - if (irq >= 0 && irq < IOAPIC_NUM_PINS) { - entry = ioapic->redirtbl[irq]; - level ^= entry.fields.polarity; - if (!level) - ioapic->irr &= ~mask; - else { - ioapic->irr |= mask; - if ((!entry.fields.trig_mode && old_irr != ioapic->irr) - || !entry.fields.remote_irr) - ioapic_service(ioapic, irq); - } - } -} - -static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector) -{ - int i; - - for (i = 0; i < IOAPIC_NUM_PINS; i++) - if (ioapic->redirtbl[i].fields.vector == vector) - return i; - return -1; -} - -void kvm_ioapic_update_eoi(struct kvm *kvm, int vector) -{ - struct kvm_ioapic *ioapic = kvm->arch.vioapic; - union ioapic_redir_entry *ent; - int gsi; - - gsi = get_eoi_gsi(ioapic, vector); - if (gsi == -1) { - printk(KERN_WARNING "Can't find redir item for %d EOI\n", - vector); - return; - } - - ent = &ioapic->redirtbl[gsi]; - ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); - - ent->fields.remote_irr = 0; - if (!ent->fields.mask && (ioapic->irr & (1 << gsi))) - ioapic_deliver(ioapic, gsi); -} - -static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr) -{ - struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; - - return ((addr >= ioapic->base_address && - (addr < ioapic->base_address + IOAPIC_MEM_LENGTH))); -} - -static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, - void *val) -{ - struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; - u32 result; - - ioapic_debug("addr %lx\n", (unsigned long)addr); - ASSERT(!(addr & 0xf)); /* check alignment */ - - addr &= 0xff; - switch (addr) { - case IOAPIC_REG_SELECT: - result = ioapic->ioregsel; - break; - - case IOAPIC_REG_WINDOW: - result = ioapic_read_indirect(ioapic, addr, len); - break; - - default: - result = 0; - break; - } - switch (len) { - case 8: - *(u64 *) val = result; - break; - case 1: - case 2: - case 4: - memcpy(val, (char *)&result, len); - break; - default: - printk(KERN_WARNING "ioapic: wrong length %d\n", len); - } -} - -static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, - const void *val) -{ - struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; - u32 data; - - ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n", - (void*)addr, len, val); - ASSERT(!(addr & 0xf)); /* check alignment */ - if (len == 4 || len == 8) - data = *(u32 *) val; - else { - printk(KERN_WARNING "ioapic: Unsupported size %d\n", len); - return; - } - - addr &= 0xff; - switch (addr) { - case IOAPIC_REG_SELECT: - ioapic->ioregsel = data; - break; - - case IOAPIC_REG_WINDOW: - ioapic_write_indirect(ioapic, data); - break; -#ifdef CONFIG_IA64 - case IOAPIC_REG_EOI: - kvm_ioapic_update_eoi(ioapic, data); - break; -#endif - - default: - break; - } -} - -void kvm_ioapic_reset(struct kvm_ioapic *ioapic) -{ - int i; - - for (i = 0; i < IOAPIC_NUM_PINS; i++) - ioapic->redirtbl[i].fields.mask = 1; - ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; - ioapic->ioregsel = 0; - ioapic->irr = 0; - ioapic->id = 0; -} - -int kvm_ioapic_init(struct kvm *kvm) -{ - struct kvm_ioapic *ioapic; - - ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL); - if (!ioapic) - return -ENOMEM; - kvm->arch.vioapic = ioapic; - kvm_ioapic_reset(ioapic); - ioapic->dev.read = ioapic_mmio_read; - ioapic->dev.write = ioapic_mmio_write; - ioapic->dev.in_range = ioapic_in_range; - ioapic->dev.private = ioapic; - ioapic->kvm = kvm; - kvm_io_bus_register_dev(&kvm->mmio_bus, &ioapic->dev); - return 0; -} diff --git a/drivers/kvm/iodev.h b/drivers/kvm/iodev.h index eb9e8a71843..c14e642027b 100644 --- a/drivers/kvm/iodev.h +++ b/drivers/kvm/iodev.h @@ -16,7 +16,7 @@ #ifndef __KVM_IODEV_H__ #define __KVM_IODEV_H__ -#include "types.h" +#include <linux/kvm_types.h> struct kvm_io_device { void (*read)(struct kvm_io_device *this, diff --git a/drivers/kvm/irq.c b/drivers/kvm/irq.c deleted file mode 100644 index 59b47c55fc7..00000000000 --- a/drivers/kvm/irq.c +++ /dev/null @@ -1,99 +0,0 @@ -/* - * irq.c: API for in kernel interrupt controller - * Copyright (c) 2007, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple - * Place - Suite 330, Boston, MA 02111-1307 USA. - * Authors: - * Yaozu (Eddie) Dong <Eddie.dong@intel.com> - * - */ - -#include <linux/module.h> - -#include "kvm.h" -#include "x86.h" -#include "irq.h" - -/* - * check if there is pending interrupt without - * intack. - */ -int kvm_cpu_has_interrupt(struct kvm_vcpu *v) -{ - struct kvm_pic *s; - - if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */ - if (kvm_apic_accept_pic_intr(v)) { - s = pic_irqchip(v->kvm); /* PIC */ - return s->output; - } else - return 0; - } - return 1; -} -EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); - -/* - * Read pending interrupt vector and intack. - */ -int kvm_cpu_get_interrupt(struct kvm_vcpu *v) -{ - struct kvm_pic *s; - int vector; - - vector = kvm_get_apic_interrupt(v); /* APIC */ - if (vector == -1) { - if (kvm_apic_accept_pic_intr(v)) { - s = pic_irqchip(v->kvm); - s->output = 0; /* PIC */ - vector = kvm_pic_read_irq(s); - } - } - return vector; -} -EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); - -static void vcpu_kick_intr(void *info) -{ -#ifdef DEBUG - struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info; - printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu); -#endif -} - -void kvm_vcpu_kick(struct kvm_vcpu *vcpu) -{ - int ipi_pcpu = vcpu->cpu; - - if (waitqueue_active(&vcpu->wq)) { - wake_up_interruptible(&vcpu->wq); - ++vcpu->stat.halt_wakeup; - } - if (vcpu->guest_mode) - smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0); -} - -void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) -{ - kvm_inject_apic_timer_irqs(vcpu); - /* TODO: PIT, RTC etc. */ -} -EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); - -void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec) -{ - kvm_apic_timer_intr_post(vcpu, vec); - /* TODO: PIT, RTC etc. */ -} -EXPORT_SYMBOL_GPL(kvm_timer_intr_post); diff --git a/drivers/kvm/irq.h b/drivers/kvm/irq.h deleted file mode 100644 index 6e023dc3f84..00000000000 --- a/drivers/kvm/irq.h +++ /dev/null @@ -1,196 +0,0 @@ -/* - * irq.h: in kernel interrupt controller related definitions - * Copyright (c) 2007, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple - * Place - Suite 330, Boston, MA 02111-1307 USA. - * Authors: - * Yaozu (Eddie) Dong <Eddie.dong@intel.com> - * - */ - -#ifndef __IRQ_H -#define __IRQ_H - -#include <linux/mm_types.h> -#include <linux/hrtimer.h> -#include <asm/kvm.h> -#include "iodev.h" -#include "kvm.h" - -struct kvm; -struct kvm_vcpu; - -typedef void irq_request_func(void *opaque, int level); - -struct kvm_kpic_state { - u8 last_irr; /* edge detection */ - u8 irr; /* interrupt request register */ - u8 imr; /* interrupt mask register */ - u8 isr; /* interrupt service register */ - u8 priority_add; /* highest irq priority */ - u8 irq_base; - u8 read_reg_select; - u8 poll; - u8 special_mask; - u8 init_state; - u8 auto_eoi; - u8 rotate_on_auto_eoi; - u8 special_fully_nested_mode; - u8 init4; /* true if 4 byte init */ - u8 elcr; /* PIIX edge/trigger selection */ - u8 elcr_mask; - struct kvm_pic *pics_state; -}; - -struct kvm_pic { - struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ - irq_request_func *irq_request; - void *irq_request_opaque; - int output; /* intr from master PIC */ - struct kvm_io_device dev; -}; - -struct kvm_pic *kvm_create_pic(struct kvm *kvm); -void kvm_pic_set_irq(void *opaque, int irq, int level); -int kvm_pic_read_irq(struct kvm_pic *s); -void kvm_pic_update_irq(struct kvm_pic *s); - -#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS -#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */ -#define IOAPIC_EDGE_TRIG 0 -#define IOAPIC_LEVEL_TRIG 1 - -#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000 -#define IOAPIC_MEM_LENGTH 0x100 - -/* Direct registers. */ -#define IOAPIC_REG_SELECT 0x00 -#define IOAPIC_REG_WINDOW 0x10 -#define IOAPIC_REG_EOI 0x40 /* IA64 IOSAPIC only */ - -/* Indirect registers. */ -#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */ -#define IOAPIC_REG_VERSION 0x01 -#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */ - -/*ioapic delivery mode*/ -#define IOAPIC_FIXED 0x0 -#define IOAPIC_LOWEST_PRIORITY 0x1 -#define IOAPIC_PMI 0x2 -#define IOAPIC_NMI 0x4 -#define IOAPIC_INIT 0x5 -#define IOAPIC_EXTINT 0x7 - -struct kvm_ioapic { - u64 base_address; - u32 ioregsel; - u32 id; - u32 irr; - u32 pad; - union ioapic_redir_entry { - u64 bits; - struct { - u8 vector; - u8 delivery_mode:3; - u8 dest_mode:1; - u8 delivery_status:1; - u8 polarity:1; - u8 remote_irr:1; - u8 trig_mode:1; - u8 mask:1; - u8 reserve:7; - u8 reserved[4]; - u8 dest_id; - } fields; - } redirtbl[IOAPIC_NUM_PINS]; - struct kvm_io_device dev; - struct kvm *kvm; -}; - -struct kvm_lapic { - unsigned long base_address; - struct kvm_io_device dev; - struct { - atomic_t pending; - s64 period; /* unit: ns */ - u32 divide_count; - ktime_t last_update; - struct hrtimer dev; - } timer; - struct kvm_vcpu *vcpu; - struct page *regs_page; - void *regs; -}; - -#ifdef DEBUG -#define ASSERT(x) \ -do { \ - if (!(x)) { \ - printk(KERN_EMERG "assertion failed %s: %d: %s\n", \ - __FILE__, __LINE__, #x); \ - BUG(); \ - } \ -} while (0) -#else -#define ASSERT(x) do { } while (0) -#endif - -static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) -{ - return kvm->arch.vpic; -} - -static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) -{ - return kvm->arch.vioapic; -} - -static inline int irqchip_in_kernel(struct kvm *kvm) -{ - return pic_irqchip(kvm) != NULL; -} - -void kvm_vcpu_kick(struct kvm_vcpu *vcpu); -int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); -int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); -int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); -int kvm_create_lapic(struct kvm_vcpu *vcpu); -void kvm_lapic_reset(struct kvm_vcpu *vcpu); -void kvm_pic_reset(struct kvm_kpic_state *s); -void kvm_ioapic_reset(struct kvm_ioapic *ioapic); -void kvm_free_lapic(struct kvm_vcpu *vcpu); -u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); -void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); -void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); - -struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, - unsigned long bitmap); -u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); -void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); -int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); -void kvm_ioapic_update_eoi(struct kvm *kvm, int vector); -int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); -int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig); -void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu); -int kvm_ioapic_init(struct kvm *kvm); -void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); -int kvm_lapic_enabled(struct kvm_vcpu *vcpu); -int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); -void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec); -void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec); -void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); -void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); -void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); - -#endif diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h deleted file mode 100644 index bf6a3b330a3..00000000000 --- a/drivers/kvm/kvm.h +++ /dev/null @@ -1,289 +0,0 @@ -#ifndef __KVM_H -#define __KVM_H - -/* - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - */ - -#include <linux/types.h> -#include <linux/hardirq.h> -#include <linux/list.h> -#include <linux/mutex.h> -#include <linux/spinlock.h> -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/preempt.h> -#include <asm/signal.h> - -#include <linux/kvm.h> -#include <linux/kvm_para.h> - -#include "types.h" - -#include "x86.h" - -#define KVM_MAX_VCPUS 4 -#define KVM_MEMORY_SLOTS 8 -/* memory slots that does not exposed to userspace */ -#define KVM_PRIVATE_MEM_SLOTS 4 - -#define KVM_PIO_PAGE_OFFSET 1 - -/* - * vcpu->requests bit members - */ -#define KVM_REQ_TLB_FLUSH 0 - - -struct kvm_vcpu; -extern struct kmem_cache *kvm_vcpu_cache; - -struct kvm_guest_debug { - int enabled; - unsigned long bp[4]; - int singlestep; -}; - -/* - * It would be nice to use something smarter than a linear search, TBD... - * Thankfully we dont expect many devices to register (famous last words :), - * so until then it will suffice. At least its abstracted so we can change - * in one place. - */ -struct kvm_io_bus { - int dev_count; -#define NR_IOBUS_DEVS 6 - struct kvm_io_device *devs[NR_IOBUS_DEVS]; -}; - -void kvm_io_bus_init(struct kvm_io_bus *bus); -void kvm_io_bus_destroy(struct kvm_io_bus *bus); -struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr); -void kvm_io_bus_register_dev(struct kvm_io_bus *bus, - struct kvm_io_device *dev); - -struct kvm_vcpu { - struct kvm *kvm; - struct preempt_notifier preempt_notifier; - int vcpu_id; - struct mutex mutex; - int cpu; - struct kvm_run *run; - int guest_mode; - unsigned long requests; - struct kvm_guest_debug guest_debug; - int fpu_active; - int guest_fpu_loaded; - wait_queue_head_t wq; - int sigset_active; - sigset_t sigset; - struct kvm_vcpu_stat stat; - -#ifdef CONFIG_HAS_IOMEM - int mmio_needed; - int mmio_read_completed; - int mmio_is_write; - int mmio_size; - unsigned char mmio_data[8]; - gpa_t mmio_phys_addr; -#endif - - struct kvm_vcpu_arch arch; -}; - -struct kvm_memory_slot { - gfn_t base_gfn; - unsigned long npages; - unsigned long flags; - unsigned long *rmap; - unsigned long *dirty_bitmap; - unsigned long userspace_addr; - int user_alloc; -}; - -struct kvm { - struct mutex lock; /* protects everything except vcpus */ - struct mm_struct *mm; /* userspace tied to this vm */ - int nmemslots; - struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS + - KVM_PRIVATE_MEM_SLOTS]; - struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; - struct list_head vm_list; - struct file *filp; - struct kvm_io_bus mmio_bus; - struct kvm_io_bus pio_bus; - struct kvm_vm_stat stat; - struct kvm_arch arch; -}; - -/* The guest did something we don't support. */ -#define pr_unimpl(vcpu, fmt, ...) \ - do { \ - if (printk_ratelimit()) \ - printk(KERN_ERR "kvm: %i: cpu%i " fmt, \ - current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \ - } while (0) - -#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt) -#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt) - -int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); -void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); - -void vcpu_load(struct kvm_vcpu *vcpu); -void vcpu_put(struct kvm_vcpu *vcpu); - -void decache_vcpus_on_cpu(int cpu); - - -int kvm_init(void *opaque, unsigned int vcpu_size, - struct module *module); -void kvm_exit(void); - -#define HPA_MSB ((sizeof(hpa_t) * 8) - 1) -#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB) -static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; } -struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva); - -extern struct page *bad_page; - -int is_error_page(struct page *page); -int kvm_is_error_hva(unsigned long addr); -int kvm_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - int user_alloc); -int __kvm_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - int user_alloc); -int kvm_arch_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old, - int user_alloc); -gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn); -struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); -void kvm_release_page_clean(struct page *page); -void kvm_release_page_dirty(struct page *page); -int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, - int len); -int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); -int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, - int offset, int len); -int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, - unsigned long len); -int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); -int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); -struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); -int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn); -void mark_page_dirty(struct kvm *kvm, gfn_t gfn); - -void kvm_vcpu_block(struct kvm_vcpu *vcpu); -void kvm_resched(struct kvm_vcpu *vcpu); -void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); -void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); -void kvm_flush_remote_tlbs(struct kvm *kvm); - -long kvm_arch_dev_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg); -long kvm_arch_vcpu_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg); -void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu); -void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu); - -int kvm_dev_ioctl_check_extension(long ext); - -int kvm_get_dirty_log(struct kvm *kvm, - struct kvm_dirty_log *log, int *is_dirty); -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, - struct kvm_dirty_log *log); - -int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, - struct - kvm_userspace_memory_region *mem, - int user_alloc); -long kvm_arch_vm_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg); -void kvm_arch_destroy_vm(struct kvm *kvm); - -int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu); -int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu); - -int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, - struct kvm_translation *tr); - -int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs); -int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs); -int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, - struct kvm_sregs *sregs); -int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, - struct kvm_sregs *sregs); -int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, - struct kvm_debug_guest *dbg); -int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run); - -int kvm_arch_init(void *opaque); -void kvm_arch_exit(void); - -int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu); -void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu); - -void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu); -void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu); -void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu); -struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id); -int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu); -void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu); - -int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu); -void kvm_arch_hardware_enable(void *garbage); -void kvm_arch_hardware_disable(void *garbage); -int kvm_arch_hardware_setup(void); -void kvm_arch_hardware_unsetup(void); -void kvm_arch_check_processor_compat(void *rtn); -int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); - -void kvm_free_physmem(struct kvm *kvm); - -struct kvm *kvm_arch_create_vm(void); -void kvm_arch_destroy_vm(struct kvm *kvm); - -int kvm_cpu_get_interrupt(struct kvm_vcpu *v); -int kvm_cpu_has_interrupt(struct kvm_vcpu *v); - -static inline void kvm_guest_enter(void) -{ - account_system_vtime(current); - current->flags |= PF_VCPU; -} - -static inline void kvm_guest_exit(void) -{ - account_system_vtime(current); - current->flags &= ~PF_VCPU; -} - -static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot) -{ - return slot - kvm->memslots; -} - -static inline gpa_t gfn_to_gpa(gfn_t gfn) -{ - return (gpa_t)gfn << PAGE_SHIFT; -} - -enum kvm_stat_kind { - KVM_STAT_VM, - KVM_STAT_VCPU, -}; - -struct kvm_stats_debugfs_item { - const char *name; - int offset; - enum kvm_stat_kind kind; - struct dentry *dentry; -}; -extern struct kvm_stats_debugfs_item debugfs_entries[]; - -#endif diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index ae2a1bf640b..4026d7d6429 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -15,9 +15,9 @@ * */ -#include "kvm.h" #include "iodev.h" +#include <linux/kvm_host.h> #include <linux/kvm.h> #include <linux/module.h> #include <linux/errno.h> diff --git a/drivers/kvm/kvm_svm.h b/drivers/kvm/kvm_svm.h deleted file mode 100644 index a0e415daef5..00000000000 --- a/drivers/kvm/kvm_svm.h +++ /dev/null @@ -1,45 +0,0 @@ -#ifndef __KVM_SVM_H -#define __KVM_SVM_H - -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/list.h> -#include <asm/msr.h> - -#include "svm.h" -#include "kvm.h" - -static const u32 host_save_user_msrs[] = { -#ifdef CONFIG_X86_64 - MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, - MSR_FS_BASE, -#endif - MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, -}; - -#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) -#define NUM_DB_REGS 4 - -struct kvm_vcpu; - -struct vcpu_svm { - struct kvm_vcpu vcpu; - struct vmcb *vmcb; - unsigned long vmcb_pa; - struct svm_cpu_data *svm_data; - uint64_t asid_generation; - - unsigned long db_regs[NUM_DB_REGS]; - - u64 next_rip; - - u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; - u64 host_gs_base; - unsigned long host_cr2; - unsigned long host_db_regs[NUM_DB_REGS]; - unsigned long host_dr6; - unsigned long host_dr7; -}; - -#endif - diff --git a/drivers/kvm/lapic.c b/drivers/kvm/lapic.c deleted file mode 100644 index 8c74bf184a0..00000000000 --- a/drivers/kvm/lapic.c +++ /dev/null @@ -1,1087 +0,0 @@ - -/* - * Local APIC virtualization - * - * Copyright (C) 2006 Qumranet, Inc. - * Copyright (C) 2007 Novell - * Copyright (C) 2007 Intel - * - * Authors: - * Dor Laor <dor.laor@qumranet.com> - * Gregory Haskins <ghaskins@novell.com> - * Yaozu (Eddie) Dong <eddie.dong@intel.com> - * - * Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation. - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - */ - -#include "kvm.h" -#include "x86.h" - -#include <linux/kvm.h> -#include <linux/mm.h> -#include <linux/highmem.h> -#include <linux/smp.h> -#include <linux/hrtimer.h> -#include <linux/io.h> -#include <linux/module.h> -#include <asm/processor.h> -#include <asm/msr.h> -#include <asm/page.h> -#include <asm/current.h> -#include <asm/apicdef.h> -#include <asm/atomic.h> -#include <asm/div64.h> -#include "irq.h" - -#define PRId64 "d" -#define PRIx64 "llx" -#define PRIu64 "u" -#define PRIo64 "o" - -#define APIC_BUS_CYCLE_NS 1 - -/* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */ -#define apic_debug(fmt, arg...) - -#define APIC_LVT_NUM 6 -/* 14 is the version for Xeon and Pentium 8.4.8*/ -#define APIC_VERSION (0x14UL | ((APIC_LVT_NUM - 1) << 16)) -#define LAPIC_MMIO_LENGTH (1 << 12) -/* followed define is not in apicdef.h */ -#define APIC_SHORT_MASK 0xc0000 -#define APIC_DEST_NOSHORT 0x0 -#define APIC_DEST_MASK 0x800 -#define MAX_APIC_VECTOR 256 - -#define VEC_POS(v) ((v) & (32 - 1)) -#define REG_POS(v) (((v) >> 5) << 4) - -static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off) -{ - return *((u32 *) (apic->regs + reg_off)); -} - -static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) -{ - *((u32 *) (apic->regs + reg_off)) = val; -} - -static inline int apic_test_and_set_vector(int vec, void *bitmap) -{ - return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); -} - -static inline int apic_test_and_clear_vector(int vec, void *bitmap) -{ - return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); -} - -static inline void apic_set_vector(int vec, void *bitmap) -{ - set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); -} - -static inline void apic_clear_vector(int vec, void *bitmap) -{ - clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); -} - -static inline int apic_hw_enabled(struct kvm_lapic *apic) -{ - return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; -} - -static inline int apic_sw_enabled(struct kvm_lapic *apic) -{ - return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED; -} - -static inline int apic_enabled(struct kvm_lapic *apic) -{ - return apic_sw_enabled(apic) && apic_hw_enabled(apic); -} - -#define LVT_MASK \ - (APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK) - -#define LINT_MASK \ - (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ - APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) - -static inline int kvm_apic_id(struct kvm_lapic *apic) -{ - return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff; -} - -static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type) -{ - return !(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED); -} - -static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type) -{ - return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK; -} - -static inline int apic_lvtt_period(struct kvm_lapic *apic) -{ - return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC; -} - -static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { - LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */ - LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ - LVT_MASK | APIC_MODE_MASK, /* LVTPC */ - LINT_MASK, LINT_MASK, /* LVT0-1 */ - LVT_MASK /* LVTERR */ -}; - -static int find_highest_vector(void *bitmap) -{ - u32 *word = bitmap; - int word_offset = MAX_APIC_VECTOR >> 5; - - while ((word_offset != 0) && (word[(--word_offset) << 2] == 0)) - continue; - - if (likely(!word_offset && !word[0])) - return -1; - else - return fls(word[word_offset << 2]) - 1 + (word_offset << 5); -} - -static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) -{ - return apic_test_and_set_vector(vec, apic->regs + APIC_IRR); -} - -static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) -{ - apic_clear_vector(vec, apic->regs + APIC_IRR); -} - -static inline int apic_find_highest_irr(struct kvm_lapic *apic) -{ - int result; - - result = find_highest_vector(apic->regs + APIC_IRR); - ASSERT(result == -1 || result >= 16); - - return result; -} - -int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - int highest_irr; - - if (!apic) - return 0; - highest_irr = apic_find_highest_irr(apic); - - return highest_irr; -} -EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr); - -int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - if (!apic_test_and_set_irr(vec, apic)) { - /* a new pending irq is set in IRR */ - if (trig) - apic_set_vector(vec, apic->regs + APIC_TMR); - else - apic_clear_vector(vec, apic->regs + APIC_TMR); - kvm_vcpu_kick(apic->vcpu); - return 1; - } - return 0; -} - -static inline int apic_find_highest_isr(struct kvm_lapic *apic) -{ - int result; - - result = find_highest_vector(apic->regs + APIC_ISR); - ASSERT(result == -1 || result >= 16); - - return result; -} - -static void apic_update_ppr(struct kvm_lapic *apic) -{ - u32 tpr, isrv, ppr; - int isr; - - tpr = apic_get_reg(apic, APIC_TASKPRI); - isr = apic_find_highest_isr(apic); - isrv = (isr != -1) ? isr : 0; - - if ((tpr & 0xf0) >= (isrv & 0xf0)) - ppr = tpr & 0xff; - else - ppr = isrv & 0xf0; - - apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x", - apic, ppr, isr, isrv); - - apic_set_reg(apic, APIC_PROCPRI, ppr); -} - -static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) -{ - apic_set_reg(apic, APIC_TASKPRI, tpr); - apic_update_ppr(apic); -} - -int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest) -{ - return kvm_apic_id(apic) == dest; -} - -int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) -{ - int result = 0; - u8 logical_id; - - logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR)); - - switch (apic_get_reg(apic, APIC_DFR)) { - case APIC_DFR_FLAT: - if (logical_id & mda) - result = 1; - break; - case APIC_DFR_CLUSTER: - if (((logical_id >> 4) == (mda >> 0x4)) - && (logical_id & mda & 0xf)) - result = 1; - break; - default: - printk(KERN_WARNING "Bad DFR vcpu %d: %08x\n", - apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR)); - break; - } - - return result; -} - -static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, - int short_hand, int dest, int dest_mode) -{ - int result = 0; - struct kvm_lapic *target = vcpu->arch.apic; - - apic_debug("target %p, source %p, dest 0x%x, " - "dest_mode 0x%x, short_hand 0x%x", - target, source, dest, dest_mode, short_hand); - - ASSERT(!target); - switch (short_hand) { - case APIC_DEST_NOSHORT: - if (dest_mode == 0) { - /* Physical mode. */ - if ((dest == 0xFF) || (dest == kvm_apic_id(target))) - result = 1; - } else - /* Logical mode. */ - result = kvm_apic_match_logical_addr(target, dest); - break; - case APIC_DEST_SELF: - if (target == source) - result = 1; - break; - case APIC_DEST_ALLINC: - result = 1; - break; - case APIC_DEST_ALLBUT: - if (target != source) - result = 1; - break; - default: - printk(KERN_WARNING "Bad dest shorthand value %x\n", - short_hand); - break; - } - - return result; -} - -/* - * Add a pending IRQ into lapic. - * Return 1 if successfully added and 0 if discarded. - */ -static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, - int vector, int level, int trig_mode) -{ - int orig_irr, result = 0; - struct kvm_vcpu *vcpu = apic->vcpu; - - switch (delivery_mode) { - case APIC_DM_FIXED: - case APIC_DM_LOWEST: - /* FIXME add logic for vcpu on reset */ - if (unlikely(!apic_enabled(apic))) - break; - - orig_irr = apic_test_and_set_irr(vector, apic); - if (orig_irr && trig_mode) { - apic_debug("level trig mode repeatedly for vector %d", - vector); - break; - } - - if (trig_mode) { - apic_debug("level trig mode for vector %d", vector); - apic_set_vector(vector, apic->regs + APIC_TMR); - } else - apic_clear_vector(vector, apic->regs + APIC_TMR); - - if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE) - kvm_vcpu_kick(vcpu); - else if (vcpu->arch.mp_state == VCPU_MP_STATE_HALTED) { - vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; - if (waitqueue_active(&vcpu->wq)) - wake_up_interruptible(&vcpu->wq); - } - - result = (orig_irr == 0); - break; - - case APIC_DM_REMRD: - printk(KERN_DEBUG "Ignoring delivery mode 3\n"); - break; - - case APIC_DM_SMI: - printk(KERN_DEBUG "Ignoring guest SMI\n"); - break; - case APIC_DM_NMI: - printk(KERN_DEBUG "Ignoring guest NMI\n"); - break; - - case APIC_DM_INIT: - if (level) { - if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE) - printk(KERN_DEBUG - "INIT on a runnable vcpu %d\n", - vcpu->vcpu_id); - vcpu->arch.mp_state = VCPU_MP_STATE_INIT_RECEIVED; - kvm_vcpu_kick(vcpu); - } else { - printk(KERN_DEBUG - "Ignoring de-assert INIT to vcpu %d\n", - vcpu->vcpu_id); - } - - break; - - case APIC_DM_STARTUP: - printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n", - vcpu->vcpu_id, vector); - if (vcpu->arch.mp_state == VCPU_MP_STATE_INIT_RECEIVED) { - vcpu->arch.sipi_vector = vector; - vcpu->arch.mp_state = VCPU_MP_STATE_SIPI_RECEIVED; - if (waitqueue_active(&vcpu->wq)) - wake_up_interruptible(&vcpu->wq); - } - break; - - default: - printk(KERN_ERR "TODO: unsupported delivery mode %x\n", - delivery_mode); - break; - } - return result; -} - -static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, - unsigned long bitmap) -{ - int last; - int next; - struct kvm_lapic *apic = NULL; - - last = kvm->arch.round_robin_prev_vcpu; - next = last; - - do { - if (++next == KVM_MAX_VCPUS) - next = 0; - if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap)) - continue; - apic = kvm->vcpus[next]->arch.apic; - if (apic && apic_enabled(apic)) - break; - apic = NULL; - } while (next != last); - kvm->arch.round_robin_prev_vcpu = next; - - if (!apic) - printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n"); - - return apic; -} - -struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, - unsigned long bitmap) -{ - struct kvm_lapic *apic; - - apic = kvm_apic_round_robin(kvm, vector, bitmap); - if (apic) - return apic->vcpu; - return NULL; -} - -static void apic_set_eoi(struct kvm_lapic *apic) -{ - int vector = apic_find_highest_isr(apic); - - /* - * Not every write EOI will has corresponding ISR, - * one example is when Kernel check timer on setup_IO_APIC - */ - if (vector == -1) - return; - - apic_clear_vector(vector, apic->regs + APIC_ISR); - apic_update_ppr(apic); - - if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR)) - kvm_ioapic_update_eoi(apic->vcpu->kvm, vector); -} - -static void apic_send_ipi(struct kvm_lapic *apic) -{ - u32 icr_low = apic_get_reg(apic, APIC_ICR); - u32 icr_high = apic_get_reg(apic, APIC_ICR2); - - unsigned int dest = GET_APIC_DEST_FIELD(icr_high); - unsigned int short_hand = icr_low & APIC_SHORT_MASK; - unsigned int trig_mode = icr_low & APIC_INT_LEVELTRIG; - unsigned int level = icr_low & APIC_INT_ASSERT; - unsigned int dest_mode = icr_low & APIC_DEST_MASK; - unsigned int delivery_mode = icr_low & APIC_MODE_MASK; - unsigned int vector = icr_low & APIC_VECTOR_MASK; - - struct kvm_vcpu *target; - struct kvm_vcpu *vcpu; - unsigned long lpr_map = 0; - int i; - - apic_debug("icr_high 0x%x, icr_low 0x%x, " - "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " - "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n", - icr_high, icr_low, short_hand, dest, - trig_mode, level, dest_mode, delivery_mode, vector); - - for (i = 0; i < KVM_MAX_VCPUS; i++) { - vcpu = apic->vcpu->kvm->vcpus[i]; - if (!vcpu) - continue; - - if (vcpu->arch.apic && - apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) { - if (delivery_mode == APIC_DM_LOWEST) - set_bit(vcpu->vcpu_id, &lpr_map); - else - __apic_accept_irq(vcpu->arch.apic, delivery_mode, - vector, level, trig_mode); - } - } - - if (delivery_mode == APIC_DM_LOWEST) { - target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map); - if (target != NULL) - __apic_accept_irq(target->arch.apic, delivery_mode, - vector, level, trig_mode); - } -} - -static u32 apic_get_tmcct(struct kvm_lapic *apic) -{ - u64 counter_passed; - ktime_t passed, now; - u32 tmcct; - - ASSERT(apic != NULL); - - now = apic->timer.dev.base->get_time(); - tmcct = apic_get_reg(apic, APIC_TMICT); - - /* if initial count is 0, current count should also be 0 */ - if (tmcct == 0) - return 0; - - if (unlikely(ktime_to_ns(now) <= - ktime_to_ns(apic->timer.last_update))) { - /* Wrap around */ - passed = ktime_add(( { - (ktime_t) { - .tv64 = KTIME_MAX - - (apic->timer.last_update).tv64}; } - ), now); - apic_debug("time elapsed\n"); - } else - passed = ktime_sub(now, apic->timer.last_update); - - counter_passed = div64_64(ktime_to_ns(passed), - (APIC_BUS_CYCLE_NS * apic->timer.divide_count)); - - if (counter_passed > tmcct) { - if (unlikely(!apic_lvtt_period(apic))) { - /* one-shot timers stick at 0 until reset */ - tmcct = 0; - } else { - /* - * periodic timers reset to APIC_TMICT when they - * hit 0. The while loop simulates this happening N - * times. (counter_passed %= tmcct) would also work, - * but might be slower or not work on 32-bit?? - */ - while (counter_passed > tmcct) - counter_passed -= tmcct; - tmcct -= counter_passed; - } - } else { - tmcct -= counter_passed; - } - - return tmcct; -} - -static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) -{ - u32 val = 0; - - if (offset >= LAPIC_MMIO_LENGTH) - return 0; - - switch (offset) { - case APIC_ARBPRI: - printk(KERN_WARNING "Access APIC ARBPRI register " - "which is for P6\n"); - break; - - case APIC_TMCCT: /* Timer CCR */ - val = apic_get_tmcct(apic); - break; - - default: - apic_update_ppr(apic); - val = apic_get_reg(apic, offset); - break; - } - - return val; -} - -static void apic_mmio_read(struct kvm_io_device *this, - gpa_t address, int len, void *data) -{ - struct kvm_lapic *apic = (struct kvm_lapic *)this->private; - unsigned int offset = address - apic->base_address; - unsigned char alignment = offset & 0xf; - u32 result; - - if ((alignment + len) > 4) { - printk(KERN_ERR "KVM_APIC_READ: alignment error %lx %d", - (unsigned long)address, len); - return; - } - result = __apic_read(apic, offset & ~0xf); - - switch (len) { - case 1: - case 2: - case 4: - memcpy(data, (char *)&result + alignment, len); - break; - default: - printk(KERN_ERR "Local APIC read with len = %x, " - "should be 1,2, or 4 instead\n", len); - break; - } -} - -static void update_divide_count(struct kvm_lapic *apic) -{ - u32 tmp1, tmp2, tdcr; - - tdcr = apic_get_reg(apic, APIC_TDCR); - tmp1 = tdcr & 0xf; - tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1; - apic->timer.divide_count = 0x1 << (tmp2 & 0x7); - - apic_debug("timer divide count is 0x%x\n", - apic->timer.divide_count); -} - -static void start_apic_timer(struct kvm_lapic *apic) -{ - ktime_t now = apic->timer.dev.base->get_time(); - - apic->timer.last_update = now; - - apic->timer.period = apic_get_reg(apic, APIC_TMICT) * - APIC_BUS_CYCLE_NS * apic->timer.divide_count; - atomic_set(&apic->timer.pending, 0); - hrtimer_start(&apic->timer.dev, - ktime_add_ns(now, apic->timer.period), - HRTIMER_MODE_ABS); - - apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" - PRIx64 ", " - "timer initial count 0x%x, period %lldns, " - "expire @ 0x%016" PRIx64 ".\n", __FUNCTION__, - APIC_BUS_CYCLE_NS, ktime_to_ns(now), - apic_get_reg(apic, APIC_TMICT), - apic->timer.period, - ktime_to_ns(ktime_add_ns(now, - apic->timer.period))); -} - -static void apic_mmio_write(struct kvm_io_device *this, - gpa_t address, int len, const void *data) -{ - struct kvm_lapic *apic = (struct kvm_lapic *)this->private; - unsigned int offset = address - apic->base_address; - unsigned char alignment = offset & 0xf; - u32 val; - - /* - * APIC register must be aligned on 128-bits boundary. - * 32/64/128 bits registers must be accessed thru 32 bits. - * Refer SDM 8.4.1 - */ - if (len != 4 || alignment) { - if (printk_ratelimit()) - printk(KERN_ERR "apic write: bad size=%d %lx\n", - len, (long)address); - return; - } - - val = *(u32 *) data; - - /* too common printing */ - if (offset != APIC_EOI) - apic_debug("%s: offset 0x%x with length 0x%x, and value is " - "0x%x\n", __FUNCTION__, offset, len, val); - - offset &= 0xff0; - - switch (offset) { - case APIC_ID: /* Local APIC ID */ - apic_set_reg(apic, APIC_ID, val); - break; - - case APIC_TASKPRI: - apic_set_tpr(apic, val & 0xff); - break; - - case APIC_EOI: - apic_set_eoi(apic); - break; - - case APIC_LDR: - apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK); - break; - - case APIC_DFR: - apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); - break; - - case APIC_SPIV: - apic_set_reg(apic, APIC_SPIV, val & 0x3ff); - if (!(val & APIC_SPIV_APIC_ENABLED)) { - int i; - u32 lvt_val; - - for (i = 0; i < APIC_LVT_NUM; i++) { - lvt_val = apic_get_reg(apic, - APIC_LVTT + 0x10 * i); - apic_set_reg(apic, APIC_LVTT + 0x10 * i, - lvt_val | APIC_LVT_MASKED); - } - atomic_set(&apic->timer.pending, 0); - - } - break; - - case APIC_ICR: - /* No delay here, so we always clear the pending bit */ - apic_set_reg(apic, APIC_ICR, val & ~(1 << 12)); - apic_send_ipi(apic); - break; - - case APIC_ICR2: - apic_set_reg(apic, APIC_ICR2, val & 0xff000000); - break; - - case APIC_LVTT: - case APIC_LVTTHMR: - case APIC_LVTPC: - case APIC_LVT0: - case APIC_LVT1: - case APIC_LVTERR: - /* TODO: Check vector */ - if (!apic_sw_enabled(apic)) - val |= APIC_LVT_MASKED; - - val &= apic_lvt_mask[(offset - APIC_LVTT) >> 4]; - apic_set_reg(apic, offset, val); - - break; - - case APIC_TMICT: - hrtimer_cancel(&apic->timer.dev); - apic_set_reg(apic, APIC_TMICT, val); - start_apic_timer(apic); - return; - - case APIC_TDCR: - if (val & 4) - printk(KERN_ERR "KVM_WRITE:TDCR %x\n", val); - apic_set_reg(apic, APIC_TDCR, val); - update_divide_count(apic); - break; - - default: - apic_debug("Local APIC Write to read-only register %x\n", - offset); - break; - } - -} - -static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr) -{ - struct kvm_lapic *apic = (struct kvm_lapic *)this->private; - int ret = 0; - - - if (apic_hw_enabled(apic) && - (addr >= apic->base_address) && - (addr < (apic->base_address + LAPIC_MMIO_LENGTH))) - ret = 1; - - return ret; -} - -void kvm_free_lapic(struct kvm_vcpu *vcpu) -{ - if (!vcpu->arch.apic) - return; - - hrtimer_cancel(&vcpu->arch.apic->timer.dev); - - if (vcpu->arch.apic->regs_page) - __free_page(vcpu->arch.apic->regs_page); - - kfree(vcpu->arch.apic); -} - -/* - *---------------------------------------------------------------------- - * LAPIC interface - *---------------------------------------------------------------------- - */ - -void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - if (!apic) - return; - apic_set_tpr(apic, ((cr8 & 0x0f) << 4)); -} - -u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - u64 tpr; - - if (!apic) - return 0; - tpr = (u64) apic_get_reg(apic, APIC_TASKPRI); - - return (tpr & 0xf0) >> 4; -} -EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8); - -void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - if (!apic) { - value |= MSR_IA32_APICBASE_BSP; - vcpu->arch.apic_base = value; - return; - } - if (apic->vcpu->vcpu_id) - value &= ~MSR_IA32_APICBASE_BSP; - - vcpu->arch.apic_base = value; - apic->base_address = apic->vcpu->arch.apic_base & - MSR_IA32_APICBASE_BASE; - - /* with FSB delivery interrupt, we can restart APIC functionality */ - apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is " - "0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address); - -} - -u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.apic_base; -} -EXPORT_SYMBOL_GPL(kvm_lapic_get_base); - -void kvm_lapic_reset(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic; - int i; - - apic_debug("%s\n", __FUNCTION__); - - ASSERT(vcpu); - apic = vcpu->arch.apic; - ASSERT(apic != NULL); - - /* Stop the timer in case it's a reset to an active apic */ - hrtimer_cancel(&apic->timer.dev); - - apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24); - apic_set_reg(apic, APIC_LVR, APIC_VERSION); - - for (i = 0; i < APIC_LVT_NUM; i++) - apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); - apic_set_reg(apic, APIC_LVT0, - SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); - - apic_set_reg(apic, APIC_DFR, 0xffffffffU); - apic_set_reg(apic, APIC_SPIV, 0xff); - apic_set_reg(apic, APIC_TASKPRI, 0); - apic_set_reg(apic, APIC_LDR, 0); - apic_set_reg(apic, APIC_ESR, 0); - apic_set_reg(apic, APIC_ICR, 0); - apic_set_reg(apic, APIC_ICR2, 0); - apic_set_reg(apic, APIC_TDCR, 0); - apic_set_reg(apic, APIC_TMICT, 0); - for (i = 0; i < 8; i++) { - apic_set_reg(apic, APIC_IRR + 0x10 * i, 0); - apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); - apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); - } - update_divide_count(apic); - atomic_set(&apic->timer.pending, 0); - if (vcpu->vcpu_id == 0) - vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; - apic_update_ppr(apic); - - apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" - "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__, - vcpu, kvm_apic_id(apic), - vcpu->arch.apic_base, apic->base_address); -} -EXPORT_SYMBOL_GPL(kvm_lapic_reset); - -int kvm_lapic_enabled(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - int ret = 0; - - if (!apic) - return 0; - ret = apic_enabled(apic); - - return ret; -} -EXPORT_SYMBOL_GPL(kvm_lapic_enabled); - -/* - *---------------------------------------------------------------------- - * timer interface - *---------------------------------------------------------------------- - */ - -/* TODO: make sure __apic_timer_fn runs in current pCPU */ -static int __apic_timer_fn(struct kvm_lapic *apic) -{ - int result = 0; - wait_queue_head_t *q = &apic->vcpu->wq; - - atomic_inc(&apic->timer.pending); - if (waitqueue_active(q)) { - apic->vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; - wake_up_interruptible(q); - } - if (apic_lvtt_period(apic)) { - result = 1; - apic->timer.dev.expires = ktime_add_ns( - apic->timer.dev.expires, - apic->timer.period); - } - return result; -} - -static int __inject_apic_timer_irq(struct kvm_lapic *apic) -{ - int vector; - - vector = apic_lvt_vector(apic, APIC_LVTT); - return __apic_accept_irq(apic, APIC_DM_FIXED, vector, 1, 0); -} - -static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) -{ - struct kvm_lapic *apic; - int restart_timer = 0; - - apic = container_of(data, struct kvm_lapic, timer.dev); - - restart_timer = __apic_timer_fn(apic); - - if (restart_timer) - return HRTIMER_RESTART; - else - return HRTIMER_NORESTART; -} - -int kvm_create_lapic(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic; - - ASSERT(vcpu != NULL); - apic_debug("apic_init %d\n", vcpu->vcpu_id); - - apic = kzalloc(sizeof(*apic), GFP_KERNEL); - if (!apic) - goto nomem; - - vcpu->arch.apic = apic; - - apic->regs_page = alloc_page(GFP_KERNEL); - if (apic->regs_page == NULL) { - printk(KERN_ERR "malloc apic regs error for vcpu %x\n", - vcpu->vcpu_id); - goto nomem_free_apic; - } - apic->regs = page_address(apic->regs_page); - memset(apic->regs, 0, PAGE_SIZE); - apic->vcpu = vcpu; - - hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - apic->timer.dev.function = apic_timer_fn; - apic->base_address = APIC_DEFAULT_PHYS_BASE; - vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE; - - kvm_lapic_reset(vcpu); - apic->dev.read = apic_mmio_read; - apic->dev.write = apic_mmio_write; - apic->dev.in_range = apic_mmio_range; - apic->dev.private = apic; - - return 0; -nomem_free_apic: - kfree(apic); -nomem: - return -ENOMEM; -} -EXPORT_SYMBOL_GPL(kvm_create_lapic); - -int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - int highest_irr; - - if (!apic || !apic_enabled(apic)) - return -1; - - apic_update_ppr(apic); - highest_irr = apic_find_highest_irr(apic); - if ((highest_irr == -1) || - ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI))) - return -1; - return highest_irr; -} - -int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) -{ - u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); - int r = 0; - - if (vcpu->vcpu_id == 0) { - if (!apic_hw_enabled(vcpu->arch.apic)) - r = 1; - if ((lvt0 & APIC_LVT_MASKED) == 0 && - GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) - r = 1; - } - return r; -} - -void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - if (apic && apic_lvt_enabled(apic, APIC_LVTT) && - atomic_read(&apic->timer.pending) > 0) { - if (__inject_apic_timer_irq(apic)) - atomic_dec(&apic->timer.pending); - } -} - -void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec) - apic->timer.last_update = ktime_add_ns( - apic->timer.last_update, - apic->timer.period); -} - -int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) -{ - int vector = kvm_apic_has_interrupt(vcpu); - struct kvm_lapic *apic = vcpu->arch.apic; - - if (vector == -1) - return -1; - - apic_set_vector(vector, apic->regs + APIC_ISR); - apic_update_ppr(apic); - apic_clear_irr(vector, apic); - return vector; -} - -void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - apic->base_address = vcpu->arch.apic_base & - MSR_IA32_APICBASE_BASE; - apic_set_reg(apic, APIC_LVR, APIC_VERSION); - apic_update_ppr(apic); - hrtimer_cancel(&apic->timer.dev); - update_divide_count(apic); - start_apic_timer(apic); -} - -void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - struct hrtimer *timer; - - if (!apic) - return; - - timer = &apic->timer.dev; - if (hrtimer_cancel(timer)) - hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS); -} -EXPORT_SYMBOL_GPL(kvm_migrate_apic_timer); diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c deleted file mode 100644 index c26d83f86a3..00000000000 --- a/drivers/kvm/mmu.c +++ /dev/null @@ -1,1806 +0,0 @@ -/* - * Kernel-based Virtual Machine driver for Linux - * - * This module enables machines with Intel VT-x extensions to run virtual - * machines without emulation or binary translation. - * - * MMU support - * - * Copyright (C) 2006 Qumranet, Inc. - * - * Authors: - * Yaniv Kamay <yaniv@qumranet.com> - * Avi Kivity <avi@qumranet.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -#include "vmx.h" -#include "kvm.h" -#include "x86.h" -#include "mmu.h" - -#include <linux/types.h> -#include <linux/string.h> -#include <linux/mm.h> -#include <linux/highmem.h> -#include <linux/module.h> -#include <linux/swap.h> - -#include <asm/page.h> -#include <asm/cmpxchg.h> -#include <asm/io.h> - -#undef MMU_DEBUG - -#undef AUDIT - -#ifdef AUDIT -static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg); -#else -static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {} -#endif - -#ifdef MMU_DEBUG - -#define pgprintk(x...) do { if (dbg) printk(x); } while (0) -#define rmap_printk(x...) do { if (dbg) printk(x); } while (0) - -#else - -#define pgprintk(x...) do { } while (0) -#define rmap_printk(x...) do { } while (0) - -#endif - -#if defined(MMU_DEBUG) || defined(AUDIT) -static int dbg = 1; -#endif - -#ifndef MMU_DEBUG -#define ASSERT(x) do { } while (0) -#else -#define ASSERT(x) \ - if (!(x)) { \ - printk(KERN_WARNING "assertion failed %s:%d: %s\n", \ - __FILE__, __LINE__, #x); \ - } -#endif - -#define PT64_PT_BITS 9 -#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) -#define PT32_PT_BITS 10 -#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS) - -#define PT_WRITABLE_SHIFT 1 - -#define PT_PRESENT_MASK (1ULL << 0) -#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT) -#define PT_USER_MASK (1ULL << 2) -#define PT_PWT_MASK (1ULL << 3) -#define PT_PCD_MASK (1ULL << 4) -#define PT_ACCESSED_MASK (1ULL << 5) -#define PT_DIRTY_MASK (1ULL << 6) -#define PT_PAGE_SIZE_MASK (1ULL << 7) -#define PT_PAT_MASK (1ULL << 7) -#define PT_GLOBAL_MASK (1ULL << 8) -#define PT64_NX_SHIFT 63 -#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT) - -#define PT_PAT_SHIFT 7 -#define PT_DIR_PAT_SHIFT 12 -#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT) - -#define PT32_DIR_PSE36_SIZE 4 -#define PT32_DIR_PSE36_SHIFT 13 -#define PT32_DIR_PSE36_MASK \ - (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) - - -#define PT_FIRST_AVAIL_BITS_SHIFT 9 -#define PT64_SECOND_AVAIL_BITS_SHIFT 52 - -#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) - -#define VALID_PAGE(x) ((x) != INVALID_PAGE) - -#define PT64_LEVEL_BITS 9 - -#define PT64_LEVEL_SHIFT(level) \ - (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) - -#define PT64_LEVEL_MASK(level) \ - (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level)) - -#define PT64_INDEX(address, level)\ - (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) - - -#define PT32_LEVEL_BITS 10 - -#define PT32_LEVEL_SHIFT(level) \ - (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) - -#define PT32_LEVEL_MASK(level) \ - (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) - -#define PT32_INDEX(address, level)\ - (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) - - -#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) -#define PT64_DIR_BASE_ADDR_MASK \ - (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) - -#define PT32_BASE_ADDR_MASK PAGE_MASK -#define PT32_DIR_BASE_ADDR_MASK \ - (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) - -#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ - | PT64_NX_MASK) - -#define PFERR_PRESENT_MASK (1U << 0) -#define PFERR_WRITE_MASK (1U << 1) -#define PFERR_USER_MASK (1U << 2) -#define PFERR_FETCH_MASK (1U << 4) - -#define PT64_ROOT_LEVEL 4 -#define PT32_ROOT_LEVEL 2 -#define PT32E_ROOT_LEVEL 3 - -#define PT_DIRECTORY_LEVEL 2 -#define PT_PAGE_TABLE_LEVEL 1 - -#define RMAP_EXT 4 - -#define ACC_EXEC_MASK 1 -#define ACC_WRITE_MASK PT_WRITABLE_MASK -#define ACC_USER_MASK PT_USER_MASK -#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) - -struct kvm_rmap_desc { - u64 *shadow_ptes[RMAP_EXT]; - struct kvm_rmap_desc *more; -}; - -static struct kmem_cache *pte_chain_cache; -static struct kmem_cache *rmap_desc_cache; -static struct kmem_cache *mmu_page_header_cache; - -static u64 __read_mostly shadow_trap_nonpresent_pte; -static u64 __read_mostly shadow_notrap_nonpresent_pte; - -void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) -{ - shadow_trap_nonpresent_pte = trap_pte; - shadow_notrap_nonpresent_pte = notrap_pte; -} -EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes); - -static int is_write_protection(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.cr0 & X86_CR0_WP; -} - -static int is_cpuid_PSE36(void) -{ - return 1; -} - -static int is_nx(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.shadow_efer & EFER_NX; -} - -static int is_present_pte(unsigned long pte) -{ - return pte & PT_PRESENT_MASK; -} - -static int is_shadow_present_pte(u64 pte) -{ - pte &= ~PT_SHADOW_IO_MARK; - return pte != shadow_trap_nonpresent_pte - && pte != shadow_notrap_nonpresent_pte; -} - -static int is_writeble_pte(unsigned long pte) -{ - return pte & PT_WRITABLE_MASK; -} - -static int is_dirty_pte(unsigned long pte) -{ - return pte & PT_DIRTY_MASK; -} - -static int is_io_pte(unsigned long pte) -{ - return pte & PT_SHADOW_IO_MARK; -} - -static int is_rmap_pte(u64 pte) -{ - return pte != shadow_trap_nonpresent_pte - && pte != shadow_notrap_nonpresent_pte; -} - -static gfn_t pse36_gfn_delta(u32 gpte) -{ - int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; - - return (gpte & PT32_DIR_PSE36_MASK) << shift; -} - -static void set_shadow_pte(u64 *sptep, u64 spte) -{ -#ifdef CONFIG_X86_64 - set_64bit((unsigned long *)sptep, spte); -#else - set_64bit((unsigned long long *)sptep, spte); -#endif -} - -static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, - struct kmem_cache *base_cache, int min) -{ - void *obj; - - if (cache->nobjs >= min) - return 0; - while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - obj = kmem_cache_zalloc(base_cache, GFP_KERNEL); - if (!obj) - return -ENOMEM; - cache->objects[cache->nobjs++] = obj; - } - return 0; -} - -static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) -{ - while (mc->nobjs) - kfree(mc->objects[--mc->nobjs]); -} - -static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, - int min) -{ - struct page *page; - - if (cache->nobjs >= min) - return 0; - while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - page = alloc_page(GFP_KERNEL); - if (!page) - return -ENOMEM; - set_page_private(page, 0); - cache->objects[cache->nobjs++] = page_address(page); - } - return 0; -} - -static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc) -{ - while (mc->nobjs) - free_page((unsigned long)mc->objects[--mc->nobjs]); -} - -static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) -{ - int r; - - kvm_mmu_free_some_pages(vcpu); - r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache, - pte_chain_cache, 4); - if (r) - goto out; - r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, - rmap_desc_cache, 1); - if (r) - goto out; - r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); - if (r) - goto out; - r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, - mmu_page_header_cache, 4); -out: - return r; -} - -static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) -{ - mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache); - mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache); - mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); - mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); -} - -static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, - size_t size) -{ - void *p; - - BUG_ON(!mc->nobjs); - p = mc->objects[--mc->nobjs]; - memset(p, 0, size); - return p; -} - -static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu) -{ - return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache, - sizeof(struct kvm_pte_chain)); -} - -static void mmu_free_pte_chain(struct kvm_pte_chain *pc) -{ - kfree(pc); -} - -static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) -{ - return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache, - sizeof(struct kvm_rmap_desc)); -} - -static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) -{ - kfree(rd); -} - -/* - * Take gfn and return the reverse mapping to it. - * Note: gfn must be unaliased before this function get called - */ - -static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn) -{ - struct kvm_memory_slot *slot; - - slot = gfn_to_memslot(kvm, gfn); - return &slot->rmap[gfn - slot->base_gfn]; -} - -/* - * Reverse mapping data structures: - * - * If rmapp bit zero is zero, then rmapp point to the shadw page table entry - * that points to page_address(page). - * - * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc - * containing more mappings. - */ -static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) -{ - struct kvm_mmu_page *sp; - struct kvm_rmap_desc *desc; - unsigned long *rmapp; - int i; - - if (!is_rmap_pte(*spte)) - return; - gfn = unalias_gfn(vcpu->kvm, gfn); - sp = page_header(__pa(spte)); - sp->gfns[spte - sp->spt] = gfn; - rmapp = gfn_to_rmap(vcpu->kvm, gfn); - if (!*rmapp) { - rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); - *rmapp = (unsigned long)spte; - } else if (!(*rmapp & 1)) { - rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); - desc = mmu_alloc_rmap_desc(vcpu); - desc->shadow_ptes[0] = (u64 *)*rmapp; - desc->shadow_ptes[1] = spte; - *rmapp = (unsigned long)desc | 1; - } else { - rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); - desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); - while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) - desc = desc->more; - if (desc->shadow_ptes[RMAP_EXT-1]) { - desc->more = mmu_alloc_rmap_desc(vcpu); - desc = desc->more; - } - for (i = 0; desc->shadow_ptes[i]; ++i) - ; - desc->shadow_ptes[i] = spte; - } -} - -static void rmap_desc_remove_entry(unsigned long *rmapp, - struct kvm_rmap_desc *desc, - int i, - struct kvm_rmap_desc *prev_desc) -{ - int j; - - for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j) - ; - desc->shadow_ptes[i] = desc->shadow_ptes[j]; - desc->shadow_ptes[j] = NULL; - if (j != 0) - return; - if (!prev_desc && !desc->more) - *rmapp = (unsigned long)desc->shadow_ptes[0]; - else - if (prev_desc) - prev_desc->more = desc->more; - else - *rmapp = (unsigned long)desc->more | 1; - mmu_free_rmap_desc(desc); -} - -static void rmap_remove(struct kvm *kvm, u64 *spte) -{ - struct kvm_rmap_desc *desc; - struct kvm_rmap_desc *prev_desc; - struct kvm_mmu_page *sp; - struct page *page; - unsigned long *rmapp; - int i; - - if (!is_rmap_pte(*spte)) - return; - sp = page_header(__pa(spte)); - page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); - mark_page_accessed(page); - if (is_writeble_pte(*spte)) - kvm_release_page_dirty(page); - else - kvm_release_page_clean(page); - rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]); - if (!*rmapp) { - printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); - BUG(); - } else if (!(*rmapp & 1)) { - rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte); - if ((u64 *)*rmapp != spte) { - printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n", - spte, *spte); - BUG(); - } - *rmapp = 0; - } else { - rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte); - desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); - prev_desc = NULL; - while (desc) { - for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) - if (desc->shadow_ptes[i] == spte) { - rmap_desc_remove_entry(rmapp, - desc, i, - prev_desc); - return; - } - prev_desc = desc; - desc = desc->more; - } - BUG(); - } -} - -static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) -{ - struct kvm_rmap_desc *desc; - struct kvm_rmap_desc *prev_desc; - u64 *prev_spte; - int i; - - if (!*rmapp) - return NULL; - else if (!(*rmapp & 1)) { - if (!spte) - return (u64 *)*rmapp; - return NULL; - } - desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); - prev_desc = NULL; - prev_spte = NULL; - while (desc) { - for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) { - if (prev_spte == spte) - return desc->shadow_ptes[i]; - prev_spte = desc->shadow_ptes[i]; - } - desc = desc->more; - } - return NULL; -} - -static void rmap_write_protect(struct kvm *kvm, u64 gfn) -{ - unsigned long *rmapp; - u64 *spte; - - gfn = unalias_gfn(kvm, gfn); - rmapp = gfn_to_rmap(kvm, gfn); - - spte = rmap_next(kvm, rmapp, NULL); - while (spte) { - BUG_ON(!spte); - BUG_ON(!(*spte & PT_PRESENT_MASK)); - rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); - if (is_writeble_pte(*spte)) - set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK); - kvm_flush_remote_tlbs(kvm); - spte = rmap_next(kvm, rmapp, spte); - } -} - -#ifdef MMU_DEBUG -static int is_empty_shadow_page(u64 *spt) -{ - u64 *pos; - u64 *end; - - for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) - if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) { - printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__, - pos, *pos); - return 0; - } - return 1; -} -#endif - -static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - ASSERT(is_empty_shadow_page(sp->spt)); - list_del(&sp->link); - __free_page(virt_to_page(sp->spt)); - __free_page(virt_to_page(sp->gfns)); - kfree(sp); - ++kvm->arch.n_free_mmu_pages; -} - -static unsigned kvm_page_table_hashfn(gfn_t gfn) -{ - return gfn; -} - -static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, - u64 *parent_pte) -{ - struct kvm_mmu_page *sp; - - if (!vcpu->kvm->arch.n_free_mmu_pages) - return NULL; - - sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp); - sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); - sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); - set_page_private(virt_to_page(sp->spt), (unsigned long)sp); - list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); - ASSERT(is_empty_shadow_page(sp->spt)); - sp->slot_bitmap = 0; - sp->multimapped = 0; - sp->parent_pte = parent_pte; - --vcpu->kvm->arch.n_free_mmu_pages; - return sp; -} - -static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, u64 *parent_pte) -{ - struct kvm_pte_chain *pte_chain; - struct hlist_node *node; - int i; - - if (!parent_pte) - return; - if (!sp->multimapped) { - u64 *old = sp->parent_pte; - - if (!old) { - sp->parent_pte = parent_pte; - return; - } - sp->multimapped = 1; - pte_chain = mmu_alloc_pte_chain(vcpu); - INIT_HLIST_HEAD(&sp->parent_ptes); - hlist_add_head(&pte_chain->link, &sp->parent_ptes); - pte_chain->parent_ptes[0] = old; - } - hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) { - if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1]) - continue; - for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) - if (!pte_chain->parent_ptes[i]) { - pte_chain->parent_ptes[i] = parent_pte; - return; - } - } - pte_chain = mmu_alloc_pte_chain(vcpu); - BUG_ON(!pte_chain); - hlist_add_head(&pte_chain->link, &sp->parent_ptes); - pte_chain->parent_ptes[0] = parent_pte; -} - -static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, - u64 *parent_pte) -{ - struct kvm_pte_chain *pte_chain; - struct hlist_node *node; - int i; - - if (!sp->multimapped) { - BUG_ON(sp->parent_pte != parent_pte); - sp->parent_pte = NULL; - return; - } - hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) - for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { - if (!pte_chain->parent_ptes[i]) - break; - if (pte_chain->parent_ptes[i] != parent_pte) - continue; - while (i + 1 < NR_PTE_CHAIN_ENTRIES - && pte_chain->parent_ptes[i + 1]) { - pte_chain->parent_ptes[i] - = pte_chain->parent_ptes[i + 1]; - ++i; - } - pte_chain->parent_ptes[i] = NULL; - if (i == 0) { - hlist_del(&pte_chain->link); - mmu_free_pte_chain(pte_chain); - if (hlist_empty(&sp->parent_ptes)) { - sp->multimapped = 0; - sp->parent_pte = NULL; - } - } - return; - } - BUG(); -} - -static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) -{ - unsigned index; - struct hlist_head *bucket; - struct kvm_mmu_page *sp; - struct hlist_node *node; - - pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); - index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; - bucket = &kvm->arch.mmu_page_hash[index]; - hlist_for_each_entry(sp, node, bucket, hash_link) - if (sp->gfn == gfn && !sp->role.metaphysical) { - pgprintk("%s: found role %x\n", - __FUNCTION__, sp->role.word); - return sp; - } - return NULL; -} - -static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, - gfn_t gfn, - gva_t gaddr, - unsigned level, - int metaphysical, - unsigned access, - u64 *parent_pte, - bool *new_page) -{ - union kvm_mmu_page_role role; - unsigned index; - unsigned quadrant; - struct hlist_head *bucket; - struct kvm_mmu_page *sp; - struct hlist_node *node; - - role.word = 0; - role.glevels = vcpu->arch.mmu.root_level; - role.level = level; - role.metaphysical = metaphysical; - role.access = access; - if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { - quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); - quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; - role.quadrant = quadrant; - } - pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__, - gfn, role.word); - index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; - bucket = &vcpu->kvm->arch.mmu_page_hash[index]; - hlist_for_each_entry(sp, node, bucket, hash_link) - if (sp->gfn == gfn && sp->role.word == role.word) { - mmu_page_add_parent_pte(vcpu, sp, parent_pte); - pgprintk("%s: found\n", __FUNCTION__); - return sp; - } - sp = kvm_mmu_alloc_page(vcpu, parent_pte); - if (!sp) - return sp; - pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word); - sp->gfn = gfn; - sp->role = role; - hlist_add_head(&sp->hash_link, bucket); - vcpu->arch.mmu.prefetch_page(vcpu, sp); - if (!metaphysical) - rmap_write_protect(vcpu->kvm, gfn); - if (new_page) - *new_page = 1; - return sp; -} - -static void kvm_mmu_page_unlink_children(struct kvm *kvm, - struct kvm_mmu_page *sp) -{ - unsigned i; - u64 *pt; - u64 ent; - - pt = sp->spt; - - if (sp->role.level == PT_PAGE_TABLE_LEVEL) { - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - if (is_shadow_present_pte(pt[i])) - rmap_remove(kvm, &pt[i]); - pt[i] = shadow_trap_nonpresent_pte; - } - kvm_flush_remote_tlbs(kvm); - return; - } - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - ent = pt[i]; - - pt[i] = shadow_trap_nonpresent_pte; - if (!is_shadow_present_pte(ent)) - continue; - ent &= PT64_BASE_ADDR_MASK; - mmu_page_remove_parent_pte(page_header(ent), &pt[i]); - } - kvm_flush_remote_tlbs(kvm); -} - -static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) -{ - mmu_page_remove_parent_pte(sp, parent_pte); -} - -static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm) -{ - int i; - - for (i = 0; i < KVM_MAX_VCPUS; ++i) - if (kvm->vcpus[i]) - kvm->vcpus[i]->arch.last_pte_updated = NULL; -} - -static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - u64 *parent_pte; - - ++kvm->stat.mmu_shadow_zapped; - while (sp->multimapped || sp->parent_pte) { - if (!sp->multimapped) - parent_pte = sp->parent_pte; - else { - struct kvm_pte_chain *chain; - - chain = container_of(sp->parent_ptes.first, - struct kvm_pte_chain, link); - parent_pte = chain->parent_ptes[0]; - } - BUG_ON(!parent_pte); - kvm_mmu_put_page(sp, parent_pte); - set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte); - } - kvm_mmu_page_unlink_children(kvm, sp); - if (!sp->root_count) { - hlist_del(&sp->hash_link); - kvm_mmu_free_page(kvm, sp); - } else - list_move(&sp->link, &kvm->arch.active_mmu_pages); - kvm_mmu_reset_last_pte_updated(kvm); -} - -/* - * Changing the number of mmu pages allocated to the vm - * Note: if kvm_nr_mmu_pages is too small, you will get dead lock - */ -void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) -{ - /* - * If we set the number of mmu pages to be smaller be than the - * number of actived pages , we must to free some mmu pages before we - * change the value - */ - - if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) > - kvm_nr_mmu_pages) { - int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages - - kvm->arch.n_free_mmu_pages; - - while (n_used_mmu_pages > kvm_nr_mmu_pages) { - struct kvm_mmu_page *page; - - page = container_of(kvm->arch.active_mmu_pages.prev, - struct kvm_mmu_page, link); - kvm_mmu_zap_page(kvm, page); - n_used_mmu_pages--; - } - kvm->arch.n_free_mmu_pages = 0; - } - else - kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages - - kvm->arch.n_alloc_mmu_pages; - - kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages; -} - -static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) -{ - unsigned index; - struct hlist_head *bucket; - struct kvm_mmu_page *sp; - struct hlist_node *node, *n; - int r; - - pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); - r = 0; - index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; - bucket = &kvm->arch.mmu_page_hash[index]; - hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) - if (sp->gfn == gfn && !sp->role.metaphysical) { - pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn, - sp->role.word); - kvm_mmu_zap_page(kvm, sp); - r = 1; - } - return r; -} - -static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) -{ - struct kvm_mmu_page *sp; - - while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) { - pgprintk("%s: zap %lx %x\n", __FUNCTION__, gfn, sp->role.word); - kvm_mmu_zap_page(kvm, sp); - } -} - -static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) -{ - int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn)); - struct kvm_mmu_page *sp = page_header(__pa(pte)); - - __set_bit(slot, &sp->slot_bitmap); -} - -struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) -{ - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); - - if (gpa == UNMAPPED_GVA) - return NULL; - return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); -} - -static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, - unsigned pt_access, unsigned pte_access, - int user_fault, int write_fault, int dirty, - int *ptwrite, gfn_t gfn) -{ - u64 spte; - int was_rmapped = is_rmap_pte(*shadow_pte); - struct page *page; - - pgprintk("%s: spte %llx access %x write_fault %d" - " user_fault %d gfn %lx\n", - __FUNCTION__, *shadow_pte, pt_access, - write_fault, user_fault, gfn); - - /* - * We don't set the accessed bit, since we sometimes want to see - * whether the guest actually used the pte (in order to detect - * demand paging). - */ - spte = PT_PRESENT_MASK | PT_DIRTY_MASK; - if (!dirty) - pte_access &= ~ACC_WRITE_MASK; - if (!(pte_access & ACC_EXEC_MASK)) - spte |= PT64_NX_MASK; - - page = gfn_to_page(vcpu->kvm, gfn); - - spte |= PT_PRESENT_MASK; - if (pte_access & ACC_USER_MASK) - spte |= PT_USER_MASK; - - if (is_error_page(page)) { - set_shadow_pte(shadow_pte, - shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK); - kvm_release_page_clean(page); - return; - } - - spte |= page_to_phys(page); - - if ((pte_access & ACC_WRITE_MASK) - || (write_fault && !is_write_protection(vcpu) && !user_fault)) { - struct kvm_mmu_page *shadow; - - spte |= PT_WRITABLE_MASK; - if (user_fault) { - mmu_unshadow(vcpu->kvm, gfn); - goto unshadowed; - } - - shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); - if (shadow) { - pgprintk("%s: found shadow page for %lx, marking ro\n", - __FUNCTION__, gfn); - pte_access &= ~ACC_WRITE_MASK; - if (is_writeble_pte(spte)) { - spte &= ~PT_WRITABLE_MASK; - kvm_x86_ops->tlb_flush(vcpu); - } - if (write_fault) - *ptwrite = 1; - } - } - -unshadowed: - - if (pte_access & ACC_WRITE_MASK) - mark_page_dirty(vcpu->kvm, gfn); - - pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte); - set_shadow_pte(shadow_pte, spte); - page_header_update_slot(vcpu->kvm, shadow_pte, gfn); - if (!was_rmapped) { - rmap_add(vcpu, shadow_pte, gfn); - if (!is_rmap_pte(*shadow_pte)) - kvm_release_page_clean(page); - } - else - kvm_release_page_clean(page); - if (!ptwrite || !*ptwrite) - vcpu->arch.last_pte_updated = shadow_pte; -} - -static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) -{ -} - -static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) -{ - int level = PT32E_ROOT_LEVEL; - hpa_t table_addr = vcpu->arch.mmu.root_hpa; - int pt_write = 0; - - for (; ; level--) { - u32 index = PT64_INDEX(v, level); - u64 *table; - - ASSERT(VALID_PAGE(table_addr)); - table = __va(table_addr); - - if (level == 1) { - mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, - 0, write, 1, &pt_write, gfn); - return pt_write || is_io_pte(table[index]); - } - - if (table[index] == shadow_trap_nonpresent_pte) { - struct kvm_mmu_page *new_table; - gfn_t pseudo_gfn; - - pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK) - >> PAGE_SHIFT; - new_table = kvm_mmu_get_page(vcpu, pseudo_gfn, - v, level - 1, - 1, ACC_ALL, &table[index], - NULL); - if (!new_table) { - pgprintk("nonpaging_map: ENOMEM\n"); - return -ENOMEM; - } - - table[index] = __pa(new_table->spt) | PT_PRESENT_MASK - | PT_WRITABLE_MASK | PT_USER_MASK; - } - table_addr = table[index] & PT64_BASE_ADDR_MASK; - } -} - -static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp) -{ - int i; - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) - sp->spt[i] = shadow_trap_nonpresent_pte; -} - -static void mmu_free_roots(struct kvm_vcpu *vcpu) -{ - int i; - struct kvm_mmu_page *sp; - - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) - return; -#ifdef CONFIG_X86_64 - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { - hpa_t root = vcpu->arch.mmu.root_hpa; - - sp = page_header(root); - --sp->root_count; - vcpu->arch.mmu.root_hpa = INVALID_PAGE; - return; - } -#endif - for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu.pae_root[i]; - - if (root) { - root &= PT64_BASE_ADDR_MASK; - sp = page_header(root); - --sp->root_count; - } - vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; - } - vcpu->arch.mmu.root_hpa = INVALID_PAGE; -} - -static void mmu_alloc_roots(struct kvm_vcpu *vcpu) -{ - int i; - gfn_t root_gfn; - struct kvm_mmu_page *sp; - - root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; - -#ifdef CONFIG_X86_64 - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { - hpa_t root = vcpu->arch.mmu.root_hpa; - - ASSERT(!VALID_PAGE(root)); - sp = kvm_mmu_get_page(vcpu, root_gfn, 0, - PT64_ROOT_LEVEL, 0, ACC_ALL, NULL, NULL); - root = __pa(sp->spt); - ++sp->root_count; - vcpu->arch.mmu.root_hpa = root; - return; - } -#endif - for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu.pae_root[i]; - - ASSERT(!VALID_PAGE(root)); - if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { - if (!is_present_pte(vcpu->arch.pdptrs[i])) { - vcpu->arch.mmu.pae_root[i] = 0; - continue; - } - root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT; - } else if (vcpu->arch.mmu.root_level == 0) - root_gfn = 0; - sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, - PT32_ROOT_LEVEL, !is_paging(vcpu), - ACC_ALL, NULL, NULL); - root = __pa(sp->spt); - ++sp->root_count; - vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; - } - vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); -} - -static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) -{ - return vaddr; -} - -static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, - u32 error_code) -{ - gfn_t gfn; - int r; - - pgprintk("%s: gva %lx error %x\n", __FUNCTION__, gva, error_code); - r = mmu_topup_memory_caches(vcpu); - if (r) - return r; - - ASSERT(vcpu); - ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); - - gfn = gva >> PAGE_SHIFT; - - return nonpaging_map(vcpu, gva & PAGE_MASK, - error_code & PFERR_WRITE_MASK, gfn); -} - -static void nonpaging_free(struct kvm_vcpu *vcpu) -{ - mmu_free_roots(vcpu); -} - -static int nonpaging_init_context(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu *context = &vcpu->arch.mmu; - - context->new_cr3 = nonpaging_new_cr3; - context->page_fault = nonpaging_page_fault; - context->gva_to_gpa = nonpaging_gva_to_gpa; - context->free = nonpaging_free; - context->prefetch_page = nonpaging_prefetch_page; - context->root_level = 0; - context->shadow_root_level = PT32E_ROOT_LEVEL; - context->root_hpa = INVALID_PAGE; - return 0; -} - -void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) -{ - ++vcpu->stat.tlb_flush; - kvm_x86_ops->tlb_flush(vcpu); -} - -static void paging_new_cr3(struct kvm_vcpu *vcpu) -{ - pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3); - mmu_free_roots(vcpu); -} - -static void inject_page_fault(struct kvm_vcpu *vcpu, - u64 addr, - u32 err_code) -{ - kvm_inject_page_fault(vcpu, addr, err_code); -} - -static void paging_free(struct kvm_vcpu *vcpu) -{ - nonpaging_free(vcpu); -} - -#define PTTYPE 64 -#include "paging_tmpl.h" -#undef PTTYPE - -#define PTTYPE 32 -#include "paging_tmpl.h" -#undef PTTYPE - -static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) -{ - struct kvm_mmu *context = &vcpu->arch.mmu; - - ASSERT(is_pae(vcpu)); - context->new_cr3 = paging_new_cr3; - context->page_fault = paging64_page_fault; - context->gva_to_gpa = paging64_gva_to_gpa; - context->prefetch_page = paging64_prefetch_page; - context->free = paging_free; - context->root_level = level; - context->shadow_root_level = level; - context->root_hpa = INVALID_PAGE; - return 0; -} - -static int paging64_init_context(struct kvm_vcpu *vcpu) -{ - return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); -} - -static int paging32_init_context(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu *context = &vcpu->arch.mmu; - - context->new_cr3 = paging_new_cr3; - context->page_fault = paging32_page_fault; - context->gva_to_gpa = paging32_gva_to_gpa; - context->free = paging_free; - context->prefetch_page = paging32_prefetch_page; - context->root_level = PT32_ROOT_LEVEL; - context->shadow_root_level = PT32E_ROOT_LEVEL; - context->root_hpa = INVALID_PAGE; - return 0; -} - -static int paging32E_init_context(struct kvm_vcpu *vcpu) -{ - return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); -} - -static int init_kvm_mmu(struct kvm_vcpu *vcpu) -{ - ASSERT(vcpu); - ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); - - if (!is_paging(vcpu)) - return nonpaging_init_context(vcpu); - else if (is_long_mode(vcpu)) - return paging64_init_context(vcpu); - else if (is_pae(vcpu)) - return paging32E_init_context(vcpu); - else - return paging32_init_context(vcpu); -} - -static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) -{ - ASSERT(vcpu); - if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) { - vcpu->arch.mmu.free(vcpu); - vcpu->arch.mmu.root_hpa = INVALID_PAGE; - } -} - -int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) -{ - destroy_kvm_mmu(vcpu); - return init_kvm_mmu(vcpu); -} -EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); - -int kvm_mmu_load(struct kvm_vcpu *vcpu) -{ - int r; - - mutex_lock(&vcpu->kvm->lock); - r = mmu_topup_memory_caches(vcpu); - if (r) - goto out; - mmu_alloc_roots(vcpu); - kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); - kvm_mmu_flush_tlb(vcpu); -out: - mutex_unlock(&vcpu->kvm->lock); - return r; -} -EXPORT_SYMBOL_GPL(kvm_mmu_load); - -void kvm_mmu_unload(struct kvm_vcpu *vcpu) -{ - mmu_free_roots(vcpu); -} - -static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, - u64 *spte) -{ - u64 pte; - struct kvm_mmu_page *child; - - pte = *spte; - if (is_shadow_present_pte(pte)) { - if (sp->role.level == PT_PAGE_TABLE_LEVEL) - rmap_remove(vcpu->kvm, spte); - else { - child = page_header(pte & PT64_BASE_ADDR_MASK); - mmu_page_remove_parent_pte(child, spte); - } - } - set_shadow_pte(spte, shadow_trap_nonpresent_pte); -} - -static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, - u64 *spte, - const void *new, int bytes, - int offset_in_pte) -{ - if (sp->role.level != PT_PAGE_TABLE_LEVEL) { - ++vcpu->kvm->stat.mmu_pde_zapped; - return; - } - - ++vcpu->kvm->stat.mmu_pte_updated; - if (sp->role.glevels == PT32_ROOT_LEVEL) - paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte); - else - paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte); -} - -static bool need_remote_flush(u64 old, u64 new) -{ - if (!is_shadow_present_pte(old)) - return false; - if (!is_shadow_present_pte(new)) - return true; - if ((old ^ new) & PT64_BASE_ADDR_MASK) - return true; - old ^= PT64_NX_MASK; - new ^= PT64_NX_MASK; - return (old & ~new & PT64_PERM_MASK) != 0; -} - -static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new) -{ - if (need_remote_flush(old, new)) - kvm_flush_remote_tlbs(vcpu->kvm); - else - kvm_mmu_flush_tlb(vcpu); -} - -static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu) -{ - u64 *spte = vcpu->arch.last_pte_updated; - - return !!(spte && (*spte & PT_ACCESSED_MASK)); -} - -void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes) -{ - gfn_t gfn = gpa >> PAGE_SHIFT; - struct kvm_mmu_page *sp; - struct hlist_node *node, *n; - struct hlist_head *bucket; - unsigned index; - u64 entry; - u64 *spte; - unsigned offset = offset_in_page(gpa); - unsigned pte_size; - unsigned page_offset; - unsigned misaligned; - unsigned quadrant; - int level; - int flooded = 0; - int npte; - - pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes); - ++vcpu->kvm->stat.mmu_pte_write; - kvm_mmu_audit(vcpu, "pre pte write"); - if (gfn == vcpu->arch.last_pt_write_gfn - && !last_updated_pte_accessed(vcpu)) { - ++vcpu->arch.last_pt_write_count; - if (vcpu->arch.last_pt_write_count >= 3) - flooded = 1; - } else { - vcpu->arch.last_pt_write_gfn = gfn; - vcpu->arch.last_pt_write_count = 1; - vcpu->arch.last_pte_updated = NULL; - } - index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; - bucket = &vcpu->kvm->arch.mmu_page_hash[index]; - hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { - if (sp->gfn != gfn || sp->role.metaphysical) - continue; - pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; - misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); - misaligned |= bytes < 4; - if (misaligned || flooded) { - /* - * Misaligned accesses are too much trouble to fix - * up; also, they usually indicate a page is not used - * as a page table. - * - * If we're seeing too many writes to a page, - * it may no longer be a page table, or we may be - * forking, in which case it is better to unmap the - * page. - */ - pgprintk("misaligned: gpa %llx bytes %d role %x\n", - gpa, bytes, sp->role.word); - kvm_mmu_zap_page(vcpu->kvm, sp); - ++vcpu->kvm->stat.mmu_flooded; - continue; - } - page_offset = offset; - level = sp->role.level; - npte = 1; - if (sp->role.glevels == PT32_ROOT_LEVEL) { - page_offset <<= 1; /* 32->64 */ - /* - * A 32-bit pde maps 4MB while the shadow pdes map - * only 2MB. So we need to double the offset again - * and zap two pdes instead of one. - */ - if (level == PT32_ROOT_LEVEL) { - page_offset &= ~7; /* kill rounding error */ - page_offset <<= 1; - npte = 2; - } - quadrant = page_offset >> PAGE_SHIFT; - page_offset &= ~PAGE_MASK; - if (quadrant != sp->role.quadrant) - continue; - } - spte = &sp->spt[page_offset / sizeof(*spte)]; - while (npte--) { - entry = *spte; - mmu_pte_write_zap_pte(vcpu, sp, spte); - mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes, - page_offset & (pte_size - 1)); - mmu_pte_write_flush_tlb(vcpu, entry, *spte); - ++spte; - } - } - kvm_mmu_audit(vcpu, "post pte write"); -} - -int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) -{ - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); - - return kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); -} - -void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) -{ - while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) { - struct kvm_mmu_page *sp; - - sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, - struct kvm_mmu_page, link); - kvm_mmu_zap_page(vcpu->kvm, sp); - ++vcpu->kvm->stat.mmu_recycled; - } -} - -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) -{ - int r; - enum emulation_result er; - - mutex_lock(&vcpu->kvm->lock); - r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code); - if (r < 0) - goto out; - - if (!r) { - r = 1; - goto out; - } - - r = mmu_topup_memory_caches(vcpu); - if (r) - goto out; - - er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0); - mutex_unlock(&vcpu->kvm->lock); - - switch (er) { - case EMULATE_DONE: - return 1; - case EMULATE_DO_MMIO: - ++vcpu->stat.mmio_exits; - return 0; - case EMULATE_FAIL: - kvm_report_emulation_failure(vcpu, "pagetable"); - return 1; - default: - BUG(); - } -out: - mutex_unlock(&vcpu->kvm->lock); - return r; -} -EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); - -static void free_mmu_pages(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu_page *sp; - - while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) { - sp = container_of(vcpu->kvm->arch.active_mmu_pages.next, - struct kvm_mmu_page, link); - kvm_mmu_zap_page(vcpu->kvm, sp); - } - free_page((unsigned long)vcpu->arch.mmu.pae_root); -} - -static int alloc_mmu_pages(struct kvm_vcpu *vcpu) -{ - struct page *page; - int i; - - ASSERT(vcpu); - - if (vcpu->kvm->arch.n_requested_mmu_pages) - vcpu->kvm->arch.n_free_mmu_pages = - vcpu->kvm->arch.n_requested_mmu_pages; - else - vcpu->kvm->arch.n_free_mmu_pages = - vcpu->kvm->arch.n_alloc_mmu_pages; - /* - * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. - * Therefore we need to allocate shadow page tables in the first - * 4GB of memory, which happens to fit the DMA32 zone. - */ - page = alloc_page(GFP_KERNEL | __GFP_DMA32); - if (!page) - goto error_1; - vcpu->arch.mmu.pae_root = page_address(page); - for (i = 0; i < 4; ++i) - vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; - - return 0; - -error_1: - free_mmu_pages(vcpu); - return -ENOMEM; -} - -int kvm_mmu_create(struct kvm_vcpu *vcpu) -{ - ASSERT(vcpu); - ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); - - return alloc_mmu_pages(vcpu); -} - -int kvm_mmu_setup(struct kvm_vcpu *vcpu) -{ - ASSERT(vcpu); - ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); - - return init_kvm_mmu(vcpu); -} - -void kvm_mmu_destroy(struct kvm_vcpu *vcpu) -{ - ASSERT(vcpu); - - destroy_kvm_mmu(vcpu); - free_mmu_pages(vcpu); - mmu_free_memory_caches(vcpu); -} - -void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) -{ - struct kvm_mmu_page *sp; - - list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { - int i; - u64 *pt; - - if (!test_bit(slot, &sp->slot_bitmap)) - continue; - - pt = sp->spt; - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) - /* avoid RMW */ - if (pt[i] & PT_WRITABLE_MASK) - pt[i] &= ~PT_WRITABLE_MASK; - } -} - -void kvm_mmu_zap_all(struct kvm *kvm) -{ - struct kvm_mmu_page *sp, *node; - - list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) - kvm_mmu_zap_page(kvm, sp); - - kvm_flush_remote_tlbs(kvm); -} - -void kvm_mmu_module_exit(void) -{ - if (pte_chain_cache) - kmem_cache_destroy(pte_chain_cache); - if (rmap_desc_cache) - kmem_cache_destroy(rmap_desc_cache); - if (mmu_page_header_cache) - kmem_cache_destroy(mmu_page_header_cache); -} - -int kvm_mmu_module_init(void) -{ - pte_chain_cache = kmem_cache_create("kvm_pte_chain", - sizeof(struct kvm_pte_chain), - 0, 0, NULL); - if (!pte_chain_cache) - goto nomem; - rmap_desc_cache = kmem_cache_create("kvm_rmap_desc", - sizeof(struct kvm_rmap_desc), - 0, 0, NULL); - if (!rmap_desc_cache) - goto nomem; - - mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", - sizeof(struct kvm_mmu_page), - 0, 0, NULL); - if (!mmu_page_header_cache) - goto nomem; - - return 0; - -nomem: - kvm_mmu_module_exit(); - return -ENOMEM; -} - -/* - * Caculate mmu pages needed for kvm. - */ -unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) -{ - int i; - unsigned int nr_mmu_pages; - unsigned int nr_pages = 0; - - for (i = 0; i < kvm->nmemslots; i++) - nr_pages += kvm->memslots[i].npages; - - nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; - nr_mmu_pages = max(nr_mmu_pages, - (unsigned int) KVM_MIN_ALLOC_MMU_PAGES); - - return nr_mmu_pages; -} - -#ifdef AUDIT - -static const char *audit_msg; - -static gva_t canonicalize(gva_t gva) -{ -#ifdef CONFIG_X86_64 - gva = (long long)(gva << 16) >> 16; -#endif - return gva; -} - -static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, - gva_t va, int level) -{ - u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK); - int i; - gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1)); - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) { - u64 ent = pt[i]; - - if (ent == shadow_trap_nonpresent_pte) - continue; - - va = canonicalize(va); - if (level > 1) { - if (ent == shadow_notrap_nonpresent_pte) - printk(KERN_ERR "audit: (%s) nontrapping pte" - " in nonleaf level: levels %d gva %lx" - " level %d pte %llx\n", audit_msg, - vcpu->arch.mmu.root_level, va, level, ent); - - audit_mappings_page(vcpu, ent, va, level - 1); - } else { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); - struct page *page = gpa_to_page(vcpu, gpa); - hpa_t hpa = page_to_phys(page); - - if (is_shadow_present_pte(ent) - && (ent & PT64_BASE_ADDR_MASK) != hpa) - printk(KERN_ERR "xx audit error: (%s) levels %d" - " gva %lx gpa %llx hpa %llx ent %llx %d\n", - audit_msg, vcpu->arch.mmu.root_level, - va, gpa, hpa, ent, - is_shadow_present_pte(ent)); - else if (ent == shadow_notrap_nonpresent_pte - && !is_error_hpa(hpa)) - printk(KERN_ERR "audit: (%s) notrap shadow," - " valid guest gva %lx\n", audit_msg, va); - kvm_release_page_clean(page); - - } - } -} - -static void audit_mappings(struct kvm_vcpu *vcpu) -{ - unsigned i; - - if (vcpu->arch.mmu.root_level == 4) - audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4); - else - for (i = 0; i < 4; ++i) - if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK) - audit_mappings_page(vcpu, - vcpu->arch.mmu.pae_root[i], - i << 30, - 2); -} - -static int count_rmaps(struct kvm_vcpu *vcpu) -{ - int nmaps = 0; - int i, j, k; - - for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { - struct kvm_memory_slot *m = &vcpu->kvm->memslots[i]; - struct kvm_rmap_desc *d; - - for (j = 0; j < m->npages; ++j) { - unsigned long *rmapp = &m->rmap[j]; - - if (!*rmapp) - continue; - if (!(*rmapp & 1)) { - ++nmaps; - continue; - } - d = (struct kvm_rmap_desc *)(*rmapp & ~1ul); - while (d) { - for (k = 0; k < RMAP_EXT; ++k) - if (d->shadow_ptes[k]) - ++nmaps; - else - break; - d = d->more; - } - } - } - return nmaps; -} - -static int count_writable_mappings(struct kvm_vcpu *vcpu) -{ - int nmaps = 0; - struct kvm_mmu_page *sp; - int i; - - list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { - u64 *pt = sp->spt; - - if (sp->role.level != PT_PAGE_TABLE_LEVEL) - continue; - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - u64 ent = pt[i]; - - if (!(ent & PT_PRESENT_MASK)) - continue; - if (!(ent & PT_WRITABLE_MASK)) - continue; - ++nmaps; - } - } - return nmaps; -} - -static void audit_rmap(struct kvm_vcpu *vcpu) -{ - int n_rmap = count_rmaps(vcpu); - int n_actual = count_writable_mappings(vcpu); - - if (n_rmap != n_actual) - printk(KERN_ERR "%s: (%s) rmap %d actual %d\n", - __FUNCTION__, audit_msg, n_rmap, n_actual); -} - -static void audit_write_protection(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu_page *sp; - struct kvm_memory_slot *slot; - unsigned long *rmapp; - gfn_t gfn; - - list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { - if (sp->role.metaphysical) - continue; - - slot = gfn_to_memslot(vcpu->kvm, sp->gfn); - gfn = unalias_gfn(vcpu->kvm, sp->gfn); - rmapp = &slot->rmap[gfn - slot->base_gfn]; - if (*rmapp) - printk(KERN_ERR "%s: (%s) shadow page has writable" - " mappings: gfn %lx role %x\n", - __FUNCTION__, audit_msg, sp->gfn, - sp->role.word); - } -} - -static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) -{ - int olddbg = dbg; - - dbg = 0; - audit_msg = msg; - audit_rmap(vcpu); - audit_write_protection(vcpu); - audit_mappings(vcpu); - dbg = olddbg; -} - -#endif diff --git a/drivers/kvm/mmu.h b/drivers/kvm/mmu.h deleted file mode 100644 index cbfc272262d..00000000000 --- a/drivers/kvm/mmu.h +++ /dev/null @@ -1,44 +0,0 @@ -#ifndef __KVM_X86_MMU_H -#define __KVM_X86_MMU_H - -#include "kvm.h" - -static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) -{ - if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) - __kvm_mmu_free_some_pages(vcpu); -} - -static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) -{ - if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE)) - return 0; - - return kvm_mmu_load(vcpu); -} - -static inline int is_long_mode(struct kvm_vcpu *vcpu) -{ -#ifdef CONFIG_X86_64 - return vcpu->arch.shadow_efer & EFER_LME; -#else - return 0; -#endif -} - -static inline int is_pae(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.cr4 & X86_CR4_PAE; -} - -static inline int is_pse(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.cr4 & X86_CR4_PSE; -} - -static inline int is_paging(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.cr0 & X86_CR0_PG; -} - -#endif diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h deleted file mode 100644 index 56b88f7e83e..00000000000 --- a/drivers/kvm/paging_tmpl.h +++ /dev/null @@ -1,461 +0,0 @@ -/* - * Kernel-based Virtual Machine driver for Linux - * - * This module enables machines with Intel VT-x extensions to run virtual - * machines without emulation or binary translation. - * - * MMU support - * - * Copyright (C) 2006 Qumranet, Inc. - * - * Authors: - * Yaniv Kamay <yaniv@qumranet.com> - * Avi Kivity <avi@qumranet.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -/* - * We need the mmu code to access both 32-bit and 64-bit guest ptes, - * so the code in this file is compiled twice, once per pte size. - */ - -#if PTTYPE == 64 - #define pt_element_t u64 - #define guest_walker guest_walker64 - #define FNAME(name) paging##64_##name - #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK - #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK - #define PT_INDEX(addr, level) PT64_INDEX(addr, level) - #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) - #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) - #define PT_LEVEL_BITS PT64_LEVEL_BITS - #ifdef CONFIG_X86_64 - #define PT_MAX_FULL_LEVELS 4 - #define CMPXCHG cmpxchg - #else - #define CMPXCHG cmpxchg64 - #define PT_MAX_FULL_LEVELS 2 - #endif -#elif PTTYPE == 32 - #define pt_element_t u32 - #define guest_walker guest_walker32 - #define FNAME(name) paging##32_##name - #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK - #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK - #define PT_INDEX(addr, level) PT32_INDEX(addr, level) - #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) - #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) - #define PT_LEVEL_BITS PT32_LEVEL_BITS - #define PT_MAX_FULL_LEVELS 2 - #define CMPXCHG cmpxchg -#else - #error Invalid PTTYPE value -#endif - -#define gpte_to_gfn FNAME(gpte_to_gfn) -#define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde) - -/* - * The guest_walker structure emulates the behavior of the hardware page - * table walker. - */ -struct guest_walker { - int level; - gfn_t table_gfn[PT_MAX_FULL_LEVELS]; - pt_element_t ptes[PT_MAX_FULL_LEVELS]; - gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; - unsigned pt_access; - unsigned pte_access; - gfn_t gfn; - u32 error_code; -}; - -static gfn_t gpte_to_gfn(pt_element_t gpte) -{ - return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; -} - -static gfn_t gpte_to_gfn_pde(pt_element_t gpte) -{ - return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; -} - -static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, - gfn_t table_gfn, unsigned index, - pt_element_t orig_pte, pt_element_t new_pte) -{ - pt_element_t ret; - pt_element_t *table; - struct page *page; - - page = gfn_to_page(kvm, table_gfn); - table = kmap_atomic(page, KM_USER0); - - ret = CMPXCHG(&table[index], orig_pte, new_pte); - - kunmap_atomic(table, KM_USER0); - - kvm_release_page_dirty(page); - - return (ret != orig_pte); -} - -static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) -{ - unsigned access; - - access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; -#if PTTYPE == 64 - if (is_nx(vcpu)) - access &= ~(gpte >> PT64_NX_SHIFT); -#endif - return access; -} - -/* - * Fetch a guest pte for a guest virtual address - */ -static int FNAME(walk_addr)(struct guest_walker *walker, - struct kvm_vcpu *vcpu, gva_t addr, - int write_fault, int user_fault, int fetch_fault) -{ - pt_element_t pte; - gfn_t table_gfn; - unsigned index, pt_access, pte_access; - gpa_t pte_gpa; - - pgprintk("%s: addr %lx\n", __FUNCTION__, addr); -walk: - walker->level = vcpu->arch.mmu.root_level; - pte = vcpu->arch.cr3; -#if PTTYPE == 64 - if (!is_long_mode(vcpu)) { - pte = vcpu->arch.pdptrs[(addr >> 30) & 3]; - if (!is_present_pte(pte)) - goto not_present; - --walker->level; - } -#endif - ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || - (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0); - - pt_access = ACC_ALL; - - for (;;) { - index = PT_INDEX(addr, walker->level); - - table_gfn = gpte_to_gfn(pte); - pte_gpa = gfn_to_gpa(table_gfn); - pte_gpa += index * sizeof(pt_element_t); - walker->table_gfn[walker->level - 1] = table_gfn; - walker->pte_gpa[walker->level - 1] = pte_gpa; - pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__, - walker->level - 1, table_gfn); - - kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); - - if (!is_present_pte(pte)) - goto not_present; - - if (write_fault && !is_writeble_pte(pte)) - if (user_fault || is_write_protection(vcpu)) - goto access_error; - - if (user_fault && !(pte & PT_USER_MASK)) - goto access_error; - -#if PTTYPE == 64 - if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK)) - goto access_error; -#endif - - if (!(pte & PT_ACCESSED_MASK)) { - mark_page_dirty(vcpu->kvm, table_gfn); - if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, - index, pte, pte|PT_ACCESSED_MASK)) - goto walk; - pte |= PT_ACCESSED_MASK; - } - - pte_access = pt_access & FNAME(gpte_access)(vcpu, pte); - - walker->ptes[walker->level - 1] = pte; - - if (walker->level == PT_PAGE_TABLE_LEVEL) { - walker->gfn = gpte_to_gfn(pte); - break; - } - - if (walker->level == PT_DIRECTORY_LEVEL - && (pte & PT_PAGE_SIZE_MASK) - && (PTTYPE == 64 || is_pse(vcpu))) { - walker->gfn = gpte_to_gfn_pde(pte); - walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL); - if (PTTYPE == 32 && is_cpuid_PSE36()) - walker->gfn += pse36_gfn_delta(pte); - break; - } - - pt_access = pte_access; - --walker->level; - } - - if (write_fault && !is_dirty_pte(pte)) { - bool ret; - - mark_page_dirty(vcpu->kvm, table_gfn); - ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, - pte|PT_DIRTY_MASK); - if (ret) - goto walk; - pte |= PT_DIRTY_MASK; - kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte)); - walker->ptes[walker->level - 1] = pte; - } - - walker->pt_access = pt_access; - walker->pte_access = pte_access; - pgprintk("%s: pte %llx pte_access %x pt_access %x\n", - __FUNCTION__, (u64)pte, pt_access, pte_access); - return 1; - -not_present: - walker->error_code = 0; - goto err; - -access_error: - walker->error_code = PFERR_PRESENT_MASK; - -err: - if (write_fault) - walker->error_code |= PFERR_WRITE_MASK; - if (user_fault) - walker->error_code |= PFERR_USER_MASK; - if (fetch_fault) - walker->error_code |= PFERR_FETCH_MASK; - return 0; -} - -static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, - u64 *spte, const void *pte, int bytes, - int offset_in_pte) -{ - pt_element_t gpte; - unsigned pte_access; - - gpte = *(const pt_element_t *)pte; - if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { - if (!offset_in_pte && !is_present_pte(gpte)) - set_shadow_pte(spte, shadow_notrap_nonpresent_pte); - return; - } - if (bytes < sizeof(pt_element_t)) - return; - pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); - pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte); - mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, - gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte)); -} - -/* - * Fetch a shadow pte for a specific level in the paging hierarchy. - */ -static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, - struct guest_walker *walker, - int user_fault, int write_fault, int *ptwrite) -{ - hpa_t shadow_addr; - int level; - u64 *shadow_ent; - unsigned access = walker->pt_access; - - if (!is_present_pte(walker->ptes[walker->level - 1])) - return NULL; - - shadow_addr = vcpu->arch.mmu.root_hpa; - level = vcpu->arch.mmu.shadow_root_level; - if (level == PT32E_ROOT_LEVEL) { - shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; - shadow_addr &= PT64_BASE_ADDR_MASK; - --level; - } - - for (; ; level--) { - u32 index = SHADOW_PT_INDEX(addr, level); - struct kvm_mmu_page *shadow_page; - u64 shadow_pte; - int metaphysical; - gfn_t table_gfn; - bool new_page = 0; - - shadow_ent = ((u64 *)__va(shadow_addr)) + index; - if (is_shadow_present_pte(*shadow_ent)) { - if (level == PT_PAGE_TABLE_LEVEL) - break; - shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; - continue; - } - - if (level == PT_PAGE_TABLE_LEVEL) - break; - - if (level - 1 == PT_PAGE_TABLE_LEVEL - && walker->level == PT_DIRECTORY_LEVEL) { - metaphysical = 1; - if (!is_dirty_pte(walker->ptes[level - 1])) - access &= ~ACC_WRITE_MASK; - table_gfn = gpte_to_gfn(walker->ptes[level - 1]); - } else { - metaphysical = 0; - table_gfn = walker->table_gfn[level - 2]; - } - shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, - metaphysical, access, - shadow_ent, &new_page); - if (new_page && !metaphysical) { - pt_element_t curr_pte; - kvm_read_guest(vcpu->kvm, walker->pte_gpa[level - 2], - &curr_pte, sizeof(curr_pte)); - if (curr_pte != walker->ptes[level - 2]) - return NULL; - } - shadow_addr = __pa(shadow_page->spt); - shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK - | PT_WRITABLE_MASK | PT_USER_MASK; - *shadow_ent = shadow_pte; - } - - mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, - user_fault, write_fault, - walker->ptes[walker->level-1] & PT_DIRTY_MASK, - ptwrite, walker->gfn); - - return shadow_ent; -} - -/* - * Page fault handler. There are several causes for a page fault: - * - there is no shadow pte for the guest pte - * - write access through a shadow pte marked read only so that we can set - * the dirty bit - * - write access to a shadow pte marked read only so we can update the page - * dirty bitmap, when userspace requests it - * - mmio access; in this case we will never install a present shadow pte - * - normal guest page fault due to the guest pte marked not present, not - * writable, or not executable - * - * Returns: 1 if we need to emulate the instruction, 0 otherwise, or - * a negative value on error. - */ -static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, - u32 error_code) -{ - int write_fault = error_code & PFERR_WRITE_MASK; - int user_fault = error_code & PFERR_USER_MASK; - int fetch_fault = error_code & PFERR_FETCH_MASK; - struct guest_walker walker; - u64 *shadow_pte; - int write_pt = 0; - int r; - - pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code); - kvm_mmu_audit(vcpu, "pre page fault"); - - r = mmu_topup_memory_caches(vcpu); - if (r) - return r; - - /* - * Look up the shadow pte for the faulting address. - */ - r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, - fetch_fault); - - /* - * The page is not mapped by the guest. Let the guest handle it. - */ - if (!r) { - pgprintk("%s: guest page fault\n", __FUNCTION__); - inject_page_fault(vcpu, addr, walker.error_code); - vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ - return 0; - } - - shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, - &write_pt); - pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__, - shadow_pte, *shadow_pte, write_pt); - - if (!write_pt) - vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ - - /* - * mmio: emulate if accessible, otherwise its a guest fault. - */ - if (shadow_pte && is_io_pte(*shadow_pte)) - return 1; - - ++vcpu->stat.pf_fixed; - kvm_mmu_audit(vcpu, "post page fault (fixed)"); - - return write_pt; -} - -static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) -{ - struct guest_walker walker; - gpa_t gpa = UNMAPPED_GVA; - int r; - - r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0); - - if (r) { - gpa = gfn_to_gpa(walker.gfn); - gpa |= vaddr & ~PAGE_MASK; - } - - return gpa; -} - -static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp) -{ - int i, offset = 0; - pt_element_t *gpt; - struct page *page; - - if (sp->role.metaphysical - || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) { - nonpaging_prefetch_page(vcpu, sp); - return; - } - - if (PTTYPE == 32) - offset = sp->role.quadrant << PT64_LEVEL_BITS; - page = gfn_to_page(vcpu->kvm, sp->gfn); - gpt = kmap_atomic(page, KM_USER0); - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) - if (is_present_pte(gpt[offset + i])) - sp->spt[i] = shadow_trap_nonpresent_pte; - else - sp->spt[i] = shadow_notrap_nonpresent_pte; - kunmap_atomic(gpt, KM_USER0); - kvm_release_page_clean(page); -} - -#undef pt_element_t -#undef guest_walker -#undef FNAME -#undef PT_BASE_ADDR_MASK -#undef PT_INDEX -#undef SHADOW_PT_INDEX -#undef PT_LEVEL_MASK -#undef PT_DIR_BASE_ADDR_MASK -#undef PT_LEVEL_BITS -#undef PT_MAX_FULL_LEVELS -#undef gpte_to_gfn -#undef gpte_to_gfn_pde -#undef CMPXCHG diff --git a/drivers/kvm/segment_descriptor.h b/drivers/kvm/segment_descriptor.h deleted file mode 100644 index 56fc4c87338..00000000000 --- a/drivers/kvm/segment_descriptor.h +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef __SEGMENT_DESCRIPTOR_H -#define __SEGMENT_DESCRIPTOR_H - -struct segment_descriptor { - u16 limit_low; - u16 base_low; - u8 base_mid; - u8 type : 4; - u8 system : 1; - u8 dpl : 2; - u8 present : 1; - u8 limit_high : 4; - u8 avl : 1; - u8 long_mode : 1; - u8 default_op : 1; - u8 granularity : 1; - u8 base_high; -} __attribute__((packed)); - -#ifdef CONFIG_X86_64 -/* LDT or TSS descriptor in the GDT. 16 bytes. */ -struct segment_descriptor_64 { - struct segment_descriptor s; - u32 base_higher; - u32 pad_zero; -}; - -#endif -#endif diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c deleted file mode 100644 index e606f6d1866..00000000000 --- a/drivers/kvm/svm.c +++ /dev/null @@ -1,1725 +0,0 @@ -/* - * Kernel-based Virtual Machine driver for Linux - * - * AMD SVM support - * - * Copyright (C) 2006 Qumranet, Inc. - * - * Authors: - * Yaniv Kamay <yaniv@qumranet.com> - * Avi Kivity <avi@qumranet.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ -#include "x86.h" -#include "kvm_svm.h" -#include "x86_emulate.h" -#include "irq.h" -#include "mmu.h" - -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/vmalloc.h> -#include <linux/highmem.h> -#include <linux/sched.h> - -#include <asm/desc.h> - -MODULE_AUTHOR("Qumranet"); -MODULE_LICENSE("GPL"); - -#define IOPM_ALLOC_ORDER 2 -#define MSRPM_ALLOC_ORDER 1 - -#define DB_VECTOR 1 -#define UD_VECTOR 6 -#define GP_VECTOR 13 - -#define DR7_GD_MASK (1 << 13) -#define DR6_BD_MASK (1 << 13) - -#define SEG_TYPE_LDT 2 -#define SEG_TYPE_BUSY_TSS16 3 - -#define SVM_FEATURE_NPT (1 << 0) -#define SVM_FEATURE_LBRV (1 << 1) -#define SVM_DEATURE_SVML (1 << 2) - -static void kvm_reput_irq(struct vcpu_svm *svm); - -static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) -{ - return container_of(vcpu, struct vcpu_svm, vcpu); -} - -unsigned long iopm_base; -unsigned long msrpm_base; - -struct kvm_ldttss_desc { - u16 limit0; - u16 base0; - unsigned base1 : 8, type : 5, dpl : 2, p : 1; - unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8; - u32 base3; - u32 zero1; -} __attribute__((packed)); - -struct svm_cpu_data { - int cpu; - - u64 asid_generation; - u32 max_asid; - u32 next_asid; - struct kvm_ldttss_desc *tss_desc; - - struct page *save_area; -}; - -static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); -static uint32_t svm_features; - -struct svm_init_data { - int cpu; - int r; -}; - -static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; - -#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges) -#define MSRS_RANGE_SIZE 2048 -#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2) - -#define MAX_INST_SIZE 15 - -static inline u32 svm_has(u32 feat) -{ - return svm_features & feat; -} - -static inline u8 pop_irq(struct kvm_vcpu *vcpu) -{ - int word_index = __ffs(vcpu->arch.irq_summary); - int bit_index = __ffs(vcpu->arch.irq_pending[word_index]); - int irq = word_index * BITS_PER_LONG + bit_index; - - clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); - if (!vcpu->arch.irq_pending[word_index]) - clear_bit(word_index, &vcpu->arch.irq_summary); - return irq; -} - -static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq) -{ - set_bit(irq, vcpu->arch.irq_pending); - set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary); -} - -static inline void clgi(void) -{ - asm volatile (SVM_CLGI); -} - -static inline void stgi(void) -{ - asm volatile (SVM_STGI); -} - -static inline void invlpga(unsigned long addr, u32 asid) -{ - asm volatile (SVM_INVLPGA :: "a"(addr), "c"(asid)); -} - -static inline unsigned long kvm_read_cr2(void) -{ - unsigned long cr2; - - asm volatile ("mov %%cr2, %0" : "=r" (cr2)); - return cr2; -} - -static inline void kvm_write_cr2(unsigned long val) -{ - asm volatile ("mov %0, %%cr2" :: "r" (val)); -} - -static inline unsigned long read_dr6(void) -{ - unsigned long dr6; - - asm volatile ("mov %%dr6, %0" : "=r" (dr6)); - return dr6; -} - -static inline void write_dr6(unsigned long val) -{ - asm volatile ("mov %0, %%dr6" :: "r" (val)); -} - -static inline unsigned long read_dr7(void) -{ - unsigned long dr7; - - asm volatile ("mov %%dr7, %0" : "=r" (dr7)); - return dr7; -} - -static inline void write_dr7(unsigned long val) -{ - asm volatile ("mov %0, %%dr7" :: "r" (val)); -} - -static inline void force_new_asid(struct kvm_vcpu *vcpu) -{ - to_svm(vcpu)->asid_generation--; -} - -static inline void flush_guest_tlb(struct kvm_vcpu *vcpu) -{ - force_new_asid(vcpu); -} - -static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) -{ - if (!(efer & EFER_LMA)) - efer &= ~EFER_LME; - - to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK; - vcpu->arch.shadow_efer = efer; -} - -static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, - bool has_error_code, u32 error_code) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - svm->vmcb->control.event_inj = nr - | SVM_EVTINJ_VALID - | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0) - | SVM_EVTINJ_TYPE_EXEPT; - svm->vmcb->control.event_inj_err = error_code; -} - -static bool svm_exception_injected(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID); -} - -static int is_external_interrupt(u32 info) -{ - info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; - return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR); -} - -static void skip_emulated_instruction(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - if (!svm->next_rip) { - printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__); - return; - } - if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) - printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n", - __FUNCTION__, - svm->vmcb->save.rip, - svm->next_rip); - - vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip; - svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; - - vcpu->arch.interrupt_window_open = 1; -} - -static int has_svm(void) -{ - uint32_t eax, ebx, ecx, edx; - - if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { - printk(KERN_INFO "has_svm: not amd\n"); - return 0; - } - - cpuid(0x80000000, &eax, &ebx, &ecx, &edx); - if (eax < SVM_CPUID_FUNC) { - printk(KERN_INFO "has_svm: can't execute cpuid_8000000a\n"); - return 0; - } - - cpuid(0x80000001, &eax, &ebx, &ecx, &edx); - if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) { - printk(KERN_DEBUG "has_svm: svm not available\n"); - return 0; - } - return 1; -} - -static void svm_hardware_disable(void *garbage) -{ - struct svm_cpu_data *svm_data - = per_cpu(svm_data, raw_smp_processor_id()); - - if (svm_data) { - uint64_t efer; - - wrmsrl(MSR_VM_HSAVE_PA, 0); - rdmsrl(MSR_EFER, efer); - wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK); - per_cpu(svm_data, raw_smp_processor_id()) = NULL; - __free_page(svm_data->save_area); - kfree(svm_data); - } -} - -static void svm_hardware_enable(void *garbage) -{ - - struct svm_cpu_data *svm_data; - uint64_t efer; -#ifdef CONFIG_X86_64 - struct desc_ptr gdt_descr; -#else - struct desc_ptr gdt_descr; -#endif - struct desc_struct *gdt; - int me = raw_smp_processor_id(); - - if (!has_svm()) { - printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me); - return; - } - svm_data = per_cpu(svm_data, me); - - if (!svm_data) { - printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n", - me); - return; - } - - svm_data->asid_generation = 1; - svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; - svm_data->next_asid = svm_data->max_asid + 1; - svm_features = cpuid_edx(SVM_CPUID_FUNC); - - asm volatile ("sgdt %0" : "=m"(gdt_descr)); - gdt = (struct desc_struct *)gdt_descr.address; - svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); - - rdmsrl(MSR_EFER, efer); - wrmsrl(MSR_EFER, efer | MSR_EFER_SVME_MASK); - - wrmsrl(MSR_VM_HSAVE_PA, - page_to_pfn(svm_data->save_area) << PAGE_SHIFT); -} - -static int svm_cpu_init(int cpu) -{ - struct svm_cpu_data *svm_data; - int r; - - svm_data = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL); - if (!svm_data) - return -ENOMEM; - svm_data->cpu = cpu; - svm_data->save_area = alloc_page(GFP_KERNEL); - r = -ENOMEM; - if (!svm_data->save_area) - goto err_1; - - per_cpu(svm_data, cpu) = svm_data; - - return 0; - -err_1: - kfree(svm_data); - return r; - -} - -static void set_msr_interception(u32 *msrpm, unsigned msr, - int read, int write) -{ - int i; - - for (i = 0; i < NUM_MSR_MAPS; i++) { - if (msr >= msrpm_ranges[i] && - msr < msrpm_ranges[i] + MSRS_IN_RANGE) { - u32 msr_offset = (i * MSRS_IN_RANGE + msr - - msrpm_ranges[i]) * 2; - - u32 *base = msrpm + (msr_offset / 32); - u32 msr_shift = msr_offset % 32; - u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1); - *base = (*base & ~(0x3 << msr_shift)) | - (mask << msr_shift); - return; - } - } - BUG(); -} - -static __init int svm_hardware_setup(void) -{ - int cpu; - struct page *iopm_pages; - struct page *msrpm_pages; - void *iopm_va, *msrpm_va; - int r; - - iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER); - - if (!iopm_pages) - return -ENOMEM; - - iopm_va = page_address(iopm_pages); - memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER)); - clear_bit(0x80, iopm_va); /* allow direct access to PC debug port */ - iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; - - - msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); - - r = -ENOMEM; - if (!msrpm_pages) - goto err_1; - - msrpm_va = page_address(msrpm_pages); - memset(msrpm_va, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); - msrpm_base = page_to_pfn(msrpm_pages) << PAGE_SHIFT; - -#ifdef CONFIG_X86_64 - set_msr_interception(msrpm_va, MSR_GS_BASE, 1, 1); - set_msr_interception(msrpm_va, MSR_FS_BASE, 1, 1); - set_msr_interception(msrpm_va, MSR_KERNEL_GS_BASE, 1, 1); - set_msr_interception(msrpm_va, MSR_LSTAR, 1, 1); - set_msr_interception(msrpm_va, MSR_CSTAR, 1, 1); - set_msr_interception(msrpm_va, MSR_SYSCALL_MASK, 1, 1); -#endif - set_msr_interception(msrpm_va, MSR_K6_STAR, 1, 1); - set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_CS, 1, 1); - set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_ESP, 1, 1); - set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_EIP, 1, 1); - - for_each_online_cpu(cpu) { - r = svm_cpu_init(cpu); - if (r) - goto err_2; - } - return 0; - -err_2: - __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER); - msrpm_base = 0; -err_1: - __free_pages(iopm_pages, IOPM_ALLOC_ORDER); - iopm_base = 0; - return r; -} - -static __exit void svm_hardware_unsetup(void) -{ - __free_pages(pfn_to_page(msrpm_base >> PAGE_SHIFT), MSRPM_ALLOC_ORDER); - __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); - iopm_base = msrpm_base = 0; -} - -static void init_seg(struct vmcb_seg *seg) -{ - seg->selector = 0; - seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK | - SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */ - seg->limit = 0xffff; - seg->base = 0; -} - -static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) -{ - seg->selector = 0; - seg->attrib = SVM_SELECTOR_P_MASK | type; - seg->limit = 0xffff; - seg->base = 0; -} - -static void init_vmcb(struct vmcb *vmcb) -{ - struct vmcb_control_area *control = &vmcb->control; - struct vmcb_save_area *save = &vmcb->save; - - control->intercept_cr_read = INTERCEPT_CR0_MASK | - INTERCEPT_CR3_MASK | - INTERCEPT_CR4_MASK | - INTERCEPT_CR8_MASK; - - control->intercept_cr_write = INTERCEPT_CR0_MASK | - INTERCEPT_CR3_MASK | - INTERCEPT_CR4_MASK | - INTERCEPT_CR8_MASK; - - control->intercept_dr_read = INTERCEPT_DR0_MASK | - INTERCEPT_DR1_MASK | - INTERCEPT_DR2_MASK | - INTERCEPT_DR3_MASK; - - control->intercept_dr_write = INTERCEPT_DR0_MASK | - INTERCEPT_DR1_MASK | - INTERCEPT_DR2_MASK | - INTERCEPT_DR3_MASK | - INTERCEPT_DR5_MASK | - INTERCEPT_DR7_MASK; - - control->intercept_exceptions = (1 << PF_VECTOR) | - (1 << UD_VECTOR); - - - control->intercept = (1ULL << INTERCEPT_INTR) | - (1ULL << INTERCEPT_NMI) | - (1ULL << INTERCEPT_SMI) | - /* - * selective cr0 intercept bug? - * 0: 0f 22 d8 mov %eax,%cr3 - * 3: 0f 20 c0 mov %cr0,%eax - * 6: 0d 00 00 00 80 or $0x80000000,%eax - * b: 0f 22 c0 mov %eax,%cr0 - * set cr3 ->interception - * get cr0 ->interception - * set cr0 -> no interception - */ - /* (1ULL << INTERCEPT_SELECTIVE_CR0) | */ - (1ULL << INTERCEPT_CPUID) | - (1ULL << INTERCEPT_INVD) | - (1ULL << INTERCEPT_HLT) | - (1ULL << INTERCEPT_INVLPGA) | - (1ULL << INTERCEPT_IOIO_PROT) | - (1ULL << INTERCEPT_MSR_PROT) | - (1ULL << INTERCEPT_TASK_SWITCH) | - (1ULL << INTERCEPT_SHUTDOWN) | - (1ULL << INTERCEPT_VMRUN) | - (1ULL << INTERCEPT_VMMCALL) | - (1ULL << INTERCEPT_VMLOAD) | - (1ULL << INTERCEPT_VMSAVE) | - (1ULL << INTERCEPT_STGI) | - (1ULL << INTERCEPT_CLGI) | - (1ULL << INTERCEPT_SKINIT) | - (1ULL << INTERCEPT_WBINVD) | - (1ULL << INTERCEPT_MONITOR) | - (1ULL << INTERCEPT_MWAIT); - - control->iopm_base_pa = iopm_base; - control->msrpm_base_pa = msrpm_base; - control->tsc_offset = 0; - control->int_ctl = V_INTR_MASKING_MASK; - - init_seg(&save->es); - init_seg(&save->ss); - init_seg(&save->ds); - init_seg(&save->fs); - init_seg(&save->gs); - - save->cs.selector = 0xf000; - /* Executable/Readable Code Segment */ - save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK | - SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK; - save->cs.limit = 0xffff; - /* - * cs.base should really be 0xffff0000, but vmx can't handle that, so - * be consistent with it. - * - * Replace when we have real mode working for vmx. - */ - save->cs.base = 0xf0000; - - save->gdtr.limit = 0xffff; - save->idtr.limit = 0xffff; - - init_sys_seg(&save->ldtr, SEG_TYPE_LDT); - init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); - - save->efer = MSR_EFER_SVME_MASK; - save->dr6 = 0xffff0ff0; - save->dr7 = 0x400; - save->rflags = 2; - save->rip = 0x0000fff0; - - /* - * cr0 val on cpu init should be 0x60000010, we enable cpu - * cache by default. the orderly way is to enable cache in bios. - */ - save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP; - save->cr4 = X86_CR4_PAE; - /* rdx = ?? */ -} - -static int svm_vcpu_reset(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - init_vmcb(svm->vmcb); - - if (vcpu->vcpu_id != 0) { - svm->vmcb->save.rip = 0; - svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; - svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; - } - - return 0; -} - -static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) -{ - struct vcpu_svm *svm; - struct page *page; - int err; - - svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); - if (!svm) { - err = -ENOMEM; - goto out; - } - - err = kvm_vcpu_init(&svm->vcpu, kvm, id); - if (err) - goto free_svm; - - page = alloc_page(GFP_KERNEL); - if (!page) { - err = -ENOMEM; - goto uninit; - } - - svm->vmcb = page_address(page); - clear_page(svm->vmcb); - svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; - svm->asid_generation = 0; - memset(svm->db_regs, 0, sizeof(svm->db_regs)); - init_vmcb(svm->vmcb); - - fx_init(&svm->vcpu); - svm->vcpu.fpu_active = 1; - svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; - if (svm->vcpu.vcpu_id == 0) - svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; - - return &svm->vcpu; - -uninit: - kvm_vcpu_uninit(&svm->vcpu); -free_svm: - kmem_cache_free(kvm_vcpu_cache, svm); -out: - return ERR_PTR(err); -} - -static void svm_free_vcpu(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT)); - kvm_vcpu_uninit(vcpu); - kmem_cache_free(kvm_vcpu_cache, svm); -} - -static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - int i; - - if (unlikely(cpu != vcpu->cpu)) { - u64 tsc_this, delta; - - /* - * Make sure that the guest sees a monotonically - * increasing TSC. - */ - rdtscll(tsc_this); - delta = vcpu->arch.host_tsc - tsc_this; - svm->vmcb->control.tsc_offset += delta; - vcpu->cpu = cpu; - kvm_migrate_apic_timer(vcpu); - } - - for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) - rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); -} - -static void svm_vcpu_put(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - int i; - - ++vcpu->stat.host_state_reload; - for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) - wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); - - rdtscll(vcpu->arch.host_tsc); -} - -static void svm_vcpu_decache(struct kvm_vcpu *vcpu) -{ -} - -static void svm_cache_regs(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; - vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; - vcpu->arch.rip = svm->vmcb->save.rip; -} - -static void svm_decache_regs(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; - svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; - svm->vmcb->save.rip = vcpu->arch.rip; -} - -static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) -{ - return to_svm(vcpu)->vmcb->save.rflags; -} - -static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) -{ - to_svm(vcpu)->vmcb->save.rflags = rflags; -} - -static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) -{ - struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save; - - switch (seg) { - case VCPU_SREG_CS: return &save->cs; - case VCPU_SREG_DS: return &save->ds; - case VCPU_SREG_ES: return &save->es; - case VCPU_SREG_FS: return &save->fs; - case VCPU_SREG_GS: return &save->gs; - case VCPU_SREG_SS: return &save->ss; - case VCPU_SREG_TR: return &save->tr; - case VCPU_SREG_LDTR: return &save->ldtr; - } - BUG(); - return NULL; -} - -static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg) -{ - struct vmcb_seg *s = svm_seg(vcpu, seg); - - return s->base; -} - -static void svm_get_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) -{ - struct vmcb_seg *s = svm_seg(vcpu, seg); - - var->base = s->base; - var->limit = s->limit; - var->selector = s->selector; - var->type = s->attrib & SVM_SELECTOR_TYPE_MASK; - var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1; - var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3; - var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1; - var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1; - var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1; - var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; - var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1; - var->unusable = !var->present; -} - -static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - dt->limit = svm->vmcb->save.idtr.limit; - dt->base = svm->vmcb->save.idtr.base; -} - -static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - svm->vmcb->save.idtr.limit = dt->limit; - svm->vmcb->save.idtr.base = dt->base ; -} - -static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - dt->limit = svm->vmcb->save.gdtr.limit; - dt->base = svm->vmcb->save.gdtr.base; -} - -static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - svm->vmcb->save.gdtr.limit = dt->limit; - svm->vmcb->save.gdtr.base = dt->base ; -} - -static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) -{ -} - -static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) -{ - struct vcpu_svm *svm = to_svm(vcpu); - -#ifdef CONFIG_X86_64 - if (vcpu->arch.shadow_efer & EFER_LME) { - if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { - vcpu->arch.shadow_efer |= EFER_LMA; - svm->vmcb->save.efer |= EFER_LMA | EFER_LME; - } - - if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) { - vcpu->arch.shadow_efer &= ~EFER_LMA; - svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); - } - } -#endif - if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { - svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); - vcpu->fpu_active = 1; - } - - vcpu->arch.cr0 = cr0; - cr0 |= X86_CR0_PG | X86_CR0_WP; - cr0 &= ~(X86_CR0_CD | X86_CR0_NW); - svm->vmcb->save.cr0 = cr0; -} - -static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) -{ - vcpu->arch.cr4 = cr4; - to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE; -} - -static void svm_set_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) -{ - struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb_seg *s = svm_seg(vcpu, seg); - - s->base = var->base; - s->limit = var->limit; - s->selector = var->selector; - if (var->unusable) - s->attrib = 0; - else { - s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK); - s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT; - s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT; - s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT; - s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT; - s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT; - s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT; - s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; - } - if (seg == VCPU_SREG_CS) - svm->vmcb->save.cpl - = (svm->vmcb->save.cs.attrib - >> SVM_SELECTOR_DPL_SHIFT) & 3; - -} - -/* FIXME: - - svm(vcpu)->vmcb->control.int_ctl &= ~V_TPR_MASK; - svm(vcpu)->vmcb->control.int_ctl |= (sregs->cr8 & V_TPR_MASK); - -*/ - -static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) -{ - return -EOPNOTSUPP; -} - -static int svm_get_irq(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - u32 exit_int_info = svm->vmcb->control.exit_int_info; - - if (is_external_interrupt(exit_int_info)) - return exit_int_info & SVM_EVTINJ_VEC_MASK; - return -1; -} - -static void load_host_msrs(struct kvm_vcpu *vcpu) -{ -#ifdef CONFIG_X86_64 - wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base); -#endif -} - -static void save_host_msrs(struct kvm_vcpu *vcpu) -{ -#ifdef CONFIG_X86_64 - rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base); -#endif -} - -static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data) -{ - if (svm_data->next_asid > svm_data->max_asid) { - ++svm_data->asid_generation; - svm_data->next_asid = 1; - svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; - } - - svm->vcpu.cpu = svm_data->cpu; - svm->asid_generation = svm_data->asid_generation; - svm->vmcb->control.asid = svm_data->next_asid++; -} - -static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) -{ - return to_svm(vcpu)->db_regs[dr]; -} - -static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, - int *exception) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - *exception = 0; - - if (svm->vmcb->save.dr7 & DR7_GD_MASK) { - svm->vmcb->save.dr7 &= ~DR7_GD_MASK; - svm->vmcb->save.dr6 |= DR6_BD_MASK; - *exception = DB_VECTOR; - return; - } - - switch (dr) { - case 0 ... 3: - svm->db_regs[dr] = value; - return; - case 4 ... 5: - if (vcpu->arch.cr4 & X86_CR4_DE) { - *exception = UD_VECTOR; - return; - } - case 7: { - if (value & ~((1ULL << 32) - 1)) { - *exception = GP_VECTOR; - return; - } - svm->vmcb->save.dr7 = value; - return; - } - default: - printk(KERN_DEBUG "%s: unexpected dr %u\n", - __FUNCTION__, dr); - *exception = UD_VECTOR; - return; - } -} - -static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) -{ - u32 exit_int_info = svm->vmcb->control.exit_int_info; - struct kvm *kvm = svm->vcpu.kvm; - u64 fault_address; - u32 error_code; - - if (!irqchip_in_kernel(kvm) && - is_external_interrupt(exit_int_info)) - push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); - - fault_address = svm->vmcb->control.exit_info_2; - error_code = svm->vmcb->control.exit_info_1; - return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); -} - -static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) -{ - int er; - - er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0); - if (er != EMULATE_DONE) - kvm_queue_exception(&svm->vcpu, UD_VECTOR); - return 1; -} - -static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) -{ - svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); - if (!(svm->vcpu.arch.cr0 & X86_CR0_TS)) - svm->vmcb->save.cr0 &= ~X86_CR0_TS; - svm->vcpu.fpu_active = 1; - - return 1; -} - -static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) -{ - /* - * VMCB is undefined after a SHUTDOWN intercept - * so reinitialize it. - */ - clear_page(svm->vmcb); - init_vmcb(svm->vmcb); - - kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; - return 0; -} - -static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) -{ - u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ - int size, down, in, string, rep; - unsigned port; - - ++svm->vcpu.stat.io_exits; - - svm->next_rip = svm->vmcb->control.exit_info_2; - - string = (io_info & SVM_IOIO_STR_MASK) != 0; - - if (string) { - if (emulate_instruction(&svm->vcpu, - kvm_run, 0, 0, 0) == EMULATE_DO_MMIO) - return 0; - return 1; - } - - in = (io_info & SVM_IOIO_TYPE_MASK) != 0; - port = io_info >> 16; - size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; - rep = (io_info & SVM_IOIO_REP_MASK) != 0; - down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0; - - return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port); -} - -static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) -{ - return 1; -} - -static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) -{ - svm->next_rip = svm->vmcb->save.rip + 1; - skip_emulated_instruction(&svm->vcpu); - return kvm_emulate_halt(&svm->vcpu); -} - -static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) -{ - svm->next_rip = svm->vmcb->save.rip + 3; - skip_emulated_instruction(&svm->vcpu); - kvm_emulate_hypercall(&svm->vcpu); - return 1; -} - -static int invalid_op_interception(struct vcpu_svm *svm, - struct kvm_run *kvm_run) -{ - kvm_queue_exception(&svm->vcpu, UD_VECTOR); - return 1; -} - -static int task_switch_interception(struct vcpu_svm *svm, - struct kvm_run *kvm_run) -{ - pr_unimpl(&svm->vcpu, "%s: task switch is unsupported\n", __FUNCTION__); - kvm_run->exit_reason = KVM_EXIT_UNKNOWN; - return 0; -} - -static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) -{ - svm->next_rip = svm->vmcb->save.rip + 2; - kvm_emulate_cpuid(&svm->vcpu); - return 1; -} - -static int emulate_on_interception(struct vcpu_svm *svm, - struct kvm_run *kvm_run) -{ - if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE) - pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__); - return 1; -} - -static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) -{ - emulate_instruction(&svm->vcpu, NULL, 0, 0, 0); - if (irqchip_in_kernel(svm->vcpu.kvm)) - return 1; - kvm_run->exit_reason = KVM_EXIT_SET_TPR; - return 0; -} - -static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - switch (ecx) { - case MSR_IA32_TIME_STAMP_COUNTER: { - u64 tsc; - - rdtscll(tsc); - *data = svm->vmcb->control.tsc_offset + tsc; - break; - } - case MSR_K6_STAR: - *data = svm->vmcb->save.star; - break; -#ifdef CONFIG_X86_64 - case MSR_LSTAR: - *data = svm->vmcb->save.lstar; - break; - case MSR_CSTAR: - *data = svm->vmcb->save.cstar; - break; - case MSR_KERNEL_GS_BASE: - *data = svm->vmcb->save.kernel_gs_base; - break; - case MSR_SYSCALL_MASK: - *data = svm->vmcb->save.sfmask; - break; -#endif - case MSR_IA32_SYSENTER_CS: - *data = svm->vmcb->save.sysenter_cs; - break; - case MSR_IA32_SYSENTER_EIP: - *data = svm->vmcb->save.sysenter_eip; - break; - case MSR_IA32_SYSENTER_ESP: - *data = svm->vmcb->save.sysenter_esp; - break; - default: - return kvm_get_msr_common(vcpu, ecx, data); - } - return 0; -} - -static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) -{ - u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; - u64 data; - - if (svm_get_msr(&svm->vcpu, ecx, &data)) - kvm_inject_gp(&svm->vcpu, 0); - else { - svm->vmcb->save.rax = data & 0xffffffff; - svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; - svm->next_rip = svm->vmcb->save.rip + 2; - skip_emulated_instruction(&svm->vcpu); - } - return 1; -} - -static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - switch (ecx) { - case MSR_IA32_TIME_STAMP_COUNTER: { - u64 tsc; - - rdtscll(tsc); - svm->vmcb->control.tsc_offset = data - tsc; - break; - } - case MSR_K6_STAR: - svm->vmcb->save.star = data; - break; -#ifdef CONFIG_X86_64 - case MSR_LSTAR: - svm->vmcb->save.lstar = data; - break; - case MSR_CSTAR: - svm->vmcb->save.cstar = data; - break; - case MSR_KERNEL_GS_BASE: - svm->vmcb->save.kernel_gs_base = data; - break; - case MSR_SYSCALL_MASK: - svm->vmcb->save.sfmask = data; - break; -#endif - case MSR_IA32_SYSENTER_CS: - svm->vmcb->save.sysenter_cs = data; - break; - case MSR_IA32_SYSENTER_EIP: - svm->vmcb->save.sysenter_eip = data; - break; - case MSR_IA32_SYSENTER_ESP: - svm->vmcb->save.sysenter_esp = data; - break; - case MSR_K7_EVNTSEL0: - case MSR_K7_EVNTSEL1: - case MSR_K7_EVNTSEL2: - case MSR_K7_EVNTSEL3: - /* - * only support writing 0 to the performance counters for now - * to make Windows happy. Should be replaced by a real - * performance counter emulation later. - */ - if (data != 0) - goto unhandled; - break; - default: - unhandled: - return kvm_set_msr_common(vcpu, ecx, data); - } - return 0; -} - -static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) -{ - u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; - u64 data = (svm->vmcb->save.rax & -1u) - | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); - svm->next_rip = svm->vmcb->save.rip + 2; - if (svm_set_msr(&svm->vcpu, ecx, data)) - kvm_inject_gp(&svm->vcpu, 0); - else - skip_emulated_instruction(&svm->vcpu); - return 1; -} - -static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) -{ - if (svm->vmcb->control.exit_info_1) - return wrmsr_interception(svm, kvm_run); - else - return rdmsr_interception(svm, kvm_run); -} - -static int interrupt_window_interception(struct vcpu_svm *svm, - struct kvm_run *kvm_run) -{ - svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR); - svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; - /* - * If the user space waits to inject interrupts, exit as soon as - * possible - */ - if (kvm_run->request_interrupt_window && - !svm->vcpu.arch.irq_summary) { - ++svm->vcpu.stat.irq_window_exits; - kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; - return 0; - } - - return 1; -} - -static int (*svm_exit_handlers[])(struct vcpu_svm *svm, - struct kvm_run *kvm_run) = { - [SVM_EXIT_READ_CR0] = emulate_on_interception, - [SVM_EXIT_READ_CR3] = emulate_on_interception, - [SVM_EXIT_READ_CR4] = emulate_on_interception, - [SVM_EXIT_READ_CR8] = emulate_on_interception, - /* for now: */ - [SVM_EXIT_WRITE_CR0] = emulate_on_interception, - [SVM_EXIT_WRITE_CR3] = emulate_on_interception, - [SVM_EXIT_WRITE_CR4] = emulate_on_interception, - [SVM_EXIT_WRITE_CR8] = cr8_write_interception, - [SVM_EXIT_READ_DR0] = emulate_on_interception, - [SVM_EXIT_READ_DR1] = emulate_on_interception, - [SVM_EXIT_READ_DR2] = emulate_on_interception, - [SVM_EXIT_READ_DR3] = emulate_on_interception, - [SVM_EXIT_WRITE_DR0] = emulate_on_interception, - [SVM_EXIT_WRITE_DR1] = emulate_on_interception, - [SVM_EXIT_WRITE_DR2] = emulate_on_interception, - [SVM_EXIT_WRITE_DR3] = emulate_on_interception, - [SVM_EXIT_WRITE_DR5] = emulate_on_interception, - [SVM_EXIT_WRITE_DR7] = emulate_on_interception, - [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, - [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, - [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, - [SVM_EXIT_INTR] = nop_on_interception, - [SVM_EXIT_NMI] = nop_on_interception, - [SVM_EXIT_SMI] = nop_on_interception, - [SVM_EXIT_INIT] = nop_on_interception, - [SVM_EXIT_VINTR] = interrupt_window_interception, - /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */ - [SVM_EXIT_CPUID] = cpuid_interception, - [SVM_EXIT_INVD] = emulate_on_interception, - [SVM_EXIT_HLT] = halt_interception, - [SVM_EXIT_INVLPG] = emulate_on_interception, - [SVM_EXIT_INVLPGA] = invalid_op_interception, - [SVM_EXIT_IOIO] = io_interception, - [SVM_EXIT_MSR] = msr_interception, - [SVM_EXIT_TASK_SWITCH] = task_switch_interception, - [SVM_EXIT_SHUTDOWN] = shutdown_interception, - [SVM_EXIT_VMRUN] = invalid_op_interception, - [SVM_EXIT_VMMCALL] = vmmcall_interception, - [SVM_EXIT_VMLOAD] = invalid_op_interception, - [SVM_EXIT_VMSAVE] = invalid_op_interception, - [SVM_EXIT_STGI] = invalid_op_interception, - [SVM_EXIT_CLGI] = invalid_op_interception, - [SVM_EXIT_SKINIT] = invalid_op_interception, - [SVM_EXIT_WBINVD] = emulate_on_interception, - [SVM_EXIT_MONITOR] = invalid_op_interception, - [SVM_EXIT_MWAIT] = invalid_op_interception, -}; - - -static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - u32 exit_code = svm->vmcb->control.exit_code; - - kvm_reput_irq(svm); - - if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { - kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; - kvm_run->fail_entry.hardware_entry_failure_reason - = svm->vmcb->control.exit_code; - return 0; - } - - if (is_external_interrupt(svm->vmcb->control.exit_int_info) && - exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR) - printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " - "exit_code 0x%x\n", - __FUNCTION__, svm->vmcb->control.exit_int_info, - exit_code); - - if (exit_code >= ARRAY_SIZE(svm_exit_handlers) - || !svm_exit_handlers[exit_code]) { - kvm_run->exit_reason = KVM_EXIT_UNKNOWN; - kvm_run->hw.hardware_exit_reason = exit_code; - return 0; - } - - return svm_exit_handlers[exit_code](svm, kvm_run); -} - -static void reload_tss(struct kvm_vcpu *vcpu) -{ - int cpu = raw_smp_processor_id(); - - struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); - svm_data->tss_desc->type = 9; /* available 32/64-bit TSS */ - load_TR_desc(); -} - -static void pre_svm_run(struct vcpu_svm *svm) -{ - int cpu = raw_smp_processor_id(); - - struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); - - svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; - if (svm->vcpu.cpu != cpu || - svm->asid_generation != svm_data->asid_generation) - new_asid(svm, svm_data); -} - - -static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) -{ - struct vmcb_control_area *control; - - control = &svm->vmcb->control; - control->int_vector = irq; - control->int_ctl &= ~V_INTR_PRIO_MASK; - control->int_ctl |= V_IRQ_MASK | - ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); -} - -static void svm_set_irq(struct kvm_vcpu *vcpu, int irq) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - svm_inject_irq(svm, irq); -} - -static void svm_intr_assist(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb *vmcb = svm->vmcb; - int intr_vector = -1; - - if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) && - ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) { - intr_vector = vmcb->control.exit_int_info & - SVM_EVTINJ_VEC_MASK; - vmcb->control.exit_int_info = 0; - svm_inject_irq(svm, intr_vector); - return; - } - - if (vmcb->control.int_ctl & V_IRQ_MASK) - return; - - if (!kvm_cpu_has_interrupt(vcpu)) - return; - - if (!(vmcb->save.rflags & X86_EFLAGS_IF) || - (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) || - (vmcb->control.event_inj & SVM_EVTINJ_VALID)) { - /* unable to deliver irq, set pending irq */ - vmcb->control.intercept |= (1ULL << INTERCEPT_VINTR); - svm_inject_irq(svm, 0x0); - return; - } - /* Okay, we can deliver the interrupt: grab it and update PIC state. */ - intr_vector = kvm_cpu_get_interrupt(vcpu); - svm_inject_irq(svm, intr_vector); - kvm_timer_intr_post(vcpu, intr_vector); -} - -static void kvm_reput_irq(struct vcpu_svm *svm) -{ - struct vmcb_control_area *control = &svm->vmcb->control; - - if ((control->int_ctl & V_IRQ_MASK) - && !irqchip_in_kernel(svm->vcpu.kvm)) { - control->int_ctl &= ~V_IRQ_MASK; - push_irq(&svm->vcpu, control->int_vector); - } - - svm->vcpu.arch.interrupt_window_open = - !(control->int_state & SVM_INTERRUPT_SHADOW_MASK); -} - -static void svm_do_inject_vector(struct vcpu_svm *svm) -{ - struct kvm_vcpu *vcpu = &svm->vcpu; - int word_index = __ffs(vcpu->arch.irq_summary); - int bit_index = __ffs(vcpu->arch.irq_pending[word_index]); - int irq = word_index * BITS_PER_LONG + bit_index; - - clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); - if (!vcpu->arch.irq_pending[word_index]) - clear_bit(word_index, &vcpu->arch.irq_summary); - svm_inject_irq(svm, irq); -} - -static void do_interrupt_requests(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) -{ - struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb_control_area *control = &svm->vmcb->control; - - svm->vcpu.arch.interrupt_window_open = - (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) && - (svm->vmcb->save.rflags & X86_EFLAGS_IF)); - - if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary) - /* - * If interrupts enabled, and not blocked by sti or mov ss. Good. - */ - svm_do_inject_vector(svm); - - /* - * Interrupts blocked. Wait for unblock. - */ - if (!svm->vcpu.arch.interrupt_window_open && - (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window)) - control->intercept |= 1ULL << INTERCEPT_VINTR; - else - control->intercept &= ~(1ULL << INTERCEPT_VINTR); -} - -static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) -{ - return 0; -} - -static void save_db_regs(unsigned long *db_regs) -{ - asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0])); - asm volatile ("mov %%dr1, %0" : "=r"(db_regs[1])); - asm volatile ("mov %%dr2, %0" : "=r"(db_regs[2])); - asm volatile ("mov %%dr3, %0" : "=r"(db_regs[3])); -} - -static void load_db_regs(unsigned long *db_regs) -{ - asm volatile ("mov %0, %%dr0" : : "r"(db_regs[0])); - asm volatile ("mov %0, %%dr1" : : "r"(db_regs[1])); - asm volatile ("mov %0, %%dr2" : : "r"(db_regs[2])); - asm volatile ("mov %0, %%dr3" : : "r"(db_regs[3])); -} - -static void svm_flush_tlb(struct kvm_vcpu *vcpu) -{ - force_new_asid(vcpu); -} - -static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) -{ -} - -static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) -{ - struct vcpu_svm *svm = to_svm(vcpu); - u16 fs_selector; - u16 gs_selector; - u16 ldt_selector; - - pre_svm_run(svm); - - save_host_msrs(vcpu); - fs_selector = read_fs(); - gs_selector = read_gs(); - ldt_selector = read_ldt(); - svm->host_cr2 = kvm_read_cr2(); - svm->host_dr6 = read_dr6(); - svm->host_dr7 = read_dr7(); - svm->vmcb->save.cr2 = vcpu->arch.cr2; - - if (svm->vmcb->save.dr7 & 0xff) { - write_dr7(0); - save_db_regs(svm->host_db_regs); - load_db_regs(svm->db_regs); - } - - clgi(); - - local_irq_enable(); - - asm volatile ( -#ifdef CONFIG_X86_64 - "push %%rbp; \n\t" -#else - "push %%ebp; \n\t" -#endif - -#ifdef CONFIG_X86_64 - "mov %c[rbx](%[svm]), %%rbx \n\t" - "mov %c[rcx](%[svm]), %%rcx \n\t" - "mov %c[rdx](%[svm]), %%rdx \n\t" - "mov %c[rsi](%[svm]), %%rsi \n\t" - "mov %c[rdi](%[svm]), %%rdi \n\t" - "mov %c[rbp](%[svm]), %%rbp \n\t" - "mov %c[r8](%[svm]), %%r8 \n\t" - "mov %c[r9](%[svm]), %%r9 \n\t" - "mov %c[r10](%[svm]), %%r10 \n\t" - "mov %c[r11](%[svm]), %%r11 \n\t" - "mov %c[r12](%[svm]), %%r12 \n\t" - "mov %c[r13](%[svm]), %%r13 \n\t" - "mov %c[r14](%[svm]), %%r14 \n\t" - "mov %c[r15](%[svm]), %%r15 \n\t" -#else - "mov %c[rbx](%[svm]), %%ebx \n\t" - "mov %c[rcx](%[svm]), %%ecx \n\t" - "mov %c[rdx](%[svm]), %%edx \n\t" - "mov %c[rsi](%[svm]), %%esi \n\t" - "mov %c[rdi](%[svm]), %%edi \n\t" - "mov %c[rbp](%[svm]), %%ebp \n\t" -#endif - -#ifdef CONFIG_X86_64 - /* Enter guest mode */ - "push %%rax \n\t" - "mov %c[vmcb](%[svm]), %%rax \n\t" - SVM_VMLOAD "\n\t" - SVM_VMRUN "\n\t" - SVM_VMSAVE "\n\t" - "pop %%rax \n\t" -#else - /* Enter guest mode */ - "push %%eax \n\t" - "mov %c[vmcb](%[svm]), %%eax \n\t" - SVM_VMLOAD "\n\t" - SVM_VMRUN "\n\t" - SVM_VMSAVE "\n\t" - "pop %%eax \n\t" -#endif - - /* Save guest registers, load host registers */ -#ifdef CONFIG_X86_64 - "mov %%rbx, %c[rbx](%[svm]) \n\t" - "mov %%rcx, %c[rcx](%[svm]) \n\t" - "mov %%rdx, %c[rdx](%[svm]) \n\t" - "mov %%rsi, %c[rsi](%[svm]) \n\t" - "mov %%rdi, %c[rdi](%[svm]) \n\t" - "mov %%rbp, %c[rbp](%[svm]) \n\t" - "mov %%r8, %c[r8](%[svm]) \n\t" - "mov %%r9, %c[r9](%[svm]) \n\t" - "mov %%r10, %c[r10](%[svm]) \n\t" - "mov %%r11, %c[r11](%[svm]) \n\t" - "mov %%r12, %c[r12](%[svm]) \n\t" - "mov %%r13, %c[r13](%[svm]) \n\t" - "mov %%r14, %c[r14](%[svm]) \n\t" - "mov %%r15, %c[r15](%[svm]) \n\t" - - "pop %%rbp; \n\t" -#else - "mov %%ebx, %c[rbx](%[svm]) \n\t" - "mov %%ecx, %c[rcx](%[svm]) \n\t" - "mov %%edx, %c[rdx](%[svm]) \n\t" - "mov %%esi, %c[rsi](%[svm]) \n\t" - "mov %%edi, %c[rdi](%[svm]) \n\t" - "mov %%ebp, %c[rbp](%[svm]) \n\t" - - "pop %%ebp; \n\t" -#endif - : - : [svm]"a"(svm), - [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), - [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])), - [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])), - [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])), - [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])), - [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])), - [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP])) -#ifdef CONFIG_X86_64 - , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])), - [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])), - [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])), - [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])), - [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])), - [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])), - [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])), - [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) -#endif - : "cc", "memory" -#ifdef CONFIG_X86_64 - , "rbx", "rcx", "rdx", "rsi", "rdi" - , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" -#else - , "ebx", "ecx", "edx" , "esi", "edi" -#endif - ); - - if ((svm->vmcb->save.dr7 & 0xff)) - load_db_regs(svm->host_db_regs); - - vcpu->arch.cr2 = svm->vmcb->save.cr2; - - write_dr6(svm->host_dr6); - write_dr7(svm->host_dr7); - kvm_write_cr2(svm->host_cr2); - - load_fs(fs_selector); - load_gs(gs_selector); - load_ldt(ldt_selector); - load_host_msrs(vcpu); - - reload_tss(vcpu); - - local_irq_disable(); - - stgi(); - - svm->next_rip = 0; -} - -static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - svm->vmcb->save.cr3 = root; - force_new_asid(vcpu); - - if (vcpu->fpu_active) { - svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR); - svm->vmcb->save.cr0 |= X86_CR0_TS; - vcpu->fpu_active = 0; - } -} - -static int is_disabled(void) -{ - u64 vm_cr; - - rdmsrl(MSR_VM_CR, vm_cr); - if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) - return 1; - - return 0; -} - -static void -svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) -{ - /* - * Patch in the VMMCALL instruction: - */ - hypercall[0] = 0x0f; - hypercall[1] = 0x01; - hypercall[2] = 0xd9; -} - -static void svm_check_processor_compat(void *rtn) -{ - *(int *)rtn = 0; -} - -static struct kvm_x86_ops svm_x86_ops = { - .cpu_has_kvm_support = has_svm, - .disabled_by_bios = is_disabled, - .hardware_setup = svm_hardware_setup, - .hardware_unsetup = svm_hardware_unsetup, - .check_processor_compatibility = svm_check_processor_compat, - .hardware_enable = svm_hardware_enable, - .hardware_disable = svm_hardware_disable, - - .vcpu_create = svm_create_vcpu, - .vcpu_free = svm_free_vcpu, - .vcpu_reset = svm_vcpu_reset, - - .prepare_guest_switch = svm_prepare_guest_switch, - .vcpu_load = svm_vcpu_load, - .vcpu_put = svm_vcpu_put, - .vcpu_decache = svm_vcpu_decache, - - .set_guest_debug = svm_guest_debug, - .get_msr = svm_get_msr, - .set_msr = svm_set_msr, - .get_segment_base = svm_get_segment_base, - .get_segment = svm_get_segment, - .set_segment = svm_set_segment, - .get_cs_db_l_bits = kvm_get_cs_db_l_bits, - .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, - .set_cr0 = svm_set_cr0, - .set_cr3 = svm_set_cr3, - .set_cr4 = svm_set_cr4, - .set_efer = svm_set_efer, - .get_idt = svm_get_idt, - .set_idt = svm_set_idt, - .get_gdt = svm_get_gdt, - .set_gdt = svm_set_gdt, - .get_dr = svm_get_dr, - .set_dr = svm_set_dr, - .cache_regs = svm_cache_regs, - .decache_regs = svm_decache_regs, - .get_rflags = svm_get_rflags, - .set_rflags = svm_set_rflags, - - .tlb_flush = svm_flush_tlb, - - .run = svm_vcpu_run, - .handle_exit = handle_exit, - .skip_emulated_instruction = skip_emulated_instruction, - .patch_hypercall = svm_patch_hypercall, - .get_irq = svm_get_irq, - .set_irq = svm_set_irq, - .queue_exception = svm_queue_exception, - .exception_injected = svm_exception_injected, - .inject_pending_irq = svm_intr_assist, - .inject_pending_vectors = do_interrupt_requests, - - .set_tss_addr = svm_set_tss_addr, -}; - -static int __init svm_init(void) -{ - return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm), - THIS_MODULE); -} - -static void __exit svm_exit(void) -{ - kvm_exit(); -} - -module_init(svm_init) -module_exit(svm_exit) diff --git a/drivers/kvm/svm.h b/drivers/kvm/svm.h deleted file mode 100644 index 5fd50491b55..00000000000 --- a/drivers/kvm/svm.h +++ /dev/null @@ -1,325 +0,0 @@ -#ifndef __SVM_H -#define __SVM_H - -enum { - INTERCEPT_INTR, - INTERCEPT_NMI, - INTERCEPT_SMI, - INTERCEPT_INIT, - INTERCEPT_VINTR, - INTERCEPT_SELECTIVE_CR0, - INTERCEPT_STORE_IDTR, - INTERCEPT_STORE_GDTR, - INTERCEPT_STORE_LDTR, - INTERCEPT_STORE_TR, - INTERCEPT_LOAD_IDTR, - INTERCEPT_LOAD_GDTR, - INTERCEPT_LOAD_LDTR, - INTERCEPT_LOAD_TR, - INTERCEPT_RDTSC, - INTERCEPT_RDPMC, - INTERCEPT_PUSHF, - INTERCEPT_POPF, - INTERCEPT_CPUID, - INTERCEPT_RSM, - INTERCEPT_IRET, - INTERCEPT_INTn, - INTERCEPT_INVD, - INTERCEPT_PAUSE, - INTERCEPT_HLT, - INTERCEPT_INVLPG, - INTERCEPT_INVLPGA, - INTERCEPT_IOIO_PROT, - INTERCEPT_MSR_PROT, - INTERCEPT_TASK_SWITCH, - INTERCEPT_FERR_FREEZE, - INTERCEPT_SHUTDOWN, - INTERCEPT_VMRUN, - INTERCEPT_VMMCALL, - INTERCEPT_VMLOAD, - INTERCEPT_VMSAVE, - INTERCEPT_STGI, - INTERCEPT_CLGI, - INTERCEPT_SKINIT, - INTERCEPT_RDTSCP, - INTERCEPT_ICEBP, - INTERCEPT_WBINVD, - INTERCEPT_MONITOR, - INTERCEPT_MWAIT, - INTERCEPT_MWAIT_COND, -}; - - -struct __attribute__ ((__packed__)) vmcb_control_area { - u16 intercept_cr_read; - u16 intercept_cr_write; - u16 intercept_dr_read; - u16 intercept_dr_write; - u32 intercept_exceptions; - u64 intercept; - u8 reserved_1[44]; - u64 iopm_base_pa; - u64 msrpm_base_pa; - u64 tsc_offset; - u32 asid; - u8 tlb_ctl; - u8 reserved_2[3]; - u32 int_ctl; - u32 int_vector; - u32 int_state; - u8 reserved_3[4]; - u32 exit_code; - u32 exit_code_hi; - u64 exit_info_1; - u64 exit_info_2; - u32 exit_int_info; - u32 exit_int_info_err; - u64 nested_ctl; - u8 reserved_4[16]; - u32 event_inj; - u32 event_inj_err; - u64 nested_cr3; - u64 lbr_ctl; - u8 reserved_5[832]; -}; - - -#define TLB_CONTROL_DO_NOTHING 0 -#define TLB_CONTROL_FLUSH_ALL_ASID 1 - -#define V_TPR_MASK 0x0f - -#define V_IRQ_SHIFT 8 -#define V_IRQ_MASK (1 << V_IRQ_SHIFT) - -#define V_INTR_PRIO_SHIFT 16 -#define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT) - -#define V_IGN_TPR_SHIFT 20 -#define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT) - -#define V_INTR_MASKING_SHIFT 24 -#define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT) - -#define SVM_INTERRUPT_SHADOW_MASK 1 - -#define SVM_IOIO_STR_SHIFT 2 -#define SVM_IOIO_REP_SHIFT 3 -#define SVM_IOIO_SIZE_SHIFT 4 -#define SVM_IOIO_ASIZE_SHIFT 7 - -#define SVM_IOIO_TYPE_MASK 1 -#define SVM_IOIO_STR_MASK (1 << SVM_IOIO_STR_SHIFT) -#define SVM_IOIO_REP_MASK (1 << SVM_IOIO_REP_SHIFT) -#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT) -#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT) - -struct __attribute__ ((__packed__)) vmcb_seg { - u16 selector; - u16 attrib; - u32 limit; - u64 base; -}; - -struct __attribute__ ((__packed__)) vmcb_save_area { - struct vmcb_seg es; - struct vmcb_seg cs; - struct vmcb_seg ss; - struct vmcb_seg ds; - struct vmcb_seg fs; - struct vmcb_seg gs; - struct vmcb_seg gdtr; - struct vmcb_seg ldtr; - struct vmcb_seg idtr; - struct vmcb_seg tr; - u8 reserved_1[43]; - u8 cpl; - u8 reserved_2[4]; - u64 efer; - u8 reserved_3[112]; - u64 cr4; - u64 cr3; - u64 cr0; - u64 dr7; - u64 dr6; - u64 rflags; - u64 rip; - u8 reserved_4[88]; - u64 rsp; - u8 reserved_5[24]; - u64 rax; - u64 star; - u64 lstar; - u64 cstar; - u64 sfmask; - u64 kernel_gs_base; - u64 sysenter_cs; - u64 sysenter_esp; - u64 sysenter_eip; - u64 cr2; - u8 reserved_6[32]; - u64 g_pat; - u64 dbgctl; - u64 br_from; - u64 br_to; - u64 last_excp_from; - u64 last_excp_to; -}; - -struct __attribute__ ((__packed__)) vmcb { - struct vmcb_control_area control; - struct vmcb_save_area save; -}; - -#define SVM_CPUID_FEATURE_SHIFT 2 -#define SVM_CPUID_FUNC 0x8000000a - -#define MSR_EFER_SVME_MASK (1ULL << 12) -#define MSR_VM_CR 0xc0010114 -#define MSR_VM_HSAVE_PA 0xc0010117ULL - -#define SVM_VM_CR_SVM_DISABLE 4 - -#define SVM_SELECTOR_S_SHIFT 4 -#define SVM_SELECTOR_DPL_SHIFT 5 -#define SVM_SELECTOR_P_SHIFT 7 -#define SVM_SELECTOR_AVL_SHIFT 8 -#define SVM_SELECTOR_L_SHIFT 9 -#define SVM_SELECTOR_DB_SHIFT 10 -#define SVM_SELECTOR_G_SHIFT 11 - -#define SVM_SELECTOR_TYPE_MASK (0xf) -#define SVM_SELECTOR_S_MASK (1 << SVM_SELECTOR_S_SHIFT) -#define SVM_SELECTOR_DPL_MASK (3 << SVM_SELECTOR_DPL_SHIFT) -#define SVM_SELECTOR_P_MASK (1 << SVM_SELECTOR_P_SHIFT) -#define SVM_SELECTOR_AVL_MASK (1 << SVM_SELECTOR_AVL_SHIFT) -#define SVM_SELECTOR_L_MASK (1 << SVM_SELECTOR_L_SHIFT) -#define SVM_SELECTOR_DB_MASK (1 << SVM_SELECTOR_DB_SHIFT) -#define SVM_SELECTOR_G_MASK (1 << SVM_SELECTOR_G_SHIFT) - -#define SVM_SELECTOR_WRITE_MASK (1 << 1) -#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK -#define SVM_SELECTOR_CODE_MASK (1 << 3) - -#define INTERCEPT_CR0_MASK 1 -#define INTERCEPT_CR3_MASK (1 << 3) -#define INTERCEPT_CR4_MASK (1 << 4) -#define INTERCEPT_CR8_MASK (1 << 8) - -#define INTERCEPT_DR0_MASK 1 -#define INTERCEPT_DR1_MASK (1 << 1) -#define INTERCEPT_DR2_MASK (1 << 2) -#define INTERCEPT_DR3_MASK (1 << 3) -#define INTERCEPT_DR4_MASK (1 << 4) -#define INTERCEPT_DR5_MASK (1 << 5) -#define INTERCEPT_DR6_MASK (1 << 6) -#define INTERCEPT_DR7_MASK (1 << 7) - -#define SVM_EVTINJ_VEC_MASK 0xff - -#define SVM_EVTINJ_TYPE_SHIFT 8 -#define SVM_EVTINJ_TYPE_MASK (7 << SVM_EVTINJ_TYPE_SHIFT) - -#define SVM_EVTINJ_TYPE_INTR (0 << SVM_EVTINJ_TYPE_SHIFT) -#define SVM_EVTINJ_TYPE_NMI (2 << SVM_EVTINJ_TYPE_SHIFT) -#define SVM_EVTINJ_TYPE_EXEPT (3 << SVM_EVTINJ_TYPE_SHIFT) -#define SVM_EVTINJ_TYPE_SOFT (4 << SVM_EVTINJ_TYPE_SHIFT) - -#define SVM_EVTINJ_VALID (1 << 31) -#define SVM_EVTINJ_VALID_ERR (1 << 11) - -#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK - -#define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR -#define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI -#define SVM_EXITINTINFO_TYPE_EXEPT SVM_EVTINJ_TYPE_EXEPT -#define SVM_EXITINTINFO_TYPE_SOFT SVM_EVTINJ_TYPE_SOFT - -#define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID -#define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR - -#define SVM_EXIT_READ_CR0 0x000 -#define SVM_EXIT_READ_CR3 0x003 -#define SVM_EXIT_READ_CR4 0x004 -#define SVM_EXIT_READ_CR8 0x008 -#define SVM_EXIT_WRITE_CR0 0x010 -#define SVM_EXIT_WRITE_CR3 0x013 -#define SVM_EXIT_WRITE_CR4 0x014 -#define SVM_EXIT_WRITE_CR8 0x018 -#define SVM_EXIT_READ_DR0 0x020 -#define SVM_EXIT_READ_DR1 0x021 -#define SVM_EXIT_READ_DR2 0x022 -#define SVM_EXIT_READ_DR3 0x023 -#define SVM_EXIT_READ_DR4 0x024 -#define SVM_EXIT_READ_DR5 0x025 -#define SVM_EXIT_READ_DR6 0x026 -#define SVM_EXIT_READ_DR7 0x027 -#define SVM_EXIT_WRITE_DR0 0x030 -#define SVM_EXIT_WRITE_DR1 0x031 -#define SVM_EXIT_WRITE_DR2 0x032 -#define SVM_EXIT_WRITE_DR3 0x033 -#define SVM_EXIT_WRITE_DR4 0x034 -#define SVM_EXIT_WRITE_DR5 0x035 -#define SVM_EXIT_WRITE_DR6 0x036 -#define SVM_EXIT_WRITE_DR7 0x037 -#define SVM_EXIT_EXCP_BASE 0x040 -#define SVM_EXIT_INTR 0x060 -#define SVM_EXIT_NMI 0x061 -#define SVM_EXIT_SMI 0x062 -#define SVM_EXIT_INIT 0x063 -#define SVM_EXIT_VINTR 0x064 -#define SVM_EXIT_CR0_SEL_WRITE 0x065 -#define SVM_EXIT_IDTR_READ 0x066 -#define SVM_EXIT_GDTR_READ 0x067 -#define SVM_EXIT_LDTR_READ 0x068 -#define SVM_EXIT_TR_READ 0x069 -#define SVM_EXIT_IDTR_WRITE 0x06a -#define SVM_EXIT_GDTR_WRITE 0x06b -#define SVM_EXIT_LDTR_WRITE 0x06c -#define SVM_EXIT_TR_WRITE 0x06d -#define SVM_EXIT_RDTSC 0x06e -#define SVM_EXIT_RDPMC 0x06f -#define SVM_EXIT_PUSHF 0x070 -#define SVM_EXIT_POPF 0x071 -#define SVM_EXIT_CPUID 0x072 -#define SVM_EXIT_RSM 0x073 -#define SVM_EXIT_IRET 0x074 -#define SVM_EXIT_SWINT 0x075 -#define SVM_EXIT_INVD 0x076 -#define SVM_EXIT_PAUSE 0x077 -#define SVM_EXIT_HLT 0x078 -#define SVM_EXIT_INVLPG 0x079 -#define SVM_EXIT_INVLPGA 0x07a -#define SVM_EXIT_IOIO 0x07b -#define SVM_EXIT_MSR 0x07c -#define SVM_EXIT_TASK_SWITCH 0x07d -#define SVM_EXIT_FERR_FREEZE 0x07e -#define SVM_EXIT_SHUTDOWN 0x07f -#define SVM_EXIT_VMRUN 0x080 -#define SVM_EXIT_VMMCALL 0x081 -#define SVM_EXIT_VMLOAD 0x082 -#define SVM_EXIT_VMSAVE 0x083 -#define SVM_EXIT_STGI 0x084 -#define SVM_EXIT_CLGI 0x085 -#define SVM_EXIT_SKINIT 0x086 -#define SVM_EXIT_RDTSCP 0x087 -#define SVM_EXIT_ICEBP 0x088 -#define SVM_EXIT_WBINVD 0x089 -#define SVM_EXIT_MONITOR 0x08a -#define SVM_EXIT_MWAIT 0x08b -#define SVM_EXIT_MWAIT_COND 0x08c -#define SVM_EXIT_NPF 0x400 - -#define SVM_EXIT_ERR -1 - -#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */ - -#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda" -#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8" -#define SVM_VMSAVE ".byte 0x0f, 0x01, 0xdb" -#define SVM_CLGI ".byte 0x0f, 0x01, 0xdd" -#define SVM_STGI ".byte 0x0f, 0x01, 0xdc" -#define SVM_INVLPGA ".byte 0x0f, 0x01, 0xdf" - -#endif - diff --git a/drivers/kvm/types.h b/drivers/kvm/types.h deleted file mode 100644 index 1c4e46decb2..00000000000 --- a/drivers/kvm/types.h +++ /dev/null @@ -1,54 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - * - */ - -#ifndef __KVM_TYPES_H__ -#define __KVM_TYPES_H__ - -#include <asm/types.h> - -/* - * Address types: - * - * gva - guest virtual address - * gpa - guest physical address - * gfn - guest frame number - * hva - host virtual address - * hpa - host physical address - * hfn - host frame number - */ - -typedef unsigned long gva_t; -typedef u64 gpa_t; -typedef unsigned long gfn_t; - -typedef unsigned long hva_t; -typedef u64 hpa_t; -typedef unsigned long hfn_t; - -struct kvm_pio_request { - unsigned long count; - int cur_count; - struct page *guest_pages[2]; - unsigned guest_page_offset; - int in; - int port; - int size; - int string; - int down; - int rep; -}; - -#endif /* __KVM_TYPES_H__ */ diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c deleted file mode 100644 index 11ca2340d38..00000000000 --- a/drivers/kvm/vmx.c +++ /dev/null @@ -1,2673 +0,0 @@ -/* - * Kernel-based Virtual Machine driver for Linux - * - * This module enables machines with Intel VT-x extensions to run virtual - * machines without emulation or binary translation. - * - * Copyright (C) 2006 Qumranet, Inc. - * - * Authors: - * Avi Kivity <avi@qumranet.com> - * Yaniv Kamay <yaniv@qumranet.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -#include "kvm.h" -#include "x86.h" -#include "x86_emulate.h" -#include "irq.h" -#include "vmx.h" -#include "segment_descriptor.h" -#include "mmu.h" - -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/highmem.h> -#include <linux/sched.h> -#include <linux/moduleparam.h> - -#include <asm/io.h> -#include <asm/desc.h> - -MODULE_AUTHOR("Qumranet"); -MODULE_LICENSE("GPL"); - -static int bypass_guest_pf = 1; -module_param(bypass_guest_pf, bool, 0); - -struct vmcs { - u32 revision_id; - u32 abort; - char data[0]; -}; - -struct vcpu_vmx { - struct kvm_vcpu vcpu; - int launched; - u8 fail; - u32 idt_vectoring_info; - struct kvm_msr_entry *guest_msrs; - struct kvm_msr_entry *host_msrs; - int nmsrs; - int save_nmsrs; - int msr_offset_efer; -#ifdef CONFIG_X86_64 - int msr_offset_kernel_gs_base; -#endif - struct vmcs *vmcs; - struct { - int loaded; - u16 fs_sel, gs_sel, ldt_sel; - int gs_ldt_reload_needed; - int fs_reload_needed; - int guest_efer_loaded; - } host_state; - struct { - struct { - bool pending; - u8 vector; - unsigned rip; - } irq; - } rmode; -}; - -static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) -{ - return container_of(vcpu, struct vcpu_vmx, vcpu); -} - -static int init_rmode_tss(struct kvm *kvm); - -static DEFINE_PER_CPU(struct vmcs *, vmxarea); -static DEFINE_PER_CPU(struct vmcs *, current_vmcs); - -static struct page *vmx_io_bitmap_a; -static struct page *vmx_io_bitmap_b; - -static struct vmcs_config { - int size; - int order; - u32 revision_id; - u32 pin_based_exec_ctrl; - u32 cpu_based_exec_ctrl; - u32 cpu_based_2nd_exec_ctrl; - u32 vmexit_ctrl; - u32 vmentry_ctrl; -} vmcs_config; - -#define VMX_SEGMENT_FIELD(seg) \ - [VCPU_SREG_##seg] = { \ - .selector = GUEST_##seg##_SELECTOR, \ - .base = GUEST_##seg##_BASE, \ - .limit = GUEST_##seg##_LIMIT, \ - .ar_bytes = GUEST_##seg##_AR_BYTES, \ - } - -static struct kvm_vmx_segment_field { - unsigned selector; - unsigned base; - unsigned limit; - unsigned ar_bytes; -} kvm_vmx_segment_fields[] = { - VMX_SEGMENT_FIELD(CS), - VMX_SEGMENT_FIELD(DS), - VMX_SEGMENT_FIELD(ES), - VMX_SEGMENT_FIELD(FS), - VMX_SEGMENT_FIELD(GS), - VMX_SEGMENT_FIELD(SS), - VMX_SEGMENT_FIELD(TR), - VMX_SEGMENT_FIELD(LDTR), -}; - -/* - * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it - * away by decrementing the array size. - */ -static const u32 vmx_msr_index[] = { -#ifdef CONFIG_X86_64 - MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE, -#endif - MSR_EFER, MSR_K6_STAR, -}; -#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) - -static void load_msrs(struct kvm_msr_entry *e, int n) -{ - int i; - - for (i = 0; i < n; ++i) - wrmsrl(e[i].index, e[i].data); -} - -static void save_msrs(struct kvm_msr_entry *e, int n) -{ - int i; - - for (i = 0; i < n; ++i) - rdmsrl(e[i].index, e[i].data); -} - -static inline int is_page_fault(u32 intr_info) -{ - return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | - INTR_INFO_VALID_MASK)) == - (INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK); -} - -static inline int is_no_device(u32 intr_info) -{ - return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | - INTR_INFO_VALID_MASK)) == - (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); -} - -static inline int is_invalid_opcode(u32 intr_info) -{ - return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | - INTR_INFO_VALID_MASK)) == - (INTR_TYPE_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK); -} - -static inline int is_external_interrupt(u32 intr_info) -{ - return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) - == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); -} - -static inline int cpu_has_vmx_tpr_shadow(void) -{ - return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW); -} - -static inline int vm_need_tpr_shadow(struct kvm *kvm) -{ - return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm))); -} - -static inline int cpu_has_secondary_exec_ctrls(void) -{ - return (vmcs_config.cpu_based_exec_ctrl & - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS); -} - -static inline int cpu_has_vmx_virtualize_apic_accesses(void) -{ - return (vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); -} - -static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) -{ - return ((cpu_has_vmx_virtualize_apic_accesses()) && - (irqchip_in_kernel(kvm))); -} - -static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) -{ - int i; - - for (i = 0; i < vmx->nmsrs; ++i) - if (vmx->guest_msrs[i].index == msr) - return i; - return -1; -} - -static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) -{ - int i; - - i = __find_msr_index(vmx, msr); - if (i >= 0) - return &vmx->guest_msrs[i]; - return NULL; -} - -static void vmcs_clear(struct vmcs *vmcs) -{ - u64 phys_addr = __pa(vmcs); - u8 error; - - asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0" - : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) - : "cc", "memory"); - if (error) - printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", - vmcs, phys_addr); -} - -static void __vcpu_clear(void *arg) -{ - struct vcpu_vmx *vmx = arg; - int cpu = raw_smp_processor_id(); - - if (vmx->vcpu.cpu == cpu) - vmcs_clear(vmx->vmcs); - if (per_cpu(current_vmcs, cpu) == vmx->vmcs) - per_cpu(current_vmcs, cpu) = NULL; - rdtscll(vmx->vcpu.arch.host_tsc); -} - -static void vcpu_clear(struct vcpu_vmx *vmx) -{ - if (vmx->vcpu.cpu == -1) - return; - smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 0, 1); - vmx->launched = 0; -} - -static unsigned long vmcs_readl(unsigned long field) -{ - unsigned long value; - - asm volatile (ASM_VMX_VMREAD_RDX_RAX - : "=a"(value) : "d"(field) : "cc"); - return value; -} - -static u16 vmcs_read16(unsigned long field) -{ - return vmcs_readl(field); -} - -static u32 vmcs_read32(unsigned long field) -{ - return vmcs_readl(field); -} - -static u64 vmcs_read64(unsigned long field) -{ -#ifdef CONFIG_X86_64 - return vmcs_readl(field); -#else - return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32); -#endif -} - -static noinline void vmwrite_error(unsigned long field, unsigned long value) -{ - printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n", - field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); - dump_stack(); -} - -static void vmcs_writel(unsigned long field, unsigned long value) -{ - u8 error; - - asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0" - : "=q"(error) : "a"(value), "d"(field) : "cc"); - if (unlikely(error)) - vmwrite_error(field, value); -} - -static void vmcs_write16(unsigned long field, u16 value) -{ - vmcs_writel(field, value); -} - -static void vmcs_write32(unsigned long field, u32 value) -{ - vmcs_writel(field, value); -} - -static void vmcs_write64(unsigned long field, u64 value) -{ -#ifdef CONFIG_X86_64 - vmcs_writel(field, value); -#else - vmcs_writel(field, value); - asm volatile (""); - vmcs_writel(field+1, value >> 32); -#endif -} - -static void vmcs_clear_bits(unsigned long field, u32 mask) -{ - vmcs_writel(field, vmcs_readl(field) & ~mask); -} - -static void vmcs_set_bits(unsigned long field, u32 mask) -{ - vmcs_writel(field, vmcs_readl(field) | mask); -} - -static void update_exception_bitmap(struct kvm_vcpu *vcpu) -{ - u32 eb; - - eb = (1u << PF_VECTOR) | (1u << UD_VECTOR); - if (!vcpu->fpu_active) - eb |= 1u << NM_VECTOR; - if (vcpu->guest_debug.enabled) - eb |= 1u << 1; - if (vcpu->arch.rmode.active) - eb = ~0; - vmcs_write32(EXCEPTION_BITMAP, eb); -} - -static void reload_tss(void) -{ -#ifndef CONFIG_X86_64 - - /* - * VT restores TR but not its size. Useless. - */ - struct descriptor_table gdt; - struct segment_descriptor *descs; - - get_gdt(&gdt); - descs = (void *)gdt.base; - descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ - load_TR_desc(); -#endif -} - -static void load_transition_efer(struct vcpu_vmx *vmx) -{ - int efer_offset = vmx->msr_offset_efer; - u64 host_efer = vmx->host_msrs[efer_offset].data; - u64 guest_efer = vmx->guest_msrs[efer_offset].data; - u64 ignore_bits; - - if (efer_offset < 0) - return; - /* - * NX is emulated; LMA and LME handled by hardware; SCE meaninless - * outside long mode - */ - ignore_bits = EFER_NX | EFER_SCE; -#ifdef CONFIG_X86_64 - ignore_bits |= EFER_LMA | EFER_LME; - /* SCE is meaningful only in long mode on Intel */ - if (guest_efer & EFER_LMA) - ignore_bits &= ~(u64)EFER_SCE; -#endif - if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits)) - return; - - vmx->host_state.guest_efer_loaded = 1; - guest_efer &= ~ignore_bits; - guest_efer |= host_efer & ignore_bits; - wrmsrl(MSR_EFER, guest_efer); - vmx->vcpu.stat.efer_reload++; -} - -static void reload_host_efer(struct vcpu_vmx *vmx) -{ - if (vmx->host_state.guest_efer_loaded) { - vmx->host_state.guest_efer_loaded = 0; - load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1); - } -} - -static void vmx_save_host_state(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (vmx->host_state.loaded) - return; - - vmx->host_state.loaded = 1; - /* - * Set host fs and gs selectors. Unfortunately, 22.2.3 does not - * allow segment selectors with cpl > 0 or ti == 1. - */ - vmx->host_state.ldt_sel = read_ldt(); - vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; - vmx->host_state.fs_sel = read_fs(); - if (!(vmx->host_state.fs_sel & 7)) { - vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); - vmx->host_state.fs_reload_needed = 0; - } else { - vmcs_write16(HOST_FS_SELECTOR, 0); - vmx->host_state.fs_reload_needed = 1; - } - vmx->host_state.gs_sel = read_gs(); - if (!(vmx->host_state.gs_sel & 7)) - vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); - else { - vmcs_write16(HOST_GS_SELECTOR, 0); - vmx->host_state.gs_ldt_reload_needed = 1; - } - -#ifdef CONFIG_X86_64 - vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); - vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); -#else - vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel)); - vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel)); -#endif - -#ifdef CONFIG_X86_64 - if (is_long_mode(&vmx->vcpu)) - save_msrs(vmx->host_msrs + - vmx->msr_offset_kernel_gs_base, 1); - -#endif - load_msrs(vmx->guest_msrs, vmx->save_nmsrs); - load_transition_efer(vmx); -} - -static void vmx_load_host_state(struct vcpu_vmx *vmx) -{ - unsigned long flags; - - if (!vmx->host_state.loaded) - return; - - ++vmx->vcpu.stat.host_state_reload; - vmx->host_state.loaded = 0; - if (vmx->host_state.fs_reload_needed) - load_fs(vmx->host_state.fs_sel); - if (vmx->host_state.gs_ldt_reload_needed) { - load_ldt(vmx->host_state.ldt_sel); - /* - * If we have to reload gs, we must take care to - * preserve our gs base. - */ - local_irq_save(flags); - load_gs(vmx->host_state.gs_sel); -#ifdef CONFIG_X86_64 - wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); -#endif - local_irq_restore(flags); - } - reload_tss(); - save_msrs(vmx->guest_msrs, vmx->save_nmsrs); - load_msrs(vmx->host_msrs, vmx->save_nmsrs); - reload_host_efer(vmx); -} - -/* - * Switches to specified vcpu, until a matching vcpu_put(), but assumes - * vcpu mutex is already taken. - */ -static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 phys_addr = __pa(vmx->vmcs); - u64 tsc_this, delta; - - if (vcpu->cpu != cpu) { - vcpu_clear(vmx); - kvm_migrate_apic_timer(vcpu); - } - - if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { - u8 error; - - per_cpu(current_vmcs, cpu) = vmx->vmcs; - asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0" - : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) - : "cc"); - if (error) - printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", - vmx->vmcs, phys_addr); - } - - if (vcpu->cpu != cpu) { - struct descriptor_table dt; - unsigned long sysenter_esp; - - vcpu->cpu = cpu; - /* - * Linux uses per-cpu TSS and GDT, so set these when switching - * processors. - */ - vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */ - get_gdt(&dt); - vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */ - - rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); - vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ - - /* - * Make sure the time stamp counter is monotonous. - */ - rdtscll(tsc_this); - delta = vcpu->arch.host_tsc - tsc_this; - vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta); - } -} - -static void vmx_vcpu_put(struct kvm_vcpu *vcpu) -{ - vmx_load_host_state(to_vmx(vcpu)); -} - -static void vmx_fpu_activate(struct kvm_vcpu *vcpu) -{ - if (vcpu->fpu_active) - return; - vcpu->fpu_active = 1; - vmcs_clear_bits(GUEST_CR0, X86_CR0_TS); - if (vcpu->arch.cr0 & X86_CR0_TS) - vmcs_set_bits(GUEST_CR0, X86_CR0_TS); - update_exception_bitmap(vcpu); -} - -static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) -{ - if (!vcpu->fpu_active) - return; - vcpu->fpu_active = 0; - vmcs_set_bits(GUEST_CR0, X86_CR0_TS); - update_exception_bitmap(vcpu); -} - -static void vmx_vcpu_decache(struct kvm_vcpu *vcpu) -{ - vcpu_clear(to_vmx(vcpu)); -} - -static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) -{ - return vmcs_readl(GUEST_RFLAGS); -} - -static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) -{ - if (vcpu->arch.rmode.active) - rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; - vmcs_writel(GUEST_RFLAGS, rflags); -} - -static void skip_emulated_instruction(struct kvm_vcpu *vcpu) -{ - unsigned long rip; - u32 interruptibility; - - rip = vmcs_readl(GUEST_RIP); - rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); - vmcs_writel(GUEST_RIP, rip); - - /* - * We emulated an instruction, so temporary interrupt blocking - * should be removed, if set. - */ - interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); - if (interruptibility & 3) - vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, - interruptibility & ~3); - vcpu->arch.interrupt_window_open = 1; -} - -static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, - bool has_error_code, u32 error_code) -{ - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, - nr | INTR_TYPE_EXCEPTION - | (has_error_code ? INTR_INFO_DELIEVER_CODE_MASK : 0) - | INTR_INFO_VALID_MASK); - if (has_error_code) - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); -} - -static bool vmx_exception_injected(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); -} - -/* - * Swap MSR entry in host/guest MSR entry array. - */ -#ifdef CONFIG_X86_64 -static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) -{ - struct kvm_msr_entry tmp; - - tmp = vmx->guest_msrs[to]; - vmx->guest_msrs[to] = vmx->guest_msrs[from]; - vmx->guest_msrs[from] = tmp; - tmp = vmx->host_msrs[to]; - vmx->host_msrs[to] = vmx->host_msrs[from]; - vmx->host_msrs[from] = tmp; -} -#endif - -/* - * Set up the vmcs to automatically save and restore system - * msrs. Don't touch the 64-bit msrs if the guest is in legacy - * mode, as fiddling with msrs is very expensive. - */ -static void setup_msrs(struct vcpu_vmx *vmx) -{ - int save_nmsrs; - - save_nmsrs = 0; -#ifdef CONFIG_X86_64 - if (is_long_mode(&vmx->vcpu)) { - int index; - - index = __find_msr_index(vmx, MSR_SYSCALL_MASK); - if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_LSTAR); - if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_CSTAR); - if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE); - if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); - /* - * MSR_K6_STAR is only needed on long mode guests, and only - * if efer.sce is enabled. - */ - index = __find_msr_index(vmx, MSR_K6_STAR); - if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE)) - move_msr_up(vmx, index, save_nmsrs++); - } -#endif - vmx->save_nmsrs = save_nmsrs; - -#ifdef CONFIG_X86_64 - vmx->msr_offset_kernel_gs_base = - __find_msr_index(vmx, MSR_KERNEL_GS_BASE); -#endif - vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER); -} - -/* - * reads and returns guest's timestamp counter "register" - * guest_tsc = host_tsc + tsc_offset -- 21.3 - */ -static u64 guest_read_tsc(void) -{ - u64 host_tsc, tsc_offset; - - rdtscll(host_tsc); - tsc_offset = vmcs_read64(TSC_OFFSET); - return host_tsc + tsc_offset; -} - -/* - * writes 'guest_tsc' into guest's timestamp counter "register" - * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc - */ -static void guest_write_tsc(u64 guest_tsc) -{ - u64 host_tsc; - - rdtscll(host_tsc); - vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc); -} - -/* - * Reads an msr value (of 'msr_index') into 'pdata'. - * Returns 0 on success, non-0 otherwise. - * Assumes vcpu_load() was already called. - */ -static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) -{ - u64 data; - struct kvm_msr_entry *msr; - - if (!pdata) { - printk(KERN_ERR "BUG: get_msr called with NULL pdata\n"); - return -EINVAL; - } - - switch (msr_index) { -#ifdef CONFIG_X86_64 - case MSR_FS_BASE: - data = vmcs_readl(GUEST_FS_BASE); - break; - case MSR_GS_BASE: - data = vmcs_readl(GUEST_GS_BASE); - break; - case MSR_EFER: - return kvm_get_msr_common(vcpu, msr_index, pdata); -#endif - case MSR_IA32_TIME_STAMP_COUNTER: - data = guest_read_tsc(); - break; - case MSR_IA32_SYSENTER_CS: - data = vmcs_read32(GUEST_SYSENTER_CS); - break; - case MSR_IA32_SYSENTER_EIP: - data = vmcs_readl(GUEST_SYSENTER_EIP); - break; - case MSR_IA32_SYSENTER_ESP: - data = vmcs_readl(GUEST_SYSENTER_ESP); - break; - default: - msr = find_msr_entry(to_vmx(vcpu), msr_index); - if (msr) { - data = msr->data; - break; - } - return kvm_get_msr_common(vcpu, msr_index, pdata); - } - - *pdata = data; - return 0; -} - -/* - * Writes msr value into into the appropriate "register". - * Returns 0 on success, non-0 otherwise. - * Assumes vcpu_load() was already called. - */ -static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct kvm_msr_entry *msr; - int ret = 0; - - switch (msr_index) { -#ifdef CONFIG_X86_64 - case MSR_EFER: - ret = kvm_set_msr_common(vcpu, msr_index, data); - if (vmx->host_state.loaded) { - reload_host_efer(vmx); - load_transition_efer(vmx); - } - break; - case MSR_FS_BASE: - vmcs_writel(GUEST_FS_BASE, data); - break; - case MSR_GS_BASE: - vmcs_writel(GUEST_GS_BASE, data); - break; -#endif - case MSR_IA32_SYSENTER_CS: - vmcs_write32(GUEST_SYSENTER_CS, data); - break; - case MSR_IA32_SYSENTER_EIP: - vmcs_writel(GUEST_SYSENTER_EIP, data); - break; - case MSR_IA32_SYSENTER_ESP: - vmcs_writel(GUEST_SYSENTER_ESP, data); - break; - case MSR_IA32_TIME_STAMP_COUNTER: - guest_write_tsc(data); - break; - default: - msr = find_msr_entry(vmx, msr_index); - if (msr) { - msr->data = data; - if (vmx->host_state.loaded) - load_msrs(vmx->guest_msrs, vmx->save_nmsrs); - break; - } - ret = kvm_set_msr_common(vcpu, msr_index, data); - } - - return ret; -} - -/* - * Sync the rsp and rip registers into the vcpu structure. This allows - * registers to be accessed by indexing vcpu->arch.regs. - */ -static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu) -{ - vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); - vcpu->arch.rip = vmcs_readl(GUEST_RIP); -} - -/* - * Syncs rsp and rip back into the vmcs. Should be called after possible - * modification. - */ -static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu) -{ - vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); - vmcs_writel(GUEST_RIP, vcpu->arch.rip); -} - -static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) -{ - unsigned long dr7 = 0x400; - int old_singlestep; - - old_singlestep = vcpu->guest_debug.singlestep; - - vcpu->guest_debug.enabled = dbg->enabled; - if (vcpu->guest_debug.enabled) { - int i; - - dr7 |= 0x200; /* exact */ - for (i = 0; i < 4; ++i) { - if (!dbg->breakpoints[i].enabled) - continue; - vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address; - dr7 |= 2 << (i*2); /* global enable */ - dr7 |= 0 << (i*4+16); /* execution breakpoint */ - } - - vcpu->guest_debug.singlestep = dbg->singlestep; - } else - vcpu->guest_debug.singlestep = 0; - - if (old_singlestep && !vcpu->guest_debug.singlestep) { - unsigned long flags; - - flags = vmcs_readl(GUEST_RFLAGS); - flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); - vmcs_writel(GUEST_RFLAGS, flags); - } - - update_exception_bitmap(vcpu); - vmcs_writel(GUEST_DR7, dr7); - - return 0; -} - -static int vmx_get_irq(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 idtv_info_field; - - idtv_info_field = vmx->idt_vectoring_info; - if (idtv_info_field & INTR_INFO_VALID_MASK) { - if (is_external_interrupt(idtv_info_field)) - return idtv_info_field & VECTORING_INFO_VECTOR_MASK; - else - printk(KERN_DEBUG "pending exception: not handled yet\n"); - } - return -1; -} - -static __init int cpu_has_kvm_support(void) -{ - unsigned long ecx = cpuid_ecx(1); - return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */ -} - -static __init int vmx_disabled_by_bios(void) -{ - u64 msr; - - rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); - return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED | - MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) - == MSR_IA32_FEATURE_CONTROL_LOCKED; - /* locked but not enabled */ -} - -static void hardware_enable(void *garbage) -{ - int cpu = raw_smp_processor_id(); - u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); - u64 old; - - rdmsrl(MSR_IA32_FEATURE_CONTROL, old); - if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED | - MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) - != (MSR_IA32_FEATURE_CONTROL_LOCKED | - MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) - /* enable and lock */ - wrmsrl(MSR_IA32_FEATURE_CONTROL, old | - MSR_IA32_FEATURE_CONTROL_LOCKED | - MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED); - write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ - asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr) - : "memory", "cc"); -} - -static void hardware_disable(void *garbage) -{ - asm volatile (ASM_VMX_VMXOFF : : : "cc"); -} - -static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, - u32 msr, u32 *result) -{ - u32 vmx_msr_low, vmx_msr_high; - u32 ctl = ctl_min | ctl_opt; - - rdmsr(msr, vmx_msr_low, vmx_msr_high); - - ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ - ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ - - /* Ensure minimum (required) set of control bits are supported. */ - if (ctl_min & ~ctl) - return -EIO; - - *result = ctl; - return 0; -} - -static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) -{ - u32 vmx_msr_low, vmx_msr_high; - u32 min, opt; - u32 _pin_based_exec_control = 0; - u32 _cpu_based_exec_control = 0; - u32 _cpu_based_2nd_exec_control = 0; - u32 _vmexit_control = 0; - u32 _vmentry_control = 0; - - min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; - opt = 0; - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, - &_pin_based_exec_control) < 0) - return -EIO; - - min = CPU_BASED_HLT_EXITING | -#ifdef CONFIG_X86_64 - CPU_BASED_CR8_LOAD_EXITING | - CPU_BASED_CR8_STORE_EXITING | -#endif - CPU_BASED_USE_IO_BITMAPS | - CPU_BASED_MOV_DR_EXITING | - CPU_BASED_USE_TSC_OFFSETING; - opt = CPU_BASED_TPR_SHADOW | - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, - &_cpu_based_exec_control) < 0) - return -EIO; -#ifdef CONFIG_X86_64 - if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) - _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & - ~CPU_BASED_CR8_STORE_EXITING; -#endif - if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { - min = 0; - opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | - SECONDARY_EXEC_WBINVD_EXITING; - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2, - &_cpu_based_2nd_exec_control) < 0) - return -EIO; - } -#ifndef CONFIG_X86_64 - if (!(_cpu_based_2nd_exec_control & - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) - _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; -#endif - - min = 0; -#ifdef CONFIG_X86_64 - min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; -#endif - opt = 0; - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, - &_vmexit_control) < 0) - return -EIO; - - min = opt = 0; - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, - &_vmentry_control) < 0) - return -EIO; - - rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); - - /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ - if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) - return -EIO; - -#ifdef CONFIG_X86_64 - /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ - if (vmx_msr_high & (1u<<16)) - return -EIO; -#endif - - /* Require Write-Back (WB) memory type for VMCS accesses. */ - if (((vmx_msr_high >> 18) & 15) != 6) - return -EIO; - - vmcs_conf->size = vmx_msr_high & 0x1fff; - vmcs_conf->order = get_order(vmcs_config.size); - vmcs_conf->revision_id = vmx_msr_low; - - vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; - vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; - vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; - vmcs_conf->vmexit_ctrl = _vmexit_control; - vmcs_conf->vmentry_ctrl = _vmentry_control; - - return 0; -} - -static struct vmcs *alloc_vmcs_cpu(int cpu) -{ - int node = cpu_to_node(cpu); - struct page *pages; - struct vmcs *vmcs; - - pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order); - if (!pages) - return NULL; - vmcs = page_address(pages); - memset(vmcs, 0, vmcs_config.size); - vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */ - return vmcs; -} - -static struct vmcs *alloc_vmcs(void) -{ - return alloc_vmcs_cpu(raw_smp_processor_id()); -} - -static void free_vmcs(struct vmcs *vmcs) -{ - free_pages((unsigned long)vmcs, vmcs_config.order); -} - -static void free_kvm_area(void) -{ - int cpu; - - for_each_online_cpu(cpu) - free_vmcs(per_cpu(vmxarea, cpu)); -} - -static __init int alloc_kvm_area(void) -{ - int cpu; - - for_each_online_cpu(cpu) { - struct vmcs *vmcs; - - vmcs = alloc_vmcs_cpu(cpu); - if (!vmcs) { - free_kvm_area(); - return -ENOMEM; - } - - per_cpu(vmxarea, cpu) = vmcs; - } - return 0; -} - -static __init int hardware_setup(void) -{ - if (setup_vmcs_config(&vmcs_config) < 0) - return -EIO; - return alloc_kvm_area(); -} - -static __exit void hardware_unsetup(void) -{ - free_kvm_area(); -} - -static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save) -{ - struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; - - if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) { - vmcs_write16(sf->selector, save->selector); - vmcs_writel(sf->base, save->base); - vmcs_write32(sf->limit, save->limit); - vmcs_write32(sf->ar_bytes, save->ar); - } else { - u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK) - << AR_DPL_SHIFT; - vmcs_write32(sf->ar_bytes, 0x93 | dpl); - } -} - -static void enter_pmode(struct kvm_vcpu *vcpu) -{ - unsigned long flags; - - vcpu->arch.rmode.active = 0; - - vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); - vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit); - vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar); - - flags = vmcs_readl(GUEST_RFLAGS); - flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); - flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT); - vmcs_writel(GUEST_RFLAGS, flags); - - vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | - (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); - - update_exception_bitmap(vcpu); - - fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es); - fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); - fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); - fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); - - vmcs_write16(GUEST_SS_SELECTOR, 0); - vmcs_write32(GUEST_SS_AR_BYTES, 0x93); - - vmcs_write16(GUEST_CS_SELECTOR, - vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK); - vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); -} - -static gva_t rmode_tss_base(struct kvm *kvm) -{ - if (!kvm->arch.tss_addr) { - gfn_t base_gfn = kvm->memslots[0].base_gfn + - kvm->memslots[0].npages - 3; - return base_gfn << PAGE_SHIFT; - } - return kvm->arch.tss_addr; -} - -static void fix_rmode_seg(int seg, struct kvm_save_segment *save) -{ - struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; - - save->selector = vmcs_read16(sf->selector); - save->base = vmcs_readl(sf->base); - save->limit = vmcs_read32(sf->limit); - save->ar = vmcs_read32(sf->ar_bytes); - vmcs_write16(sf->selector, save->base >> 4); - vmcs_write32(sf->base, save->base & 0xfffff); - vmcs_write32(sf->limit, 0xffff); - vmcs_write32(sf->ar_bytes, 0xf3); -} - -static void enter_rmode(struct kvm_vcpu *vcpu) -{ - unsigned long flags; - - vcpu->arch.rmode.active = 1; - - vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); - vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); - - vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); - vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); - - vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); - vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); - - flags = vmcs_readl(GUEST_RFLAGS); - vcpu->arch.rmode.save_iopl - = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; - - flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; - - vmcs_writel(GUEST_RFLAGS, flags); - vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); - update_exception_bitmap(vcpu); - - vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); - vmcs_write32(GUEST_SS_LIMIT, 0xffff); - vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); - - vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); - vmcs_write32(GUEST_CS_LIMIT, 0xffff); - if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000) - vmcs_writel(GUEST_CS_BASE, 0xf0000); - vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); - - fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es); - fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); - fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); - fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); - - kvm_mmu_reset_context(vcpu); - init_rmode_tss(vcpu->kvm); -} - -#ifdef CONFIG_X86_64 - -static void enter_lmode(struct kvm_vcpu *vcpu) -{ - u32 guest_tr_ar; - - guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); - if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { - printk(KERN_DEBUG "%s: tss fixup for long mode. \n", - __FUNCTION__); - vmcs_write32(GUEST_TR_AR_BYTES, - (guest_tr_ar & ~AR_TYPE_MASK) - | AR_TYPE_BUSY_64_TSS); - } - - vcpu->arch.shadow_efer |= EFER_LMA; - - find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME; - vmcs_write32(VM_ENTRY_CONTROLS, - vmcs_read32(VM_ENTRY_CONTROLS) - | VM_ENTRY_IA32E_MODE); -} - -static void exit_lmode(struct kvm_vcpu *vcpu) -{ - vcpu->arch.shadow_efer &= ~EFER_LMA; - - vmcs_write32(VM_ENTRY_CONTROLS, - vmcs_read32(VM_ENTRY_CONTROLS) - & ~VM_ENTRY_IA32E_MODE); -} - -#endif - -static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) -{ - vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK; - vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; -} - -static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) -{ - vmx_fpu_deactivate(vcpu); - - if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE)) - enter_pmode(vcpu); - - if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE)) - enter_rmode(vcpu); - -#ifdef CONFIG_X86_64 - if (vcpu->arch.shadow_efer & EFER_LME) { - if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) - enter_lmode(vcpu); - if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) - exit_lmode(vcpu); - } -#endif - - vmcs_writel(CR0_READ_SHADOW, cr0); - vmcs_writel(GUEST_CR0, - (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); - vcpu->arch.cr0 = cr0; - - if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE)) - vmx_fpu_activate(vcpu); -} - -static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) -{ - vmcs_writel(GUEST_CR3, cr3); - if (vcpu->arch.cr0 & X86_CR0_PE) - vmx_fpu_deactivate(vcpu); -} - -static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) -{ - vmcs_writel(CR4_READ_SHADOW, cr4); - vmcs_writel(GUEST_CR4, cr4 | (vcpu->arch.rmode.active ? - KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON)); - vcpu->arch.cr4 = cr4; -} - -#ifdef CONFIG_X86_64 - -static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); - - vcpu->arch.shadow_efer = efer; - if (efer & EFER_LMA) { - vmcs_write32(VM_ENTRY_CONTROLS, - vmcs_read32(VM_ENTRY_CONTROLS) | - VM_ENTRY_IA32E_MODE); - msr->data = efer; - - } else { - vmcs_write32(VM_ENTRY_CONTROLS, - vmcs_read32(VM_ENTRY_CONTROLS) & - ~VM_ENTRY_IA32E_MODE); - - msr->data = efer & ~EFER_LME; - } - setup_msrs(vmx); -} - -#endif - -static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) -{ - struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; - - return vmcs_readl(sf->base); -} - -static void vmx_get_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) -{ - struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; - u32 ar; - - var->base = vmcs_readl(sf->base); - var->limit = vmcs_read32(sf->limit); - var->selector = vmcs_read16(sf->selector); - ar = vmcs_read32(sf->ar_bytes); - if (ar & AR_UNUSABLE_MASK) - ar = 0; - var->type = ar & 15; - var->s = (ar >> 4) & 1; - var->dpl = (ar >> 5) & 3; - var->present = (ar >> 7) & 1; - var->avl = (ar >> 12) & 1; - var->l = (ar >> 13) & 1; - var->db = (ar >> 14) & 1; - var->g = (ar >> 15) & 1; - var->unusable = (ar >> 16) & 1; -} - -static u32 vmx_segment_access_rights(struct kvm_segment *var) -{ - u32 ar; - - if (var->unusable) - ar = 1 << 16; - else { - ar = var->type & 15; - ar |= (var->s & 1) << 4; - ar |= (var->dpl & 3) << 5; - ar |= (var->present & 1) << 7; - ar |= (var->avl & 1) << 12; - ar |= (var->l & 1) << 13; - ar |= (var->db & 1) << 14; - ar |= (var->g & 1) << 15; - } - if (ar == 0) /* a 0 value means unusable */ - ar = AR_UNUSABLE_MASK; - - return ar; -} - -static void vmx_set_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) -{ - struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; - u32 ar; - - if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) { - vcpu->arch.rmode.tr.selector = var->selector; - vcpu->arch.rmode.tr.base = var->base; - vcpu->arch.rmode.tr.limit = var->limit; - vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var); - return; - } - vmcs_writel(sf->base, var->base); - vmcs_write32(sf->limit, var->limit); - vmcs_write16(sf->selector, var->selector); - if (vcpu->arch.rmode.active && var->s) { - /* - * Hack real-mode segments into vm86 compatibility. - */ - if (var->base == 0xffff0000 && var->selector == 0xf000) - vmcs_writel(sf->base, 0xf0000); - ar = 0xf3; - } else - ar = vmx_segment_access_rights(var); - vmcs_write32(sf->ar_bytes, ar); -} - -static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) -{ - u32 ar = vmcs_read32(GUEST_CS_AR_BYTES); - - *db = (ar >> 14) & 1; - *l = (ar >> 13) & 1; -} - -static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) -{ - dt->limit = vmcs_read32(GUEST_IDTR_LIMIT); - dt->base = vmcs_readl(GUEST_IDTR_BASE); -} - -static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) -{ - vmcs_write32(GUEST_IDTR_LIMIT, dt->limit); - vmcs_writel(GUEST_IDTR_BASE, dt->base); -} - -static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) -{ - dt->limit = vmcs_read32(GUEST_GDTR_LIMIT); - dt->base = vmcs_readl(GUEST_GDTR_BASE); -} - -static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) -{ - vmcs_write32(GUEST_GDTR_LIMIT, dt->limit); - vmcs_writel(GUEST_GDTR_BASE, dt->base); -} - -static int init_rmode_tss(struct kvm *kvm) -{ - gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; - u16 data = 0; - int r; - - r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); - if (r < 0) - return 0; - data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; - r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16)); - if (r < 0) - return 0; - r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); - if (r < 0) - return 0; - r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); - if (r < 0) - return 0; - data = ~0; - r = kvm_write_guest_page(kvm, fn, &data, RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1, - sizeof(u8)); - if (r < 0) - return 0; - return 1; -} - -static void seg_setup(int seg) -{ - struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; - - vmcs_write16(sf->selector, 0); - vmcs_writel(sf->base, 0); - vmcs_write32(sf->limit, 0xffff); - vmcs_write32(sf->ar_bytes, 0x93); -} - -static int alloc_apic_access_page(struct kvm *kvm) -{ - struct kvm_userspace_memory_region kvm_userspace_mem; - int r = 0; - - mutex_lock(&kvm->lock); - if (kvm->arch.apic_access_page) - goto out; - kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; - kvm_userspace_mem.flags = 0; - kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL; - kvm_userspace_mem.memory_size = PAGE_SIZE; - r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); - if (r) - goto out; - kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); -out: - mutex_unlock(&kvm->lock); - return r; -} - -/* - * Sets up the vmcs for emulated real mode. - */ -static int vmx_vcpu_setup(struct vcpu_vmx *vmx) -{ - u32 host_sysenter_cs; - u32 junk; - unsigned long a; - struct descriptor_table dt; - int i; - unsigned long kvm_vmx_return; - u32 exec_control; - - /* I/O */ - vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a)); - vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b)); - - vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ - - /* Control */ - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, - vmcs_config.pin_based_exec_ctrl); - - exec_control = vmcs_config.cpu_based_exec_ctrl; - if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { - exec_control &= ~CPU_BASED_TPR_SHADOW; -#ifdef CONFIG_X86_64 - exec_control |= CPU_BASED_CR8_STORE_EXITING | - CPU_BASED_CR8_LOAD_EXITING; -#endif - } - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); - - if (cpu_has_secondary_exec_ctrls()) { - exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; - if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) - exec_control &= - ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); - } - - vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf); - vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); - vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ - - vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */ - vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ - vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ - - vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ - vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ - vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ - vmcs_write16(HOST_FS_SELECTOR, read_fs()); /* 22.2.4 */ - vmcs_write16(HOST_GS_SELECTOR, read_gs()); /* 22.2.4 */ - vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ -#ifdef CONFIG_X86_64 - rdmsrl(MSR_FS_BASE, a); - vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ - rdmsrl(MSR_GS_BASE, a); - vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */ -#else - vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ - vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ -#endif - - vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ - - get_idt(&dt); - vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ - - asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); - vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */ - vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); - - rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk); - vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs); - rdmsrl(MSR_IA32_SYSENTER_ESP, a); - vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */ - rdmsrl(MSR_IA32_SYSENTER_EIP, a); - vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */ - - for (i = 0; i < NR_VMX_MSR; ++i) { - u32 index = vmx_msr_index[i]; - u32 data_low, data_high; - u64 data; - int j = vmx->nmsrs; - - if (rdmsr_safe(index, &data_low, &data_high) < 0) - continue; - if (wrmsr_safe(index, data_low, data_high) < 0) - continue; - data = data_low | ((u64)data_high << 32); - vmx->host_msrs[j].index = index; - vmx->host_msrs[j].reserved = 0; - vmx->host_msrs[j].data = data; - vmx->guest_msrs[j] = vmx->host_msrs[j]; - ++vmx->nmsrs; - } - - vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); - - /* 22.2.1, 20.8.1 */ - vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); - - vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); - vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); - - if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) - if (alloc_apic_access_page(vmx->vcpu.kvm) != 0) - return -ENOMEM; - - return 0; -} - -static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 msr; - int ret; - - if (!init_rmode_tss(vmx->vcpu.kvm)) { - ret = -ENOMEM; - goto out; - } - - vmx->vcpu.arch.rmode.active = 0; - - vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); - set_cr8(&vmx->vcpu, 0); - msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; - if (vmx->vcpu.vcpu_id == 0) - msr |= MSR_IA32_APICBASE_BSP; - kvm_set_apic_base(&vmx->vcpu, msr); - - fx_init(&vmx->vcpu); - - /* - * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode - * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. - */ - if (vmx->vcpu.vcpu_id == 0) { - vmcs_write16(GUEST_CS_SELECTOR, 0xf000); - vmcs_writel(GUEST_CS_BASE, 0x000f0000); - } else { - vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); - vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); - } - vmcs_write32(GUEST_CS_LIMIT, 0xffff); - vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); - - seg_setup(VCPU_SREG_DS); - seg_setup(VCPU_SREG_ES); - seg_setup(VCPU_SREG_FS); - seg_setup(VCPU_SREG_GS); - seg_setup(VCPU_SREG_SS); - - vmcs_write16(GUEST_TR_SELECTOR, 0); - vmcs_writel(GUEST_TR_BASE, 0); - vmcs_write32(GUEST_TR_LIMIT, 0xffff); - vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); - - vmcs_write16(GUEST_LDTR_SELECTOR, 0); - vmcs_writel(GUEST_LDTR_BASE, 0); - vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); - vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); - - vmcs_write32(GUEST_SYSENTER_CS, 0); - vmcs_writel(GUEST_SYSENTER_ESP, 0); - vmcs_writel(GUEST_SYSENTER_EIP, 0); - - vmcs_writel(GUEST_RFLAGS, 0x02); - if (vmx->vcpu.vcpu_id == 0) - vmcs_writel(GUEST_RIP, 0xfff0); - else - vmcs_writel(GUEST_RIP, 0); - vmcs_writel(GUEST_RSP, 0); - - /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */ - vmcs_writel(GUEST_DR7, 0x400); - - vmcs_writel(GUEST_GDTR_BASE, 0); - vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); - - vmcs_writel(GUEST_IDTR_BASE, 0); - vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); - - vmcs_write32(GUEST_ACTIVITY_STATE, 0); - vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); - vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); - - guest_write_tsc(0); - - /* Special registers */ - vmcs_write64(GUEST_IA32_DEBUGCTL, 0); - - setup_msrs(vmx); - - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ - - if (cpu_has_vmx_tpr_shadow()) { - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); - if (vm_need_tpr_shadow(vmx->vcpu.kvm)) - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, - page_to_phys(vmx->vcpu.arch.apic->regs_page)); - vmcs_write32(TPR_THRESHOLD, 0); - } - - if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) - vmcs_write64(APIC_ACCESS_ADDR, - page_to_phys(vmx->vcpu.kvm->arch.apic_access_page)); - - vmx->vcpu.arch.cr0 = 0x60000010; - vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ - vmx_set_cr4(&vmx->vcpu, 0); -#ifdef CONFIG_X86_64 - vmx_set_efer(&vmx->vcpu, 0); -#endif - vmx_fpu_activate(&vmx->vcpu); - update_exception_bitmap(&vmx->vcpu); - - return 0; - -out: - return ret; -} - -static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (vcpu->arch.rmode.active) { - vmx->rmode.irq.pending = true; - vmx->rmode.irq.vector = irq; - vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP); - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, - irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); - vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1); - return; - } - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, - irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); -} - -static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) -{ - int word_index = __ffs(vcpu->arch.irq_summary); - int bit_index = __ffs(vcpu->arch.irq_pending[word_index]); - int irq = word_index * BITS_PER_LONG + bit_index; - - clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); - if (!vcpu->arch.irq_pending[word_index]) - clear_bit(word_index, &vcpu->arch.irq_summary); - vmx_inject_irq(vcpu, irq); -} - - -static void do_interrupt_requests(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) -{ - u32 cpu_based_vm_exec_control; - - vcpu->arch.interrupt_window_open = - ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && - (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); - - if (vcpu->arch.interrupt_window_open && - vcpu->arch.irq_summary && - !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK)) - /* - * If interrupts enabled, and not blocked by sti or mov ss. Good. - */ - kvm_do_inject_irq(vcpu); - - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - if (!vcpu->arch.interrupt_window_open && - (vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) - /* - * Interrupts blocked. Wait for unblock. - */ - cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; - else - cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); -} - -static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) -{ - int ret; - struct kvm_userspace_memory_region tss_mem = { - .slot = 8, - .guest_phys_addr = addr, - .memory_size = PAGE_SIZE * 3, - .flags = 0, - }; - - ret = kvm_set_memory_region(kvm, &tss_mem, 0); - if (ret) - return ret; - kvm->arch.tss_addr = addr; - return 0; -} - -static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) -{ - struct kvm_guest_debug *dbg = &vcpu->guest_debug; - - set_debugreg(dbg->bp[0], 0); - set_debugreg(dbg->bp[1], 1); - set_debugreg(dbg->bp[2], 2); - set_debugreg(dbg->bp[3], 3); - - if (dbg->singlestep) { - unsigned long flags; - - flags = vmcs_readl(GUEST_RFLAGS); - flags |= X86_EFLAGS_TF | X86_EFLAGS_RF; - vmcs_writel(GUEST_RFLAGS, flags); - } -} - -static int handle_rmode_exception(struct kvm_vcpu *vcpu, - int vec, u32 err_code) -{ - if (!vcpu->arch.rmode.active) - return 0; - - /* - * Instruction with address size override prefix opcode 0x67 - * Cause the #SS fault with 0 error code in VM86 mode. - */ - if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) - if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) - return 1; - return 0; -} - -static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 intr_info, error_code; - unsigned long cr2, rip; - u32 vect_info; - enum emulation_result er; - - vect_info = vmx->idt_vectoring_info; - intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - - if ((vect_info & VECTORING_INFO_VALID_MASK) && - !is_page_fault(intr_info)) - printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " - "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info); - - if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) { - int irq = vect_info & VECTORING_INFO_VECTOR_MASK; - set_bit(irq, vcpu->arch.irq_pending); - set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary); - } - - if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ - return 1; /* already handled by vmx_vcpu_run() */ - - if (is_no_device(intr_info)) { - vmx_fpu_activate(vcpu); - return 1; - } - - if (is_invalid_opcode(intr_info)) { - er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); - if (er != EMULATE_DONE) - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } - - error_code = 0; - rip = vmcs_readl(GUEST_RIP); - if (intr_info & INTR_INFO_DELIEVER_CODE_MASK) - error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); - if (is_page_fault(intr_info)) { - cr2 = vmcs_readl(EXIT_QUALIFICATION); - return kvm_mmu_page_fault(vcpu, cr2, error_code); - } - - if (vcpu->arch.rmode.active && - handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, - error_code)) { - if (vcpu->arch.halt_request) { - vcpu->arch.halt_request = 0; - return kvm_emulate_halt(vcpu); - } - return 1; - } - - if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == - (INTR_TYPE_EXCEPTION | 1)) { - kvm_run->exit_reason = KVM_EXIT_DEBUG; - return 0; - } - kvm_run->exit_reason = KVM_EXIT_EXCEPTION; - kvm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK; - kvm_run->ex.error_code = error_code; - return 0; -} - -static int handle_external_interrupt(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) -{ - ++vcpu->stat.irq_exits; - return 1; -} - -static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) -{ - kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; - return 0; -} - -static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) -{ - unsigned long exit_qualification; - int size, down, in, string, rep; - unsigned port; - - ++vcpu->stat.io_exits; - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - string = (exit_qualification & 16) != 0; - - if (string) { - if (emulate_instruction(vcpu, - kvm_run, 0, 0, 0) == EMULATE_DO_MMIO) - return 0; - return 1; - } - - size = (exit_qualification & 7) + 1; - in = (exit_qualification & 8) != 0; - down = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0; - rep = (exit_qualification & 32) != 0; - port = exit_qualification >> 16; - - return kvm_emulate_pio(vcpu, kvm_run, in, size, port); -} - -static void -vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) -{ - /* - * Patch in the VMCALL instruction: - */ - hypercall[0] = 0x0f; - hypercall[1] = 0x01; - hypercall[2] = 0xc1; -} - -static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) -{ - unsigned long exit_qualification; - int cr; - int reg; - - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - cr = exit_qualification & 15; - reg = (exit_qualification >> 8) & 15; - switch ((exit_qualification >> 4) & 3) { - case 0: /* mov to cr */ - switch (cr) { - case 0: - vcpu_load_rsp_rip(vcpu); - set_cr0(vcpu, vcpu->arch.regs[reg]); - skip_emulated_instruction(vcpu); - return 1; - case 3: - vcpu_load_rsp_rip(vcpu); - set_cr3(vcpu, vcpu->arch.regs[reg]); - skip_emulated_instruction(vcpu); - return 1; - case 4: - vcpu_load_rsp_rip(vcpu); - set_cr4(vcpu, vcpu->arch.regs[reg]); - skip_emulated_instruction(vcpu); - return 1; - case 8: - vcpu_load_rsp_rip(vcpu); - set_cr8(vcpu, vcpu->arch.regs[reg]); - skip_emulated_instruction(vcpu); - if (irqchip_in_kernel(vcpu->kvm)) - return 1; - kvm_run->exit_reason = KVM_EXIT_SET_TPR; - return 0; - }; - break; - case 2: /* clts */ - vcpu_load_rsp_rip(vcpu); - vmx_fpu_deactivate(vcpu); - vcpu->arch.cr0 &= ~X86_CR0_TS; - vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); - vmx_fpu_activate(vcpu); - skip_emulated_instruction(vcpu); - return 1; - case 1: /*mov from cr*/ - switch (cr) { - case 3: - vcpu_load_rsp_rip(vcpu); - vcpu->arch.regs[reg] = vcpu->arch.cr3; - vcpu_put_rsp_rip(vcpu); - skip_emulated_instruction(vcpu); - return 1; - case 8: - vcpu_load_rsp_rip(vcpu); - vcpu->arch.regs[reg] = get_cr8(vcpu); - vcpu_put_rsp_rip(vcpu); - skip_emulated_instruction(vcpu); - return 1; - } - break; - case 3: /* lmsw */ - lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f); - - skip_emulated_instruction(vcpu); - return 1; - default: - break; - } - kvm_run->exit_reason = 0; - pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n", - (int)(exit_qualification >> 4) & 3, cr); - return 0; -} - -static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) -{ - unsigned long exit_qualification; - unsigned long val; - int dr, reg; - - /* - * FIXME: this code assumes the host is debugging the guest. - * need to deal with guest debugging itself too. - */ - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - dr = exit_qualification & 7; - reg = (exit_qualification >> 8) & 15; - vcpu_load_rsp_rip(vcpu); - if (exit_qualification & 16) { - /* mov from dr */ - switch (dr) { - case 6: - val = 0xffff0ff0; - break; - case 7: - val = 0x400; - break; - default: - val = 0; - } - vcpu->arch.regs[reg] = val; - } else { - /* mov to dr */ - } - vcpu_put_rsp_rip(vcpu); - skip_emulated_instruction(vcpu); - return 1; -} - -static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) -{ - kvm_emulate_cpuid(vcpu); - return 1; -} - -static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) -{ - u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; - u64 data; - - if (vmx_get_msr(vcpu, ecx, &data)) { - kvm_inject_gp(vcpu, 0); - return 1; - } - - /* FIXME: handling of bits 32:63 of rax, rdx */ - vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u; - vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u; - skip_emulated_instruction(vcpu); - return 1; -} - -static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) -{ - u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; - u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) - | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); - - if (vmx_set_msr(vcpu, ecx, data) != 0) { - kvm_inject_gp(vcpu, 0); - return 1; - } - - skip_emulated_instruction(vcpu); - return 1; -} - -static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) -{ - return 1; -} - -static int handle_interrupt_window(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) -{ - u32 cpu_based_vm_exec_control; - - /* clear pending irq */ - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); - /* - * If the user space waits to inject interrupts, exit as soon as - * possible - */ - if (kvm_run->request_interrupt_window && - !vcpu->arch.irq_summary) { - kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; - ++vcpu->stat.irq_window_exits; - return 0; - } - return 1; -} - -static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) -{ - skip_emulated_instruction(vcpu); - return kvm_emulate_halt(vcpu); -} - -static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) -{ - skip_emulated_instruction(vcpu); - kvm_emulate_hypercall(vcpu); - return 1; -} - -static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) -{ - skip_emulated_instruction(vcpu); - /* TODO: Add support for VT-d/pass-through device */ - return 1; -} - -static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) -{ - u64 exit_qualification; - enum emulation_result er; - unsigned long offset; - - exit_qualification = vmcs_read64(EXIT_QUALIFICATION); - offset = exit_qualification & 0xffful; - - er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); - - if (er != EMULATE_DONE) { - printk(KERN_ERR - "Fail to handle apic access vmexit! Offset is 0x%lx\n", - offset); - return -ENOTSUPP; - } - return 1; -} - -/* - * The exit handlers return 1 if the exit was handled fully and guest execution - * may resume. Otherwise they set the kvm_run parameter to indicate what needs - * to be done to userspace and return 0. - */ -static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) = { - [EXIT_REASON_EXCEPTION_NMI] = handle_exception, - [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, - [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, - [EXIT_REASON_IO_INSTRUCTION] = handle_io, - [EXIT_REASON_CR_ACCESS] = handle_cr, - [EXIT_REASON_DR_ACCESS] = handle_dr, - [EXIT_REASON_CPUID] = handle_cpuid, - [EXIT_REASON_MSR_READ] = handle_rdmsr, - [EXIT_REASON_MSR_WRITE] = handle_wrmsr, - [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, - [EXIT_REASON_HLT] = handle_halt, - [EXIT_REASON_VMCALL] = handle_vmcall, - [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, - [EXIT_REASON_APIC_ACCESS] = handle_apic_access, - [EXIT_REASON_WBINVD] = handle_wbinvd, -}; - -static const int kvm_vmx_max_exit_handlers = - ARRAY_SIZE(kvm_vmx_exit_handlers); - -/* - * The guest has exited. See if we can fix it or if we need userspace - * assistance. - */ -static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) -{ - u32 exit_reason = vmcs_read32(VM_EXIT_REASON); - struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 vectoring_info = vmx->idt_vectoring_info; - - if (unlikely(vmx->fail)) { - kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; - kvm_run->fail_entry.hardware_entry_failure_reason - = vmcs_read32(VM_INSTRUCTION_ERROR); - return 0; - } - - if ((vectoring_info & VECTORING_INFO_VALID_MASK) && - exit_reason != EXIT_REASON_EXCEPTION_NMI) - printk(KERN_WARNING "%s: unexpected, valid vectoring info and " - "exit reason is 0x%x\n", __FUNCTION__, exit_reason); - if (exit_reason < kvm_vmx_max_exit_handlers - && kvm_vmx_exit_handlers[exit_reason]) - return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); - else { - kvm_run->exit_reason = KVM_EXIT_UNKNOWN; - kvm_run->hw.hardware_exit_reason = exit_reason; - } - return 0; -} - -static void vmx_flush_tlb(struct kvm_vcpu *vcpu) -{ -} - -static void update_tpr_threshold(struct kvm_vcpu *vcpu) -{ - int max_irr, tpr; - - if (!vm_need_tpr_shadow(vcpu->kvm)) - return; - - if (!kvm_lapic_enabled(vcpu) || - ((max_irr = kvm_lapic_find_highest_irr(vcpu)) == -1)) { - vmcs_write32(TPR_THRESHOLD, 0); - return; - } - - tpr = (kvm_lapic_get_cr8(vcpu) & 0x0f) << 4; - vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4); -} - -static void enable_irq_window(struct kvm_vcpu *vcpu) -{ - u32 cpu_based_vm_exec_control; - - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); -} - -static void vmx_intr_assist(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 idtv_info_field, intr_info_field; - int has_ext_irq, interrupt_window_open; - int vector; - - update_tpr_threshold(vcpu); - - has_ext_irq = kvm_cpu_has_interrupt(vcpu); - intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); - idtv_info_field = vmx->idt_vectoring_info; - if (intr_info_field & INTR_INFO_VALID_MASK) { - if (idtv_info_field & INTR_INFO_VALID_MASK) { - /* TODO: fault when IDT_Vectoring */ - if (printk_ratelimit()) - printk(KERN_ERR "Fault when IDT_Vectoring\n"); - } - if (has_ext_irq) - enable_irq_window(vcpu); - return; - } - if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) { - if ((idtv_info_field & VECTORING_INFO_TYPE_MASK) - == INTR_TYPE_EXT_INTR - && vcpu->arch.rmode.active) { - u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK; - - vmx_inject_irq(vcpu, vect); - if (unlikely(has_ext_irq)) - enable_irq_window(vcpu); - return; - } - - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field); - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, - vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); - - if (unlikely(idtv_info_field & INTR_INFO_DELIEVER_CODE_MASK)) - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, - vmcs_read32(IDT_VECTORING_ERROR_CODE)); - if (unlikely(has_ext_irq)) - enable_irq_window(vcpu); - return; - } - if (!has_ext_irq) - return; - interrupt_window_open = - ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && - (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); - if (interrupt_window_open) { - vector = kvm_cpu_get_interrupt(vcpu); - vmx_inject_irq(vcpu, vector); - kvm_timer_intr_post(vcpu, vector); - } else - enable_irq_window(vcpu); -} - -/* - * Failure to inject an interrupt should give us the information - * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs - * when fetching the interrupt redirection bitmap in the real-mode - * tss, this doesn't happen. So we do it ourselves. - */ -static void fixup_rmode_irq(struct vcpu_vmx *vmx) -{ - vmx->rmode.irq.pending = 0; - if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip) - return; - vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip); - if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { - vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; - vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR; - return; - } - vmx->idt_vectoring_info = - VECTORING_INFO_VALID_MASK - | INTR_TYPE_EXT_INTR - | vmx->rmode.irq.vector; -} - -static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 intr_info; - - /* - * Loading guest fpu may have cleared host cr0.ts - */ - vmcs_writel(HOST_CR0, read_cr0()); - - asm( - /* Store host registers */ -#ifdef CONFIG_X86_64 - "push %%rdx; push %%rbp;" - "push %%rcx \n\t" -#else - "push %%edx; push %%ebp;" - "push %%ecx \n\t" -#endif - ASM_VMX_VMWRITE_RSP_RDX "\n\t" - /* Check if vmlaunch of vmresume is needed */ - "cmpl $0, %c[launched](%0) \n\t" - /* Load guest registers. Don't clobber flags. */ -#ifdef CONFIG_X86_64 - "mov %c[cr2](%0), %%rax \n\t" - "mov %%rax, %%cr2 \n\t" - "mov %c[rax](%0), %%rax \n\t" - "mov %c[rbx](%0), %%rbx \n\t" - "mov %c[rdx](%0), %%rdx \n\t" - "mov %c[rsi](%0), %%rsi \n\t" - "mov %c[rdi](%0), %%rdi \n\t" - "mov %c[rbp](%0), %%rbp \n\t" - "mov %c[r8](%0), %%r8 \n\t" - "mov %c[r9](%0), %%r9 \n\t" - "mov %c[r10](%0), %%r10 \n\t" - "mov %c[r11](%0), %%r11 \n\t" - "mov %c[r12](%0), %%r12 \n\t" - "mov %c[r13](%0), %%r13 \n\t" - "mov %c[r14](%0), %%r14 \n\t" - "mov %c[r15](%0), %%r15 \n\t" - "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */ -#else - "mov %c[cr2](%0), %%eax \n\t" - "mov %%eax, %%cr2 \n\t" - "mov %c[rax](%0), %%eax \n\t" - "mov %c[rbx](%0), %%ebx \n\t" - "mov %c[rdx](%0), %%edx \n\t" - "mov %c[rsi](%0), %%esi \n\t" - "mov %c[rdi](%0), %%edi \n\t" - "mov %c[rbp](%0), %%ebp \n\t" - "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */ -#endif - /* Enter guest mode */ - "jne .Llaunched \n\t" - ASM_VMX_VMLAUNCH "\n\t" - "jmp .Lkvm_vmx_return \n\t" - ".Llaunched: " ASM_VMX_VMRESUME "\n\t" - ".Lkvm_vmx_return: " - /* Save guest registers, load host registers, keep flags */ -#ifdef CONFIG_X86_64 - "xchg %0, (%%rsp) \n\t" - "mov %%rax, %c[rax](%0) \n\t" - "mov %%rbx, %c[rbx](%0) \n\t" - "pushq (%%rsp); popq %c[rcx](%0) \n\t" - "mov %%rdx, %c[rdx](%0) \n\t" - "mov %%rsi, %c[rsi](%0) \n\t" - "mov %%rdi, %c[rdi](%0) \n\t" - "mov %%rbp, %c[rbp](%0) \n\t" - "mov %%r8, %c[r8](%0) \n\t" - "mov %%r9, %c[r9](%0) \n\t" - "mov %%r10, %c[r10](%0) \n\t" - "mov %%r11, %c[r11](%0) \n\t" - "mov %%r12, %c[r12](%0) \n\t" - "mov %%r13, %c[r13](%0) \n\t" - "mov %%r14, %c[r14](%0) \n\t" - "mov %%r15, %c[r15](%0) \n\t" - "mov %%cr2, %%rax \n\t" - "mov %%rax, %c[cr2](%0) \n\t" - - "pop %%rbp; pop %%rbp; pop %%rdx \n\t" -#else - "xchg %0, (%%esp) \n\t" - "mov %%eax, %c[rax](%0) \n\t" - "mov %%ebx, %c[rbx](%0) \n\t" - "pushl (%%esp); popl %c[rcx](%0) \n\t" - "mov %%edx, %c[rdx](%0) \n\t" - "mov %%esi, %c[rsi](%0) \n\t" - "mov %%edi, %c[rdi](%0) \n\t" - "mov %%ebp, %c[rbp](%0) \n\t" - "mov %%cr2, %%eax \n\t" - "mov %%eax, %c[cr2](%0) \n\t" - - "pop %%ebp; pop %%ebp; pop %%edx \n\t" -#endif - "setbe %c[fail](%0) \n\t" - : : "c"(vmx), "d"((unsigned long)HOST_RSP), - [launched]"i"(offsetof(struct vcpu_vmx, launched)), - [fail]"i"(offsetof(struct vcpu_vmx, fail)), - [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), - [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), - [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), - [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])), - [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])), - [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])), - [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])), -#ifdef CONFIG_X86_64 - [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])), - [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])), - [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])), - [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])), - [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])), - [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])), - [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), - [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), -#endif - [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) - : "cc", "memory" -#ifdef CONFIG_X86_64 - , "rbx", "rdi", "rsi" - , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" -#else - , "ebx", "edi", "rsi" -#endif - ); - - vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); - if (vmx->rmode.irq.pending) - fixup_rmode_irq(vmx); - - vcpu->arch.interrupt_window_open = - (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; - - asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); - vmx->launched = 1; - - intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - - /* We need to handle NMIs before interrupts are enabled */ - if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ - asm("int $2"); -} - -static void vmx_free_vmcs(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (vmx->vmcs) { - on_each_cpu(__vcpu_clear, vmx, 0, 1); - free_vmcs(vmx->vmcs); - vmx->vmcs = NULL; - } -} - -static void vmx_free_vcpu(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - vmx_free_vmcs(vcpu); - kfree(vmx->host_msrs); - kfree(vmx->guest_msrs); - kvm_vcpu_uninit(vcpu); - kmem_cache_free(kvm_vcpu_cache, vmx); -} - -static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) -{ - int err; - struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); - int cpu; - - if (!vmx) - return ERR_PTR(-ENOMEM); - - err = kvm_vcpu_init(&vmx->vcpu, kvm, id); - if (err) - goto free_vcpu; - - vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!vmx->guest_msrs) { - err = -ENOMEM; - goto uninit_vcpu; - } - - vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!vmx->host_msrs) - goto free_guest_msrs; - - vmx->vmcs = alloc_vmcs(); - if (!vmx->vmcs) - goto free_msrs; - - vmcs_clear(vmx->vmcs); - - cpu = get_cpu(); - vmx_vcpu_load(&vmx->vcpu, cpu); - err = vmx_vcpu_setup(vmx); - vmx_vcpu_put(&vmx->vcpu); - put_cpu(); - if (err) - goto free_vmcs; - - return &vmx->vcpu; - -free_vmcs: - free_vmcs(vmx->vmcs); -free_msrs: - kfree(vmx->host_msrs); -free_guest_msrs: - kfree(vmx->guest_msrs); -uninit_vcpu: - kvm_vcpu_uninit(&vmx->vcpu); -free_vcpu: - kmem_cache_free(kvm_vcpu_cache, vmx); - return ERR_PTR(err); -} - -static void __init vmx_check_processor_compat(void *rtn) -{ - struct vmcs_config vmcs_conf; - - *(int *)rtn = 0; - if (setup_vmcs_config(&vmcs_conf) < 0) - *(int *)rtn = -EIO; - if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { - printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", - smp_processor_id()); - *(int *)rtn = -EIO; - } -} - -static struct kvm_x86_ops vmx_x86_ops = { - .cpu_has_kvm_support = cpu_has_kvm_support, - .disabled_by_bios = vmx_disabled_by_bios, - .hardware_setup = hardware_setup, - .hardware_unsetup = hardware_unsetup, - .check_processor_compatibility = vmx_check_processor_compat, - .hardware_enable = hardware_enable, - .hardware_disable = hardware_disable, - - .vcpu_create = vmx_create_vcpu, - .vcpu_free = vmx_free_vcpu, - .vcpu_reset = vmx_vcpu_reset, - - .prepare_guest_switch = vmx_save_host_state, - .vcpu_load = vmx_vcpu_load, - .vcpu_put = vmx_vcpu_put, - .vcpu_decache = vmx_vcpu_decache, - - .set_guest_debug = set_guest_debug, - .guest_debug_pre = kvm_guest_debug_pre, - .get_msr = vmx_get_msr, - .set_msr = vmx_set_msr, - .get_segment_base = vmx_get_segment_base, - .get_segment = vmx_get_segment, - .set_segment = vmx_set_segment, - .get_cs_db_l_bits = vmx_get_cs_db_l_bits, - .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, - .set_cr0 = vmx_set_cr0, - .set_cr3 = vmx_set_cr3, - .set_cr4 = vmx_set_cr4, -#ifdef CONFIG_X86_64 - .set_efer = vmx_set_efer, -#endif - .get_idt = vmx_get_idt, - .set_idt = vmx_set_idt, - .get_gdt = vmx_get_gdt, - .set_gdt = vmx_set_gdt, - .cache_regs = vcpu_load_rsp_rip, - .decache_regs = vcpu_put_rsp_rip, - .get_rflags = vmx_get_rflags, - .set_rflags = vmx_set_rflags, - - .tlb_flush = vmx_flush_tlb, - - .run = vmx_vcpu_run, - .handle_exit = kvm_handle_exit, - .skip_emulated_instruction = skip_emulated_instruction, - .patch_hypercall = vmx_patch_hypercall, - .get_irq = vmx_get_irq, - .set_irq = vmx_inject_irq, - .queue_exception = vmx_queue_exception, - .exception_injected = vmx_exception_injected, - .inject_pending_irq = vmx_intr_assist, - .inject_pending_vectors = do_interrupt_requests, - - .set_tss_addr = vmx_set_tss_addr, -}; - -static int __init vmx_init(void) -{ - void *iova; - int r; - - vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); - if (!vmx_io_bitmap_a) - return -ENOMEM; - - vmx_io_bitmap_b = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); - if (!vmx_io_bitmap_b) { - r = -ENOMEM; - goto out; - } - - /* - * Allow direct access to the PC debug port (it is often used for I/O - * delays, but the vmexits simply slow things down). - */ - iova = kmap(vmx_io_bitmap_a); - memset(iova, 0xff, PAGE_SIZE); - clear_bit(0x80, iova); - kunmap(vmx_io_bitmap_a); - - iova = kmap(vmx_io_bitmap_b); - memset(iova, 0xff, PAGE_SIZE); - kunmap(vmx_io_bitmap_b); - - r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); - if (r) - goto out1; - - if (bypass_guest_pf) - kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); - - return 0; - -out1: - __free_page(vmx_io_bitmap_b); -out: - __free_page(vmx_io_bitmap_a); - return r; -} - -static void __exit vmx_exit(void) -{ - __free_page(vmx_io_bitmap_b); - __free_page(vmx_io_bitmap_a); - - kvm_exit(); -} - -module_init(vmx_init) -module_exit(vmx_exit) diff --git a/drivers/kvm/vmx.h b/drivers/kvm/vmx.h deleted file mode 100644 index d52ae8d7303..00000000000 --- a/drivers/kvm/vmx.h +++ /dev/null @@ -1,324 +0,0 @@ -#ifndef VMX_H -#define VMX_H - -/* - * vmx.h: VMX Architecture related definitions - * Copyright (c) 2004, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple - * Place - Suite 330, Boston, MA 02111-1307 USA. - * - * A few random additions are: - * Copyright (C) 2006 Qumranet - * Avi Kivity <avi@qumranet.com> - * Yaniv Kamay <yaniv@qumranet.com> - * - */ - -/* - * Definitions of Primary Processor-Based VM-Execution Controls. - */ -#define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004 -#define CPU_BASED_USE_TSC_OFFSETING 0x00000008 -#define CPU_BASED_HLT_EXITING 0x00000080 -#define CPU_BASED_INVLPG_EXITING 0x00000200 -#define CPU_BASED_MWAIT_EXITING 0x00000400 -#define CPU_BASED_RDPMC_EXITING 0x00000800 -#define CPU_BASED_RDTSC_EXITING 0x00001000 -#define CPU_BASED_CR8_LOAD_EXITING 0x00080000 -#define CPU_BASED_CR8_STORE_EXITING 0x00100000 -#define CPU_BASED_TPR_SHADOW 0x00200000 -#define CPU_BASED_MOV_DR_EXITING 0x00800000 -#define CPU_BASED_UNCOND_IO_EXITING 0x01000000 -#define CPU_BASED_USE_IO_BITMAPS 0x02000000 -#define CPU_BASED_USE_MSR_BITMAPS 0x10000000 -#define CPU_BASED_MONITOR_EXITING 0x20000000 -#define CPU_BASED_PAUSE_EXITING 0x40000000 -#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000 -/* - * Definitions of Secondary Processor-Based VM-Execution Controls. - */ -#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 -#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 - - -#define PIN_BASED_EXT_INTR_MASK 0x00000001 -#define PIN_BASED_NMI_EXITING 0x00000008 -#define PIN_BASED_VIRTUAL_NMIS 0x00000020 - -#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 -#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 - -#define VM_ENTRY_IA32E_MODE 0x00000200 -#define VM_ENTRY_SMM 0x00000400 -#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 - -/* VMCS Encodings */ -enum vmcs_field { - GUEST_ES_SELECTOR = 0x00000800, - GUEST_CS_SELECTOR = 0x00000802, - GUEST_SS_SELECTOR = 0x00000804, - GUEST_DS_SELECTOR = 0x00000806, - GUEST_FS_SELECTOR = 0x00000808, - GUEST_GS_SELECTOR = 0x0000080a, - GUEST_LDTR_SELECTOR = 0x0000080c, - GUEST_TR_SELECTOR = 0x0000080e, - HOST_ES_SELECTOR = 0x00000c00, - HOST_CS_SELECTOR = 0x00000c02, - HOST_SS_SELECTOR = 0x00000c04, - HOST_DS_SELECTOR = 0x00000c06, - HOST_FS_SELECTOR = 0x00000c08, - HOST_GS_SELECTOR = 0x00000c0a, - HOST_TR_SELECTOR = 0x00000c0c, - IO_BITMAP_A = 0x00002000, - IO_BITMAP_A_HIGH = 0x00002001, - IO_BITMAP_B = 0x00002002, - IO_BITMAP_B_HIGH = 0x00002003, - MSR_BITMAP = 0x00002004, - MSR_BITMAP_HIGH = 0x00002005, - VM_EXIT_MSR_STORE_ADDR = 0x00002006, - VM_EXIT_MSR_STORE_ADDR_HIGH = 0x00002007, - VM_EXIT_MSR_LOAD_ADDR = 0x00002008, - VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009, - VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a, - VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b, - TSC_OFFSET = 0x00002010, - TSC_OFFSET_HIGH = 0x00002011, - VIRTUAL_APIC_PAGE_ADDR = 0x00002012, - VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013, - APIC_ACCESS_ADDR = 0x00002014, - APIC_ACCESS_ADDR_HIGH = 0x00002015, - VMCS_LINK_POINTER = 0x00002800, - VMCS_LINK_POINTER_HIGH = 0x00002801, - GUEST_IA32_DEBUGCTL = 0x00002802, - GUEST_IA32_DEBUGCTL_HIGH = 0x00002803, - PIN_BASED_VM_EXEC_CONTROL = 0x00004000, - CPU_BASED_VM_EXEC_CONTROL = 0x00004002, - EXCEPTION_BITMAP = 0x00004004, - PAGE_FAULT_ERROR_CODE_MASK = 0x00004006, - PAGE_FAULT_ERROR_CODE_MATCH = 0x00004008, - CR3_TARGET_COUNT = 0x0000400a, - VM_EXIT_CONTROLS = 0x0000400c, - VM_EXIT_MSR_STORE_COUNT = 0x0000400e, - VM_EXIT_MSR_LOAD_COUNT = 0x00004010, - VM_ENTRY_CONTROLS = 0x00004012, - VM_ENTRY_MSR_LOAD_COUNT = 0x00004014, - VM_ENTRY_INTR_INFO_FIELD = 0x00004016, - VM_ENTRY_EXCEPTION_ERROR_CODE = 0x00004018, - VM_ENTRY_INSTRUCTION_LEN = 0x0000401a, - TPR_THRESHOLD = 0x0000401c, - SECONDARY_VM_EXEC_CONTROL = 0x0000401e, - VM_INSTRUCTION_ERROR = 0x00004400, - VM_EXIT_REASON = 0x00004402, - VM_EXIT_INTR_INFO = 0x00004404, - VM_EXIT_INTR_ERROR_CODE = 0x00004406, - IDT_VECTORING_INFO_FIELD = 0x00004408, - IDT_VECTORING_ERROR_CODE = 0x0000440a, - VM_EXIT_INSTRUCTION_LEN = 0x0000440c, - VMX_INSTRUCTION_INFO = 0x0000440e, - GUEST_ES_LIMIT = 0x00004800, - GUEST_CS_LIMIT = 0x00004802, - GUEST_SS_LIMIT = 0x00004804, - GUEST_DS_LIMIT = 0x00004806, - GUEST_FS_LIMIT = 0x00004808, - GUEST_GS_LIMIT = 0x0000480a, - GUEST_LDTR_LIMIT = 0x0000480c, - GUEST_TR_LIMIT = 0x0000480e, - GUEST_GDTR_LIMIT = 0x00004810, - GUEST_IDTR_LIMIT = 0x00004812, - GUEST_ES_AR_BYTES = 0x00004814, - GUEST_CS_AR_BYTES = 0x00004816, - GUEST_SS_AR_BYTES = 0x00004818, - GUEST_DS_AR_BYTES = 0x0000481a, - GUEST_FS_AR_BYTES = 0x0000481c, - GUEST_GS_AR_BYTES = 0x0000481e, - GUEST_LDTR_AR_BYTES = 0x00004820, - GUEST_TR_AR_BYTES = 0x00004822, - GUEST_INTERRUPTIBILITY_INFO = 0x00004824, - GUEST_ACTIVITY_STATE = 0X00004826, - GUEST_SYSENTER_CS = 0x0000482A, - HOST_IA32_SYSENTER_CS = 0x00004c00, - CR0_GUEST_HOST_MASK = 0x00006000, - CR4_GUEST_HOST_MASK = 0x00006002, - CR0_READ_SHADOW = 0x00006004, - CR4_READ_SHADOW = 0x00006006, - CR3_TARGET_VALUE0 = 0x00006008, - CR3_TARGET_VALUE1 = 0x0000600a, - CR3_TARGET_VALUE2 = 0x0000600c, - CR3_TARGET_VALUE3 = 0x0000600e, - EXIT_QUALIFICATION = 0x00006400, - GUEST_LINEAR_ADDRESS = 0x0000640a, - GUEST_CR0 = 0x00006800, - GUEST_CR3 = 0x00006802, - GUEST_CR4 = 0x00006804, - GUEST_ES_BASE = 0x00006806, - GUEST_CS_BASE = 0x00006808, - GUEST_SS_BASE = 0x0000680a, - GUEST_DS_BASE = 0x0000680c, - GUEST_FS_BASE = 0x0000680e, - GUEST_GS_BASE = 0x00006810, - GUEST_LDTR_BASE = 0x00006812, - GUEST_TR_BASE = 0x00006814, - GUEST_GDTR_BASE = 0x00006816, - GUEST_IDTR_BASE = 0x00006818, - GUEST_DR7 = 0x0000681a, - GUEST_RSP = 0x0000681c, - GUEST_RIP = 0x0000681e, - GUEST_RFLAGS = 0x00006820, - GUEST_PENDING_DBG_EXCEPTIONS = 0x00006822, - GUEST_SYSENTER_ESP = 0x00006824, - GUEST_SYSENTER_EIP = 0x00006826, - HOST_CR0 = 0x00006c00, - HOST_CR3 = 0x00006c02, - HOST_CR4 = 0x00006c04, - HOST_FS_BASE = 0x00006c06, - HOST_GS_BASE = 0x00006c08, - HOST_TR_BASE = 0x00006c0a, - HOST_GDTR_BASE = 0x00006c0c, - HOST_IDTR_BASE = 0x00006c0e, - HOST_IA32_SYSENTER_ESP = 0x00006c10, - HOST_IA32_SYSENTER_EIP = 0x00006c12, - HOST_RSP = 0x00006c14, - HOST_RIP = 0x00006c16, -}; - -#define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000 - -#define EXIT_REASON_EXCEPTION_NMI 0 -#define EXIT_REASON_EXTERNAL_INTERRUPT 1 -#define EXIT_REASON_TRIPLE_FAULT 2 - -#define EXIT_REASON_PENDING_INTERRUPT 7 - -#define EXIT_REASON_TASK_SWITCH 9 -#define EXIT_REASON_CPUID 10 -#define EXIT_REASON_HLT 12 -#define EXIT_REASON_INVLPG 14 -#define EXIT_REASON_RDPMC 15 -#define EXIT_REASON_RDTSC 16 -#define EXIT_REASON_VMCALL 18 -#define EXIT_REASON_VMCLEAR 19 -#define EXIT_REASON_VMLAUNCH 20 -#define EXIT_REASON_VMPTRLD 21 -#define EXIT_REASON_VMPTRST 22 -#define EXIT_REASON_VMREAD 23 -#define EXIT_REASON_VMRESUME 24 -#define EXIT_REASON_VMWRITE 25 -#define EXIT_REASON_VMOFF 26 -#define EXIT_REASON_VMON 27 -#define EXIT_REASON_CR_ACCESS 28 -#define EXIT_REASON_DR_ACCESS 29 -#define EXIT_REASON_IO_INSTRUCTION 30 -#define EXIT_REASON_MSR_READ 31 -#define EXIT_REASON_MSR_WRITE 32 -#define EXIT_REASON_MWAIT_INSTRUCTION 36 -#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 -#define EXIT_REASON_APIC_ACCESS 44 -#define EXIT_REASON_WBINVD 54 - -/* - * Interruption-information format - */ -#define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */ -#define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */ -#define INTR_INFO_DELIEVER_CODE_MASK 0x800 /* 11 */ -#define INTR_INFO_VALID_MASK 0x80000000 /* 31 */ - -#define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK -#define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK -#define VECTORING_INFO_DELIEVER_CODE_MASK INTR_INFO_DELIEVER_CODE_MASK -#define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK - -#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ -#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */ -#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */ - -/* - * Exit Qualifications for MOV for Control Register Access - */ -#define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control reg.*/ -#define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */ -#define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose reg. */ -#define LMSW_SOURCE_DATA_SHIFT 16 -#define LMSW_SOURCE_DATA (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */ -#define REG_EAX (0 << 8) -#define REG_ECX (1 << 8) -#define REG_EDX (2 << 8) -#define REG_EBX (3 << 8) -#define REG_ESP (4 << 8) -#define REG_EBP (5 << 8) -#define REG_ESI (6 << 8) -#define REG_EDI (7 << 8) -#define REG_R8 (8 << 8) -#define REG_R9 (9 << 8) -#define REG_R10 (10 << 8) -#define REG_R11 (11 << 8) -#define REG_R12 (12 << 8) -#define REG_R13 (13 << 8) -#define REG_R14 (14 << 8) -#define REG_R15 (15 << 8) - -/* - * Exit Qualifications for MOV for Debug Register Access - */ -#define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug reg. */ -#define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */ -#define TYPE_MOV_TO_DR (0 << 4) -#define TYPE_MOV_FROM_DR (1 << 4) -#define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose reg. */ - - -/* segment AR */ -#define SEGMENT_AR_L_MASK (1 << 13) - -#define AR_TYPE_ACCESSES_MASK 1 -#define AR_TYPE_READABLE_MASK (1 << 1) -#define AR_TYPE_WRITEABLE_MASK (1 << 2) -#define AR_TYPE_CODE_MASK (1 << 3) -#define AR_TYPE_MASK 0x0f -#define AR_TYPE_BUSY_64_TSS 11 -#define AR_TYPE_BUSY_32_TSS 11 -#define AR_TYPE_BUSY_16_TSS 3 -#define AR_TYPE_LDT 2 - -#define AR_UNUSABLE_MASK (1 << 16) -#define AR_S_MASK (1 << 4) -#define AR_P_MASK (1 << 7) -#define AR_L_MASK (1 << 13) -#define AR_DB_MASK (1 << 14) -#define AR_G_MASK (1 << 15) -#define AR_DPL_SHIFT 5 -#define AR_DPL(ar) (((ar) >> AR_DPL_SHIFT) & 3) - -#define AR_RESERVD_MASK 0xfffe0f00 - -#define MSR_IA32_VMX_BASIC 0x480 -#define MSR_IA32_VMX_PINBASED_CTLS 0x481 -#define MSR_IA32_VMX_PROCBASED_CTLS 0x482 -#define MSR_IA32_VMX_EXIT_CTLS 0x483 -#define MSR_IA32_VMX_ENTRY_CTLS 0x484 -#define MSR_IA32_VMX_MISC 0x485 -#define MSR_IA32_VMX_CR0_FIXED0 0x486 -#define MSR_IA32_VMX_CR0_FIXED1 0x487 -#define MSR_IA32_VMX_CR4_FIXED0 0x488 -#define MSR_IA32_VMX_CR4_FIXED1 0x489 -#define MSR_IA32_VMX_VMCS_ENUM 0x48a -#define MSR_IA32_VMX_PROCBASED_CTLS2 0x48b - -#define MSR_IA32_FEATURE_CONTROL 0x3a -#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1 -#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4 - -#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 - -#endif diff --git a/drivers/kvm/x86.c b/drivers/kvm/x86.c deleted file mode 100644 index b37c0093d72..00000000000 --- a/drivers/kvm/x86.c +++ /dev/null @@ -1,3148 +0,0 @@ -/* - * Kernel-based Virtual Machine driver for Linux - * - * derived from drivers/kvm/kvm_main.c - * - * Copyright (C) 2006 Qumranet, Inc. - * - * Authors: - * Avi Kivity <avi@qumranet.com> - * Yaniv Kamay <yaniv@qumranet.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -#include "kvm.h" -#include "x86.h" -#include "x86_emulate.h" -#include "segment_descriptor.h" -#include "irq.h" -#include "mmu.h" - -#include <linux/kvm.h> -#include <linux/fs.h> -#include <linux/vmalloc.h> -#include <linux/module.h> -#include <linux/mman.h> -#include <linux/highmem.h> - -#include <asm/uaccess.h> -#include <asm/msr.h> - -#define MAX_IO_MSRS 256 -#define CR0_RESERVED_BITS \ - (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ - | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ - | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) -#define CR4_RESERVED_BITS \ - (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ - | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ - | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) - -#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) -#define EFER_RESERVED_BITS 0xfffffffffffff2fe - -#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM -#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU - -struct kvm_x86_ops *kvm_x86_ops; - -struct kvm_stats_debugfs_item debugfs_entries[] = { - { "pf_fixed", VCPU_STAT(pf_fixed) }, - { "pf_guest", VCPU_STAT(pf_guest) }, - { "tlb_flush", VCPU_STAT(tlb_flush) }, - { "invlpg", VCPU_STAT(invlpg) }, - { "exits", VCPU_STAT(exits) }, - { "io_exits", VCPU_STAT(io_exits) }, - { "mmio_exits", VCPU_STAT(mmio_exits) }, - { "signal_exits", VCPU_STAT(signal_exits) }, - { "irq_window", VCPU_STAT(irq_window_exits) }, - { "halt_exits", VCPU_STAT(halt_exits) }, - { "halt_wakeup", VCPU_STAT(halt_wakeup) }, - { "request_irq", VCPU_STAT(request_irq_exits) }, - { "irq_exits", VCPU_STAT(irq_exits) }, - { "host_state_reload", VCPU_STAT(host_state_reload) }, - { "efer_reload", VCPU_STAT(efer_reload) }, - { "fpu_reload", VCPU_STAT(fpu_reload) }, - { "insn_emulation", VCPU_STAT(insn_emulation) }, - { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, - { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, - { "mmu_pte_write", VM_STAT(mmu_pte_write) }, - { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, - { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, - { "mmu_flooded", VM_STAT(mmu_flooded) }, - { "mmu_recycled", VM_STAT(mmu_recycled) }, - { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, - { NULL } -}; - - -unsigned long segment_base(u16 selector) -{ - struct descriptor_table gdt; - struct segment_descriptor *d; - unsigned long table_base; - unsigned long v; - - if (selector == 0) - return 0; - - asm("sgdt %0" : "=m"(gdt)); - table_base = gdt.base; - - if (selector & 4) { /* from ldt */ - u16 ldt_selector; - - asm("sldt %0" : "=g"(ldt_selector)); - table_base = segment_base(ldt_selector); - } - d = (struct segment_descriptor *)(table_base + (selector & ~7)); - v = d->base_low | ((unsigned long)d->base_mid << 16) | - ((unsigned long)d->base_high << 24); -#ifdef CONFIG_X86_64 - if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) - v |= ((unsigned long) \ - ((struct segment_descriptor_64 *)d)->base_higher) << 32; -#endif - return v; -} -EXPORT_SYMBOL_GPL(segment_base); - -u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) -{ - if (irqchip_in_kernel(vcpu->kvm)) - return vcpu->arch.apic_base; - else - return vcpu->arch.apic_base; -} -EXPORT_SYMBOL_GPL(kvm_get_apic_base); - -void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) -{ - /* TODO: reserve bits check */ - if (irqchip_in_kernel(vcpu->kvm)) - kvm_lapic_set_base(vcpu, data); - else - vcpu->arch.apic_base = data; -} -EXPORT_SYMBOL_GPL(kvm_set_apic_base); - -void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) -{ - WARN_ON(vcpu->arch.exception.pending); - vcpu->arch.exception.pending = true; - vcpu->arch.exception.has_error_code = false; - vcpu->arch.exception.nr = nr; -} -EXPORT_SYMBOL_GPL(kvm_queue_exception); - -void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, - u32 error_code) -{ - ++vcpu->stat.pf_guest; - if (vcpu->arch.exception.pending && vcpu->arch.exception.nr == PF_VECTOR) { - printk(KERN_DEBUG "kvm: inject_page_fault:" - " double fault 0x%lx\n", addr); - vcpu->arch.exception.nr = DF_VECTOR; - vcpu->arch.exception.error_code = 0; - return; - } - vcpu->arch.cr2 = addr; - kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); -} - -void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) -{ - WARN_ON(vcpu->arch.exception.pending); - vcpu->arch.exception.pending = true; - vcpu->arch.exception.has_error_code = true; - vcpu->arch.exception.nr = nr; - vcpu->arch.exception.error_code = error_code; -} -EXPORT_SYMBOL_GPL(kvm_queue_exception_e); - -static void __queue_exception(struct kvm_vcpu *vcpu) -{ - kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, - vcpu->arch.exception.has_error_code, - vcpu->arch.exception.error_code); -} - -/* - * Load the pae pdptrs. Return true is they are all valid. - */ -int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) -{ - gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; - unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; - int i; - int ret; - u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; - - mutex_lock(&vcpu->kvm->lock); - ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, - offset * sizeof(u64), sizeof(pdpte)); - if (ret < 0) { - ret = 0; - goto out; - } - for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { - if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) { - ret = 0; - goto out; - } - } - ret = 1; - - memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); -out: - mutex_unlock(&vcpu->kvm->lock); - - return ret; -} - -static bool pdptrs_changed(struct kvm_vcpu *vcpu) -{ - u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; - bool changed = true; - int r; - - if (is_long_mode(vcpu) || !is_pae(vcpu)) - return false; - - mutex_lock(&vcpu->kvm->lock); - r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); - if (r < 0) - goto out; - changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; -out: - mutex_unlock(&vcpu->kvm->lock); - - return changed; -} - -void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) -{ - if (cr0 & CR0_RESERVED_BITS) { - printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", - cr0, vcpu->arch.cr0); - kvm_inject_gp(vcpu, 0); - return; - } - - if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { - printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); - kvm_inject_gp(vcpu, 0); - return; - } - - if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { - printk(KERN_DEBUG "set_cr0: #GP, set PG flag " - "and a clear PE flag\n"); - kvm_inject_gp(vcpu, 0); - return; - } - - if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { -#ifdef CONFIG_X86_64 - if ((vcpu->arch.shadow_efer & EFER_LME)) { - int cs_db, cs_l; - - if (!is_pae(vcpu)) { - printk(KERN_DEBUG "set_cr0: #GP, start paging " - "in long mode while PAE is disabled\n"); - kvm_inject_gp(vcpu, 0); - return; - } - kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - if (cs_l) { - printk(KERN_DEBUG "set_cr0: #GP, start paging " - "in long mode while CS.L == 1\n"); - kvm_inject_gp(vcpu, 0); - return; - - } - } else -#endif - if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { - printk(KERN_DEBUG "set_cr0: #GP, pdptrs " - "reserved bits\n"); - kvm_inject_gp(vcpu, 0); - return; - } - - } - - kvm_x86_ops->set_cr0(vcpu, cr0); - vcpu->arch.cr0 = cr0; - - mutex_lock(&vcpu->kvm->lock); - kvm_mmu_reset_context(vcpu); - mutex_unlock(&vcpu->kvm->lock); - return; -} -EXPORT_SYMBOL_GPL(set_cr0); - -void lmsw(struct kvm_vcpu *vcpu, unsigned long msw) -{ - set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); -} -EXPORT_SYMBOL_GPL(lmsw); - -void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) -{ - if (cr4 & CR4_RESERVED_BITS) { - printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); - kvm_inject_gp(vcpu, 0); - return; - } - - if (is_long_mode(vcpu)) { - if (!(cr4 & X86_CR4_PAE)) { - printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " - "in long mode\n"); - kvm_inject_gp(vcpu, 0); - return; - } - } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE) - && !load_pdptrs(vcpu, vcpu->arch.cr3)) { - printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); - kvm_inject_gp(vcpu, 0); - return; - } - - if (cr4 & X86_CR4_VMXE) { - printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); - kvm_inject_gp(vcpu, 0); - return; - } - kvm_x86_ops->set_cr4(vcpu, cr4); - vcpu->arch.cr4 = cr4; - mutex_lock(&vcpu->kvm->lock); - kvm_mmu_reset_context(vcpu); - mutex_unlock(&vcpu->kvm->lock); -} -EXPORT_SYMBOL_GPL(set_cr4); - -void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) -{ - if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { - kvm_mmu_flush_tlb(vcpu); - return; - } - - if (is_long_mode(vcpu)) { - if (cr3 & CR3_L_MODE_RESERVED_BITS) { - printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); - kvm_inject_gp(vcpu, 0); - return; - } - } else { - if (is_pae(vcpu)) { - if (cr3 & CR3_PAE_RESERVED_BITS) { - printk(KERN_DEBUG - "set_cr3: #GP, reserved bits\n"); - kvm_inject_gp(vcpu, 0); - return; - } - if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { - printk(KERN_DEBUG "set_cr3: #GP, pdptrs " - "reserved bits\n"); - kvm_inject_gp(vcpu, 0); - return; - } - } - /* - * We don't check reserved bits in nonpae mode, because - * this isn't enforced, and VMware depends on this. - */ - } - - mutex_lock(&vcpu->kvm->lock); - /* - * Does the new cr3 value map to physical memory? (Note, we - * catch an invalid cr3 even in real-mode, because it would - * cause trouble later on when we turn on paging anyway.) - * - * A real CPU would silently accept an invalid cr3 and would - * attempt to use it - with largely undefined (and often hard - * to debug) behavior on the guest side. - */ - if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) - kvm_inject_gp(vcpu, 0); - else { - vcpu->arch.cr3 = cr3; - vcpu->arch.mmu.new_cr3(vcpu); - } - mutex_unlock(&vcpu->kvm->lock); -} -EXPORT_SYMBOL_GPL(set_cr3); - -void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) -{ - if (cr8 & CR8_RESERVED_BITS) { - printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); - kvm_inject_gp(vcpu, 0); - return; - } - if (irqchip_in_kernel(vcpu->kvm)) - kvm_lapic_set_tpr(vcpu, cr8); - else - vcpu->arch.cr8 = cr8; -} -EXPORT_SYMBOL_GPL(set_cr8); - -unsigned long get_cr8(struct kvm_vcpu *vcpu) -{ - if (irqchip_in_kernel(vcpu->kvm)) - return kvm_lapic_get_cr8(vcpu); - else - return vcpu->arch.cr8; -} -EXPORT_SYMBOL_GPL(get_cr8); - -/* - * List of msr numbers which we expose to userspace through KVM_GET_MSRS - * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. - * - * This list is modified at module load time to reflect the - * capabilities of the host cpu. - */ -static u32 msrs_to_save[] = { - MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, - MSR_K6_STAR, -#ifdef CONFIG_X86_64 - MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, -#endif - MSR_IA32_TIME_STAMP_COUNTER, -}; - -static unsigned num_msrs_to_save; - -static u32 emulated_msrs[] = { - MSR_IA32_MISC_ENABLE, -}; - -#ifdef CONFIG_X86_64 - -static void set_efer(struct kvm_vcpu *vcpu, u64 efer) -{ - if (efer & EFER_RESERVED_BITS) { - printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", - efer); - kvm_inject_gp(vcpu, 0); - return; - } - - if (is_paging(vcpu) - && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) { - printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); - kvm_inject_gp(vcpu, 0); - return; - } - - kvm_x86_ops->set_efer(vcpu, efer); - - efer &= ~EFER_LMA; - efer |= vcpu->arch.shadow_efer & EFER_LMA; - - vcpu->arch.shadow_efer = efer; -} - -#endif - -/* - * Writes msr value into into the appropriate "register". - * Returns 0 on success, non-0 otherwise. - * Assumes vcpu_load() was already called. - */ -int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) -{ - return kvm_x86_ops->set_msr(vcpu, msr_index, data); -} - -/* - * Adapt set_msr() to msr_io()'s calling convention - */ -static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) -{ - return kvm_set_msr(vcpu, index, *data); -} - - -int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) -{ - switch (msr) { -#ifdef CONFIG_X86_64 - case MSR_EFER: - set_efer(vcpu, data); - break; -#endif - case MSR_IA32_MC0_STATUS: - pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", - __FUNCTION__, data); - break; - case MSR_IA32_MCG_STATUS: - pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", - __FUNCTION__, data); - break; - case MSR_IA32_UCODE_REV: - case MSR_IA32_UCODE_WRITE: - case 0x200 ... 0x2ff: /* MTRRs */ - break; - case MSR_IA32_APICBASE: - kvm_set_apic_base(vcpu, data); - break; - case MSR_IA32_MISC_ENABLE: - vcpu->arch.ia32_misc_enable_msr = data; - break; - default: - pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr); - return 1; - } - return 0; -} -EXPORT_SYMBOL_GPL(kvm_set_msr_common); - - -/* - * Reads an msr value (of 'msr_index') into 'pdata'. - * Returns 0 on success, non-0 otherwise. - * Assumes vcpu_load() was already called. - */ -int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) -{ - return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); -} - -int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) -{ - u64 data; - - switch (msr) { - case 0xc0010010: /* SYSCFG */ - case 0xc0010015: /* HWCR */ - case MSR_IA32_PLATFORM_ID: - case MSR_IA32_P5_MC_ADDR: - case MSR_IA32_P5_MC_TYPE: - case MSR_IA32_MC0_CTL: - case MSR_IA32_MCG_STATUS: - case MSR_IA32_MCG_CAP: - case MSR_IA32_MC0_MISC: - case MSR_IA32_MC0_MISC+4: - case MSR_IA32_MC0_MISC+8: - case MSR_IA32_MC0_MISC+12: - case MSR_IA32_MC0_MISC+16: - case MSR_IA32_UCODE_REV: - case MSR_IA32_PERF_STATUS: - case MSR_IA32_EBL_CR_POWERON: - /* MTRR registers */ - case 0xfe: - case 0x200 ... 0x2ff: - data = 0; - break; - case 0xcd: /* fsb frequency */ - data = 3; - break; - case MSR_IA32_APICBASE: - data = kvm_get_apic_base(vcpu); - break; - case MSR_IA32_MISC_ENABLE: - data = vcpu->arch.ia32_misc_enable_msr; - break; -#ifdef CONFIG_X86_64 - case MSR_EFER: - data = vcpu->arch.shadow_efer; - break; -#endif - default: - pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); - return 1; - } - *pdata = data; - return 0; -} -EXPORT_SYMBOL_GPL(kvm_get_msr_common); - -/* - * Read or write a bunch of msrs. All parameters are kernel addresses. - * - * @return number of msrs set successfully. - */ -static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, - struct kvm_msr_entry *entries, - int (*do_msr)(struct kvm_vcpu *vcpu, - unsigned index, u64 *data)) -{ - int i; - - vcpu_load(vcpu); - - for (i = 0; i < msrs->nmsrs; ++i) - if (do_msr(vcpu, entries[i].index, &entries[i].data)) - break; - - vcpu_put(vcpu); - - return i; -} - -/* - * Read or write a bunch of msrs. Parameters are user addresses. - * - * @return number of msrs set successfully. - */ -static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, - int (*do_msr)(struct kvm_vcpu *vcpu, - unsigned index, u64 *data), - int writeback) -{ - struct kvm_msrs msrs; - struct kvm_msr_entry *entries; - int r, n; - unsigned size; - - r = -EFAULT; - if (copy_from_user(&msrs, user_msrs, sizeof msrs)) - goto out; - - r = -E2BIG; - if (msrs.nmsrs >= MAX_IO_MSRS) - goto out; - - r = -ENOMEM; - size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; - entries = vmalloc(size); - if (!entries) - goto out; - - r = -EFAULT; - if (copy_from_user(entries, user_msrs->entries, size)) - goto out_free; - - r = n = __msr_io(vcpu, &msrs, entries, do_msr); - if (r < 0) - goto out_free; - - r = -EFAULT; - if (writeback && copy_to_user(user_msrs->entries, entries, size)) - goto out_free; - - r = n; - -out_free: - vfree(entries); -out: - return r; -} - -/* - * Make sure that a cpu that is being hot-unplugged does not have any vcpus - * cached on it. - */ -void decache_vcpus_on_cpu(int cpu) -{ - struct kvm *vm; - struct kvm_vcpu *vcpu; - int i; - - spin_lock(&kvm_lock); - list_for_each_entry(vm, &vm_list, vm_list) - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - vcpu = vm->vcpus[i]; - if (!vcpu) - continue; - /* - * If the vcpu is locked, then it is running on some - * other cpu and therefore it is not cached on the - * cpu in question. - * - * If it's not locked, check the last cpu it executed - * on. - */ - if (mutex_trylock(&vcpu->mutex)) { - if (vcpu->cpu == cpu) { - kvm_x86_ops->vcpu_decache(vcpu); - vcpu->cpu = -1; - } - mutex_unlock(&vcpu->mutex); - } - } - spin_unlock(&kvm_lock); -} - -int kvm_dev_ioctl_check_extension(long ext) -{ - int r; - - switch (ext) { - case KVM_CAP_IRQCHIP: - case KVM_CAP_HLT: - case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: - case KVM_CAP_USER_MEMORY: - case KVM_CAP_SET_TSS_ADDR: - case KVM_CAP_EXT_CPUID: - r = 1; - break; - default: - r = 0; - break; - } - return r; - -} - -long kvm_arch_dev_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) -{ - void __user *argp = (void __user *)arg; - long r; - - switch (ioctl) { - case KVM_GET_MSR_INDEX_LIST: { - struct kvm_msr_list __user *user_msr_list = argp; - struct kvm_msr_list msr_list; - unsigned n; - - r = -EFAULT; - if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) - goto out; - n = msr_list.nmsrs; - msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); - if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) - goto out; - r = -E2BIG; - if (n < num_msrs_to_save) - goto out; - r = -EFAULT; - if (copy_to_user(user_msr_list->indices, &msrs_to_save, - num_msrs_to_save * sizeof(u32))) - goto out; - if (copy_to_user(user_msr_list->indices - + num_msrs_to_save * sizeof(u32), - &emulated_msrs, - ARRAY_SIZE(emulated_msrs) * sizeof(u32))) - goto out; - r = 0; - break; - } - default: - r = -EINVAL; - } -out: - return r; -} - -void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) -{ - kvm_x86_ops->vcpu_load(vcpu, cpu); -} - -void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) -{ - kvm_x86_ops->vcpu_put(vcpu); - kvm_put_guest_fpu(vcpu); -} - -static int is_efer_nx(void) -{ - u64 efer; - - rdmsrl(MSR_EFER, efer); - return efer & EFER_NX; -} - -static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) -{ - int i; - struct kvm_cpuid_entry2 *e, *entry; - - entry = NULL; - for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { - e = &vcpu->arch.cpuid_entries[i]; - if (e->function == 0x80000001) { - entry = e; - break; - } - } - if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) { - entry->edx &= ~(1 << 20); - printk(KERN_INFO "kvm: guest NX capability removed\n"); - } -} - -/* when an old userspace process fills a new kernel module */ -static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, - struct kvm_cpuid *cpuid, - struct kvm_cpuid_entry __user *entries) -{ - int r, i; - struct kvm_cpuid_entry *cpuid_entries; - - r = -E2BIG; - if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) - goto out; - r = -ENOMEM; - cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent); - if (!cpuid_entries) - goto out; - r = -EFAULT; - if (copy_from_user(cpuid_entries, entries, - cpuid->nent * sizeof(struct kvm_cpuid_entry))) - goto out_free; - for (i = 0; i < cpuid->nent; i++) { - vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; - vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; - vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; - vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; - vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; - vcpu->arch.cpuid_entries[i].index = 0; - vcpu->arch.cpuid_entries[i].flags = 0; - vcpu->arch.cpuid_entries[i].padding[0] = 0; - vcpu->arch.cpuid_entries[i].padding[1] = 0; - vcpu->arch.cpuid_entries[i].padding[2] = 0; - } - vcpu->arch.cpuid_nent = cpuid->nent; - cpuid_fix_nx_cap(vcpu); - r = 0; - -out_free: - vfree(cpuid_entries); -out: - return r; -} - -static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, - struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries) -{ - int r; - - r = -E2BIG; - if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) - goto out; - r = -EFAULT; - if (copy_from_user(&vcpu->arch.cpuid_entries, entries, - cpuid->nent * sizeof(struct kvm_cpuid_entry2))) - goto out; - vcpu->arch.cpuid_nent = cpuid->nent; - return 0; - -out: - return r; -} - -static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, - struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries) -{ - int r; - - r = -E2BIG; - if (cpuid->nent < vcpu->arch.cpuid_nent) - goto out; - r = -EFAULT; - if (copy_to_user(entries, &vcpu->arch.cpuid_entries, - vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) - goto out; - return 0; - -out: - cpuid->nent = vcpu->arch.cpuid_nent; - return r; -} - -static inline u32 bit(int bitno) -{ - return 1 << (bitno & 31); -} - -static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, - u32 index) -{ - entry->function = function; - entry->index = index; - cpuid_count(entry->function, entry->index, - &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); - entry->flags = 0; -} - -static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, - u32 index, int *nent, int maxnent) -{ - const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) | - bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) | - bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) | - bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) | - bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) | - bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) | - bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | - bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) | - bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) | - bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP); - const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) | - bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) | - bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) | - bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) | - bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) | - bit(X86_FEATURE_PGE) | - bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | - bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) | - bit(X86_FEATURE_SYSCALL) | - (bit(X86_FEATURE_NX) && is_efer_nx()) | -#ifdef CONFIG_X86_64 - bit(X86_FEATURE_LM) | -#endif - bit(X86_FEATURE_MMXEXT) | - bit(X86_FEATURE_3DNOWEXT) | - bit(X86_FEATURE_3DNOW); - const u32 kvm_supported_word3_x86_features = - bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16); - const u32 kvm_supported_word6_x86_features = - bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY); - - /* all func 2 cpuid_count() should be called on the same cpu */ - get_cpu(); - do_cpuid_1_ent(entry, function, index); - ++*nent; - - switch (function) { - case 0: - entry->eax = min(entry->eax, (u32)0xb); - break; - case 1: - entry->edx &= kvm_supported_word0_x86_features; - entry->ecx &= kvm_supported_word3_x86_features; - break; - /* function 2 entries are STATEFUL. That is, repeated cpuid commands - * may return different values. This forces us to get_cpu() before - * issuing the first command, and also to emulate this annoying behavior - * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */ - case 2: { - int t, times = entry->eax & 0xff; - - entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; - for (t = 1; t < times && *nent < maxnent; ++t) { - do_cpuid_1_ent(&entry[t], function, 0); - entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; - ++*nent; - } - break; - } - /* function 4 and 0xb have additional index. */ - case 4: { - int index, cache_type; - - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - /* read more entries until cache_type is zero */ - for (index = 1; *nent < maxnent; ++index) { - cache_type = entry[index - 1].eax & 0x1f; - if (!cache_type) - break; - do_cpuid_1_ent(&entry[index], function, index); - entry[index].flags |= - KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - ++*nent; - } - break; - } - case 0xb: { - int index, level_type; - - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - /* read more entries until level_type is zero */ - for (index = 1; *nent < maxnent; ++index) { - level_type = entry[index - 1].ecx & 0xff; - if (!level_type) - break; - do_cpuid_1_ent(&entry[index], function, index); - entry[index].flags |= - KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - ++*nent; - } - break; - } - case 0x80000000: - entry->eax = min(entry->eax, 0x8000001a); - break; - case 0x80000001: - entry->edx &= kvm_supported_word1_x86_features; - entry->ecx &= kvm_supported_word6_x86_features; - break; - } - put_cpu(); -} - -static int kvm_vm_ioctl_get_supported_cpuid(struct kvm *kvm, - struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries) -{ - struct kvm_cpuid_entry2 *cpuid_entries; - int limit, nent = 0, r = -E2BIG; - u32 func; - - if (cpuid->nent < 1) - goto out; - r = -ENOMEM; - cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); - if (!cpuid_entries) - goto out; - - do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent); - limit = cpuid_entries[0].eax; - for (func = 1; func <= limit && nent < cpuid->nent; ++func) - do_cpuid_ent(&cpuid_entries[nent], func, 0, - &nent, cpuid->nent); - r = -E2BIG; - if (nent >= cpuid->nent) - goto out_free; - - do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent); - limit = cpuid_entries[nent - 1].eax; - for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) - do_cpuid_ent(&cpuid_entries[nent], func, 0, - &nent, cpuid->nent); - r = -EFAULT; - if (copy_to_user(entries, cpuid_entries, - nent * sizeof(struct kvm_cpuid_entry2))) - goto out_free; - cpuid->nent = nent; - r = 0; - -out_free: - vfree(cpuid_entries); -out: - return r; -} - -static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, - struct kvm_lapic_state *s) -{ - vcpu_load(vcpu); - memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); - vcpu_put(vcpu); - - return 0; -} - -static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, - struct kvm_lapic_state *s) -{ - vcpu_load(vcpu); - memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); - kvm_apic_post_state_restore(vcpu); - vcpu_put(vcpu); - - return 0; -} - -static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, - struct kvm_interrupt *irq) -{ - if (irq->irq < 0 || irq->irq >= 256) - return -EINVAL; - if (irqchip_in_kernel(vcpu->kvm)) - return -ENXIO; - vcpu_load(vcpu); - - set_bit(irq->irq, vcpu->arch.irq_pending); - set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary); - - vcpu_put(vcpu); - - return 0; -} - -long kvm_arch_vcpu_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) -{ - struct kvm_vcpu *vcpu = filp->private_data; - void __user *argp = (void __user *)arg; - int r; - - switch (ioctl) { - case KVM_GET_LAPIC: { - struct kvm_lapic_state lapic; - - memset(&lapic, 0, sizeof lapic); - r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic); - if (r) - goto out; - r = -EFAULT; - if (copy_to_user(argp, &lapic, sizeof lapic)) - goto out; - r = 0; - break; - } - case KVM_SET_LAPIC: { - struct kvm_lapic_state lapic; - - r = -EFAULT; - if (copy_from_user(&lapic, argp, sizeof lapic)) - goto out; - r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);; - if (r) - goto out; - r = 0; - break; - } - case KVM_INTERRUPT: { - struct kvm_interrupt irq; - - r = -EFAULT; - if (copy_from_user(&irq, argp, sizeof irq)) - goto out; - r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); - if (r) - goto out; - r = 0; - break; - } - case KVM_SET_CPUID: { - struct kvm_cpuid __user *cpuid_arg = argp; - struct kvm_cpuid cpuid; - - r = -EFAULT; - if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) - goto out; - r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); - if (r) - goto out; - break; - } - case KVM_SET_CPUID2: { - struct kvm_cpuid2 __user *cpuid_arg = argp; - struct kvm_cpuid2 cpuid; - - r = -EFAULT; - if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) - goto out; - r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, - cpuid_arg->entries); - if (r) - goto out; - break; - } - case KVM_GET_CPUID2: { - struct kvm_cpuid2 __user *cpuid_arg = argp; - struct kvm_cpuid2 cpuid; - - r = -EFAULT; - if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) - goto out; - r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, - cpuid_arg->entries); - if (r) - goto out; - r = -EFAULT; - if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) - goto out; - r = 0; - break; - } - case KVM_GET_MSRS: - r = msr_io(vcpu, argp, kvm_get_msr, 1); - break; - case KVM_SET_MSRS: - r = msr_io(vcpu, argp, do_set_msr, 0); - break; - default: - r = -EINVAL; - } -out: - return r; -} - -static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) -{ - int ret; - - if (addr > (unsigned int)(-3 * PAGE_SIZE)) - return -1; - ret = kvm_x86_ops->set_tss_addr(kvm, addr); - return ret; -} - -static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, - u32 kvm_nr_mmu_pages) -{ - if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) - return -EINVAL; - - mutex_lock(&kvm->lock); - - kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); - kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; - - mutex_unlock(&kvm->lock); - return 0; -} - -static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) -{ - return kvm->arch.n_alloc_mmu_pages; -} - -gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) -{ - int i; - struct kvm_mem_alias *alias; - - for (i = 0; i < kvm->arch.naliases; ++i) { - alias = &kvm->arch.aliases[i]; - if (gfn >= alias->base_gfn - && gfn < alias->base_gfn + alias->npages) - return alias->target_gfn + gfn - alias->base_gfn; - } - return gfn; -} - -/* - * Set a new alias region. Aliases map a portion of physical memory into - * another portion. This is useful for memory windows, for example the PC - * VGA region. - */ -static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, - struct kvm_memory_alias *alias) -{ - int r, n; - struct kvm_mem_alias *p; - - r = -EINVAL; - /* General sanity checks */ - if (alias->memory_size & (PAGE_SIZE - 1)) - goto out; - if (alias->guest_phys_addr & (PAGE_SIZE - 1)) - goto out; - if (alias->slot >= KVM_ALIAS_SLOTS) - goto out; - if (alias->guest_phys_addr + alias->memory_size - < alias->guest_phys_addr) - goto out; - if (alias->target_phys_addr + alias->memory_size - < alias->target_phys_addr) - goto out; - - mutex_lock(&kvm->lock); - - p = &kvm->arch.aliases[alias->slot]; - p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; - p->npages = alias->memory_size >> PAGE_SHIFT; - p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; - - for (n = KVM_ALIAS_SLOTS; n > 0; --n) - if (kvm->arch.aliases[n - 1].npages) - break; - kvm->arch.naliases = n; - - kvm_mmu_zap_all(kvm); - - mutex_unlock(&kvm->lock); - - return 0; - -out: - return r; -} - -static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) -{ - int r; - - r = 0; - switch (chip->chip_id) { - case KVM_IRQCHIP_PIC_MASTER: - memcpy(&chip->chip.pic, - &pic_irqchip(kvm)->pics[0], - sizeof(struct kvm_pic_state)); - break; - case KVM_IRQCHIP_PIC_SLAVE: - memcpy(&chip->chip.pic, - &pic_irqchip(kvm)->pics[1], - sizeof(struct kvm_pic_state)); - break; - case KVM_IRQCHIP_IOAPIC: - memcpy(&chip->chip.ioapic, - ioapic_irqchip(kvm), - sizeof(struct kvm_ioapic_state)); - break; - default: - r = -EINVAL; - break; - } - return r; -} - -static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) -{ - int r; - - r = 0; - switch (chip->chip_id) { - case KVM_IRQCHIP_PIC_MASTER: - memcpy(&pic_irqchip(kvm)->pics[0], - &chip->chip.pic, - sizeof(struct kvm_pic_state)); - break; - case KVM_IRQCHIP_PIC_SLAVE: - memcpy(&pic_irqchip(kvm)->pics[1], - &chip->chip.pic, - sizeof(struct kvm_pic_state)); - break; - case KVM_IRQCHIP_IOAPIC: - memcpy(ioapic_irqchip(kvm), - &chip->chip.ioapic, - sizeof(struct kvm_ioapic_state)); - break; - default: - r = -EINVAL; - break; - } - kvm_pic_update_irq(pic_irqchip(kvm)); - return r; -} - -/* - * Get (and clear) the dirty memory log for a memory slot. - */ -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, - struct kvm_dirty_log *log) -{ - int r; - int n; - struct kvm_memory_slot *memslot; - int is_dirty = 0; - - mutex_lock(&kvm->lock); - - r = kvm_get_dirty_log(kvm, log, &is_dirty); - if (r) - goto out; - - /* If nothing is dirty, don't bother messing with page tables. */ - if (is_dirty) { - kvm_mmu_slot_remove_write_access(kvm, log->slot); - kvm_flush_remote_tlbs(kvm); - memslot = &kvm->memslots[log->slot]; - n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; - memset(memslot->dirty_bitmap, 0, n); - } - r = 0; -out: - mutex_unlock(&kvm->lock); - return r; -} - -long kvm_arch_vm_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) -{ - struct kvm *kvm = filp->private_data; - void __user *argp = (void __user *)arg; - int r = -EINVAL; - - switch (ioctl) { - case KVM_SET_TSS_ADDR: - r = kvm_vm_ioctl_set_tss_addr(kvm, arg); - if (r < 0) - goto out; - break; - case KVM_SET_MEMORY_REGION: { - struct kvm_memory_region kvm_mem; - struct kvm_userspace_memory_region kvm_userspace_mem; - - r = -EFAULT; - if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) - goto out; - kvm_userspace_mem.slot = kvm_mem.slot; - kvm_userspace_mem.flags = kvm_mem.flags; - kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr; - kvm_userspace_mem.memory_size = kvm_mem.memory_size; - r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0); - if (r) - goto out; - break; - } - case KVM_SET_NR_MMU_PAGES: - r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); - if (r) - goto out; - break; - case KVM_GET_NR_MMU_PAGES: - r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); - break; - case KVM_SET_MEMORY_ALIAS: { - struct kvm_memory_alias alias; - - r = -EFAULT; - if (copy_from_user(&alias, argp, sizeof alias)) - goto out; - r = kvm_vm_ioctl_set_memory_alias(kvm, &alias); - if (r) - goto out; - break; - } - case KVM_CREATE_IRQCHIP: - r = -ENOMEM; - kvm->arch.vpic = kvm_create_pic(kvm); - if (kvm->arch.vpic) { - r = kvm_ioapic_init(kvm); - if (r) { - kfree(kvm->arch.vpic); - kvm->arch.vpic = NULL; - goto out; - } - } else - goto out; - break; - case KVM_IRQ_LINE: { - struct kvm_irq_level irq_event; - - r = -EFAULT; - if (copy_from_user(&irq_event, argp, sizeof irq_event)) - goto out; - if (irqchip_in_kernel(kvm)) { - mutex_lock(&kvm->lock); - if (irq_event.irq < 16) - kvm_pic_set_irq(pic_irqchip(kvm), - irq_event.irq, - irq_event.level); - kvm_ioapic_set_irq(kvm->arch.vioapic, - irq_event.irq, - irq_event.level); - mutex_unlock(&kvm->lock); - r = 0; - } - break; - } - case KVM_GET_IRQCHIP: { - /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ - struct kvm_irqchip chip; - - r = -EFAULT; - if (copy_from_user(&chip, argp, sizeof chip)) - goto out; - r = -ENXIO; - if (!irqchip_in_kernel(kvm)) - goto out; - r = kvm_vm_ioctl_get_irqchip(kvm, &chip); - if (r) - goto out; - r = -EFAULT; - if (copy_to_user(argp, &chip, sizeof chip)) - goto out; - r = 0; - break; - } - case KVM_SET_IRQCHIP: { - /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ - struct kvm_irqchip chip; - - r = -EFAULT; - if (copy_from_user(&chip, argp, sizeof chip)) - goto out; - r = -ENXIO; - if (!irqchip_in_kernel(kvm)) - goto out; - r = kvm_vm_ioctl_set_irqchip(kvm, &chip); - if (r) - goto out; - r = 0; - break; - } - case KVM_GET_SUPPORTED_CPUID: { - struct kvm_cpuid2 __user *cpuid_arg = argp; - struct kvm_cpuid2 cpuid; - - r = -EFAULT; - if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) - goto out; - r = kvm_vm_ioctl_get_supported_cpuid(kvm, &cpuid, - cpuid_arg->entries); - if (r) - goto out; - - r = -EFAULT; - if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) - goto out; - r = 0; - break; - } - default: - ; - } -out: - return r; -} - -static void kvm_init_msr_list(void) -{ - u32 dummy[2]; - unsigned i, j; - - for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { - if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) - continue; - if (j < i) - msrs_to_save[j] = msrs_to_save[i]; - j++; - } - num_msrs_to_save = j; -} - -/* - * Only apic need an MMIO device hook, so shortcut now.. - */ -static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, - gpa_t addr) -{ - struct kvm_io_device *dev; - - if (vcpu->arch.apic) { - dev = &vcpu->arch.apic->dev; - if (dev->in_range(dev, addr)) - return dev; - } - return NULL; -} - - -static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, - gpa_t addr) -{ - struct kvm_io_device *dev; - - dev = vcpu_find_pervcpu_dev(vcpu, addr); - if (dev == NULL) - dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr); - return dev; -} - -int emulator_read_std(unsigned long addr, - void *val, - unsigned int bytes, - struct kvm_vcpu *vcpu) -{ - void *data = val; - - while (bytes) { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); - unsigned offset = addr & (PAGE_SIZE-1); - unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset); - int ret; - - if (gpa == UNMAPPED_GVA) - return X86EMUL_PROPAGATE_FAULT; - ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy); - if (ret < 0) - return X86EMUL_UNHANDLEABLE; - - bytes -= tocopy; - data += tocopy; - addr += tocopy; - } - - return X86EMUL_CONTINUE; -} -EXPORT_SYMBOL_GPL(emulator_read_std); - -static int emulator_read_emulated(unsigned long addr, - void *val, - unsigned int bytes, - struct kvm_vcpu *vcpu) -{ - struct kvm_io_device *mmio_dev; - gpa_t gpa; - - if (vcpu->mmio_read_completed) { - memcpy(val, vcpu->mmio_data, bytes); - vcpu->mmio_read_completed = 0; - return X86EMUL_CONTINUE; - } - - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); - - /* For APIC access vmexit */ - if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) - goto mmio; - - if (emulator_read_std(addr, val, bytes, vcpu) - == X86EMUL_CONTINUE) - return X86EMUL_CONTINUE; - if (gpa == UNMAPPED_GVA) - return X86EMUL_PROPAGATE_FAULT; - -mmio: - /* - * Is this MMIO handled locally? - */ - mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); - if (mmio_dev) { - kvm_iodevice_read(mmio_dev, gpa, bytes, val); - return X86EMUL_CONTINUE; - } - - vcpu->mmio_needed = 1; - vcpu->mmio_phys_addr = gpa; - vcpu->mmio_size = bytes; - vcpu->mmio_is_write = 0; - - return X86EMUL_UNHANDLEABLE; -} - -static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, - const void *val, int bytes) -{ - int ret; - - ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); - if (ret < 0) - return 0; - kvm_mmu_pte_write(vcpu, gpa, val, bytes); - return 1; -} - -static int emulator_write_emulated_onepage(unsigned long addr, - const void *val, - unsigned int bytes, - struct kvm_vcpu *vcpu) -{ - struct kvm_io_device *mmio_dev; - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); - - if (gpa == UNMAPPED_GVA) { - kvm_inject_page_fault(vcpu, addr, 2); - return X86EMUL_PROPAGATE_FAULT; - } - - /* For APIC access vmexit */ - if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) - goto mmio; - - if (emulator_write_phys(vcpu, gpa, val, bytes)) - return X86EMUL_CONTINUE; - -mmio: - /* - * Is this MMIO handled locally? - */ - mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); - if (mmio_dev) { - kvm_iodevice_write(mmio_dev, gpa, bytes, val); - return X86EMUL_CONTINUE; - } - - vcpu->mmio_needed = 1; - vcpu->mmio_phys_addr = gpa; - vcpu->mmio_size = bytes; - vcpu->mmio_is_write = 1; - memcpy(vcpu->mmio_data, val, bytes); - - return X86EMUL_CONTINUE; -} - -int emulator_write_emulated(unsigned long addr, - const void *val, - unsigned int bytes, - struct kvm_vcpu *vcpu) -{ - /* Crossing a page boundary? */ - if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { - int rc, now; - - now = -addr & ~PAGE_MASK; - rc = emulator_write_emulated_onepage(addr, val, now, vcpu); - if (rc != X86EMUL_CONTINUE) - return rc; - addr += now; - val += now; - bytes -= now; - } - return emulator_write_emulated_onepage(addr, val, bytes, vcpu); -} -EXPORT_SYMBOL_GPL(emulator_write_emulated); - -static int emulator_cmpxchg_emulated(unsigned long addr, - const void *old, - const void *new, - unsigned int bytes, - struct kvm_vcpu *vcpu) -{ - static int reported; - - if (!reported) { - reported = 1; - printk(KERN_WARNING "kvm: emulating exchange as write\n"); - } -#ifndef CONFIG_X86_64 - /* guests cmpxchg8b have to be emulated atomically */ - if (bytes == 8) { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); - struct page *page; - char *addr; - u64 val; - - if (gpa == UNMAPPED_GVA || - (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) - goto emul_write; - - if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) - goto emul_write; - - val = *(u64 *)new; - page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); - addr = kmap_atomic(page, KM_USER0); - set_64bit((u64 *)(addr + offset_in_page(gpa)), val); - kunmap_atomic(addr, KM_USER0); - kvm_release_page_dirty(page); - } -emul_write: -#endif - - return emulator_write_emulated(addr, new, bytes, vcpu); -} - -static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) -{ - return kvm_x86_ops->get_segment_base(vcpu, seg); -} - -int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) -{ - return X86EMUL_CONTINUE; -} - -int emulate_clts(struct kvm_vcpu *vcpu) -{ - kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); - return X86EMUL_CONTINUE; -} - -int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) -{ - struct kvm_vcpu *vcpu = ctxt->vcpu; - - switch (dr) { - case 0 ... 3: - *dest = kvm_x86_ops->get_dr(vcpu, dr); - return X86EMUL_CONTINUE; - default: - pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr); - return X86EMUL_UNHANDLEABLE; - } -} - -int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) -{ - unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; - int exception; - - kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); - if (exception) { - /* FIXME: better handling */ - return X86EMUL_UNHANDLEABLE; - } - return X86EMUL_CONTINUE; -} - -void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) -{ - static int reported; - u8 opcodes[4]; - unsigned long rip = vcpu->arch.rip; - unsigned long rip_linear; - - rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); - - if (reported) - return; - - emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu); - - printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", - context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); - reported = 1; -} -EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); - -struct x86_emulate_ops emulate_ops = { - .read_std = emulator_read_std, - .read_emulated = emulator_read_emulated, - .write_emulated = emulator_write_emulated, - .cmpxchg_emulated = emulator_cmpxchg_emulated, -}; - -int emulate_instruction(struct kvm_vcpu *vcpu, - struct kvm_run *run, - unsigned long cr2, - u16 error_code, - int no_decode) -{ - int r; - - vcpu->arch.mmio_fault_cr2 = cr2; - kvm_x86_ops->cache_regs(vcpu); - - vcpu->mmio_is_write = 0; - vcpu->arch.pio.string = 0; - - if (!no_decode) { - int cs_db, cs_l; - kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - - vcpu->arch.emulate_ctxt.vcpu = vcpu; - vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); - vcpu->arch.emulate_ctxt.mode = - (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) - ? X86EMUL_MODE_REAL : cs_l - ? X86EMUL_MODE_PROT64 : cs_db - ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - - if (vcpu->arch.emulate_ctxt.mode == X86EMUL_MODE_PROT64) { - vcpu->arch.emulate_ctxt.cs_base = 0; - vcpu->arch.emulate_ctxt.ds_base = 0; - vcpu->arch.emulate_ctxt.es_base = 0; - vcpu->arch.emulate_ctxt.ss_base = 0; - } else { - vcpu->arch.emulate_ctxt.cs_base = - get_segment_base(vcpu, VCPU_SREG_CS); - vcpu->arch.emulate_ctxt.ds_base = - get_segment_base(vcpu, VCPU_SREG_DS); - vcpu->arch.emulate_ctxt.es_base = - get_segment_base(vcpu, VCPU_SREG_ES); - vcpu->arch.emulate_ctxt.ss_base = - get_segment_base(vcpu, VCPU_SREG_SS); - } - - vcpu->arch.emulate_ctxt.gs_base = - get_segment_base(vcpu, VCPU_SREG_GS); - vcpu->arch.emulate_ctxt.fs_base = - get_segment_base(vcpu, VCPU_SREG_FS); - - r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); - ++vcpu->stat.insn_emulation; - if (r) { - ++vcpu->stat.insn_emulation_fail; - if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) - return EMULATE_DONE; - return EMULATE_FAIL; - } - } - - r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); - - if (vcpu->arch.pio.string) - return EMULATE_DO_MMIO; - - if ((r || vcpu->mmio_is_write) && run) { - run->exit_reason = KVM_EXIT_MMIO; - run->mmio.phys_addr = vcpu->mmio_phys_addr; - memcpy(run->mmio.data, vcpu->mmio_data, 8); - run->mmio.len = vcpu->mmio_size; - run->mmio.is_write = vcpu->mmio_is_write; - } - - if (r) { - if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) - return EMULATE_DONE; - if (!vcpu->mmio_needed) { - kvm_report_emulation_failure(vcpu, "mmio"); - return EMULATE_FAIL; - } - return EMULATE_DO_MMIO; - } - - kvm_x86_ops->decache_regs(vcpu); - kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); - - if (vcpu->mmio_is_write) { - vcpu->mmio_needed = 0; - return EMULATE_DO_MMIO; - } - - return EMULATE_DONE; -} -EXPORT_SYMBOL_GPL(emulate_instruction); - -static void free_pio_guest_pages(struct kvm_vcpu *vcpu) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i) - if (vcpu->arch.pio.guest_pages[i]) { - kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]); - vcpu->arch.pio.guest_pages[i] = NULL; - } -} - -static int pio_copy_data(struct kvm_vcpu *vcpu) -{ - void *p = vcpu->arch.pio_data; - void *q; - unsigned bytes; - int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1; - - q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE, - PAGE_KERNEL); - if (!q) { - free_pio_guest_pages(vcpu); - return -ENOMEM; - } - q += vcpu->arch.pio.guest_page_offset; - bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; - if (vcpu->arch.pio.in) - memcpy(q, p, bytes); - else - memcpy(p, q, bytes); - q -= vcpu->arch.pio.guest_page_offset; - vunmap(q); - free_pio_guest_pages(vcpu); - return 0; -} - -int complete_pio(struct kvm_vcpu *vcpu) -{ - struct kvm_pio_request *io = &vcpu->arch.pio; - long delta; - int r; - - kvm_x86_ops->cache_regs(vcpu); - - if (!io->string) { - if (io->in) - memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data, - io->size); - } else { - if (io->in) { - r = pio_copy_data(vcpu); - if (r) { - kvm_x86_ops->cache_regs(vcpu); - return r; - } - } - - delta = 1; - if (io->rep) { - delta *= io->cur_count; - /* - * The size of the register should really depend on - * current address size. - */ - vcpu->arch.regs[VCPU_REGS_RCX] -= delta; - } - if (io->down) - delta = -delta; - delta *= io->size; - if (io->in) - vcpu->arch.regs[VCPU_REGS_RDI] += delta; - else - vcpu->arch.regs[VCPU_REGS_RSI] += delta; - } - - kvm_x86_ops->decache_regs(vcpu); - - io->count -= io->cur_count; - io->cur_count = 0; - - return 0; -} - -static void kernel_pio(struct kvm_io_device *pio_dev, - struct kvm_vcpu *vcpu, - void *pd) -{ - /* TODO: String I/O for in kernel device */ - - mutex_lock(&vcpu->kvm->lock); - if (vcpu->arch.pio.in) - kvm_iodevice_read(pio_dev, vcpu->arch.pio.port, - vcpu->arch.pio.size, - pd); - else - kvm_iodevice_write(pio_dev, vcpu->arch.pio.port, - vcpu->arch.pio.size, - pd); - mutex_unlock(&vcpu->kvm->lock); -} - -static void pio_string_write(struct kvm_io_device *pio_dev, - struct kvm_vcpu *vcpu) -{ - struct kvm_pio_request *io = &vcpu->arch.pio; - void *pd = vcpu->arch.pio_data; - int i; - - mutex_lock(&vcpu->kvm->lock); - for (i = 0; i < io->cur_count; i++) { - kvm_iodevice_write(pio_dev, io->port, - io->size, - pd); - pd += io->size; - } - mutex_unlock(&vcpu->kvm->lock); -} - -static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, - gpa_t addr) -{ - return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr); -} - -int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, - int size, unsigned port) -{ - struct kvm_io_device *pio_dev; - - vcpu->run->exit_reason = KVM_EXIT_IO; - vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; - vcpu->run->io.size = vcpu->arch.pio.size = size; - vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; - vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1; - vcpu->run->io.port = vcpu->arch.pio.port = port; - vcpu->arch.pio.in = in; - vcpu->arch.pio.string = 0; - vcpu->arch.pio.down = 0; - vcpu->arch.pio.guest_page_offset = 0; - vcpu->arch.pio.rep = 0; - - kvm_x86_ops->cache_regs(vcpu); - memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4); - kvm_x86_ops->decache_regs(vcpu); - - kvm_x86_ops->skip_emulated_instruction(vcpu); - - pio_dev = vcpu_find_pio_dev(vcpu, port); - if (pio_dev) { - kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); - complete_pio(vcpu); - return 1; - } - return 0; -} -EXPORT_SYMBOL_GPL(kvm_emulate_pio); - -int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, - int size, unsigned long count, int down, - gva_t address, int rep, unsigned port) -{ - unsigned now, in_page; - int i, ret = 0; - int nr_pages = 1; - struct page *page; - struct kvm_io_device *pio_dev; - - vcpu->run->exit_reason = KVM_EXIT_IO; - vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; - vcpu->run->io.size = vcpu->arch.pio.size = size; - vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; - vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count; - vcpu->run->io.port = vcpu->arch.pio.port = port; - vcpu->arch.pio.in = in; - vcpu->arch.pio.string = 1; - vcpu->arch.pio.down = down; - vcpu->arch.pio.guest_page_offset = offset_in_page(address); - vcpu->arch.pio.rep = rep; - - if (!count) { - kvm_x86_ops->skip_emulated_instruction(vcpu); - return 1; - } - - if (!down) - in_page = PAGE_SIZE - offset_in_page(address); - else - in_page = offset_in_page(address) + size; - now = min(count, (unsigned long)in_page / size); - if (!now) { - /* - * String I/O straddles page boundary. Pin two guest pages - * so that we satisfy atomicity constraints. Do just one - * transaction to avoid complexity. - */ - nr_pages = 2; - now = 1; - } - if (down) { - /* - * String I/O in reverse. Yuck. Kill the guest, fix later. - */ - pr_unimpl(vcpu, "guest string pio down\n"); - kvm_inject_gp(vcpu, 0); - return 1; - } - vcpu->run->io.count = now; - vcpu->arch.pio.cur_count = now; - - if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count) - kvm_x86_ops->skip_emulated_instruction(vcpu); - - for (i = 0; i < nr_pages; ++i) { - mutex_lock(&vcpu->kvm->lock); - page = gva_to_page(vcpu, address + i * PAGE_SIZE); - vcpu->arch.pio.guest_pages[i] = page; - mutex_unlock(&vcpu->kvm->lock); - if (!page) { - kvm_inject_gp(vcpu, 0); - free_pio_guest_pages(vcpu); - return 1; - } - } - - pio_dev = vcpu_find_pio_dev(vcpu, port); - if (!vcpu->arch.pio.in) { - /* string PIO write */ - ret = pio_copy_data(vcpu); - if (ret >= 0 && pio_dev) { - pio_string_write(pio_dev, vcpu); - complete_pio(vcpu); - if (vcpu->arch.pio.count == 0) - ret = 1; - } - } else if (pio_dev) - pr_unimpl(vcpu, "no string pio read support yet, " - "port %x size %d count %ld\n", - port, size, count); - - return ret; -} -EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); - -int kvm_arch_init(void *opaque) -{ - int r; - struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; - - r = kvm_mmu_module_init(); - if (r) - goto out_fail; - - kvm_init_msr_list(); - - if (kvm_x86_ops) { - printk(KERN_ERR "kvm: already loaded the other module\n"); - r = -EEXIST; - goto out; - } - - if (!ops->cpu_has_kvm_support()) { - printk(KERN_ERR "kvm: no hardware support\n"); - r = -EOPNOTSUPP; - goto out; - } - if (ops->disabled_by_bios()) { - printk(KERN_ERR "kvm: disabled by bios\n"); - r = -EOPNOTSUPP; - goto out; - } - - kvm_x86_ops = ops; - kvm_mmu_set_nonpresent_ptes(0ull, 0ull); - return 0; - -out: - kvm_mmu_module_exit(); -out_fail: - return r; -} - -void kvm_arch_exit(void) -{ - kvm_x86_ops = NULL; - kvm_mmu_module_exit(); -} - -int kvm_emulate_halt(struct kvm_vcpu *vcpu) -{ - ++vcpu->stat.halt_exits; - if (irqchip_in_kernel(vcpu->kvm)) { - vcpu->arch.mp_state = VCPU_MP_STATE_HALTED; - kvm_vcpu_block(vcpu); - if (vcpu->arch.mp_state != VCPU_MP_STATE_RUNNABLE) - return -EINTR; - return 1; - } else { - vcpu->run->exit_reason = KVM_EXIT_HLT; - return 0; - } -} -EXPORT_SYMBOL_GPL(kvm_emulate_halt); - -int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) -{ - unsigned long nr, a0, a1, a2, a3, ret; - - kvm_x86_ops->cache_regs(vcpu); - - nr = vcpu->arch.regs[VCPU_REGS_RAX]; - a0 = vcpu->arch.regs[VCPU_REGS_RBX]; - a1 = vcpu->arch.regs[VCPU_REGS_RCX]; - a2 = vcpu->arch.regs[VCPU_REGS_RDX]; - a3 = vcpu->arch.regs[VCPU_REGS_RSI]; - - if (!is_long_mode(vcpu)) { - nr &= 0xFFFFFFFF; - a0 &= 0xFFFFFFFF; - a1 &= 0xFFFFFFFF; - a2 &= 0xFFFFFFFF; - a3 &= 0xFFFFFFFF; - } - - switch (nr) { - default: - ret = -KVM_ENOSYS; - break; - } - vcpu->arch.regs[VCPU_REGS_RAX] = ret; - kvm_x86_ops->decache_regs(vcpu); - return 0; -} -EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); - -int kvm_fix_hypercall(struct kvm_vcpu *vcpu) -{ - char instruction[3]; - int ret = 0; - - mutex_lock(&vcpu->kvm->lock); - - /* - * Blow out the MMU to ensure that no other VCPU has an active mapping - * to ensure that the updated hypercall appears atomically across all - * VCPUs. - */ - kvm_mmu_zap_all(vcpu->kvm); - - kvm_x86_ops->cache_regs(vcpu); - kvm_x86_ops->patch_hypercall(vcpu, instruction); - if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu) - != X86EMUL_CONTINUE) - ret = -EFAULT; - - mutex_unlock(&vcpu->kvm->lock); - - return ret; -} - -static u64 mk_cr_64(u64 curr_cr, u32 new_val) -{ - return (curr_cr & ~((1ULL << 32) - 1)) | new_val; -} - -void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) -{ - struct descriptor_table dt = { limit, base }; - - kvm_x86_ops->set_gdt(vcpu, &dt); -} - -void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) -{ - struct descriptor_table dt = { limit, base }; - - kvm_x86_ops->set_idt(vcpu, &dt); -} - -void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, - unsigned long *rflags) -{ - lmsw(vcpu, msw); - *rflags = kvm_x86_ops->get_rflags(vcpu); -} - -unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) -{ - kvm_x86_ops->decache_cr4_guest_bits(vcpu); - switch (cr) { - case 0: - return vcpu->arch.cr0; - case 2: - return vcpu->arch.cr2; - case 3: - return vcpu->arch.cr3; - case 4: - return vcpu->arch.cr4; - case 8: - return get_cr8(vcpu); - default: - vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); - return 0; - } -} - -void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, - unsigned long *rflags) -{ - switch (cr) { - case 0: - set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); - *rflags = kvm_x86_ops->get_rflags(vcpu); - break; - case 2: - vcpu->arch.cr2 = val; - break; - case 3: - set_cr3(vcpu, val); - break; - case 4: - set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); - break; - case 8: - set_cr8(vcpu, val & 0xfUL); - break; - default: - vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); - } -} - -static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) -{ - struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; - int j, nent = vcpu->arch.cpuid_nent; - - e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; - /* when no next entry is found, the current entry[i] is reselected */ - for (j = i + 1; j == i; j = (j + 1) % nent) { - struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; - if (ej->function == e->function) { - ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; - return j; - } - } - return 0; /* silence gcc, even though control never reaches here */ -} - -/* find an entry with matching function, matching index (if needed), and that - * should be read next (if it's stateful) */ -static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, - u32 function, u32 index) -{ - if (e->function != function) - return 0; - if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) - return 0; - if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && - !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) - return 0; - return 1; -} - -void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) -{ - int i; - u32 function, index; - struct kvm_cpuid_entry2 *e, *best; - - kvm_x86_ops->cache_regs(vcpu); - function = vcpu->arch.regs[VCPU_REGS_RAX]; - index = vcpu->arch.regs[VCPU_REGS_RCX]; - vcpu->arch.regs[VCPU_REGS_RAX] = 0; - vcpu->arch.regs[VCPU_REGS_RBX] = 0; - vcpu->arch.regs[VCPU_REGS_RCX] = 0; - vcpu->arch.regs[VCPU_REGS_RDX] = 0; - best = NULL; - for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { - e = &vcpu->arch.cpuid_entries[i]; - if (is_matching_cpuid_entry(e, function, index)) { - if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) - move_to_next_stateful_cpuid_entry(vcpu, i); - best = e; - break; - } - /* - * Both basic or both extended? - */ - if (((e->function ^ function) & 0x80000000) == 0) - if (!best || e->function > best->function) - best = e; - } - if (best) { - vcpu->arch.regs[VCPU_REGS_RAX] = best->eax; - vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx; - vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx; - vcpu->arch.regs[VCPU_REGS_RDX] = best->edx; - } - kvm_x86_ops->decache_regs(vcpu); - kvm_x86_ops->skip_emulated_instruction(vcpu); -} -EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); - -/* - * Check if userspace requested an interrupt window, and that the - * interrupt window is open. - * - * No need to exit to userspace if we already have an interrupt queued. - */ -static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) -{ - return (!vcpu->arch.irq_summary && - kvm_run->request_interrupt_window && - vcpu->arch.interrupt_window_open && - (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF)); -} - -static void post_kvm_run_save(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) -{ - kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0; - kvm_run->cr8 = get_cr8(vcpu); - kvm_run->apic_base = kvm_get_apic_base(vcpu); - if (irqchip_in_kernel(vcpu->kvm)) - kvm_run->ready_for_interrupt_injection = 1; - else - kvm_run->ready_for_interrupt_injection = - (vcpu->arch.interrupt_window_open && - vcpu->arch.irq_summary == 0); -} - -static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) -{ - int r; - - if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) { - pr_debug("vcpu %d received sipi with vector # %x\n", - vcpu->vcpu_id, vcpu->arch.sipi_vector); - kvm_lapic_reset(vcpu); - r = kvm_x86_ops->vcpu_reset(vcpu); - if (r) - return r; - vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; - } - -preempted: - if (vcpu->guest_debug.enabled) - kvm_x86_ops->guest_debug_pre(vcpu); - -again: - r = kvm_mmu_reload(vcpu); - if (unlikely(r)) - goto out; - - kvm_inject_pending_timer_irqs(vcpu); - - preempt_disable(); - - kvm_x86_ops->prepare_guest_switch(vcpu); - kvm_load_guest_fpu(vcpu); - - local_irq_disable(); - - if (signal_pending(current)) { - local_irq_enable(); - preempt_enable(); - r = -EINTR; - kvm_run->exit_reason = KVM_EXIT_INTR; - ++vcpu->stat.signal_exits; - goto out; - } - - if (vcpu->arch.exception.pending) - __queue_exception(vcpu); - else if (irqchip_in_kernel(vcpu->kvm)) - kvm_x86_ops->inject_pending_irq(vcpu); - else - kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run); - - vcpu->guest_mode = 1; - kvm_guest_enter(); - - if (vcpu->requests) - if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) - kvm_x86_ops->tlb_flush(vcpu); - - kvm_x86_ops->run(vcpu, kvm_run); - - vcpu->guest_mode = 0; - local_irq_enable(); - - ++vcpu->stat.exits; - - /* - * We must have an instruction between local_irq_enable() and - * kvm_guest_exit(), so the timer interrupt isn't delayed by - * the interrupt shadow. The stat.exits increment will do nicely. - * But we need to prevent reordering, hence this barrier(): - */ - barrier(); - - kvm_guest_exit(); - - preempt_enable(); - - /* - * Profile KVM exit RIPs: - */ - if (unlikely(prof_on == KVM_PROFILING)) { - kvm_x86_ops->cache_regs(vcpu); - profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip); - } - - if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu)) - vcpu->arch.exception.pending = false; - - r = kvm_x86_ops->handle_exit(kvm_run, vcpu); - - if (r > 0) { - if (dm_request_for_irq_injection(vcpu, kvm_run)) { - r = -EINTR; - kvm_run->exit_reason = KVM_EXIT_INTR; - ++vcpu->stat.request_irq_exits; - goto out; - } - if (!need_resched()) - goto again; - } - -out: - if (r > 0) { - kvm_resched(vcpu); - goto preempted; - } - - post_kvm_run_save(vcpu, kvm_run); - - return r; -} - -int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) -{ - int r; - sigset_t sigsaved; - - vcpu_load(vcpu); - - if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_UNINITIALIZED)) { - kvm_vcpu_block(vcpu); - vcpu_put(vcpu); - return -EAGAIN; - } - - if (vcpu->sigset_active) - sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); - - /* re-sync apic's tpr */ - if (!irqchip_in_kernel(vcpu->kvm)) - set_cr8(vcpu, kvm_run->cr8); - - if (vcpu->arch.pio.cur_count) { - r = complete_pio(vcpu); - if (r) - goto out; - } -#if CONFIG_HAS_IOMEM - if (vcpu->mmio_needed) { - memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); - vcpu->mmio_read_completed = 1; - vcpu->mmio_needed = 0; - r = emulate_instruction(vcpu, kvm_run, - vcpu->arch.mmio_fault_cr2, 0, 1); - if (r == EMULATE_DO_MMIO) { - /* - * Read-modify-write. Back to userspace. - */ - r = 0; - goto out; - } - } -#endif - if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { - kvm_x86_ops->cache_regs(vcpu); - vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; - kvm_x86_ops->decache_regs(vcpu); - } - - r = __vcpu_run(vcpu, kvm_run); - -out: - if (vcpu->sigset_active) - sigprocmask(SIG_SETMASK, &sigsaved, NULL); - - vcpu_put(vcpu); - return r; -} - -int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) -{ - vcpu_load(vcpu); - - kvm_x86_ops->cache_regs(vcpu); - - regs->rax = vcpu->arch.regs[VCPU_REGS_RAX]; - regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX]; - regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX]; - regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX]; - regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI]; - regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI]; - regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP]; - regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP]; -#ifdef CONFIG_X86_64 - regs->r8 = vcpu->arch.regs[VCPU_REGS_R8]; - regs->r9 = vcpu->arch.regs[VCPU_REGS_R9]; - regs->r10 = vcpu->arch.regs[VCPU_REGS_R10]; - regs->r11 = vcpu->arch.regs[VCPU_REGS_R11]; - regs->r12 = vcpu->arch.regs[VCPU_REGS_R12]; - regs->r13 = vcpu->arch.regs[VCPU_REGS_R13]; - regs->r14 = vcpu->arch.regs[VCPU_REGS_R14]; - regs->r15 = vcpu->arch.regs[VCPU_REGS_R15]; -#endif - - regs->rip = vcpu->arch.rip; - regs->rflags = kvm_x86_ops->get_rflags(vcpu); - - /* - * Don't leak debug flags in case they were set for guest debugging - */ - if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep) - regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); - - vcpu_put(vcpu); - - return 0; -} - -int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) -{ - vcpu_load(vcpu); - - vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax; - vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx; - vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx; - vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx; - vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi; - vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi; - vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp; - vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp; -#ifdef CONFIG_X86_64 - vcpu->arch.regs[VCPU_REGS_R8] = regs->r8; - vcpu->arch.regs[VCPU_REGS_R9] = regs->r9; - vcpu->arch.regs[VCPU_REGS_R10] = regs->r10; - vcpu->arch.regs[VCPU_REGS_R11] = regs->r11; - vcpu->arch.regs[VCPU_REGS_R12] = regs->r12; - vcpu->arch.regs[VCPU_REGS_R13] = regs->r13; - vcpu->arch.regs[VCPU_REGS_R14] = regs->r14; - vcpu->arch.regs[VCPU_REGS_R15] = regs->r15; -#endif - - vcpu->arch.rip = regs->rip; - kvm_x86_ops->set_rflags(vcpu, regs->rflags); - - kvm_x86_ops->decache_regs(vcpu); - - vcpu_put(vcpu); - - return 0; -} - -static void get_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) -{ - return kvm_x86_ops->get_segment(vcpu, var, seg); -} - -void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) -{ - struct kvm_segment cs; - - get_segment(vcpu, &cs, VCPU_SREG_CS); - *db = cs.db; - *l = cs.l; -} -EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); - -int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, - struct kvm_sregs *sregs) -{ - struct descriptor_table dt; - int pending_vec; - - vcpu_load(vcpu); - - get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); - get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); - get_segment(vcpu, &sregs->es, VCPU_SREG_ES); - get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); - get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); - get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); - - get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); - get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); - - kvm_x86_ops->get_idt(vcpu, &dt); - sregs->idt.limit = dt.limit; - sregs->idt.base = dt.base; - kvm_x86_ops->get_gdt(vcpu, &dt); - sregs->gdt.limit = dt.limit; - sregs->gdt.base = dt.base; - - kvm_x86_ops->decache_cr4_guest_bits(vcpu); - sregs->cr0 = vcpu->arch.cr0; - sregs->cr2 = vcpu->arch.cr2; - sregs->cr3 = vcpu->arch.cr3; - sregs->cr4 = vcpu->arch.cr4; - sregs->cr8 = get_cr8(vcpu); - sregs->efer = vcpu->arch.shadow_efer; - sregs->apic_base = kvm_get_apic_base(vcpu); - - if (irqchip_in_kernel(vcpu->kvm)) { - memset(sregs->interrupt_bitmap, 0, - sizeof sregs->interrupt_bitmap); - pending_vec = kvm_x86_ops->get_irq(vcpu); - if (pending_vec >= 0) - set_bit(pending_vec, - (unsigned long *)sregs->interrupt_bitmap); - } else - memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending, - sizeof sregs->interrupt_bitmap); - - vcpu_put(vcpu); - - return 0; -} - -static void set_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) -{ - return kvm_x86_ops->set_segment(vcpu, var, seg); -} - -int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, - struct kvm_sregs *sregs) -{ - int mmu_reset_needed = 0; - int i, pending_vec, max_bits; - struct descriptor_table dt; - - vcpu_load(vcpu); - - dt.limit = sregs->idt.limit; - dt.base = sregs->idt.base; - kvm_x86_ops->set_idt(vcpu, &dt); - dt.limit = sregs->gdt.limit; - dt.base = sregs->gdt.base; - kvm_x86_ops->set_gdt(vcpu, &dt); - - vcpu->arch.cr2 = sregs->cr2; - mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; - vcpu->arch.cr3 = sregs->cr3; - - set_cr8(vcpu, sregs->cr8); - - mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; -#ifdef CONFIG_X86_64 - kvm_x86_ops->set_efer(vcpu, sregs->efer); -#endif - kvm_set_apic_base(vcpu, sregs->apic_base); - - kvm_x86_ops->decache_cr4_guest_bits(vcpu); - - mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0; - vcpu->arch.cr0 = sregs->cr0; - kvm_x86_ops->set_cr0(vcpu, sregs->cr0); - - mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; - kvm_x86_ops->set_cr4(vcpu, sregs->cr4); - if (!is_long_mode(vcpu) && is_pae(vcpu)) - load_pdptrs(vcpu, vcpu->arch.cr3); - - if (mmu_reset_needed) - kvm_mmu_reset_context(vcpu); - - if (!irqchip_in_kernel(vcpu->kvm)) { - memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap, - sizeof vcpu->arch.irq_pending); - vcpu->arch.irq_summary = 0; - for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i) - if (vcpu->arch.irq_pending[i]) - __set_bit(i, &vcpu->arch.irq_summary); - } else { - max_bits = (sizeof sregs->interrupt_bitmap) << 3; - pending_vec = find_first_bit( - (const unsigned long *)sregs->interrupt_bitmap, - max_bits); - /* Only pending external irq is handled here */ - if (pending_vec < max_bits) { - kvm_x86_ops->set_irq(vcpu, pending_vec); - pr_debug("Set back pending irq %d\n", - pending_vec); - } - } - - set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); - set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); - set_segment(vcpu, &sregs->es, VCPU_SREG_ES); - set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); - set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); - set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); - - set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); - set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); - - vcpu_put(vcpu); - - return 0; -} - -int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, - struct kvm_debug_guest *dbg) -{ - int r; - - vcpu_load(vcpu); - - r = kvm_x86_ops->set_guest_debug(vcpu, dbg); - - vcpu_put(vcpu); - - return r; -} - -/* - * fxsave fpu state. Taken from x86_64/processor.h. To be killed when - * we have asm/x86/processor.h - */ -struct fxsave { - u16 cwd; - u16 swd; - u16 twd; - u16 fop; - u64 rip; - u64 rdp; - u32 mxcsr; - u32 mxcsr_mask; - u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ -#ifdef CONFIG_X86_64 - u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ -#else - u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ -#endif -}; - -/* - * Translate a guest virtual address to a guest physical address. - */ -int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, - struct kvm_translation *tr) -{ - unsigned long vaddr = tr->linear_address; - gpa_t gpa; - - vcpu_load(vcpu); - mutex_lock(&vcpu->kvm->lock); - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); - tr->physical_address = gpa; - tr->valid = gpa != UNMAPPED_GVA; - tr->writeable = 1; - tr->usermode = 0; - mutex_unlock(&vcpu->kvm->lock); - vcpu_put(vcpu); - - return 0; -} - -int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) -{ - struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; - - vcpu_load(vcpu); - - memcpy(fpu->fpr, fxsave->st_space, 128); - fpu->fcw = fxsave->cwd; - fpu->fsw = fxsave->swd; - fpu->ftwx = fxsave->twd; - fpu->last_opcode = fxsave->fop; - fpu->last_ip = fxsave->rip; - fpu->last_dp = fxsave->rdp; - memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); - - vcpu_put(vcpu); - - return 0; -} - -int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) -{ - struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; - - vcpu_load(vcpu); - - memcpy(fxsave->st_space, fpu->fpr, 128); - fxsave->cwd = fpu->fcw; - fxsave->swd = fpu->fsw; - fxsave->twd = fpu->ftwx; - fxsave->fop = fpu->last_opcode; - fxsave->rip = fpu->last_ip; - fxsave->rdp = fpu->last_dp; - memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); - - vcpu_put(vcpu); - - return 0; -} - -void fx_init(struct kvm_vcpu *vcpu) -{ - unsigned after_mxcsr_mask; - - /* Initialize guest FPU by resetting ours and saving into guest's */ - preempt_disable(); - fx_save(&vcpu->arch.host_fx_image); - fpu_init(); - fx_save(&vcpu->arch.guest_fx_image); - fx_restore(&vcpu->arch.host_fx_image); - preempt_enable(); - - vcpu->arch.cr0 |= X86_CR0_ET; - after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); - vcpu->arch.guest_fx_image.mxcsr = 0x1f80; - memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask, - 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); -} -EXPORT_SYMBOL_GPL(fx_init); - -void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) -{ - if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) - return; - - vcpu->guest_fpu_loaded = 1; - fx_save(&vcpu->arch.host_fx_image); - fx_restore(&vcpu->arch.guest_fx_image); -} -EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); - -void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) -{ - if (!vcpu->guest_fpu_loaded) - return; - - vcpu->guest_fpu_loaded = 0; - fx_save(&vcpu->arch.guest_fx_image); - fx_restore(&vcpu->arch.host_fx_image); - ++vcpu->stat.fpu_reload; -} -EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); - -void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) -{ - kvm_x86_ops->vcpu_free(vcpu); -} - -struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, - unsigned int id) -{ - return kvm_x86_ops->vcpu_create(kvm, id); -} - -int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) -{ - int r; - - /* We do fxsave: this must be aligned. */ - BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); - - vcpu_load(vcpu); - r = kvm_arch_vcpu_reset(vcpu); - if (r == 0) - r = kvm_mmu_setup(vcpu); - vcpu_put(vcpu); - if (r < 0) - goto free_vcpu; - - return 0; -free_vcpu: - kvm_x86_ops->vcpu_free(vcpu); - return r; -} - -void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) -{ - vcpu_load(vcpu); - kvm_mmu_unload(vcpu); - vcpu_put(vcpu); - - kvm_x86_ops->vcpu_free(vcpu); -} - -int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) -{ - return kvm_x86_ops->vcpu_reset(vcpu); -} - -void kvm_arch_hardware_enable(void *garbage) -{ - kvm_x86_ops->hardware_enable(garbage); -} - -void kvm_arch_hardware_disable(void *garbage) -{ - kvm_x86_ops->hardware_disable(garbage); -} - -int kvm_arch_hardware_setup(void) -{ - return kvm_x86_ops->hardware_setup(); -} - -void kvm_arch_hardware_unsetup(void) -{ - kvm_x86_ops->hardware_unsetup(); -} - -void kvm_arch_check_processor_compat(void *rtn) -{ - kvm_x86_ops->check_processor_compatibility(rtn); -} - -int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) -{ - struct page *page; - struct kvm *kvm; - int r; - - BUG_ON(vcpu->kvm == NULL); - kvm = vcpu->kvm; - - vcpu->arch.mmu.root_hpa = INVALID_PAGE; - if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0) - vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; - else - vcpu->arch.mp_state = VCPU_MP_STATE_UNINITIALIZED; - - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) { - r = -ENOMEM; - goto fail; - } - vcpu->arch.pio_data = page_address(page); - - r = kvm_mmu_create(vcpu); - if (r < 0) - goto fail_free_pio_data; - - if (irqchip_in_kernel(kvm)) { - r = kvm_create_lapic(vcpu); - if (r < 0) - goto fail_mmu_destroy; - } - - return 0; - -fail_mmu_destroy: - kvm_mmu_destroy(vcpu); -fail_free_pio_data: - free_page((unsigned long)vcpu->arch.pio_data); -fail: - return r; -} - -void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) -{ - kvm_free_lapic(vcpu); - kvm_mmu_destroy(vcpu); - free_page((unsigned long)vcpu->arch.pio_data); -} - -struct kvm *kvm_arch_create_vm(void) -{ - struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); - - if (!kvm) - return ERR_PTR(-ENOMEM); - - INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); - - return kvm; -} - -static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) -{ - vcpu_load(vcpu); - kvm_mmu_unload(vcpu); - vcpu_put(vcpu); -} - -static void kvm_free_vcpus(struct kvm *kvm) -{ - unsigned int i; - - /* - * Unpin any mmu pages first. - */ - for (i = 0; i < KVM_MAX_VCPUS; ++i) - if (kvm->vcpus[i]) - kvm_unload_vcpu_mmu(kvm->vcpus[i]); - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - if (kvm->vcpus[i]) { - kvm_arch_vcpu_free(kvm->vcpus[i]); - kvm->vcpus[i] = NULL; - } - } - -} - -void kvm_arch_destroy_vm(struct kvm *kvm) -{ - kfree(kvm->arch.vpic); - kfree(kvm->arch.vioapic); - kvm_free_vcpus(kvm); - kvm_free_physmem(kvm); - kfree(kvm); -} - -int kvm_arch_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old, - int user_alloc) -{ - int npages = mem->memory_size >> PAGE_SHIFT; - struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; - - /*To keep backward compatibility with older userspace, - *x86 needs to hanlde !user_alloc case. - */ - if (!user_alloc) { - if (npages && !old.rmap) { - down_write(¤t->mm->mmap_sem); - memslot->userspace_addr = do_mmap(NULL, 0, - npages * PAGE_SIZE, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, - 0); - up_write(¤t->mm->mmap_sem); - - if (IS_ERR((void *)memslot->userspace_addr)) - return PTR_ERR((void *)memslot->userspace_addr); - } else { - if (!old.user_alloc && old.rmap) { - int ret; - - down_write(¤t->mm->mmap_sem); - ret = do_munmap(current->mm, old.userspace_addr, - old.npages * PAGE_SIZE); - up_write(¤t->mm->mmap_sem); - if (ret < 0) - printk(KERN_WARNING - "kvm_vm_ioctl_set_memory_region: " - "failed to munmap memory\n"); - } - } - } - - if (!kvm->arch.n_requested_mmu_pages) { - unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); - kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); - } - - kvm_mmu_slot_remove_write_access(kvm, mem->slot); - kvm_flush_remote_tlbs(kvm); - - return 0; -} - -int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE - || vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED; -} diff --git a/drivers/kvm/x86.h b/drivers/kvm/x86.h deleted file mode 100644 index dfb8091971a..00000000000 --- a/drivers/kvm/x86.h +++ /dev/null @@ -1,602 +0,0 @@ -#/* - * Kernel-based Virtual Machine driver for Linux - * - * This header defines architecture specific interfaces, x86 version - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -#ifndef KVM_X86_H -#define KVM_X86_H - -#include <linux/types.h> -#include <linux/mm.h> - -#include <linux/kvm.h> -#include <linux/kvm_para.h> - -#include <asm/desc.h> - -#include "types.h" - -#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) -#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) -#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS|0xFFFFFF0000000000ULL) - -#define KVM_GUEST_CR0_MASK \ - (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE \ - | X86_CR0_NW | X86_CR0_CD) -#define KVM_VM_CR0_ALWAYS_ON \ - (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE | X86_CR0_TS \ - | X86_CR0_MP) -#define KVM_GUEST_CR4_MASK \ - (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE) -#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) -#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) - -#define INVALID_PAGE (~(hpa_t)0) -#define UNMAPPED_GVA (~(gpa_t)0) - -#define DE_VECTOR 0 -#define UD_VECTOR 6 -#define NM_VECTOR 7 -#define DF_VECTOR 8 -#define TS_VECTOR 10 -#define NP_VECTOR 11 -#define SS_VECTOR 12 -#define GP_VECTOR 13 -#define PF_VECTOR 14 - -#define SELECTOR_TI_MASK (1 << 2) -#define SELECTOR_RPL_MASK 0x03 - -#define IOPL_SHIFT 12 - -#define KVM_ALIAS_SLOTS 4 - -#define KVM_PERMILLE_MMU_PAGES 20 -#define KVM_MIN_ALLOC_MMU_PAGES 64 -#define KVM_NUM_MMU_PAGES 1024 -#define KVM_MIN_FREE_MMU_PAGES 5 -#define KVM_REFILL_PAGES 25 -#define KVM_MAX_CPUID_ENTRIES 40 - -extern spinlock_t kvm_lock; -extern struct list_head vm_list; - -struct kvm_vcpu; -struct kvm; - -enum { - VCPU_REGS_RAX = 0, - VCPU_REGS_RCX = 1, - VCPU_REGS_RDX = 2, - VCPU_REGS_RBX = 3, - VCPU_REGS_RSP = 4, - VCPU_REGS_RBP = 5, - VCPU_REGS_RSI = 6, - VCPU_REGS_RDI = 7, -#ifdef CONFIG_X86_64 - VCPU_REGS_R8 = 8, - VCPU_REGS_R9 = 9, - VCPU_REGS_R10 = 10, - VCPU_REGS_R11 = 11, - VCPU_REGS_R12 = 12, - VCPU_REGS_R13 = 13, - VCPU_REGS_R14 = 14, - VCPU_REGS_R15 = 15, -#endif - NR_VCPU_REGS -}; - -enum { - VCPU_SREG_CS, - VCPU_SREG_DS, - VCPU_SREG_ES, - VCPU_SREG_FS, - VCPU_SREG_GS, - VCPU_SREG_SS, - VCPU_SREG_TR, - VCPU_SREG_LDTR, -}; - -#include "x86_emulate.h" - -#define KVM_NR_MEM_OBJS 40 - -/* - * We don't want allocation failures within the mmu code, so we preallocate - * enough memory for a single page fault in a cache. - */ -struct kvm_mmu_memory_cache { - int nobjs; - void *objects[KVM_NR_MEM_OBJS]; -}; - -#define NR_PTE_CHAIN_ENTRIES 5 - -struct kvm_pte_chain { - u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES]; - struct hlist_node link; -}; - -/* - * kvm_mmu_page_role, below, is defined as: - * - * bits 0:3 - total guest paging levels (2-4, or zero for real mode) - * bits 4:7 - page table level for this shadow (1-4) - * bits 8:9 - page table quadrant for 2-level guests - * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode) - * bits 17:19 - common access permissions for all ptes in this shadow page - */ -union kvm_mmu_page_role { - unsigned word; - struct { - unsigned glevels : 4; - unsigned level : 4; - unsigned quadrant : 2; - unsigned pad_for_nice_hex_output : 6; - unsigned metaphysical : 1; - unsigned access : 3; - }; -}; - -struct kvm_mmu_page { - struct list_head link; - struct hlist_node hash_link; - - /* - * The following two entries are used to key the shadow page in the - * hash table. - */ - gfn_t gfn; - union kvm_mmu_page_role role; - - u64 *spt; - /* hold the gfn of each spte inside spt */ - gfn_t *gfns; - unsigned long slot_bitmap; /* One bit set per slot which has memory - * in this shadow page. - */ - int multimapped; /* More than one parent_pte? */ - int root_count; /* Currently serving as active root */ - union { - u64 *parent_pte; /* !multimapped */ - struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */ - }; -}; - -/* - * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level - * 32-bit). The kvm_mmu structure abstracts the details of the current mmu - * mode. - */ -struct kvm_mmu { - void (*new_cr3)(struct kvm_vcpu *vcpu); - int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); - void (*free)(struct kvm_vcpu *vcpu); - gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); - void (*prefetch_page)(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *page); - hpa_t root_hpa; - int root_level; - int shadow_root_level; - - u64 *pae_root; -}; - -struct kvm_vcpu_arch { - u64 host_tsc; - int interrupt_window_open; - unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ - DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS); - unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */ - unsigned long rip; /* needs vcpu_load_rsp_rip() */ - - unsigned long cr0; - unsigned long cr2; - unsigned long cr3; - unsigned long cr4; - unsigned long cr8; - u64 pdptrs[4]; /* pae */ - u64 shadow_efer; - u64 apic_base; - struct kvm_lapic *apic; /* kernel irqchip context */ -#define VCPU_MP_STATE_RUNNABLE 0 -#define VCPU_MP_STATE_UNINITIALIZED 1 -#define VCPU_MP_STATE_INIT_RECEIVED 2 -#define VCPU_MP_STATE_SIPI_RECEIVED 3 -#define VCPU_MP_STATE_HALTED 4 - int mp_state; - int sipi_vector; - u64 ia32_misc_enable_msr; - - struct kvm_mmu mmu; - - struct kvm_mmu_memory_cache mmu_pte_chain_cache; - struct kvm_mmu_memory_cache mmu_rmap_desc_cache; - struct kvm_mmu_memory_cache mmu_page_cache; - struct kvm_mmu_memory_cache mmu_page_header_cache; - - gfn_t last_pt_write_gfn; - int last_pt_write_count; - u64 *last_pte_updated; - - struct i387_fxsave_struct host_fx_image; - struct i387_fxsave_struct guest_fx_image; - - gva_t mmio_fault_cr2; - struct kvm_pio_request pio; - void *pio_data; - - struct kvm_queued_exception { - bool pending; - bool has_error_code; - u8 nr; - u32 error_code; - } exception; - - struct { - int active; - u8 save_iopl; - struct kvm_save_segment { - u16 selector; - unsigned long base; - u32 limit; - u32 ar; - } tr, es, ds, fs, gs; - } rmode; - int halt_request; /* real mode on Intel only */ - - int cpuid_nent; - struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES]; - /* emulate context */ - - struct x86_emulate_ctxt emulate_ctxt; -}; - -struct kvm_mem_alias { - gfn_t base_gfn; - unsigned long npages; - gfn_t target_gfn; -}; - -struct kvm_arch{ - int naliases; - struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; - - unsigned int n_free_mmu_pages; - unsigned int n_requested_mmu_pages; - unsigned int n_alloc_mmu_pages; - struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; - /* - * Hash table of struct kvm_mmu_page. - */ - struct list_head active_mmu_pages; - struct kvm_pic *vpic; - struct kvm_ioapic *vioapic; - - int round_robin_prev_vcpu; - unsigned int tss_addr; - struct page *apic_access_page; -}; - -struct kvm_vm_stat { - u32 mmu_shadow_zapped; - u32 mmu_pte_write; - u32 mmu_pte_updated; - u32 mmu_pde_zapped; - u32 mmu_flooded; - u32 mmu_recycled; - u32 remote_tlb_flush; -}; - -struct kvm_vcpu_stat { - u32 pf_fixed; - u32 pf_guest; - u32 tlb_flush; - u32 invlpg; - - u32 exits; - u32 io_exits; - u32 mmio_exits; - u32 signal_exits; - u32 irq_window_exits; - u32 halt_exits; - u32 halt_wakeup; - u32 request_irq_exits; - u32 irq_exits; - u32 host_state_reload; - u32 efer_reload; - u32 fpu_reload; - u32 insn_emulation; - u32 insn_emulation_fail; -}; - -struct descriptor_table { - u16 limit; - unsigned long base; -} __attribute__((packed)); - -struct kvm_x86_ops { - int (*cpu_has_kvm_support)(void); /* __init */ - int (*disabled_by_bios)(void); /* __init */ - void (*hardware_enable)(void *dummy); /* __init */ - void (*hardware_disable)(void *dummy); - void (*check_processor_compatibility)(void *rtn); - int (*hardware_setup)(void); /* __init */ - void (*hardware_unsetup)(void); /* __exit */ - - /* Create, but do not attach this VCPU */ - struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); - void (*vcpu_free)(struct kvm_vcpu *vcpu); - int (*vcpu_reset)(struct kvm_vcpu *vcpu); - - void (*prepare_guest_switch)(struct kvm_vcpu *vcpu); - void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); - void (*vcpu_put)(struct kvm_vcpu *vcpu); - void (*vcpu_decache)(struct kvm_vcpu *vcpu); - - int (*set_guest_debug)(struct kvm_vcpu *vcpu, - struct kvm_debug_guest *dbg); - void (*guest_debug_pre)(struct kvm_vcpu *vcpu); - int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); - int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); - u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); - void (*get_segment)(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg); - void (*set_segment)(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg); - void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); - void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); - void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); - void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); - void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); - void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); - void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); - void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); - void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); - void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); - unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr); - void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value, - int *exception); - void (*cache_regs)(struct kvm_vcpu *vcpu); - void (*decache_regs)(struct kvm_vcpu *vcpu); - unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); - void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); - - void (*tlb_flush)(struct kvm_vcpu *vcpu); - - void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); - int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu); - void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); - void (*patch_hypercall)(struct kvm_vcpu *vcpu, - unsigned char *hypercall_addr); - int (*get_irq)(struct kvm_vcpu *vcpu); - void (*set_irq)(struct kvm_vcpu *vcpu, int vec); - void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, - bool has_error_code, u32 error_code); - bool (*exception_injected)(struct kvm_vcpu *vcpu); - void (*inject_pending_irq)(struct kvm_vcpu *vcpu); - void (*inject_pending_vectors)(struct kvm_vcpu *vcpu, - struct kvm_run *run); - - int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); -}; - -extern struct kvm_x86_ops *kvm_x86_ops; - -int kvm_mmu_module_init(void); -void kvm_mmu_module_exit(void); - -void kvm_mmu_destroy(struct kvm_vcpu *vcpu); -int kvm_mmu_create(struct kvm_vcpu *vcpu); -int kvm_mmu_setup(struct kvm_vcpu *vcpu); -void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); - -int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); -void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); -void kvm_mmu_zap_all(struct kvm *kvm); -unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); -void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); - -enum emulation_result { - EMULATE_DONE, /* no further processing */ - EMULATE_DO_MMIO, /* kvm_run filled with mmio request */ - EMULATE_FAIL, /* can't emulate this instruction */ -}; - -int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, - unsigned long cr2, u16 error_code, int no_decode); -void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); -void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); -void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); -void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, - unsigned long *rflags); - -unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr); -void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value, - unsigned long *rflags); -int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); -int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); - -struct x86_emulate_ctxt; - -int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, - int size, unsigned port); -int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, - int size, unsigned long count, int down, - gva_t address, int rep, unsigned port); -void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); -int kvm_emulate_halt(struct kvm_vcpu *vcpu); -int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); -int emulate_clts(struct kvm_vcpu *vcpu); -int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, - unsigned long *dest); -int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, - unsigned long value); - -void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); -void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr0); -void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr0); -void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr0); -unsigned long get_cr8(struct kvm_vcpu *vcpu); -void lmsw(struct kvm_vcpu *vcpu, unsigned long msw); -void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); - -int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); -int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); - -void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); -void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); -void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, - u32 error_code); - -void fx_init(struct kvm_vcpu *vcpu); - -int emulator_read_std(unsigned long addr, - void *val, - unsigned int bytes, - struct kvm_vcpu *vcpu); -int emulator_write_emulated(unsigned long addr, - const void *val, - unsigned int bytes, - struct kvm_vcpu *vcpu); - -unsigned long segment_base(u16 selector); - -void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); -void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes); -int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); -void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); -int kvm_mmu_load(struct kvm_vcpu *vcpu); -void kvm_mmu_unload(struct kvm_vcpu *vcpu); - -int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); - -int kvm_fix_hypercall(struct kvm_vcpu *vcpu); - -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code); - -int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); -int complete_pio(struct kvm_vcpu *vcpu); - -static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) -{ - struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); - - return (struct kvm_mmu_page *)page_private(page); -} - -static inline u16 read_fs(void) -{ - u16 seg; - asm("mov %%fs, %0" : "=g"(seg)); - return seg; -} - -static inline u16 read_gs(void) -{ - u16 seg; - asm("mov %%gs, %0" : "=g"(seg)); - return seg; -} - -static inline u16 read_ldt(void) -{ - u16 ldt; - asm("sldt %0" : "=g"(ldt)); - return ldt; -} - -static inline void load_fs(u16 sel) -{ - asm("mov %0, %%fs" : : "rm"(sel)); -} - -static inline void load_gs(u16 sel) -{ - asm("mov %0, %%gs" : : "rm"(sel)); -} - -#ifndef load_ldt -static inline void load_ldt(u16 sel) -{ - asm("lldt %0" : : "rm"(sel)); -} -#endif - -static inline void get_idt(struct descriptor_table *table) -{ - asm("sidt %0" : "=m"(*table)); -} - -static inline void get_gdt(struct descriptor_table *table) -{ - asm("sgdt %0" : "=m"(*table)); -} - -static inline unsigned long read_tr_base(void) -{ - u16 tr; - asm("str %0" : "=g"(tr)); - return segment_base(tr); -} - -#ifdef CONFIG_X86_64 -static inline unsigned long read_msr(unsigned long msr) -{ - u64 value; - - rdmsrl(msr, value); - return value; -} -#endif - -static inline void fx_save(struct i387_fxsave_struct *image) -{ - asm("fxsave (%0)":: "r" (image)); -} - -static inline void fx_restore(struct i387_fxsave_struct *image) -{ - asm("fxrstor (%0)":: "r" (image)); -} - -static inline void fpu_init(void) -{ - asm("finit"); -} - -static inline u32 get_rdx_init_val(void) -{ - return 0x600; /* P6 family */ -} - -static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code) -{ - kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); -} - -#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30" -#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2" -#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3" -#define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30" -#define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0" -#define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0" -#define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4" -#define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4" -#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30" - -#define MSR_IA32_TIME_STAMP_COUNTER 0x010 - -#define TSS_IOPB_BASE_OFFSET 0x66 -#define TSS_BASE_SIZE 0x68 -#define TSS_IOPB_SIZE (65536 / 8) -#define TSS_REDIRECTION_SIZE (256 / 8) -#define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1) - -#endif diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c deleted file mode 100644 index 50b133f6874..00000000000 --- a/drivers/kvm/x86_emulate.c +++ /dev/null @@ -1,1913 +0,0 @@ -/****************************************************************************** - * x86_emulate.c - * - * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. - * - * Copyright (c) 2005 Keir Fraser - * - * Linux coding style, mod r/m decoder, segment base fixes, real-mode - * privileged instructions: - * - * Copyright (C) 2006 Qumranet - * - * Avi Kivity <avi@qumranet.com> - * Yaniv Kamay <yaniv@qumranet.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 - */ - -#ifndef __KERNEL__ -#include <stdio.h> -#include <stdint.h> -#include <public/xen.h> -#define DPRINTF(_f, _a ...) printf(_f , ## _a) -#else -#include "kvm.h" -#include "x86.h" -#define DPRINTF(x...) do {} while (0) -#endif -#include "x86_emulate.h" -#include <linux/module.h> - -/* - * Opcode effective-address decode tables. - * Note that we only emulate instructions that have at least one memory - * operand (excluding implicit stack references). We assume that stack - * references and instruction fetches will never occur in special memory - * areas that require emulation. So, for example, 'mov <imm>,<reg>' need - * not be handled. - */ - -/* Operand sizes: 8-bit operands or specified/overridden size. */ -#define ByteOp (1<<0) /* 8-bit operands. */ -/* Destination operand type. */ -#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ -#define DstReg (2<<1) /* Register operand. */ -#define DstMem (3<<1) /* Memory operand. */ -#define DstMask (3<<1) -/* Source operand type. */ -#define SrcNone (0<<3) /* No source operand. */ -#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */ -#define SrcReg (1<<3) /* Register operand. */ -#define SrcMem (2<<3) /* Memory operand. */ -#define SrcMem16 (3<<3) /* Memory operand (16-bit). */ -#define SrcMem32 (4<<3) /* Memory operand (32-bit). */ -#define SrcImm (5<<3) /* Immediate operand. */ -#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */ -#define SrcMask (7<<3) -/* Generic ModRM decode. */ -#define ModRM (1<<6) -/* Destination is only written; never read. */ -#define Mov (1<<7) -#define BitOp (1<<8) -#define MemAbs (1<<9) /* Memory operand is absolute displacement */ -#define String (1<<10) /* String instruction (rep capable) */ -#define Stack (1<<11) /* Stack instruction (push/pop) */ - -static u16 opcode_table[256] = { - /* 0x00 - 0x07 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, - /* 0x08 - 0x0F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, - /* 0x10 - 0x17 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, - /* 0x18 - 0x1F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, - /* 0x20 - 0x27 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - SrcImmByte, SrcImm, 0, 0, - /* 0x28 - 0x2F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, - /* 0x30 - 0x37 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, - /* 0x38 - 0x3F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, - /* 0x40 - 0x47 */ - DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, - /* 0x48 - 0x4F */ - DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, - /* 0x50 - 0x57 */ - SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, - SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, - /* 0x58 - 0x5F */ - DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, - DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, - /* 0x60 - 0x67 */ - 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , - 0, 0, 0, 0, - /* 0x68 - 0x6F */ - 0, 0, ImplicitOps | Mov | Stack, 0, - SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ - SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ - /* 0x70 - 0x77 */ - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - /* 0x78 - 0x7F */ - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - /* 0x80 - 0x87 */ - ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - /* 0x88 - 0x8F */ - ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, - ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov | Stack, - /* 0x90 - 0x9F */ - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, - /* 0xA0 - 0xA7 */ - ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, - ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, - ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, - ByteOp | ImplicitOps | String, ImplicitOps | String, - /* 0xA8 - 0xAF */ - 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, - ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, - ByteOp | ImplicitOps | String, ImplicitOps | String, - /* 0xB0 - 0xBF */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xC0 - 0xC7 */ - ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, - 0, ImplicitOps | Stack, 0, 0, - ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, - /* 0xC8 - 0xCF */ - 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xD0 - 0xD7 */ - ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, - ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, - 0, 0, 0, 0, - /* 0xD8 - 0xDF */ - 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xE0 - 0xE7 */ - 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xE8 - 0xEF */ - ImplicitOps | Stack, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, - 0, 0, 0, 0, - /* 0xF0 - 0xF7 */ - 0, 0, 0, 0, - ImplicitOps, ImplicitOps, - ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, - /* 0xF8 - 0xFF */ - ImplicitOps, 0, ImplicitOps, ImplicitOps, - 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM -}; - -static u16 twobyte_table[256] = { - /* 0x00 - 0x0F */ - 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0, - ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, - /* 0x10 - 0x1F */ - 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, - /* 0x20 - 0x2F */ - ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x30 - 0x3F */ - ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x40 - 0x47 */ - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - /* 0x48 - 0x4F */ - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - /* 0x50 - 0x5F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x60 - 0x6F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x70 - 0x7F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x80 - 0x8F */ - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - /* 0x90 - 0x9F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xA0 - 0xA7 */ - 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, - /* 0xA8 - 0xAF */ - 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, - /* 0xB0 - 0xB7 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, - DstMem | SrcReg | ModRM | BitOp, - 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem16 | ModRM | Mov, - /* 0xB8 - 0xBF */ - 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp, - 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem16 | ModRM | Mov, - /* 0xC0 - 0xCF */ - 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM, - 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xD0 - 0xDF */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xE0 - 0xEF */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xF0 - 0xFF */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -}; - -/* EFLAGS bit definitions. */ -#define EFLG_OF (1<<11) -#define EFLG_DF (1<<10) -#define EFLG_SF (1<<7) -#define EFLG_ZF (1<<6) -#define EFLG_AF (1<<4) -#define EFLG_PF (1<<2) -#define EFLG_CF (1<<0) - -/* - * Instruction emulation: - * Most instructions are emulated directly via a fragment of inline assembly - * code. This allows us to save/restore EFLAGS and thus very easily pick up - * any modified flags. - */ - -#if defined(CONFIG_X86_64) -#define _LO32 "k" /* force 32-bit operand */ -#define _STK "%%rsp" /* stack pointer */ -#elif defined(__i386__) -#define _LO32 "" /* force 32-bit operand */ -#define _STK "%%esp" /* stack pointer */ -#endif - -/* - * These EFLAGS bits are restored from saved value during emulation, and - * any changes are written back to the saved value after emulation. - */ -#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) - -/* Before executing instruction: restore necessary bits in EFLAGS. */ -#define _PRE_EFLAGS(_sav, _msk, _tmp) \ - /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \ - "movl %"_sav",%"_LO32 _tmp"; " \ - "push %"_tmp"; " \ - "push %"_tmp"; " \ - "movl %"_msk",%"_LO32 _tmp"; " \ - "andl %"_LO32 _tmp",("_STK"); " \ - "pushf; " \ - "notl %"_LO32 _tmp"; " \ - "andl %"_LO32 _tmp",("_STK"); " \ - "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \ - "pop %"_tmp"; " \ - "orl %"_LO32 _tmp",("_STK"); " \ - "popf; " \ - "pop %"_sav"; " - -/* After executing instruction: write-back necessary bits in EFLAGS. */ -#define _POST_EFLAGS(_sav, _msk, _tmp) \ - /* _sav |= EFLAGS & _msk; */ \ - "pushf; " \ - "pop %"_tmp"; " \ - "andl %"_msk",%"_LO32 _tmp"; " \ - "orl %"_LO32 _tmp",%"_sav"; " - -/* Raw emulation: instruction has two explicit operands. */ -#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \ - do { \ - unsigned long _tmp; \ - \ - switch ((_dst).bytes) { \ - case 2: \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "4", "2") \ - _op"w %"_wx"3,%1; " \ - _POST_EFLAGS("0", "4", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (_tmp) \ - : _wy ((_src).val), "i" (EFLAGS_MASK)); \ - break; \ - case 4: \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "4", "2") \ - _op"l %"_lx"3,%1; " \ - _POST_EFLAGS("0", "4", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (_tmp) \ - : _ly ((_src).val), "i" (EFLAGS_MASK)); \ - break; \ - case 8: \ - __emulate_2op_8byte(_op, _src, _dst, \ - _eflags, _qx, _qy); \ - break; \ - } \ - } while (0) - -#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ - do { \ - unsigned long _tmp; \ - switch ((_dst).bytes) { \ - case 1: \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "4", "2") \ - _op"b %"_bx"3,%1; " \ - _POST_EFLAGS("0", "4", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (_tmp) \ - : _by ((_src).val), "i" (EFLAGS_MASK)); \ - break; \ - default: \ - __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ - _wx, _wy, _lx, _ly, _qx, _qy); \ - break; \ - } \ - } while (0) - -/* Source operand is byte-sized and may be restricted to just %cl. */ -#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \ - __emulate_2op(_op, _src, _dst, _eflags, \ - "b", "c", "b", "c", "b", "c", "b", "c") - -/* Source operand is byte, word, long or quad sized. */ -#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \ - __emulate_2op(_op, _src, _dst, _eflags, \ - "b", "q", "w", "r", _LO32, "r", "", "r") - -/* Source operand is word, long or quad sized. */ -#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \ - __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ - "w", "r", _LO32, "r", "", "r") - -/* Instruction has only one explicit operand (no source operand). */ -#define emulate_1op(_op, _dst, _eflags) \ - do { \ - unsigned long _tmp; \ - \ - switch ((_dst).bytes) { \ - case 1: \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "3", "2") \ - _op"b %1; " \ - _POST_EFLAGS("0", "3", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (_tmp) \ - : "i" (EFLAGS_MASK)); \ - break; \ - case 2: \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "3", "2") \ - _op"w %1; " \ - _POST_EFLAGS("0", "3", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (_tmp) \ - : "i" (EFLAGS_MASK)); \ - break; \ - case 4: \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "3", "2") \ - _op"l %1; " \ - _POST_EFLAGS("0", "3", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (_tmp) \ - : "i" (EFLAGS_MASK)); \ - break; \ - case 8: \ - __emulate_1op_8byte(_op, _dst, _eflags); \ - break; \ - } \ - } while (0) - -/* Emulate an instruction with quadword operands (x86/64 only). */ -#if defined(CONFIG_X86_64) -#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \ - do { \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "4", "2") \ - _op"q %"_qx"3,%1; " \ - _POST_EFLAGS("0", "4", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ - : _qy ((_src).val), "i" (EFLAGS_MASK)); \ - } while (0) - -#define __emulate_1op_8byte(_op, _dst, _eflags) \ - do { \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "3", "2") \ - _op"q %1; " \ - _POST_EFLAGS("0", "3", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ - : "i" (EFLAGS_MASK)); \ - } while (0) - -#elif defined(__i386__) -#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) -#define __emulate_1op_8byte(_op, _dst, _eflags) -#endif /* __i386__ */ - -/* Fetch next part of the instruction being emulated. */ -#define insn_fetch(_type, _size, _eip) \ -({ unsigned long _x; \ - rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \ - if (rc != 0) \ - goto done; \ - (_eip) += (_size); \ - (_type)_x; \ -}) - -/* Access/update address held in a register, based on addressing mode. */ -#define address_mask(reg) \ - ((c->ad_bytes == sizeof(unsigned long)) ? \ - (reg) : ((reg) & ((1UL << (c->ad_bytes << 3)) - 1))) -#define register_address(base, reg) \ - ((base) + address_mask(reg)) -#define register_address_increment(reg, inc) \ - do { \ - /* signed type ensures sign extension to long */ \ - int _inc = (inc); \ - if (c->ad_bytes == sizeof(unsigned long)) \ - (reg) += _inc; \ - else \ - (reg) = ((reg) & \ - ~((1UL << (c->ad_bytes << 3)) - 1)) | \ - (((reg) + _inc) & \ - ((1UL << (c->ad_bytes << 3)) - 1)); \ - } while (0) - -#define JMP_REL(rel) \ - do { \ - register_address_increment(c->eip, rel); \ - } while (0) - -static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - unsigned long linear, u8 *dest) -{ - struct fetch_cache *fc = &ctxt->decode.fetch; - int rc; - int size; - - if (linear < fc->start || linear >= fc->end) { - size = min(15UL, PAGE_SIZE - offset_in_page(linear)); - rc = ops->read_std(linear, fc->data, size, ctxt->vcpu); - if (rc) - return rc; - fc->start = linear; - fc->end = linear + size; - } - *dest = fc->data[linear - fc->start]; - return 0; -} - -static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - unsigned long eip, void *dest, unsigned size) -{ - int rc = 0; - - eip += ctxt->cs_base; - while (size--) { - rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); - if (rc) - return rc; - } - return 0; -} - -/* - * Given the 'reg' portion of a ModRM byte, and a register block, return a - * pointer into the block that addresses the relevant register. - * @highbyte_regs specifies whether to decode AH,CH,DH,BH. - */ -static void *decode_register(u8 modrm_reg, unsigned long *regs, - int highbyte_regs) -{ - void *p; - - p = ®s[modrm_reg]; - if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) - p = (unsigned char *)®s[modrm_reg & 3] + 1; - return p; -} - -static int read_descriptor(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - void *ptr, - u16 *size, unsigned long *address, int op_bytes) -{ - int rc; - - if (op_bytes == 2) - op_bytes = 3; - *address = 0; - rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, - ctxt->vcpu); - if (rc) - return rc; - rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, - ctxt->vcpu); - return rc; -} - -static int test_cc(unsigned int condition, unsigned int flags) -{ - int rc = 0; - - switch ((condition & 15) >> 1) { - case 0: /* o */ - rc |= (flags & EFLG_OF); - break; - case 1: /* b/c/nae */ - rc |= (flags & EFLG_CF); - break; - case 2: /* z/e */ - rc |= (flags & EFLG_ZF); - break; - case 3: /* be/na */ - rc |= (flags & (EFLG_CF|EFLG_ZF)); - break; - case 4: /* s */ - rc |= (flags & EFLG_SF); - break; - case 5: /* p/pe */ - rc |= (flags & EFLG_PF); - break; - case 7: /* le/ng */ - rc |= (flags & EFLG_ZF); - /* fall through */ - case 6: /* l/nge */ - rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF)); - break; - } - - /* Odd condition identifiers (lsb == 1) have inverted sense. */ - return (!!rc ^ (condition & 1)); -} - -static void decode_register_operand(struct operand *op, - struct decode_cache *c, - int inhibit_bytereg) -{ - unsigned reg = c->modrm_reg; - int highbyte_regs = c->rex_prefix == 0; - - if (!(c->d & ModRM)) - reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); - op->type = OP_REG; - if ((c->d & ByteOp) && !inhibit_bytereg) { - op->ptr = decode_register(reg, c->regs, highbyte_regs); - op->val = *(u8 *)op->ptr; - op->bytes = 1; - } else { - op->ptr = decode_register(reg, c->regs, 0); - op->bytes = c->op_bytes; - switch (op->bytes) { - case 2: - op->val = *(u16 *)op->ptr; - break; - case 4: - op->val = *(u32 *)op->ptr; - break; - case 8: - op->val = *(u64 *) op->ptr; - break; - } - } - op->orig_val = op->val; -} - -static int decode_modrm(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) -{ - struct decode_cache *c = &ctxt->decode; - u8 sib; - int index_reg = 0, base_reg = 0, scale, rip_relative = 0; - int rc = 0; - - if (c->rex_prefix) { - c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ - index_reg = (c->rex_prefix & 2) << 2; /* REX.X */ - c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */ - } - - c->modrm = insn_fetch(u8, 1, c->eip); - c->modrm_mod |= (c->modrm & 0xc0) >> 6; - c->modrm_reg |= (c->modrm & 0x38) >> 3; - c->modrm_rm |= (c->modrm & 0x07); - c->modrm_ea = 0; - c->use_modrm_ea = 1; - - if (c->modrm_mod == 3) { - c->modrm_val = *(unsigned long *) - decode_register(c->modrm_rm, c->regs, c->d & ByteOp); - return rc; - } - - if (c->ad_bytes == 2) { - unsigned bx = c->regs[VCPU_REGS_RBX]; - unsigned bp = c->regs[VCPU_REGS_RBP]; - unsigned si = c->regs[VCPU_REGS_RSI]; - unsigned di = c->regs[VCPU_REGS_RDI]; - - /* 16-bit ModR/M decode. */ - switch (c->modrm_mod) { - case 0: - if (c->modrm_rm == 6) - c->modrm_ea += insn_fetch(u16, 2, c->eip); - break; - case 1: - c->modrm_ea += insn_fetch(s8, 1, c->eip); - break; - case 2: - c->modrm_ea += insn_fetch(u16, 2, c->eip); - break; - } - switch (c->modrm_rm) { - case 0: - c->modrm_ea += bx + si; - break; - case 1: - c->modrm_ea += bx + di; - break; - case 2: - c->modrm_ea += bp + si; - break; - case 3: - c->modrm_ea += bp + di; - break; - case 4: - c->modrm_ea += si; - break; - case 5: - c->modrm_ea += di; - break; - case 6: - if (c->modrm_mod != 0) - c->modrm_ea += bp; - break; - case 7: - c->modrm_ea += bx; - break; - } - if (c->modrm_rm == 2 || c->modrm_rm == 3 || - (c->modrm_rm == 6 && c->modrm_mod != 0)) - if (!c->override_base) - c->override_base = &ctxt->ss_base; - c->modrm_ea = (u16)c->modrm_ea; - } else { - /* 32/64-bit ModR/M decode. */ - switch (c->modrm_rm) { - case 4: - case 12: - sib = insn_fetch(u8, 1, c->eip); - index_reg |= (sib >> 3) & 7; - base_reg |= sib & 7; - scale = sib >> 6; - - switch (base_reg) { - case 5: - if (c->modrm_mod != 0) - c->modrm_ea += c->regs[base_reg]; - else - c->modrm_ea += - insn_fetch(s32, 4, c->eip); - break; - default: - c->modrm_ea += c->regs[base_reg]; - } - switch (index_reg) { - case 4: - break; - default: - c->modrm_ea += c->regs[index_reg] << scale; - } - break; - case 5: - if (c->modrm_mod != 0) - c->modrm_ea += c->regs[c->modrm_rm]; - else if (ctxt->mode == X86EMUL_MODE_PROT64) - rip_relative = 1; - break; - default: - c->modrm_ea += c->regs[c->modrm_rm]; - break; - } - switch (c->modrm_mod) { - case 0: - if (c->modrm_rm == 5) - c->modrm_ea += insn_fetch(s32, 4, c->eip); - break; - case 1: - c->modrm_ea += insn_fetch(s8, 1, c->eip); - break; - case 2: - c->modrm_ea += insn_fetch(s32, 4, c->eip); - break; - } - } - if (rip_relative) { - c->modrm_ea += c->eip; - switch (c->d & SrcMask) { - case SrcImmByte: - c->modrm_ea += 1; - break; - case SrcImm: - if (c->d & ByteOp) - c->modrm_ea += 1; - else - if (c->op_bytes == 8) - c->modrm_ea += 4; - else - c->modrm_ea += c->op_bytes; - } - } -done: - return rc; -} - -static int decode_abs(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) -{ - struct decode_cache *c = &ctxt->decode; - int rc = 0; - - switch (c->ad_bytes) { - case 2: - c->modrm_ea = insn_fetch(u16, 2, c->eip); - break; - case 4: - c->modrm_ea = insn_fetch(u32, 4, c->eip); - break; - case 8: - c->modrm_ea = insn_fetch(u64, 8, c->eip); - break; - } -done: - return rc; -} - -int -x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) -{ - struct decode_cache *c = &ctxt->decode; - int rc = 0; - int mode = ctxt->mode; - int def_op_bytes, def_ad_bytes; - - /* Shadow copy of register state. Committed on successful emulation. */ - - memset(c, 0, sizeof(struct decode_cache)); - c->eip = ctxt->vcpu->arch.rip; - memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); - - switch (mode) { - case X86EMUL_MODE_REAL: - case X86EMUL_MODE_PROT16: - def_op_bytes = def_ad_bytes = 2; - break; - case X86EMUL_MODE_PROT32: - def_op_bytes = def_ad_bytes = 4; - break; -#ifdef CONFIG_X86_64 - case X86EMUL_MODE_PROT64: - def_op_bytes = 4; - def_ad_bytes = 8; - break; -#endif - default: - return -1; - } - - c->op_bytes = def_op_bytes; - c->ad_bytes = def_ad_bytes; - - /* Legacy prefixes. */ - for (;;) { - switch (c->b = insn_fetch(u8, 1, c->eip)) { - case 0x66: /* operand-size override */ - /* switch between 2/4 bytes */ - c->op_bytes = def_op_bytes ^ 6; - break; - case 0x67: /* address-size override */ - if (mode == X86EMUL_MODE_PROT64) - /* switch between 4/8 bytes */ - c->ad_bytes = def_ad_bytes ^ 12; - else - /* switch between 2/4 bytes */ - c->ad_bytes = def_ad_bytes ^ 6; - break; - case 0x2e: /* CS override */ - c->override_base = &ctxt->cs_base; - break; - case 0x3e: /* DS override */ - c->override_base = &ctxt->ds_base; - break; - case 0x26: /* ES override */ - c->override_base = &ctxt->es_base; - break; - case 0x64: /* FS override */ - c->override_base = &ctxt->fs_base; - break; - case 0x65: /* GS override */ - c->override_base = &ctxt->gs_base; - break; - case 0x36: /* SS override */ - c->override_base = &ctxt->ss_base; - break; - case 0x40 ... 0x4f: /* REX */ - if (mode != X86EMUL_MODE_PROT64) - goto done_prefixes; - c->rex_prefix = c->b; - continue; - case 0xf0: /* LOCK */ - c->lock_prefix = 1; - break; - case 0xf2: /* REPNE/REPNZ */ - c->rep_prefix = REPNE_PREFIX; - break; - case 0xf3: /* REP/REPE/REPZ */ - c->rep_prefix = REPE_PREFIX; - break; - default: - goto done_prefixes; - } - - /* Any legacy prefix after a REX prefix nullifies its effect. */ - - c->rex_prefix = 0; - } - -done_prefixes: - - /* REX prefix. */ - if (c->rex_prefix) - if (c->rex_prefix & 8) - c->op_bytes = 8; /* REX.W */ - - /* Opcode byte(s). */ - c->d = opcode_table[c->b]; - if (c->d == 0) { - /* Two-byte opcode? */ - if (c->b == 0x0f) { - c->twobyte = 1; - c->b = insn_fetch(u8, 1, c->eip); - c->d = twobyte_table[c->b]; - } - - /* Unrecognised? */ - if (c->d == 0) { - DPRINTF("Cannot emulate %02x\n", c->b); - return -1; - } - } - - if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) - c->op_bytes = 8; - - /* ModRM and SIB bytes. */ - if (c->d & ModRM) - rc = decode_modrm(ctxt, ops); - else if (c->d & MemAbs) - rc = decode_abs(ctxt, ops); - if (rc) - goto done; - - if (!c->override_base) - c->override_base = &ctxt->ds_base; - if (mode == X86EMUL_MODE_PROT64 && - c->override_base != &ctxt->fs_base && - c->override_base != &ctxt->gs_base) - c->override_base = NULL; - - if (c->override_base) - c->modrm_ea += *c->override_base; - - if (c->ad_bytes != 8) - c->modrm_ea = (u32)c->modrm_ea; - /* - * Decode and fetch the source operand: register, memory - * or immediate. - */ - switch (c->d & SrcMask) { - case SrcNone: - break; - case SrcReg: - decode_register_operand(&c->src, c, 0); - break; - case SrcMem16: - c->src.bytes = 2; - goto srcmem_common; - case SrcMem32: - c->src.bytes = 4; - goto srcmem_common; - case SrcMem: - c->src.bytes = (c->d & ByteOp) ? 1 : - c->op_bytes; - /* Don't fetch the address for invlpg: it could be unmapped. */ - if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7) - break; - srcmem_common: - /* - * For instructions with a ModR/M byte, switch to register - * access if Mod = 3. - */ - if ((c->d & ModRM) && c->modrm_mod == 3) { - c->src.type = OP_REG; - break; - } - c->src.type = OP_MEM; - break; - case SrcImm: - c->src.type = OP_IMM; - c->src.ptr = (unsigned long *)c->eip; - c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - if (c->src.bytes == 8) - c->src.bytes = 4; - /* NB. Immediates are sign-extended as necessary. */ - switch (c->src.bytes) { - case 1: - c->src.val = insn_fetch(s8, 1, c->eip); - break; - case 2: - c->src.val = insn_fetch(s16, 2, c->eip); - break; - case 4: - c->src.val = insn_fetch(s32, 4, c->eip); - break; - } - break; - case SrcImmByte: - c->src.type = OP_IMM; - c->src.ptr = (unsigned long *)c->eip; - c->src.bytes = 1; - c->src.val = insn_fetch(s8, 1, c->eip); - break; - } - - /* Decode and fetch the destination operand: register or memory. */ - switch (c->d & DstMask) { - case ImplicitOps: - /* Special instructions do their own operand decoding. */ - return 0; - case DstReg: - decode_register_operand(&c->dst, c, - c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); - break; - case DstMem: - if ((c->d & ModRM) && c->modrm_mod == 3) { - c->dst.type = OP_REG; - break; - } - c->dst.type = OP_MEM; - break; - } - -done: - return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; -} - -static inline void emulate_push(struct x86_emulate_ctxt *ctxt) -{ - struct decode_cache *c = &ctxt->decode; - - c->dst.type = OP_MEM; - c->dst.bytes = c->op_bytes; - c->dst.val = c->src.val; - register_address_increment(c->regs[VCPU_REGS_RSP], -c->op_bytes); - c->dst.ptr = (void *) register_address(ctxt->ss_base, - c->regs[VCPU_REGS_RSP]); -} - -static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) -{ - struct decode_cache *c = &ctxt->decode; - int rc; - - rc = ops->read_std(register_address(ctxt->ss_base, - c->regs[VCPU_REGS_RSP]), - &c->dst.val, c->dst.bytes, ctxt->vcpu); - if (rc != 0) - return rc; - - register_address_increment(c->regs[VCPU_REGS_RSP], c->dst.bytes); - - return 0; -} - -static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) -{ - struct decode_cache *c = &ctxt->decode; - switch (c->modrm_reg) { - case 0: /* rol */ - emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags); - break; - case 1: /* ror */ - emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags); - break; - case 2: /* rcl */ - emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags); - break; - case 3: /* rcr */ - emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags); - break; - case 4: /* sal/shl */ - case 6: /* sal/shl */ - emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags); - break; - case 5: /* shr */ - emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags); - break; - case 7: /* sar */ - emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags); - break; - } -} - -static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) -{ - struct decode_cache *c = &ctxt->decode; - int rc = 0; - - switch (c->modrm_reg) { - case 0 ... 1: /* test */ - /* - * Special case in Grp3: test has an immediate - * source operand. - */ - c->src.type = OP_IMM; - c->src.ptr = (unsigned long *)c->eip; - c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - if (c->src.bytes == 8) - c->src.bytes = 4; - switch (c->src.bytes) { - case 1: - c->src.val = insn_fetch(s8, 1, c->eip); - break; - case 2: - c->src.val = insn_fetch(s16, 2, c->eip); - break; - case 4: - c->src.val = insn_fetch(s32, 4, c->eip); - break; - } - emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); - break; - case 2: /* not */ - c->dst.val = ~c->dst.val; - break; - case 3: /* neg */ - emulate_1op("neg", c->dst, ctxt->eflags); - break; - default: - DPRINTF("Cannot emulate %02x\n", c->b); - rc = X86EMUL_UNHANDLEABLE; - break; - } -done: - return rc; -} - -static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) -{ - struct decode_cache *c = &ctxt->decode; - int rc; - - switch (c->modrm_reg) { - case 0: /* inc */ - emulate_1op("inc", c->dst, ctxt->eflags); - break; - case 1: /* dec */ - emulate_1op("dec", c->dst, ctxt->eflags); - break; - case 4: /* jmp abs */ - if (c->b == 0xff) - c->eip = c->dst.val; - else { - DPRINTF("Cannot emulate %02x\n", c->b); - return X86EMUL_UNHANDLEABLE; - } - break; - case 6: /* push */ - - /* 64-bit mode: PUSH always pushes a 64-bit operand. */ - - if (ctxt->mode == X86EMUL_MODE_PROT64) { - c->dst.bytes = 8; - rc = ops->read_std((unsigned long)c->dst.ptr, - &c->dst.val, 8, ctxt->vcpu); - if (rc != 0) - return rc; - } - register_address_increment(c->regs[VCPU_REGS_RSP], - -c->dst.bytes); - rc = ops->write_emulated(register_address(ctxt->ss_base, - c->regs[VCPU_REGS_RSP]), &c->dst.val, - c->dst.bytes, ctxt->vcpu); - if (rc != 0) - return rc; - c->dst.type = OP_NONE; - break; - default: - DPRINTF("Cannot emulate %02x\n", c->b); - return X86EMUL_UNHANDLEABLE; - } - return 0; -} - -static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - unsigned long memop) -{ - struct decode_cache *c = &ctxt->decode; - u64 old, new; - int rc; - - rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu); - if (rc != 0) - return rc; - - if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || - ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) { - - c->regs[VCPU_REGS_RAX] = (u32) (old >> 0); - c->regs[VCPU_REGS_RDX] = (u32) (old >> 32); - ctxt->eflags &= ~EFLG_ZF; - - } else { - new = ((u64)c->regs[VCPU_REGS_RCX] << 32) | - (u32) c->regs[VCPU_REGS_RBX]; - - rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu); - if (rc != 0) - return rc; - ctxt->eflags |= EFLG_ZF; - } - return 0; -} - -static inline int writeback(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) -{ - int rc; - struct decode_cache *c = &ctxt->decode; - - switch (c->dst.type) { - case OP_REG: - /* The 4-byte case *is* correct: - * in 64-bit mode we zero-extend. - */ - switch (c->dst.bytes) { - case 1: - *(u8 *)c->dst.ptr = (u8)c->dst.val; - break; - case 2: - *(u16 *)c->dst.ptr = (u16)c->dst.val; - break; - case 4: - *c->dst.ptr = (u32)c->dst.val; - break; /* 64b: zero-ext */ - case 8: - *c->dst.ptr = c->dst.val; - break; - } - break; - case OP_MEM: - if (c->lock_prefix) - rc = ops->cmpxchg_emulated( - (unsigned long)c->dst.ptr, - &c->dst.orig_val, - &c->dst.val, - c->dst.bytes, - ctxt->vcpu); - else - rc = ops->write_emulated( - (unsigned long)c->dst.ptr, - &c->dst.val, - c->dst.bytes, - ctxt->vcpu); - if (rc != 0) - return rc; - break; - case OP_NONE: - /* no writeback */ - break; - default: - break; - } - return 0; -} - -int -x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) -{ - unsigned long memop = 0; - u64 msr_data; - unsigned long saved_eip = 0; - struct decode_cache *c = &ctxt->decode; - int rc = 0; - - /* Shadow copy of register state. Committed on successful emulation. - * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't - * modify them. - */ - - memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); - saved_eip = c->eip; - - if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs)) - memop = c->modrm_ea; - - if (c->rep_prefix && (c->d & String)) { - /* All REP prefixes have the same first termination condition */ - if (c->regs[VCPU_REGS_RCX] == 0) { - ctxt->vcpu->arch.rip = c->eip; - goto done; - } - /* The second termination condition only applies for REPE - * and REPNE. Test if the repeat string operation prefix is - * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the - * corresponding termination condition according to: - * - if REPE/REPZ and ZF = 0 then done - * - if REPNE/REPNZ and ZF = 1 then done - */ - if ((c->b == 0xa6) || (c->b == 0xa7) || - (c->b == 0xae) || (c->b == 0xaf)) { - if ((c->rep_prefix == REPE_PREFIX) && - ((ctxt->eflags & EFLG_ZF) == 0)) { - ctxt->vcpu->arch.rip = c->eip; - goto done; - } - if ((c->rep_prefix == REPNE_PREFIX) && - ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) { - ctxt->vcpu->arch.rip = c->eip; - goto done; - } - } - c->regs[VCPU_REGS_RCX]--; - c->eip = ctxt->vcpu->arch.rip; - } - - if (c->src.type == OP_MEM) { - c->src.ptr = (unsigned long *)memop; - c->src.val = 0; - rc = ops->read_emulated((unsigned long)c->src.ptr, - &c->src.val, - c->src.bytes, - ctxt->vcpu); - if (rc != 0) - goto done; - c->src.orig_val = c->src.val; - } - - if ((c->d & DstMask) == ImplicitOps) - goto special_insn; - - - if (c->dst.type == OP_MEM) { - c->dst.ptr = (unsigned long *)memop; - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.val = 0; - if (c->d & BitOp) { - unsigned long mask = ~(c->dst.bytes * 8 - 1); - - c->dst.ptr = (void *)c->dst.ptr + - (c->src.val & mask) / 8; - } - if (!(c->d & Mov) && - /* optimisation - avoid slow emulated read */ - ((rc = ops->read_emulated((unsigned long)c->dst.ptr, - &c->dst.val, - c->dst.bytes, ctxt->vcpu)) != 0)) - goto done; - } - c->dst.orig_val = c->dst.val; - -special_insn: - - if (c->twobyte) - goto twobyte_insn; - - switch (c->b) { - case 0x00 ... 0x05: - add: /* add */ - emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); - break; - case 0x08 ... 0x0d: - or: /* or */ - emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); - break; - case 0x10 ... 0x15: - adc: /* adc */ - emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); - break; - case 0x18 ... 0x1d: - sbb: /* sbb */ - emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); - break; - case 0x20 ... 0x23: - and: /* and */ - emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); - break; - case 0x24: /* and al imm8 */ - c->dst.type = OP_REG; - c->dst.ptr = &c->regs[VCPU_REGS_RAX]; - c->dst.val = *(u8 *)c->dst.ptr; - c->dst.bytes = 1; - c->dst.orig_val = c->dst.val; - goto and; - case 0x25: /* and ax imm16, or eax imm32 */ - c->dst.type = OP_REG; - c->dst.bytes = c->op_bytes; - c->dst.ptr = &c->regs[VCPU_REGS_RAX]; - if (c->op_bytes == 2) - c->dst.val = *(u16 *)c->dst.ptr; - else - c->dst.val = *(u32 *)c->dst.ptr; - c->dst.orig_val = c->dst.val; - goto and; - case 0x28 ... 0x2d: - sub: /* sub */ - emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); - break; - case 0x30 ... 0x35: - xor: /* xor */ - emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags); - break; - case 0x38 ... 0x3d: - cmp: /* cmp */ - emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); - break; - case 0x40 ... 0x47: /* inc r16/r32 */ - emulate_1op("inc", c->dst, ctxt->eflags); - break; - case 0x48 ... 0x4f: /* dec r16/r32 */ - emulate_1op("dec", c->dst, ctxt->eflags); - break; - case 0x50 ... 0x57: /* push reg */ - c->dst.type = OP_MEM; - c->dst.bytes = c->op_bytes; - c->dst.val = c->src.val; - register_address_increment(c->regs[VCPU_REGS_RSP], - -c->op_bytes); - c->dst.ptr = (void *) register_address( - ctxt->ss_base, c->regs[VCPU_REGS_RSP]); - break; - case 0x58 ... 0x5f: /* pop reg */ - pop_instruction: - if ((rc = ops->read_std(register_address(ctxt->ss_base, - c->regs[VCPU_REGS_RSP]), c->dst.ptr, - c->op_bytes, ctxt->vcpu)) != 0) - goto done; - - register_address_increment(c->regs[VCPU_REGS_RSP], - c->op_bytes); - c->dst.type = OP_NONE; /* Disable writeback. */ - break; - case 0x63: /* movsxd */ - if (ctxt->mode != X86EMUL_MODE_PROT64) - goto cannot_emulate; - c->dst.val = (s32) c->src.val; - break; - case 0x6a: /* push imm8 */ - c->src.val = 0L; - c->src.val = insn_fetch(s8, 1, c->eip); - emulate_push(ctxt); - break; - case 0x6c: /* insb */ - case 0x6d: /* insw/insd */ - if (kvm_emulate_pio_string(ctxt->vcpu, NULL, - 1, - (c->d & ByteOp) ? 1 : c->op_bytes, - c->rep_prefix ? - address_mask(c->regs[VCPU_REGS_RCX]) : 1, - (ctxt->eflags & EFLG_DF), - register_address(ctxt->es_base, - c->regs[VCPU_REGS_RDI]), - c->rep_prefix, - c->regs[VCPU_REGS_RDX]) == 0) { - c->eip = saved_eip; - return -1; - } - return 0; - case 0x6e: /* outsb */ - case 0x6f: /* outsw/outsd */ - if (kvm_emulate_pio_string(ctxt->vcpu, NULL, - 0, - (c->d & ByteOp) ? 1 : c->op_bytes, - c->rep_prefix ? - address_mask(c->regs[VCPU_REGS_RCX]) : 1, - (ctxt->eflags & EFLG_DF), - register_address(c->override_base ? - *c->override_base : - ctxt->ds_base, - c->regs[VCPU_REGS_RSI]), - c->rep_prefix, - c->regs[VCPU_REGS_RDX]) == 0) { - c->eip = saved_eip; - return -1; - } - return 0; - case 0x70 ... 0x7f: /* jcc (short) */ { - int rel = insn_fetch(s8, 1, c->eip); - - if (test_cc(c->b, ctxt->eflags)) - JMP_REL(rel); - break; - } - case 0x80 ... 0x83: /* Grp1 */ - switch (c->modrm_reg) { - case 0: - goto add; - case 1: - goto or; - case 2: - goto adc; - case 3: - goto sbb; - case 4: - goto and; - case 5: - goto sub; - case 6: - goto xor; - case 7: - goto cmp; - } - break; - case 0x84 ... 0x85: - emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); - break; - case 0x86 ... 0x87: /* xchg */ - /* Write back the register source. */ - switch (c->dst.bytes) { - case 1: - *(u8 *) c->src.ptr = (u8) c->dst.val; - break; - case 2: - *(u16 *) c->src.ptr = (u16) c->dst.val; - break; - case 4: - *c->src.ptr = (u32) c->dst.val; - break; /* 64b reg: zero-extend */ - case 8: - *c->src.ptr = c->dst.val; - break; - } - /* - * Write back the memory destination with implicit LOCK - * prefix. - */ - c->dst.val = c->src.val; - c->lock_prefix = 1; - break; - case 0x88 ... 0x8b: /* mov */ - goto mov; - case 0x8d: /* lea r16/r32, m */ - c->dst.val = c->modrm_val; - break; - case 0x8f: /* pop (sole member of Grp1a) */ - rc = emulate_grp1a(ctxt, ops); - if (rc != 0) - goto done; - break; - case 0x9c: /* pushf */ - c->src.val = (unsigned long) ctxt->eflags; - emulate_push(ctxt); - break; - case 0x9d: /* popf */ - c->dst.ptr = (unsigned long *) &ctxt->eflags; - goto pop_instruction; - case 0xa0 ... 0xa1: /* mov */ - c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; - c->dst.val = c->src.val; - break; - case 0xa2 ... 0xa3: /* mov */ - c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX]; - break; - case 0xa4 ... 0xa5: /* movs */ - c->dst.type = OP_MEM; - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.ptr = (unsigned long *)register_address( - ctxt->es_base, - c->regs[VCPU_REGS_RDI]); - if ((rc = ops->read_emulated(register_address( - c->override_base ? *c->override_base : - ctxt->ds_base, - c->regs[VCPU_REGS_RSI]), - &c->dst.val, - c->dst.bytes, ctxt->vcpu)) != 0) - goto done; - register_address_increment(c->regs[VCPU_REGS_RSI], - (ctxt->eflags & EFLG_DF) ? -c->dst.bytes - : c->dst.bytes); - register_address_increment(c->regs[VCPU_REGS_RDI], - (ctxt->eflags & EFLG_DF) ? -c->dst.bytes - : c->dst.bytes); - break; - case 0xa6 ... 0xa7: /* cmps */ - c->src.type = OP_NONE; /* Disable writeback. */ - c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->src.ptr = (unsigned long *)register_address( - c->override_base ? *c->override_base : - ctxt->ds_base, - c->regs[VCPU_REGS_RSI]); - if ((rc = ops->read_emulated((unsigned long)c->src.ptr, - &c->src.val, - c->src.bytes, - ctxt->vcpu)) != 0) - goto done; - - c->dst.type = OP_NONE; /* Disable writeback. */ - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.ptr = (unsigned long *)register_address( - ctxt->es_base, - c->regs[VCPU_REGS_RDI]); - if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, - &c->dst.val, - c->dst.bytes, - ctxt->vcpu)) != 0) - goto done; - - DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); - - emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); - - register_address_increment(c->regs[VCPU_REGS_RSI], - (ctxt->eflags & EFLG_DF) ? -c->src.bytes - : c->src.bytes); - register_address_increment(c->regs[VCPU_REGS_RDI], - (ctxt->eflags & EFLG_DF) ? -c->dst.bytes - : c->dst.bytes); - - break; - case 0xaa ... 0xab: /* stos */ - c->dst.type = OP_MEM; - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.ptr = (unsigned long *)register_address( - ctxt->es_base, - c->regs[VCPU_REGS_RDI]); - c->dst.val = c->regs[VCPU_REGS_RAX]; - register_address_increment(c->regs[VCPU_REGS_RDI], - (ctxt->eflags & EFLG_DF) ? -c->dst.bytes - : c->dst.bytes); - break; - case 0xac ... 0xad: /* lods */ - c->dst.type = OP_REG; - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; - if ((rc = ops->read_emulated(register_address( - c->override_base ? *c->override_base : - ctxt->ds_base, - c->regs[VCPU_REGS_RSI]), - &c->dst.val, - c->dst.bytes, - ctxt->vcpu)) != 0) - goto done; - register_address_increment(c->regs[VCPU_REGS_RSI], - (ctxt->eflags & EFLG_DF) ? -c->dst.bytes - : c->dst.bytes); - break; - case 0xae ... 0xaf: /* scas */ - DPRINTF("Urk! I don't handle SCAS.\n"); - goto cannot_emulate; - case 0xc0 ... 0xc1: - emulate_grp2(ctxt); - break; - case 0xc3: /* ret */ - c->dst.ptr = &c->eip; - goto pop_instruction; - case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ - mov: - c->dst.val = c->src.val; - break; - case 0xd0 ... 0xd1: /* Grp2 */ - c->src.val = 1; - emulate_grp2(ctxt); - break; - case 0xd2 ... 0xd3: /* Grp2 */ - c->src.val = c->regs[VCPU_REGS_RCX]; - emulate_grp2(ctxt); - break; - case 0xe8: /* call (near) */ { - long int rel; - switch (c->op_bytes) { - case 2: - rel = insn_fetch(s16, 2, c->eip); - break; - case 4: - rel = insn_fetch(s32, 4, c->eip); - break; - default: - DPRINTF("Call: Invalid op_bytes\n"); - goto cannot_emulate; - } - c->src.val = (unsigned long) c->eip; - JMP_REL(rel); - c->op_bytes = c->ad_bytes; - emulate_push(ctxt); - break; - } - case 0xe9: /* jmp rel */ - case 0xeb: /* jmp rel short */ - JMP_REL(c->src.val); - c->dst.type = OP_NONE; /* Disable writeback. */ - break; - case 0xf4: /* hlt */ - ctxt->vcpu->arch.halt_request = 1; - goto done; - case 0xf5: /* cmc */ - /* complement carry flag from eflags reg */ - ctxt->eflags ^= EFLG_CF; - c->dst.type = OP_NONE; /* Disable writeback. */ - break; - case 0xf6 ... 0xf7: /* Grp3 */ - rc = emulate_grp3(ctxt, ops); - if (rc != 0) - goto done; - break; - case 0xf8: /* clc */ - ctxt->eflags &= ~EFLG_CF; - c->dst.type = OP_NONE; /* Disable writeback. */ - break; - case 0xfa: /* cli */ - ctxt->eflags &= ~X86_EFLAGS_IF; - c->dst.type = OP_NONE; /* Disable writeback. */ - break; - case 0xfb: /* sti */ - ctxt->eflags |= X86_EFLAGS_IF; - c->dst.type = OP_NONE; /* Disable writeback. */ - break; - case 0xfe ... 0xff: /* Grp4/Grp5 */ - rc = emulate_grp45(ctxt, ops); - if (rc != 0) - goto done; - break; - } - -writeback: - rc = writeback(ctxt, ops); - if (rc != 0) - goto done; - - /* Commit shadow register state. */ - memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); - ctxt->vcpu->arch.rip = c->eip; - -done: - if (rc == X86EMUL_UNHANDLEABLE) { - c->eip = saved_eip; - return -1; - } - return 0; - -twobyte_insn: - switch (c->b) { - case 0x01: /* lgdt, lidt, lmsw */ - switch (c->modrm_reg) { - u16 size; - unsigned long address; - - case 0: /* vmcall */ - if (c->modrm_mod != 3 || c->modrm_rm != 1) - goto cannot_emulate; - - rc = kvm_fix_hypercall(ctxt->vcpu); - if (rc) - goto done; - - kvm_emulate_hypercall(ctxt->vcpu); - break; - case 2: /* lgdt */ - rc = read_descriptor(ctxt, ops, c->src.ptr, - &size, &address, c->op_bytes); - if (rc) - goto done; - realmode_lgdt(ctxt->vcpu, size, address); - break; - case 3: /* lidt/vmmcall */ - if (c->modrm_mod == 3 && c->modrm_rm == 1) { - rc = kvm_fix_hypercall(ctxt->vcpu); - if (rc) - goto done; - kvm_emulate_hypercall(ctxt->vcpu); - } else { - rc = read_descriptor(ctxt, ops, c->src.ptr, - &size, &address, - c->op_bytes); - if (rc) - goto done; - realmode_lidt(ctxt->vcpu, size, address); - } - break; - case 4: /* smsw */ - if (c->modrm_mod != 3) - goto cannot_emulate; - *(u16 *)&c->regs[c->modrm_rm] - = realmode_get_cr(ctxt->vcpu, 0); - break; - case 6: /* lmsw */ - if (c->modrm_mod != 3) - goto cannot_emulate; - realmode_lmsw(ctxt->vcpu, (u16)c->modrm_val, - &ctxt->eflags); - break; - case 7: /* invlpg*/ - emulate_invlpg(ctxt->vcpu, memop); - break; - default: - goto cannot_emulate; - } - /* Disable writeback. */ - c->dst.type = OP_NONE; - break; - case 0x06: - emulate_clts(ctxt->vcpu); - c->dst.type = OP_NONE; - break; - case 0x08: /* invd */ - case 0x09: /* wbinvd */ - case 0x0d: /* GrpP (prefetch) */ - case 0x18: /* Grp16 (prefetch/nop) */ - c->dst.type = OP_NONE; - break; - case 0x20: /* mov cr, reg */ - if (c->modrm_mod != 3) - goto cannot_emulate; - c->regs[c->modrm_rm] = - realmode_get_cr(ctxt->vcpu, c->modrm_reg); - c->dst.type = OP_NONE; /* no writeback */ - break; - case 0x21: /* mov from dr to reg */ - if (c->modrm_mod != 3) - goto cannot_emulate; - rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]); - if (rc) - goto cannot_emulate; - c->dst.type = OP_NONE; /* no writeback */ - break; - case 0x22: /* mov reg, cr */ - if (c->modrm_mod != 3) - goto cannot_emulate; - realmode_set_cr(ctxt->vcpu, - c->modrm_reg, c->modrm_val, &ctxt->eflags); - c->dst.type = OP_NONE; - break; - case 0x23: /* mov from reg to dr */ - if (c->modrm_mod != 3) - goto cannot_emulate; - rc = emulator_set_dr(ctxt, c->modrm_reg, - c->regs[c->modrm_rm]); - if (rc) - goto cannot_emulate; - c->dst.type = OP_NONE; /* no writeback */ - break; - case 0x30: - /* wrmsr */ - msr_data = (u32)c->regs[VCPU_REGS_RAX] - | ((u64)c->regs[VCPU_REGS_RDX] << 32); - rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); - if (rc) { - kvm_inject_gp(ctxt->vcpu, 0); - c->eip = ctxt->vcpu->arch.rip; - } - rc = X86EMUL_CONTINUE; - c->dst.type = OP_NONE; - break; - case 0x32: - /* rdmsr */ - rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); - if (rc) { - kvm_inject_gp(ctxt->vcpu, 0); - c->eip = ctxt->vcpu->arch.rip; - } else { - c->regs[VCPU_REGS_RAX] = (u32)msr_data; - c->regs[VCPU_REGS_RDX] = msr_data >> 32; - } - rc = X86EMUL_CONTINUE; - c->dst.type = OP_NONE; - break; - case 0x40 ... 0x4f: /* cmov */ - c->dst.val = c->dst.orig_val = c->src.val; - if (!test_cc(c->b, ctxt->eflags)) - c->dst.type = OP_NONE; /* no writeback */ - break; - case 0x80 ... 0x8f: /* jnz rel, etc*/ { - long int rel; - - switch (c->op_bytes) { - case 2: - rel = insn_fetch(s16, 2, c->eip); - break; - case 4: - rel = insn_fetch(s32, 4, c->eip); - break; - case 8: - rel = insn_fetch(s64, 8, c->eip); - break; - default: - DPRINTF("jnz: Invalid op_bytes\n"); - goto cannot_emulate; - } - if (test_cc(c->b, ctxt->eflags)) - JMP_REL(rel); - c->dst.type = OP_NONE; - break; - } - case 0xa3: - bt: /* bt */ - c->dst.type = OP_NONE; - /* only subword offset */ - c->src.val &= (c->dst.bytes << 3) - 1; - emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags); - break; - case 0xab: - bts: /* bts */ - /* only subword offset */ - c->src.val &= (c->dst.bytes << 3) - 1; - emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); - break; - case 0xb0 ... 0xb1: /* cmpxchg */ - /* - * Save real source value, then compare EAX against - * destination. - */ - c->src.orig_val = c->src.val; - c->src.val = c->regs[VCPU_REGS_RAX]; - emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); - if (ctxt->eflags & EFLG_ZF) { - /* Success: write back to memory. */ - c->dst.val = c->src.orig_val; - } else { - /* Failure: write the value we saw to EAX. */ - c->dst.type = OP_REG; - c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; - } - break; - case 0xb3: - btr: /* btr */ - /* only subword offset */ - c->src.val &= (c->dst.bytes << 3) - 1; - emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags); - break; - case 0xb6 ... 0xb7: /* movzx */ - c->dst.bytes = c->op_bytes; - c->dst.val = (c->d & ByteOp) ? (u8) c->src.val - : (u16) c->src.val; - break; - case 0xba: /* Grp8 */ - switch (c->modrm_reg & 3) { - case 0: - goto bt; - case 1: - goto bts; - case 2: - goto btr; - case 3: - goto btc; - } - break; - case 0xbb: - btc: /* btc */ - /* only subword offset */ - c->src.val &= (c->dst.bytes << 3) - 1; - emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags); - break; - case 0xbe ... 0xbf: /* movsx */ - c->dst.bytes = c->op_bytes; - c->dst.val = (c->d & ByteOp) ? (s8) c->src.val : - (s16) c->src.val; - break; - case 0xc3: /* movnti */ - c->dst.bytes = c->op_bytes; - c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val : - (u64) c->src.val; - break; - case 0xc7: /* Grp9 (cmpxchg8b) */ - rc = emulate_grp9(ctxt, ops, memop); - if (rc != 0) - goto done; - c->dst.type = OP_NONE; - break; - } - goto writeback; - -cannot_emulate: - DPRINTF("Cannot emulate %02x\n", c->b); - c->eip = saved_eip; - return -1; -} diff --git a/drivers/kvm/x86_emulate.h b/drivers/kvm/x86_emulate.h deleted file mode 100644 index 7db91b9bdcd..00000000000 --- a/drivers/kvm/x86_emulate.h +++ /dev/null @@ -1,186 +0,0 @@ -/****************************************************************************** - * x86_emulate.h - * - * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. - * - * Copyright (c) 2005 Keir Fraser - * - * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 - */ - -#ifndef __X86_EMULATE_H__ -#define __X86_EMULATE_H__ - -struct x86_emulate_ctxt; - -/* - * x86_emulate_ops: - * - * These operations represent the instruction emulator's interface to memory. - * There are two categories of operation: those that act on ordinary memory - * regions (*_std), and those that act on memory regions known to require - * special treatment or emulation (*_emulated). - * - * The emulator assumes that an instruction accesses only one 'emulated memory' - * location, that this location is the given linear faulting address (cr2), and - * that this is one of the instruction's data operands. Instruction fetches and - * stack operations are assumed never to access emulated memory. The emulator - * automatically deduces which operand of a string-move operation is accessing - * emulated memory, and assumes that the other operand accesses normal memory. - * - * NOTES: - * 1. The emulator isn't very smart about emulated vs. standard memory. - * 'Emulated memory' access addresses should be checked for sanity. - * 'Normal memory' accesses may fault, and the caller must arrange to - * detect and handle reentrancy into the emulator via recursive faults. - * Accesses may be unaligned and may cross page boundaries. - * 2. If the access fails (cannot emulate, or a standard access faults) then - * it is up to the memop to propagate the fault to the guest VM via - * some out-of-band mechanism, unknown to the emulator. The memop signals - * failure by returning X86EMUL_PROPAGATE_FAULT to the emulator, which will - * then immediately bail. - * 3. Valid access sizes are 1, 2, 4 and 8 bytes. On x86/32 systems only - * cmpxchg8b_emulated need support 8-byte accesses. - * 4. The emulator cannot handle 64-bit mode emulation on an x86/32 system. - */ -/* Access completed successfully: continue emulation as normal. */ -#define X86EMUL_CONTINUE 0 -/* Access is unhandleable: bail from emulation and return error to caller. */ -#define X86EMUL_UNHANDLEABLE 1 -/* Terminate emulation but return success to the caller. */ -#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */ -#define X86EMUL_RETRY_INSTR 2 /* retry the instruction for some reason */ -#define X86EMUL_CMPXCHG_FAILED 2 /* cmpxchg did not see expected value */ -struct x86_emulate_ops { - /* - * read_std: Read bytes of standard (non-emulated/special) memory. - * Used for instruction fetch, stack operations, and others. - * @addr: [IN ] Linear address from which to read. - * @val: [OUT] Value read from memory, zero-extended to 'u_long'. - * @bytes: [IN ] Number of bytes to read from memory. - */ - int (*read_std)(unsigned long addr, void *val, - unsigned int bytes, struct kvm_vcpu *vcpu); - - /* - * read_emulated: Read bytes from emulated/special memory area. - * @addr: [IN ] Linear address from which to read. - * @val: [OUT] Value read from memory, zero-extended to 'u_long'. - * @bytes: [IN ] Number of bytes to read from memory. - */ - int (*read_emulated) (unsigned long addr, - void *val, - unsigned int bytes, - struct kvm_vcpu *vcpu); - - /* - * write_emulated: Read bytes from emulated/special memory area. - * @addr: [IN ] Linear address to which to write. - * @val: [IN ] Value to write to memory (low-order bytes used as - * required). - * @bytes: [IN ] Number of bytes to write to memory. - */ - int (*write_emulated) (unsigned long addr, - const void *val, - unsigned int bytes, - struct kvm_vcpu *vcpu); - - /* - * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an - * emulated/special memory area. - * @addr: [IN ] Linear address to access. - * @old: [IN ] Value expected to be current at @addr. - * @new: [IN ] Value to write to @addr. - * @bytes: [IN ] Number of bytes to access using CMPXCHG. - */ - int (*cmpxchg_emulated) (unsigned long addr, - const void *old, - const void *new, - unsigned int bytes, - struct kvm_vcpu *vcpu); - -}; - -/* Type, address-of, and value of an instruction's operand. */ -struct operand { - enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; - unsigned int bytes; - unsigned long val, orig_val, *ptr; -}; - -struct fetch_cache { - u8 data[15]; - unsigned long start; - unsigned long end; -}; - -struct decode_cache { - u8 twobyte; - u8 b; - u8 lock_prefix; - u8 rep_prefix; - u8 op_bytes; - u8 ad_bytes; - u8 rex_prefix; - struct operand src; - struct operand dst; - unsigned long *override_base; - unsigned int d; - unsigned long regs[NR_VCPU_REGS]; - unsigned long eip; - /* modrm */ - u8 modrm; - u8 modrm_mod; - u8 modrm_reg; - u8 modrm_rm; - u8 use_modrm_ea; - unsigned long modrm_ea; - unsigned long modrm_val; - struct fetch_cache fetch; -}; - -struct x86_emulate_ctxt { - /* Register state before/after emulation. */ - struct kvm_vcpu *vcpu; - - /* Linear faulting address (if emulating a page-faulting instruction). */ - unsigned long eflags; - - /* Emulated execution mode, represented by an X86EMUL_MODE value. */ - int mode; - - unsigned long cs_base; - unsigned long ds_base; - unsigned long es_base; - unsigned long ss_base; - unsigned long gs_base; - unsigned long fs_base; - - /* decode cache */ - - struct decode_cache decode; -}; - -/* Repeat String Operation Prefix */ -#define REPE_PREFIX 1 -#define REPNE_PREFIX 2 - -/* Execution mode, passed to the emulator. */ -#define X86EMUL_MODE_REAL 0 /* Real mode. */ -#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */ -#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */ -#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ - -/* Host execution mode. */ -#if defined(__i386__) -#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32 -#elif defined(CONFIG_X86_64) -#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 -#endif - -int x86_decode_insn(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops); -int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops); - -#endif /* __X86_EMULATE_H__ */ |