From 0ee8dcb87e403397e575674d0e79272b06dea12e Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Mar 2011 15:41:59 +0800 Subject: [PATCH 001/131] KVM: cleanup memslot_id function We can get memslot id from memslot->id directly Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- include/linux/kvm_host.h | 6 +++++- virt/kvm/kvm_main.c | 17 ----------------- 2 files changed, 5 insertions(+), 18 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ab428552af8e..57d7092d7717 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -365,7 +365,6 @@ pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, bool *writable); pfn_t gfn_to_pfn_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn); -int memslot_id(struct kvm *kvm, gfn_t gfn); void kvm_release_pfn_dirty(pfn_t); void kvm_release_pfn_clean(pfn_t pfn); void kvm_set_pfn_dirty(pfn_t pfn); @@ -597,6 +596,11 @@ static inline void kvm_guest_exit(void) current->flags &= ~PF_VCPU; } +static inline int memslot_id(struct kvm *kvm, gfn_t gfn) +{ + return gfn_to_memslot(kvm, gfn)->id; +} + static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn) { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6330653480e4..58146457bf97 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -996,23 +996,6 @@ out: return size; } -int memslot_id(struct kvm *kvm, gfn_t gfn) -{ - int i; - struct kvm_memslots *slots = kvm_memslots(kvm); - struct kvm_memory_slot *memslot = NULL; - - for (i = 0; i < slots->nmemslots; ++i) { - memslot = &slots->memslots[i]; - - if (gfn >= memslot->base_gfn - && gfn < memslot->base_gfn + memslot->npages) - break; - } - - return memslot - slots->memslots; -} - static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, gfn_t *nr_pages) { From f6e78475894d6534d7d62714a95e2265f53d2a92 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 2 Aug 2010 15:30:20 +0300 Subject: [PATCH 002/131] KVM: Use kvm_get_rflags() and kvm_set_rflags() instead of the raw versions Some rflags bits are owned by the host, not guest, so we need to use kvm_get_rflags() to strip those bits away or kvm_set_rflags() to add them back. Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 14 +++++++------- arch/x86/kvm/vmx.c | 2 +- arch/x86/kvm/x86.c | 8 ++++---- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 6bb15d583e47..2a193222c987 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -975,7 +975,7 @@ static void init_vmcb(struct vcpu_svm *svm) svm_set_efer(&svm->vcpu, 0); save->dr6 = 0xffff0ff0; save->dr7 = 0x400; - save->rflags = 2; + kvm_set_rflags(&svm->vcpu, 2); save->rip = 0x0000fff0; svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; @@ -2127,7 +2127,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu); nested_vmcb->save.cr2 = vmcb->save.cr2; nested_vmcb->save.cr4 = svm->vcpu.arch.cr4; - nested_vmcb->save.rflags = vmcb->save.rflags; + nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu); nested_vmcb->save.rip = vmcb->save.rip; nested_vmcb->save.rsp = vmcb->save.rsp; nested_vmcb->save.rax = vmcb->save.rax; @@ -2184,7 +2184,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) svm->vmcb->save.ds = hsave->save.ds; svm->vmcb->save.gdtr = hsave->save.gdtr; svm->vmcb->save.idtr = hsave->save.idtr; - svm->vmcb->save.rflags = hsave->save.rflags; + kvm_set_rflags(&svm->vcpu, hsave->save.rflags); svm_set_efer(&svm->vcpu, hsave->save.efer); svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE); svm_set_cr4(&svm->vcpu, hsave->save.cr4); @@ -2312,7 +2312,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) hsave->save.efer = svm->vcpu.arch.efer; hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); hsave->save.cr4 = svm->vcpu.arch.cr4; - hsave->save.rflags = vmcb->save.rflags; + hsave->save.rflags = kvm_get_rflags(&svm->vcpu); hsave->save.rip = kvm_rip_read(&svm->vcpu); hsave->save.rsp = vmcb->save.rsp; hsave->save.rax = vmcb->save.rax; @@ -2323,7 +2323,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) copy_vmcb_control_area(hsave, vmcb); - if (svm->vmcb->save.rflags & X86_EFLAGS_IF) + if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF) svm->vcpu.arch.hflags |= HF_HIF_MASK; else svm->vcpu.arch.hflags &= ~HF_HIF_MASK; @@ -2341,7 +2341,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) svm->vmcb->save.ds = nested_vmcb->save.ds; svm->vmcb->save.gdtr = nested_vmcb->save.gdtr; svm->vmcb->save.idtr = nested_vmcb->save.idtr; - svm->vmcb->save.rflags = nested_vmcb->save.rflags; + kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags); svm_set_efer(&svm->vcpu, nested_vmcb->save.efer); svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0); svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4); @@ -3384,7 +3384,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)) return 0; - ret = !!(vmcb->save.rflags & X86_EFLAGS_IF); + ret = !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF); if (is_guest_mode(vcpu)) return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5b4cdcbd154c..d09833e45f6f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2113,7 +2113,7 @@ static int vmx_get_cpl(struct kvm_vcpu *vcpu) if (!is_protmode(vcpu)) return 0; - if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ + if (kvm_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ return 3; return vmcs_read16(GUEST_CS_SELECTOR) & 3; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 934b4c6b0bf9..3a557eefd2fb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4310,7 +4310,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); vcpu->arch.emulate_ctxt.vcpu = vcpu; - vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); + vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); vcpu->arch.emulate_ctxt.mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : @@ -4340,7 +4340,7 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq) vcpu->arch.emulate_ctxt.eip = c->eip; memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); - kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); if (irq == NMI_VECTOR) vcpu->arch.nmi_pending = false; @@ -4473,7 +4473,7 @@ restart: r = EMULATE_DONE; toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); - kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); kvm_make_request(KVM_REQ_EVENT, vcpu); memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); @@ -5592,7 +5592,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); - kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); kvm_make_request(KVM_REQ_EVENT, vcpu); return EMULATE_DONE; } From 6de12732c42c7070af42e3d6e42ecee2838fc920 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 7 Mar 2011 12:51:22 +0200 Subject: [PATCH 003/131] KVM: VMX: Optimize vmx_get_rflags() If called several times within the same exit, return cached results. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/vmx.c | 20 ++++++++++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c8af0991fdf0..5af426464954 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -118,6 +118,7 @@ enum kvm_reg { enum kvm_reg_ex { VCPU_EXREG_PDPTR = NR_VCPU_REGS, VCPU_EXREG_CR3, + VCPU_EXREG_RFLAGS, }; enum { diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d09833e45f6f..4d117072acf8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -130,6 +130,7 @@ struct vcpu_vmx { u8 fail; u32 exit_intr_info; u32 idt_vectoring_info; + ulong rflags; struct shared_msr_entry *guest_msrs; int nmsrs; int save_nmsrs; @@ -970,17 +971,23 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) { unsigned long rflags, save_rflags; - rflags = vmcs_readl(GUEST_RFLAGS); - if (to_vmx(vcpu)->rmode.vm86_active) { - rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; - save_rflags = to_vmx(vcpu)->rmode.save_rflags; - rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; + if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) { + __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); + rflags = vmcs_readl(GUEST_RFLAGS); + if (to_vmx(vcpu)->rmode.vm86_active) { + rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; + save_rflags = to_vmx(vcpu)->rmode.save_rflags; + rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; + } + to_vmx(vcpu)->rflags = rflags; } - return rflags; + return to_vmx(vcpu)->rflags; } static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { + __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); + to_vmx(vcpu)->rflags = rflags; if (to_vmx(vcpu)->rmode.vm86_active) { to_vmx(vcpu)->rmode.save_rflags = rflags; rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; @@ -4124,6 +4131,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) ); vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) + | (1 << VCPU_EXREG_RFLAGS) | (1 << VCPU_EXREG_PDPTR) | (1 << VCPU_EXREG_CR3)); vcpu->arch.regs_dirty = 0; From f4c63e5d5a356b652a3d984edbefca289176c40f Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 7 Mar 2011 14:54:28 +0200 Subject: [PATCH 004/131] KVM: VMX: Optimize vmx_get_cpl() In long mode, vm86 mode is disallowed, so we need not check for it. Reading rflags.vm may require a VMREAD, so it is expensive. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4d117072acf8..e8a35ee00735 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2120,7 +2120,8 @@ static int vmx_get_cpl(struct kvm_vcpu *vcpu) if (!is_protmode(vcpu)) return 0; - if (kvm_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ + if (!is_long_mode(vcpu) + && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */ return 3; return vmcs_read16(GUEST_CS_SELECTOR) & 3; From 69c730289011df706a1c9890d6e6c5ee822623c7 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 7 Mar 2011 15:26:44 +0200 Subject: [PATCH 005/131] KVM: VMX: Cache cpl We may read the cpl quite often in the same vmexit (instruction privilege check, memory access checks for instruction and operands), so we gain a bit if we cache the value. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/vmx.c | 17 ++++++++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5af426464954..35f81b110260 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -119,6 +119,7 @@ enum kvm_reg_ex { VCPU_EXREG_PDPTR = NR_VCPU_REGS, VCPU_EXREG_CR3, VCPU_EXREG_RFLAGS, + VCPU_EXREG_CPL, }; enum { diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e8a35ee00735..8f9e77edc016 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -128,6 +128,7 @@ struct vcpu_vmx { unsigned long host_rsp; int launched; u8 fail; + u8 cpl; u32 exit_intr_info; u32 idt_vectoring_info; ulong rflags; @@ -987,6 +988,7 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); + __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); to_vmx(vcpu)->rflags = rflags; if (to_vmx(vcpu)->rmode.vm86_active) { to_vmx(vcpu)->rmode.save_rflags = rflags; @@ -2005,6 +2007,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, hw_cr0); vcpu->arch.cr0 = cr0; + __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); } static u64 construct_eptp(unsigned long root_hpa) @@ -2115,7 +2118,7 @@ static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) return vmcs_readl(sf->base); } -static int vmx_get_cpl(struct kvm_vcpu *vcpu) +static int __vmx_get_cpl(struct kvm_vcpu *vcpu) { if (!is_protmode(vcpu)) return 0; @@ -2127,6 +2130,16 @@ static int vmx_get_cpl(struct kvm_vcpu *vcpu) return vmcs_read16(GUEST_CS_SELECTOR) & 3; } +static int vmx_get_cpl(struct kvm_vcpu *vcpu) +{ + if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) { + __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); + to_vmx(vcpu)->cpl = __vmx_get_cpl(vcpu); + } + return to_vmx(vcpu)->cpl; +} + + static u32 vmx_segment_access_rights(struct kvm_segment *var) { u32 ar; @@ -2192,6 +2205,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, ar |= 0x1; /* Accessed */ vmcs_write32(sf->ar_bytes, ar); + __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); } static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) @@ -4133,6 +4147,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) | (1 << VCPU_EXREG_RFLAGS) + | (1 << VCPU_EXREG_CPL) | (1 << VCPU_EXREG_PDPTR) | (1 << VCPU_EXREG_CR3)); vcpu->arch.regs_dirty = 0; From 9d58b93192065f4b2ba6b880e9b0dab0bc11d0ba Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 7 Mar 2011 16:52:07 +0200 Subject: [PATCH 006/131] KVM: VMX: Avoid vmx_recover_nmi_blocking() when unneeded When we haven't injected an interrupt, we don't need to recover the nmi blocking state (since the guest can't set it by itself). This allows us to avoid a VMREAD later on. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8f9e77edc016..53bf6ae493e3 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -129,6 +129,7 @@ struct vcpu_vmx { int launched; u8 fail; u8 cpl; + bool nmi_known_unmasked; u32 exit_intr_info; u32 idt_vectoring_info; ulong rflags; @@ -2959,6 +2960,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) } ++vcpu->stat.nmi_injections; + vmx->nmi_known_unmasked = false; if (vmx->rmode.vm86_active) { if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR) != EMULATE_DONE) kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); @@ -2983,6 +2985,8 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) { if (!cpu_has_virtual_nmis()) return to_vmx(vcpu)->soft_vnmi_blocked; + if (to_vmx(vcpu)->nmi_known_unmasked) + return false; return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; } @@ -2996,6 +3000,7 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) vmx->vnmi_blocked_time = 0; } } else { + vmx->nmi_known_unmasked = !masked; if (masked) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); @@ -3527,9 +3532,11 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) switch (type) { case INTR_TYPE_NMI_INTR: vcpu->arch.nmi_injected = false; - if (cpu_has_virtual_nmis()) + if (cpu_has_virtual_nmis()) { vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); + vmx->nmi_known_unmasked = false; + } break; case INTR_TYPE_EXT_INTR: case INTR_TYPE_SOFT_INTR: @@ -3916,6 +3923,8 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; if (cpu_has_virtual_nmis()) { + if (vmx->nmi_known_unmasked) + return; unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; vector = exit_intr_info & INTR_INFO_VECTOR_MASK; /* @@ -3932,6 +3941,10 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) vector != DF_VECTOR && !idtv_info_valid) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); + else + vmx->nmi_known_unmasked = + !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) + & GUEST_INTR_STATE_NMI); } else if (unlikely(vmx->soft_vnmi_blocked)) vmx->vnmi_blocked_time += ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); @@ -3970,6 +3983,7 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx, */ vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); + vmx->nmi_known_unmasked = true; break; case INTR_TYPE_SOFT_EXCEPTION: vmx->vcpu.arch.event_exit_inst_len = From f9902069c41254ad116e089e64ea21d3a000cc41 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 7 Mar 2011 17:20:29 +0200 Subject: [PATCH 007/131] KVM: VMX: Qualify check for host NMI Check for the exit reason first; this allows us, later, to avoid a VMREAD for VM_EXIT_INTR_INFO_FIELD. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 53bf6ae493e3..89130ba3b698 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3905,7 +3905,8 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) kvm_machine_check(); /* We need to handle NMIs before interrupts are enabled */ - if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && + if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI && + (exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && (exit_intr_info & INTR_INFO_VALID_MASK)) { kvm_before_handle_nmi(&vmx->vcpu); asm("int $2"); From 00eba012d53e63f620455f7013917e4bf59424f2 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 7 Mar 2011 17:24:54 +0200 Subject: [PATCH 008/131] KVM: VMX: Refactor vmx_complete_atomic_exit() Move the exit reason checks to the front of the function, for early exit in the common case. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 89130ba3b698..51aa827e3bbb 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3896,17 +3896,20 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { - u32 exit_intr_info = vmx->exit_intr_info; + u32 exit_intr_info; + + if (!(vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY + || vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)) + return; + + exit_intr_info = vmx->exit_intr_info; /* Handle machine checks before interrupts are enabled */ - if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY) - || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI - && is_machine_check(exit_intr_info))) + if (is_machine_check(exit_intr_info)) kvm_machine_check(); /* We need to handle NMIs before interrupts are enabled */ - if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI && - (exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && + if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && (exit_intr_info & INTR_INFO_VALID_MASK)) { kvm_before_handle_nmi(&vmx->vcpu); asm("int $2"); From c5ca8e572c4d8cb8dec1cf5b3fc9c7066f6b2c29 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 7 Mar 2011 17:37:37 +0200 Subject: [PATCH 009/131] KVM: VMX: Don't VMREAD VM_EXIT_INTR_INFO unconditionally Only read it if we're going to use it later. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 51aa827e3bbb..f95f48b26006 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3902,6 +3902,7 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) || vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)) return; + vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); exit_intr_info = vmx->exit_intr_info; /* Handle machine checks before interrupts are enabled */ @@ -3919,7 +3920,7 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { - u32 exit_intr_info = vmx->exit_intr_info; + u32 exit_intr_info; bool unblock_nmi; u8 vector; bool idtv_info_valid; @@ -3929,6 +3930,11 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) if (cpu_has_virtual_nmis()) { if (vmx->nmi_known_unmasked) return; + /* + * Can't use vmx->exit_intr_info since we're not sure what + * the exit reason is. + */ + exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; vector = exit_intr_info & INTR_INFO_VECTOR_MASK; /* @@ -4176,7 +4182,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->launched = 1; vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); - vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); vmx_complete_atomic_exit(vmx); vmx_recover_nmi_blocking(vmx); From 887864758580c80710947c38a4692032163777df Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 7 Mar 2011 17:39:45 +0200 Subject: [PATCH 010/131] KVM: VMX: Use cached VM_EXIT_INTR_INFO in handle_exception vmx_complete_atomic_exit() cached it for us, so we can use it here. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f95f48b26006..1bdb49de6a22 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3118,7 +3118,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) enum emulation_result er; vect_info = vmx->idt_vectoring_info; - intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + intr_info = vmx->exit_intr_info; if (is_machine_check(intr_info)) return handle_machine_check(vcpu); From 89a9fb78b5bd8bece353449079726556ecab41df Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Thu, 24 Mar 2011 09:45:10 +0100 Subject: [PATCH 011/131] KVM: SVM: Remove unused svm_features We use boot_cpu_has now. Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 2a193222c987..cb43e98eff63 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -376,7 +376,6 @@ struct svm_cpu_data { }; static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); -static uint32_t svm_features; struct svm_init_data { int cpu; @@ -802,8 +801,6 @@ static __init int svm_hardware_setup(void) goto err; } - svm_features = cpuid_edx(SVM_CPUID_FUNC); - if (!boot_cpu_has(X86_FEATURE_NPT)) npt_enabled = false; From 654f06fc651b01782015185e5b049197255463a3 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 23 Mar 2011 15:02:47 +0200 Subject: [PATCH 012/131] KVM: VMX: simplify NMI mask management Use vmx_set_nmi_mask() instead of open-coding management of the hardware bit and the software hint (nmi_known_unmasked). There's a slight change of behaviour when running without hardware virtual NMI support - we now clear the NMI mask if NMI delivery faulted in that case as well. This improves emulation accuracy. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1bdb49de6a22..2b99ae72481f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3532,11 +3532,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) switch (type) { case INTR_TYPE_NMI_INTR: vcpu->arch.nmi_injected = false; - if (cpu_has_virtual_nmis()) { - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); - vmx->nmi_known_unmasked = false; - } + vmx_set_nmi_mask(vcpu, true); break; case INTR_TYPE_EXT_INTR: case INTR_TYPE_SOFT_INTR: @@ -3991,9 +3987,7 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx, * Clear bit "block by NMI" before VM entry if a NMI * delivery faulted. */ - vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); - vmx->nmi_known_unmasked = true; + vmx_set_nmi_mask(&vmx->vcpu, false); break; case INTR_TYPE_SOFT_EXCEPTION: vmx->vcpu.arch.event_exit_inst_len = From 3291892450e670c4f170e271cd0c4b63d5a8e41a Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Wed, 23 Mar 2011 13:40:42 -0300 Subject: [PATCH 013/131] KVM: expose async pf through our standard mechanism As Avi recently mentioned, the new standard mechanism for exposing features is KVM_GET_SUPPORTED_CPUID, not spamming CAPs. For some reason async pf missed that. So expose async_pf here. Signed-off-by: Glauber Costa CC: Gleb Natapov CC: Avi Kivity Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3a557eefd2fb..a38fb9bb342b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2418,6 +2418,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) | (1 << KVM_FEATURE_NOP_IO_DELAY) | (1 << KVM_FEATURE_CLOCKSOURCE2) | + (1 << KVM_FEATURE_ASYNC_PF) | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); entry->ebx = 0; entry->ecx = 0; From c761e5868e6737abe0464636ebd7fcbb6814c626 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 1 Apr 2011 11:25:03 -0300 Subject: [PATCH 014/131] Revert "KVM: Fix race between nmi injection and enabling nmi window" This reverts commit f86368493ec038218e8663cc1b6e5393cd8e008a. Simpler fix to follow. Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 4 +--- include/linux/kvm_host.h | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a38fb9bb342b..b9402d5fa0e9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -361,8 +361,8 @@ void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) void kvm_inject_nmi(struct kvm_vcpu *vcpu) { - kvm_make_request(KVM_REQ_NMI, vcpu); kvm_make_request(KVM_REQ_EVENT, vcpu); + vcpu->arch.nmi_pending = 1; } EXPORT_SYMBOL_GPL(kvm_inject_nmi); @@ -5208,8 +5208,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) r = 1; goto out; } - if (kvm_check_request(KVM_REQ_NMI, vcpu)) - vcpu->arch.nmi_pending = true; } r = kvm_mmu_reload(vcpu); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 57d7092d7717..7ca831e55186 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -43,7 +43,6 @@ #define KVM_REQ_DEACTIVATE_FPU 10 #define KVM_REQ_EVENT 11 #define KVM_REQ_APF_HALT 12 -#define KVM_REQ_NMI 13 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 From 1499e54af03ae51a937c59035bc86002deae0572 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Fri, 1 Apr 2011 11:26:29 -0300 Subject: [PATCH 015/131] KVM: x86: better fix for race between nmi injection and enabling nmi window Fix race between nmi injection and enabling nmi window in a simpler way. Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b9402d5fa0e9..692c70d6fd02 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5171,6 +5171,7 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) static int vcpu_enter_guest(struct kvm_vcpu *vcpu) { int r; + bool nmi_pending; bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && vcpu->run->request_interrupt_window; @@ -5214,11 +5215,19 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (unlikely(r)) goto out; + /* + * An NMI can be injected between local nmi_pending read and + * vcpu->arch.nmi_pending read inside inject_pending_event(). + * But in that case, KVM_REQ_EVENT will be set, which makes + * the race described above benign. + */ + nmi_pending = ACCESS_ONCE(vcpu->arch.nmi_pending); + if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { inject_pending_event(vcpu); /* enable NMI/IRQ window open exits if needed */ - if (vcpu->arch.nmi_pending) + if (nmi_pending) kvm_x86_ops->enable_nmi_window(vcpu); else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) kvm_x86_ops->enable_irq_window(vcpu); From 70252a1053636c35776d6bc843dd3b260d9d6de1 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 19 Jan 2010 12:51:22 +0200 Subject: [PATCH 016/131] KVM: extend in-kernel mmio to handle >8 byte transactions Needed for coalesced mmio using sse. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 58 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 692c70d6fd02..3b234c18b635 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3596,20 +3596,43 @@ static void kvm_init_msr_list(void) static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, const void *v) { - if (vcpu->arch.apic && - !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) - return 0; + int handled = 0; + int n; - return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); + do { + n = min(len, 8); + if (!(vcpu->arch.apic && + !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, n, v)) + && kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, n, v)) + break; + handled += n; + addr += n; + len -= n; + v += n; + } while (len); + + return handled; } static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) { - if (vcpu->arch.apic && - !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) - return 0; + int handled = 0; + int n; - return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); + do { + n = min(len, 8); + if (!(vcpu->arch.apic && + !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, n, v)) + && kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, n, v)) + break; + trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v); + handled += n; + addr += n; + len -= n; + v += n; + } while (len); + + return handled; } static void kvm_set_segment(struct kvm_vcpu *vcpu, @@ -3769,6 +3792,7 @@ static int emulator_read_emulated(unsigned long addr, struct kvm_vcpu *vcpu) { gpa_t gpa; + int handled; if (vcpu->mmio_read_completed) { memcpy(val, vcpu->mmio_data, bytes); @@ -3795,10 +3819,14 @@ mmio: /* * Is this MMIO handled locally? */ - if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) { - trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val); + handled = vcpu_mmio_read(vcpu, gpa, bytes, val); + + if (handled == bytes) return X86EMUL_CONTINUE; - } + + gpa += handled; + bytes -= handled; + val += handled; trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); @@ -3830,6 +3858,7 @@ static int emulator_write_emulated_onepage(unsigned long addr, struct kvm_vcpu *vcpu) { gpa_t gpa; + int handled; gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); @@ -3848,9 +3877,14 @@ mmio: /* * Is this MMIO handled locally? */ - if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) + handled = vcpu_mmio_write(vcpu, gpa, bytes, val); + if (handled == bytes) return X86EMUL_CONTINUE; + gpa += handled; + bytes -= handled; + val += handled; + vcpu->mmio_needed = 1; vcpu->run->exit_reason = KVM_EXIT_MMIO; vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; From 5287f194bf0d7062d6d99b725366202556f03e28 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 19 Jan 2010 14:20:10 +0200 Subject: [PATCH 017/131] KVM: Split mmio completion into a function Make room for sse mmio completions. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3b234c18b635..bb6b9d3f5e93 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5441,6 +5441,27 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) return r; } +static int complete_mmio(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + int r; + + if (!(vcpu->arch.pio.count || vcpu->mmio_needed)) + return 1; + + if (vcpu->mmio_needed) { + memcpy(vcpu->mmio_data, run->mmio.data, 8); + vcpu->mmio_read_completed = 1; + vcpu->mmio_needed = 0; + } + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + if (r != EMULATE_DONE) + return 0; + return 1; +} + int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; @@ -5467,20 +5488,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } } - if (vcpu->arch.pio.count || vcpu->mmio_needed) { - if (vcpu->mmio_needed) { - memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); - vcpu->mmio_read_completed = 1; - vcpu->mmio_needed = 0; - } - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - if (r != EMULATE_DONE) { - r = 0; - goto out; - } - } + r = complete_mmio(vcpu); + if (r <= 0) + goto out; + if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) kvm_register_write(vcpu, VCPU_REGS_RAX, kvm_run->hypercall.ret); From cef4dea07f6720b36cc93e18a2e68be4bdb71a92 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 20 Jan 2010 12:01:20 +0200 Subject: [PATCH 018/131] KVM: 16-byte mmio support Since sse instructions can issue 16-byte mmios, we need to support them. We can't increase the kvm_run mmio buffer size to 16 bytes without breaking compatibility, so instead we break the large mmios into two smaller 8-byte ones. Since the bus is 64-bit we aren't breaking any atomicity guarantees. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 34 ++++++++++++++++++++++++--------- include/linux/kvm_host.h | 7 ++++++- 3 files changed, 32 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 35f81b110260..e820c6339b8b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -30,6 +30,7 @@ #define KVM_MEMORY_SLOTS 32 /* memory slots that does not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 4 +#define KVM_MMIO_SIZE 16 #define KVM_PIO_PAGE_OFFSET 1 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bb6b9d3f5e93..11d692c7018d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3833,8 +3833,10 @@ mmio: vcpu->mmio_needed = 1; vcpu->run->exit_reason = KVM_EXIT_MMIO; vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; - vcpu->run->mmio.len = vcpu->mmio_size = bytes; + vcpu->mmio_size = bytes; + vcpu->run->mmio.len = min(vcpu->mmio_size, 8); vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0; + vcpu->mmio_index = 0; return X86EMUL_IO_NEEDED; } @@ -3886,11 +3888,14 @@ mmio: val += handled; vcpu->mmio_needed = 1; + memcpy(vcpu->mmio_data, val, bytes); vcpu->run->exit_reason = KVM_EXIT_MMIO; vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; - vcpu->run->mmio.len = vcpu->mmio_size = bytes; + vcpu->mmio_size = bytes; + vcpu->run->mmio.len = min(vcpu->mmio_size, 8); vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; - memcpy(vcpu->run->mmio.data, val, bytes); + memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8); + vcpu->mmio_index = 0; return X86EMUL_CONTINUE; } @@ -4498,11 +4503,9 @@ restart: if (!vcpu->arch.pio.in) vcpu->arch.pio.count = 0; r = EMULATE_DO_MMIO; - } else if (vcpu->mmio_needed) { - if (vcpu->mmio_is_write) - vcpu->mmio_needed = 0; + } else if (vcpu->mmio_needed) r = EMULATE_DO_MMIO; - } else if (r == EMULATION_RESTART) + else if (r == EMULATION_RESTART) goto restart; else r = EMULATE_DONE; @@ -5450,9 +5453,22 @@ static int complete_mmio(struct kvm_vcpu *vcpu) return 1; if (vcpu->mmio_needed) { - memcpy(vcpu->mmio_data, run->mmio.data, 8); - vcpu->mmio_read_completed = 1; vcpu->mmio_needed = 0; + if (!vcpu->mmio_is_write) + memcpy(vcpu->mmio_data, run->mmio.data, 8); + vcpu->mmio_index += 8; + if (vcpu->mmio_index < vcpu->mmio_size) { + run->exit_reason = KVM_EXIT_MMIO; + run->mmio.phys_addr = vcpu->mmio_phys_addr + vcpu->mmio_index; + memcpy(run->mmio.data, vcpu->mmio_data + vcpu->mmio_index, 8); + run->mmio.len = min(vcpu->mmio_size - vcpu->mmio_index, 8); + run->mmio.is_write = vcpu->mmio_is_write; + vcpu->mmio_needed = 1; + return 0; + } + if (vcpu->mmio_is_write) + return 1; + vcpu->mmio_read_completed = 1; } vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7ca831e55186..d1f507567068 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -27,6 +27,10 @@ #include +#ifndef KVM_MMIO_SIZE +#define KVM_MMIO_SIZE 8 +#endif + /* * vcpu->requests bit members */ @@ -132,7 +136,8 @@ struct kvm_vcpu { int mmio_read_completed; int mmio_is_write; int mmio_size; - unsigned char mmio_data[8]; + int mmio_index; + unsigned char mmio_data[KVM_MMIO_SIZE]; gpa_t mmio_phys_addr; #endif From 1d6b114f20d06ac0749686e4d7b7c7913d9116db Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 20 Jan 2010 16:00:35 +0200 Subject: [PATCH 019/131] KVM: x86 emulator: do not munge rep prefix Currently we store a rep prefix as 1 or 2 depending on whether it is a REPE or REPNE. Since sse instructions depend on the prefix value, store it as the original opcode to simplify things further on. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 4 ++-- arch/x86/kvm/emulate.c | 4 +--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 0f5213564326..c00aed12755d 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -249,8 +249,8 @@ struct x86_emulate_ctxt { }; /* Repeat String Operation Prefix */ -#define REPE_PREFIX 1 -#define REPNE_PREFIX 2 +#define REPE_PREFIX 0xf3 +#define REPNE_PREFIX 0xf2 /* Execution mode, passed to the emulator. */ #define X86EMUL_MODE_REAL 0 /* Real mode. */ diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 0ad47b819a8b..075bb6fc73ae 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2692,10 +2692,8 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) c->lock_prefix = 1; break; case 0xf2: /* REPNE/REPNZ */ - c->rep_prefix = REPNE_PREFIX; - break; case 0xf3: /* REP/REPE/REPZ */ - c->rep_prefix = REPE_PREFIX; + c->rep_prefix = c->b; break; default: goto done_prefixes; From 5037f6f324cdcc6c9071dc774aba992f96c7e5ff Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 28 Mar 2011 16:53:59 +0200 Subject: [PATCH 020/131] KVM: x86 emulator: define callbacks for using the guest fpu within the emulator Needed for emulating fpu instructions. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 2 ++ arch/x86/kvm/x86.c | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index c00aed12755d..4c0e68226113 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -158,6 +158,8 @@ struct x86_emulate_ops { int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu); int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); + void (*get_fpu)(struct x86_emulate_ctxt *ctxt); /* disables preempt */ + void (*put_fpu)(struct x86_emulate_ctxt *ctxt); /* reenables preempt */ }; /* Type, address-of, and value of an instruction's operand. */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 11d692c7018d..5af66515337d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4281,6 +4281,22 @@ static void emulator_set_segment_selector(u16 sel, int seg, kvm_set_segment(vcpu, &kvm_seg, seg); } +static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt) +{ + preempt_disable(); + kvm_load_guest_fpu(ctxt->vcpu); + /* + * CR0.TS may reference the host fpu state, not the guest fpu state, + * so it may be clear at this point. + */ + clts(); +} + +static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt) +{ + preempt_enable(); +} + static struct x86_emulate_ops emulate_ops = { .read_std = kvm_read_guest_virt_system, .write_std = kvm_write_guest_virt_system, @@ -4304,6 +4320,8 @@ static struct x86_emulate_ops emulate_ops = { .set_dr = emulator_set_dr, .set_msr = kvm_set_msr, .get_msr = kvm_get_msr, + .get_fpu = emulator_get_fpu, + .put_fpu = emulator_put_fpu, }; static void cache_all_regs(struct kvm_vcpu *vcpu) From 0d7cdee83ad1582eecbf3b4a220e82dcb5ad561c Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 29 Mar 2011 11:34:38 +0200 Subject: [PATCH 021/131] KVM: x86 emulator: Specialize decoding for insns with 66/f2/f3 prefixes Most SIMD instructions use the 66/f2/f3 prefixes to distinguish between different variants of the same instruction. Usually the encoding is quite regular, but in some cases (including non-SIMD instructions) the prefixes generate very different instructions. Examples include XCHG/PAUSE, MOVQ/MOVDQA/MOVDQU, and MOVBE/CRC32. Allow the emulator to handle these special cases by splitting such opcodes into groups, with different decode flags and execution functions for different prefixes. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 075bb6fc73ae..fcce7aeacc84 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -75,6 +75,7 @@ #define Stack (1<<13) /* Stack instruction (push/pop) */ #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ +#define Prefix (1<<16) /* Instruction varies with 66/f2/f3 prefix */ /* Misc flags */ #define VendorSpecific (1<<22) /* Vendor specific instruction */ #define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */ @@ -106,6 +107,7 @@ struct opcode { int (*execute)(struct x86_emulate_ctxt *ctxt); struct opcode *group; struct group_dual *gdual; + struct gprefix *gprefix; } u; }; @@ -114,6 +116,13 @@ struct group_dual { struct opcode mod3[8]; }; +struct gprefix { + struct opcode pfx_no; + struct opcode pfx_66; + struct opcode pfx_f2; + struct opcode pfx_f3; +}; + /* EFLAGS bit definitions. */ #define EFLG_ID (1<<21) #define EFLG_VIP (1<<20) @@ -2625,7 +2634,8 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) struct decode_cache *c = &ctxt->decode; int rc = X86EMUL_CONTINUE; int mode = ctxt->mode; - int def_op_bytes, def_ad_bytes, dual, goffset; + int def_op_bytes, def_ad_bytes, dual, goffset, simd_prefix; + bool op_prefix = false; struct opcode opcode, *g_mod012, *g_mod3; struct operand memop = { .type = OP_NONE }; @@ -2662,6 +2672,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) for (;;) { switch (c->b = insn_fetch(u8, 1, c->eip)) { case 0x66: /* operand-size override */ + op_prefix = true; /* switch between 2/4 bytes */ c->op_bytes = def_op_bytes ^ 6; break; @@ -2742,6 +2753,19 @@ done_prefixes: c->d |= opcode.flags; } + if (c->d & Prefix) { + if (c->rep_prefix && op_prefix) + return X86EMUL_UNHANDLEABLE; + simd_prefix = op_prefix ? 0x66 : c->rep_prefix; + switch (simd_prefix) { + case 0x00: opcode = opcode.u.gprefix->pfx_no; break; + case 0x66: opcode = opcode.u.gprefix->pfx_66; break; + case 0xf2: opcode = opcode.u.gprefix->pfx_f2; break; + case 0xf3: opcode = opcode.u.gprefix->pfx_f3; break; + } + c->d |= opcode.flags; + } + c->execute = opcode.u.execute; /* Unrecognised? */ From 1253791df91b064c039282feea094e5943294924 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 29 Mar 2011 11:41:27 +0200 Subject: [PATCH 022/131] KVM: x86 emulator: SSE support Add support for marking an instruction as SSE, switching registers used to the SSE register file. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 6 +- arch/x86/kvm/emulate.c | 102 ++++++++++++++++++++++++++++- 2 files changed, 104 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 4c0e68226113..48693f0d3842 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -162,9 +162,11 @@ struct x86_emulate_ops { void (*put_fpu)(struct x86_emulate_ctxt *ctxt); /* reenables preempt */ }; +typedef u32 __attribute__((vector_size(16))) sse128_t; + /* Type, address-of, and value of an instruction's operand. */ struct operand { - enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; + enum { OP_REG, OP_MEM, OP_IMM, OP_XMM, OP_NONE } type; unsigned int bytes; union { unsigned long orig_val; @@ -176,11 +178,13 @@ struct operand { ulong ea; unsigned seg; } mem; + unsigned xmm; } addr; union { unsigned long val; u64 val64; char valptr[sizeof(unsigned long) + 2]; + sse128_t vec_val; }; }; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index fcce7aeacc84..4e11102f5603 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -76,6 +76,7 @@ #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ #define Prefix (1<<16) /* Instruction varies with 66/f2/f3 prefix */ +#define Sse (1<<17) /* SSE Vector instruction */ /* Misc flags */ #define VendorSpecific (1<<22) /* Vendor specific instruction */ #define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */ @@ -505,6 +506,11 @@ static int emulate_de(struct x86_emulate_ctxt *ctxt) return emulate_exception(ctxt, DE_VECTOR, 0, false); } +static int emulate_nm(struct x86_emulate_ctxt *ctxt) +{ + return emulate_exception(ctxt, NM_VECTOR, 0, false); +} + static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, unsigned long eip, u8 *dest) @@ -632,7 +638,63 @@ static void fetch_register_operand(struct operand *op) } } -static void decode_register_operand(struct operand *op, +static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg) +{ + ctxt->ops->get_fpu(ctxt); + switch (reg) { + case 0: asm("movdqu %%xmm0, %0" : "=m"(*data)); break; + case 1: asm("movdqu %%xmm1, %0" : "=m"(*data)); break; + case 2: asm("movdqu %%xmm2, %0" : "=m"(*data)); break; + case 3: asm("movdqu %%xmm3, %0" : "=m"(*data)); break; + case 4: asm("movdqu %%xmm4, %0" : "=m"(*data)); break; + case 5: asm("movdqu %%xmm5, %0" : "=m"(*data)); break; + case 6: asm("movdqu %%xmm6, %0" : "=m"(*data)); break; + case 7: asm("movdqu %%xmm7, %0" : "=m"(*data)); break; +#ifdef CONFIG_X86_64 + case 8: asm("movdqu %%xmm8, %0" : "=m"(*data)); break; + case 9: asm("movdqu %%xmm9, %0" : "=m"(*data)); break; + case 10: asm("movdqu %%xmm10, %0" : "=m"(*data)); break; + case 11: asm("movdqu %%xmm11, %0" : "=m"(*data)); break; + case 12: asm("movdqu %%xmm12, %0" : "=m"(*data)); break; + case 13: asm("movdqu %%xmm13, %0" : "=m"(*data)); break; + case 14: asm("movdqu %%xmm14, %0" : "=m"(*data)); break; + case 15: asm("movdqu %%xmm15, %0" : "=m"(*data)); break; +#endif + default: BUG(); + } + ctxt->ops->put_fpu(ctxt); +} + +static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, + int reg) +{ + ctxt->ops->get_fpu(ctxt); + switch (reg) { + case 0: asm("movdqu %0, %%xmm0" : : "m"(*data)); break; + case 1: asm("movdqu %0, %%xmm1" : : "m"(*data)); break; + case 2: asm("movdqu %0, %%xmm2" : : "m"(*data)); break; + case 3: asm("movdqu %0, %%xmm3" : : "m"(*data)); break; + case 4: asm("movdqu %0, %%xmm4" : : "m"(*data)); break; + case 5: asm("movdqu %0, %%xmm5" : : "m"(*data)); break; + case 6: asm("movdqu %0, %%xmm6" : : "m"(*data)); break; + case 7: asm("movdqu %0, %%xmm7" : : "m"(*data)); break; +#ifdef CONFIG_X86_64 + case 8: asm("movdqu %0, %%xmm8" : : "m"(*data)); break; + case 9: asm("movdqu %0, %%xmm9" : : "m"(*data)); break; + case 10: asm("movdqu %0, %%xmm10" : : "m"(*data)); break; + case 11: asm("movdqu %0, %%xmm11" : : "m"(*data)); break; + case 12: asm("movdqu %0, %%xmm12" : : "m"(*data)); break; + case 13: asm("movdqu %0, %%xmm13" : : "m"(*data)); break; + case 14: asm("movdqu %0, %%xmm14" : : "m"(*data)); break; + case 15: asm("movdqu %0, %%xmm15" : : "m"(*data)); break; +#endif + default: BUG(); + } + ctxt->ops->put_fpu(ctxt); +} + +static void decode_register_operand(struct x86_emulate_ctxt *ctxt, + struct operand *op, struct decode_cache *c, int inhibit_bytereg) { @@ -641,6 +703,15 @@ static void decode_register_operand(struct operand *op, if (!(c->d & ModRM)) reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); + + if (c->d & Sse) { + op->type = OP_XMM; + op->bytes = 16; + op->addr.xmm = reg; + read_sse_reg(ctxt, &op->vec_val, reg); + return; + } + op->type = OP_REG; if ((c->d & ByteOp) && !inhibit_bytereg) { op->addr.reg = decode_register(reg, c->regs, highbyte_regs); @@ -680,6 +751,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes; op->addr.reg = decode_register(c->modrm_rm, c->regs, c->d & ByteOp); + if (c->d & Sse) { + op->type = OP_XMM; + op->bytes = 16; + op->addr.xmm = c->modrm_rm; + read_sse_reg(ctxt, &op->vec_val, c->modrm_rm); + return rc; + } fetch_register_operand(op); return rc; } @@ -1107,6 +1185,9 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, if (rc != X86EMUL_CONTINUE) return rc; break; + case OP_XMM: + write_sse_reg(ctxt, &c->dst.vec_val, c->dst.addr.xmm); + break; case OP_NONE: /* no writeback */ break; @@ -2785,6 +2866,9 @@ done_prefixes: c->op_bytes = 4; } + if (c->d & Sse) + c->op_bytes = 16; + /* ModRM and SIB bytes. */ if (c->d & ModRM) { rc = decode_modrm(ctxt, ops, &memop); @@ -2814,7 +2898,7 @@ done_prefixes: case SrcNone: break; case SrcReg: - decode_register_operand(&c->src, c, 0); + decode_register_operand(ctxt, &c->src, c, 0); break; case SrcMem16: memop.bytes = 2; @@ -2905,7 +2989,7 @@ done_prefixes: /* Decode and fetch the destination operand: register or memory. */ switch (c->d & DstMask) { case DstReg: - decode_register_operand(&c->dst, c, + decode_register_operand(ctxt, &c->dst, c, c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); break; case DstImmUByte: @@ -3001,6 +3085,18 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } + if ((c->d & Sse) + && ((ops->get_cr(0, ctxt->vcpu) & X86_CR0_EM) + || !(ops->get_cr(4, ctxt->vcpu) & X86_CR4_OSFXSR))) { + rc = emulate_ud(ctxt); + goto done; + } + + if ((c->d & Sse) && (ops->get_cr(0, ctxt->vcpu) & X86_CR0_TS)) { + rc = emulate_nm(ctxt); + goto done; + } + /* Privileged instruction can be executed only in CPL=0 */ if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { rc = emulate_gp(ctxt, 0); From aa97bb4891b1f1b35e7abef8d1e2bbd3dda07159 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 20 Jan 2010 18:09:23 +0200 Subject: [PATCH 023/131] KVM: x86 emulator: implement movdqu instruction (f3 0f 6f, f3 0f 7f) Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 4e11102f5603..2b6c24e572d4 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2415,11 +2415,19 @@ static int em_mov(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_movdqu(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + memcpy(&c->dst.vec_val, &c->src.vec_val, c->op_bytes); + return X86EMUL_CONTINUE; +} + #define D(_y) { .flags = (_y) } #define N D(0) #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } #define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) } #define I(_f, _e) { .flags = (_f), .u.execute = (_e) } +#define GP(_f, _g) { .flags = ((_f) | Prefix), .u.gprefix = (_g) } #define D2bv(_f) D((_f) | ByteOp), D(_f) #define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e) @@ -2484,6 +2492,10 @@ static struct opcode group11[] = { I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)), }; +static struct gprefix pfx_0f_6f_0f_7f = { + N, N, N, I(Sse, em_movdqu), +}; + static struct opcode opcode_table[256] = { /* 0x00 - 0x07 */ D6ALU(Lock), @@ -2608,9 +2620,15 @@ static struct opcode twobyte_table[256] = { /* 0x50 - 0x5F */ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, /* 0x60 - 0x6F */ - N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, + N, N, N, N, + N, N, N, N, + N, N, N, N, + N, N, N, GP(SrcMem | DstReg | ModRM | Mov, &pfx_0f_6f_0f_7f), /* 0x70 - 0x7F */ - N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, + N, N, N, N, + N, N, N, N, + N, N, N, N, + N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_6f_0f_7f), /* 0x80 - 0x8F */ X16(D(SrcImm)), /* 0x90 - 0x9F */ @@ -2654,6 +2672,7 @@ static struct opcode twobyte_table[256] = { #undef G #undef GD #undef I +#undef GP #undef D2bv #undef I2bv From c4f035c60dad45ff8813550dc82540dbbc263df2 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 4 Apr 2011 12:39:22 +0200 Subject: [PATCH 024/131] KVM: x86 emulator: add framework for instruction intercepts When running in guest mode, certain instructions can be intercepted by hardware. This also holds for nested guests running on emulated virtualization hardware, in particular instructions emulated by kvm itself. This patch adds a framework for intercepting instructions. If an instruction is marked for interception, and if we're running in guest mode, a callback is called to check whether an intercept is needed or not. The callback is called at three points in time: immediately after beginning execution, after checking privilge exceptions, and after checking memory exception. This suits the different interception points defined for different instructions and for the various virtualization instruction sets. In addition, a new X86EMUL_INTERCEPT is defined, which any callback or memory access may define, allowing the more complicated intercepts to be implemented in existing callbacks. Signed-off-by: Avi Kivity Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 20 ++++++++++++++++++++ arch/x86/kvm/emulate.c | 26 ++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 9 +++++++++ 3 files changed, 55 insertions(+) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 48693f0d3842..2cfea49d4706 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -14,6 +14,8 @@ #include struct x86_emulate_ctxt; +enum x86_intercept; +enum x86_intercept_stage; struct x86_exception { u8 vector; @@ -62,6 +64,7 @@ struct x86_exception { #define X86EMUL_RETRY_INSTR 3 /* retry the instruction for some reason */ #define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */ #define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */ +#define X86EMUL_INTERCEPTED 6 /* Intercepted by nested VMCB/VMCS */ struct x86_emulate_ops { /* @@ -160,6 +163,9 @@ struct x86_emulate_ops { int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); void (*get_fpu)(struct x86_emulate_ctxt *ctxt); /* disables preempt */ void (*put_fpu)(struct x86_emulate_ctxt *ctxt); /* reenables preempt */ + int (*intercept)(struct x86_emulate_ctxt *ctxt, + enum x86_intercept intercept, + enum x86_intercept_stage stage); }; typedef u32 __attribute__((vector_size(16))) sse128_t; @@ -203,6 +209,7 @@ struct read_cache { struct decode_cache { u8 twobyte; u8 b; + u8 intercept; u8 lock_prefix; u8 rep_prefix; u8 op_bytes; @@ -244,6 +251,7 @@ struct x86_emulate_ctxt { /* interruptibility state, as a result of execution of STI or MOV SS */ int interruptibility; + bool guest_mode; /* guest running a nested guest */ bool perm_ok; /* do not check permissions if true */ bool only_vendor_specific_insn; @@ -265,6 +273,18 @@ struct x86_emulate_ctxt { #define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */ #define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ +enum x86_intercept_stage { + X86_ICPT_PRE_EXCEPT, + X86_ICPT_POST_EXCEPT, + X86_ICPT_POST_MEMACCESS, +}; + +enum x86_intercept { + x86_intercept_none, + + nr_x86_intercepts +}; + /* Host execution mode. */ #if defined(CONFIG_X86_32) #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32 diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 2b6c24e572d4..a81486790ba8 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -104,6 +104,7 @@ struct opcode { u32 flags; + u8 intercept; union { int (*execute)(struct x86_emulate_ctxt *ctxt); struct opcode *group; @@ -2423,10 +2424,13 @@ static int em_movdqu(struct x86_emulate_ctxt *ctxt) } #define D(_y) { .flags = (_y) } +#define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i } #define N D(0) #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } #define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) } #define I(_f, _e) { .flags = (_f), .u.execute = (_e) } +#define II(_f, _e, _i) \ + { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i } #define GP(_f, _g) { .flags = ((_f) | Prefix), .u.gprefix = (_g) } #define D2bv(_f) D((_f) | ByteOp), D(_f) @@ -2867,6 +2871,7 @@ done_prefixes: } c->execute = opcode.u.execute; + c->intercept = opcode.intercept; /* Unrecognised? */ if (c->d == 0 || (c->d & Undefined)) @@ -3116,12 +3121,26 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } + if (unlikely(ctxt->guest_mode) && c->intercept) { + rc = ops->intercept(ctxt, c->intercept, + X86_ICPT_PRE_EXCEPT); + if (rc != X86EMUL_CONTINUE) + goto done; + } + /* Privileged instruction can be executed only in CPL=0 */ if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { rc = emulate_gp(ctxt, 0); goto done; } + if (unlikely(ctxt->guest_mode) && c->intercept) { + rc = ops->intercept(ctxt, c->intercept, + X86_ICPT_POST_EXCEPT); + if (rc != X86EMUL_CONTINUE) + goto done; + } + if (c->rep_prefix && (c->d & String)) { /* All REP prefixes have the same first termination condition */ if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { @@ -3160,6 +3179,13 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) special_insn: + if (unlikely(ctxt->guest_mode) && c->intercept) { + rc = ops->intercept(ctxt, c->intercept, + X86_ICPT_POST_MEMACCESS); + if (rc != X86EMUL_CONTINUE) + goto done; + } + if (c->execute) { rc = c->execute(ctxt); if (rc != X86EMUL_CONTINUE) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5af66515337d..36786bbb4c09 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4297,6 +4297,13 @@ static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt) preempt_enable(); } +static int emulator_intercept(struct x86_emulate_ctxt *ctxt, + enum x86_intercept intercept, + enum x86_intercept_stage stage) +{ + return X86EMUL_CONTINUE; +} + static struct x86_emulate_ops emulate_ops = { .read_std = kvm_read_guest_virt_system, .write_std = kvm_write_guest_virt_system, @@ -4322,6 +4329,7 @@ static struct x86_emulate_ops emulate_ops = { .get_msr = kvm_get_msr, .get_fpu = emulator_get_fpu, .put_fpu = emulator_put_fpu, + .intercept = emulator_intercept, }; static void cache_all_regs(struct kvm_vcpu *vcpu) @@ -4376,6 +4384,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) ? X86EMUL_MODE_VM86 : cs_l ? X86EMUL_MODE_PROT64 : cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; + vcpu->arch.emulate_ctxt.guest_mode = is_guest_mode(vcpu); memset(c, 0, sizeof(struct decode_cache)); memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); } From 3c6e276f22cf29188035535127c4c35aeeafcabc Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 4 Apr 2011 12:39:23 +0200 Subject: [PATCH 025/131] KVM: x86 emulator: add SVM intercepts Add intercept codes for instructions defined by SVM as interceptable. Signed-off-by: Avi Kivity Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 35 ++++++++++++++++++++++++++++++ arch/x86/kvm/emulate.c | 24 ++++++++++---------- 2 files changed, 48 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 2cfea49d4706..470ac54ca38d 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -281,6 +281,41 @@ enum x86_intercept_stage { enum x86_intercept { x86_intercept_none, + x86_intercept_lmsw, + x86_intercept_smsw, + x86_intercept_lidt, + x86_intercept_sidt, + x86_intercept_lgdt, + x86_intercept_sgdt, + x86_intercept_lldt, + x86_intercept_sldt, + x86_intercept_ltr, + x86_intercept_str, + x86_intercept_rdtsc, + x86_intercept_rdpmc, + x86_intercept_pushf, + x86_intercept_popf, + x86_intercept_cpuid, + x86_intercept_rsm, + x86_intercept_iret, + x86_intercept_intn, + x86_intercept_invd, + x86_intercept_pause, + x86_intercept_hlt, + x86_intercept_invlpg, + x86_intercept_invlpga, + x86_intercept_vmrun, + x86_intercept_vmload, + x86_intercept_vmsave, + x86_intercept_vmmcall, + x86_intercept_stgi, + x86_intercept_clgi, + x86_intercept_skinit, + x86_intercept_rdtscp, + x86_intercept_icebp, + x86_intercept_wbinvd, + x86_intercept_monitor, + x86_intercept_mwait, nr_x86_intercepts }; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a81486790ba8..c2260e57450a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2469,15 +2469,15 @@ static struct opcode group5[] = { }; static struct group_dual group7 = { { - N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv), - D(SrcNone | ModRM | DstMem | Mov), N, - D(SrcMem16 | ModRM | Mov | Priv), - D(SrcMem | ModRM | ByteOp | Priv | NoAccess), + N, N, DI(ModRM | SrcMem | Priv, lgdt), DI(ModRM | SrcMem | Priv, lidt), + DI(SrcNone | ModRM | DstMem | Mov, smsw), N, + DI(SrcMem16 | ModRM | Mov | Priv, lmsw), + DI(SrcMem | ModRM | ByteOp | Priv | NoAccess, invlpg), }, { D(SrcNone | ModRM | Priv | VendorSpecific), N, N, D(SrcNone | ModRM | Priv | VendorSpecific), - D(SrcNone | ModRM | DstMem | Mov), N, - D(SrcMem16 | ModRM | Mov | Priv), N, + DI(SrcNone | ModRM | DstMem | Mov, smsw), N, + DI(SrcMem16 | ModRM | Mov | Priv, lmsw), N, } }; static struct opcode group8[] = { @@ -2556,7 +2556,7 @@ static struct opcode opcode_table[256] = { /* 0x98 - 0x9F */ D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), I(SrcImmFAddr | No64, em_call_far), N, - D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N, + DI(ImplicitOps | Stack, pushf), DI(ImplicitOps | Stack, popf), N, N, /* 0xA0 - 0xA7 */ I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov), @@ -2579,7 +2579,8 @@ static struct opcode opcode_table[256] = { G(ByteOp, group11), G(0, group11), /* 0xC8 - 0xCF */ N, N, N, D(ImplicitOps | Stack), - D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps), + D(ImplicitOps), DI(SrcImmByte, intn), + D(ImplicitOps | No64), DI(ImplicitOps, iret), /* 0xD0 - 0xD7 */ D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM), N, N, N, N, @@ -2594,7 +2595,8 @@ static struct opcode opcode_table[256] = { D2bv(SrcNone | DstAcc), D2bv(SrcAcc | ImplicitOps), /* 0xF0 - 0xF7 */ N, N, N, N, - D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3), + DI(ImplicitOps | Priv, hlt), D(ImplicitOps), + G(ByteOp, group3), G(0, group3), /* 0xF8 - 0xFF */ D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5), @@ -2604,7 +2606,7 @@ static struct opcode twobyte_table[256] = { /* 0x00 - 0x0F */ N, GD(0, &group7), N, N, N, D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv), N, - D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N, + DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, N, D(ImplicitOps | ModRM), N, N, /* 0x10 - 0x1F */ N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N, @@ -2614,7 +2616,7 @@ static struct opcode twobyte_table[256] = { N, N, N, N, N, N, N, N, N, N, N, N, /* 0x30 - 0x3F */ - D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc), + D(ImplicitOps | Priv), II(ImplicitOps, em_rdtsc, rdtsc), D(ImplicitOps | Priv), N, D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv | VendorSpecific), N, N, From 775fde8648ebc588d07de39457aadc7c2131df2e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 4 Apr 2011 12:39:24 +0200 Subject: [PATCH 026/131] KVM: x86 emulator: Don't write-back cpu-state on X86EMUL_INTERCEPTED This patch prevents the changed CPU state to be written back when the emulator detected that the instruction was intercepted by the guest. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 1 + arch/x86/kvm/emulate.c | 3 +++ arch/x86/kvm/x86.c | 3 +++ 3 files changed, 7 insertions(+) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 470ac54ca38d..1dbd0c736cd1 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -331,6 +331,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len); #define EMULATION_FAILED -1 #define EMULATION_OK 0 #define EMULATION_RESTART 1 +#define EMULATION_INTERCEPTED 2 int x86_emulate_insn(struct x86_emulate_ctxt *ctxt); int emulator_task_switch(struct x86_emulate_ctxt *ctxt, u16 tss_selector, int reason, diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c2260e57450a..a2c31e527a99 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3592,6 +3592,9 @@ writeback: done: if (rc == X86EMUL_PROPAGATE_FAULT) ctxt->have_exception = true; + if (rc == X86EMUL_INTERCEPTED) + return EMULATION_INTERCEPTED; + return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; twobyte_insn: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 36786bbb4c09..99bed74779d2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4516,6 +4516,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, restart: r = x86_emulate_insn(&vcpu->arch.emulate_ctxt); + if (r == EMULATION_INTERCEPTED) + return EMULATE_DONE; + if (r == EMULATION_FAILED) { if (reexecute_instruction(vcpu, cr2)) return EMULATE_DONE; From d09beabd7cd4cf70d982ff54656dc6431df80fa4 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 4 Apr 2011 12:39:25 +0200 Subject: [PATCH 027/131] KVM: x86 emulator: Add check_perm callback This patch adds a check_perm callback for each opcode into the instruction emulator. This will be used to do all necessary permission checks on instructions before checking whether they are intercepted or not. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 1 + arch/x86/kvm/emulate.c | 14 ++++++++++++++ 2 files changed, 15 insertions(+) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 1dbd0c736cd1..460c2d8964b7 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -222,6 +222,7 @@ struct decode_cache { u8 seg_override; unsigned int d; int (*execute)(struct x86_emulate_ctxt *ctxt); + int (*check_perm)(struct x86_emulate_ctxt *ctxt); unsigned long regs[NR_VCPU_REGS]; unsigned long eip; /* modrm */ diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a2c31e527a99..4822824b608b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -111,6 +111,7 @@ struct opcode { struct group_dual *gdual; struct gprefix *gprefix; } u; + int (*check_perm)(struct x86_emulate_ctxt *ctxt); }; struct group_dual { @@ -2425,12 +2426,17 @@ static int em_movdqu(struct x86_emulate_ctxt *ctxt) #define D(_y) { .flags = (_y) } #define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i } +#define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \ + .check_perm = (_p) } #define N D(0) #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } #define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) } #define I(_f, _e) { .flags = (_f), .u.execute = (_e) } #define II(_f, _e, _i) \ { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i } +#define IIP(_f, _e, _i, _p) \ + { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i, \ + .check_perm = (_p) } #define GP(_f, _g) { .flags = ((_f) | Prefix), .u.gprefix = (_g) } #define D2bv(_f) D((_f) | ByteOp), D(_f) @@ -2873,6 +2879,7 @@ done_prefixes: } c->execute = opcode.u.execute; + c->check_perm = opcode.check_perm; c->intercept = opcode.intercept; /* Unrecognised? */ @@ -3136,6 +3143,13 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } + /* Do instruction specific permission checks */ + if (c->check_perm) { + rc = c->check_perm(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; + } + if (unlikely(ctxt->guest_mode) && c->intercept) { rc = ops->intercept(ctxt, c->intercept, X86_ICPT_POST_EXCEPT); From 8ea7d6aef84e278fcb121acff1bd4c3edaa95b8b Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 4 Apr 2011 12:39:26 +0200 Subject: [PATCH 028/131] KVM: x86 emulator: Add flag to check for protected mode instructions This patch adds a flag for the opcoded to tag instruction which are only recognized in protected mode. The necessary check is added too. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 4 ++++ arch/x86/kvm/emulate.c | 7 +++++++ 2 files changed, 11 insertions(+) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 460c2d8964b7..cab841a034f9 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -274,6 +274,10 @@ struct x86_emulate_ctxt { #define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */ #define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ +/* any protected mode */ +#define X86EMUL_MODE_PROT (X86EMUL_MODE_PROT16|X86EMUL_MODE_PROT32| \ + X86EMUL_MODE_PROT64) + enum x86_intercept_stage { X86_ICPT_PRE_EXCEPT, X86_ICPT_POST_EXCEPT, diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 4822824b608b..3f32a6699fbd 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -78,6 +78,7 @@ #define Prefix (1<<16) /* Instruction varies with 66/f2/f3 prefix */ #define Sse (1<<17) /* SSE Vector instruction */ /* Misc flags */ +#define Prot (1<<21) /* instruction generates #UD if not in prot-mode */ #define VendorSpecific (1<<22) /* Vendor specific instruction */ #define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */ #define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */ @@ -3143,6 +3144,12 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } + /* Instruction can only be executed in protected mode */ + if ((c->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) { + rc = emulate_ud(ctxt); + goto done; + } + /* Do instruction specific permission checks */ if (c->check_perm) { rc = c->check_perm(ctxt); From 8a76d7f25f8f24fc5a328c8e15e4a7313cf141b9 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 4 Apr 2011 12:39:27 +0200 Subject: [PATCH 029/131] KVM: x86: Add x86 callback for intercept check This patch adds a callback into kvm_x86_ops so that svm and vmx code can do intercept checks on emulated instructions. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 22 ++++++++++++++++++-- arch/x86/include/asm/kvm_host.h | 7 +++++++ arch/x86/kvm/emulate.c | 32 ++++++++++++++++++++++++------ arch/x86/kvm/svm.c | 9 +++++++++ arch/x86/kvm/vmx.c | 9 +++++++++ arch/x86/kvm/x86.c | 6 +++--- 6 files changed, 74 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index cab841a034f9..eb7033cefe8e 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -25,6 +25,24 @@ struct x86_exception { u64 address; /* cr2 or nested page fault gpa */ }; +/* + * This struct is used to carry enough information from the instruction + * decoder to main KVM so that a decision can be made whether the + * instruction needs to be intercepted or not. + */ +struct x86_instruction_info { + u8 intercept; /* which intercept */ + u8 rep_prefix; /* rep prefix? */ + u8 modrm_mod; /* mod part of modrm */ + u8 modrm_reg; /* index of register used */ + u8 modrm_rm; /* rm part of modrm */ + u64 src_val; /* value of source operand */ + u8 src_bytes; /* size of source operand */ + u8 dst_bytes; /* size of destination operand */ + u8 ad_bytes; /* size of src/dst address */ + u64 next_rip; /* rip following the instruction */ +}; + /* * x86_emulate_ops: * @@ -163,8 +181,8 @@ struct x86_emulate_ops { int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); void (*get_fpu)(struct x86_emulate_ctxt *ctxt); /* disables preempt */ void (*put_fpu)(struct x86_emulate_ctxt *ctxt); /* reenables preempt */ - int (*intercept)(struct x86_emulate_ctxt *ctxt, - enum x86_intercept intercept, + int (*intercept)(struct kvm_vcpu *vcpu, + struct x86_instruction_info *info, enum x86_intercept_stage stage); }; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e820c6339b8b..038562c222e8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -505,6 +505,8 @@ struct kvm_vcpu_stat { u32 nmi_injections; }; +struct x86_instruction_info; + struct kvm_x86_ops { int (*cpu_has_kvm_support)(void); /* __init */ int (*disabled_by_bios)(void); /* __init */ @@ -592,6 +594,11 @@ struct kvm_x86_ops { void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); + + int (*check_intercept)(struct kvm_vcpu *vcpu, + struct x86_instruction_info *info, + enum x86_intercept_stage stage); + const struct trace_print_flags *exit_reasons_str; }; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 3f32a6699fbd..e3e96eada6f3 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -408,6 +408,26 @@ struct gprefix { (_eip) += (_size); \ }) +static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, + enum x86_intercept intercept, + enum x86_intercept_stage stage) +{ + struct x86_instruction_info info = { + .intercept = intercept, + .rep_prefix = ctxt->decode.rep_prefix, + .modrm_mod = ctxt->decode.modrm_mod, + .modrm_reg = ctxt->decode.modrm_reg, + .modrm_rm = ctxt->decode.modrm_rm, + .src_val = ctxt->decode.src.val64, + .src_bytes = ctxt->decode.src.bytes, + .dst_bytes = ctxt->decode.dst.bytes, + .ad_bytes = ctxt->decode.ad_bytes, + .next_rip = ctxt->eip, + }; + + return ctxt->ops->intercept(ctxt->vcpu, &info, stage); +} + static inline unsigned long ad_mask(struct decode_cache *c) { return (1UL << (c->ad_bytes << 3)) - 1; @@ -3132,8 +3152,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) } if (unlikely(ctxt->guest_mode) && c->intercept) { - rc = ops->intercept(ctxt, c->intercept, - X86_ICPT_PRE_EXCEPT); + rc = emulator_check_intercept(ctxt, c->intercept, + X86_ICPT_PRE_EXCEPT); if (rc != X86EMUL_CONTINUE) goto done; } @@ -3158,8 +3178,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) } if (unlikely(ctxt->guest_mode) && c->intercept) { - rc = ops->intercept(ctxt, c->intercept, - X86_ICPT_POST_EXCEPT); + rc = emulator_check_intercept(ctxt, c->intercept, + X86_ICPT_POST_EXCEPT); if (rc != X86EMUL_CONTINUE) goto done; } @@ -3203,8 +3223,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) special_insn: if (unlikely(ctxt->guest_mode) && c->intercept) { - rc = ops->intercept(ctxt, c->intercept, - X86_ICPT_POST_MEMACCESS); + rc = emulator_check_intercept(ctxt, c->intercept, + X86_ICPT_POST_MEMACCESS); if (rc != X86EMUL_CONTINUE) goto done; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index cb43e98eff63..798ebe695f1d 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3868,6 +3868,13 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) update_cr0_intercept(svm); } +static int svm_check_intercept(struct kvm_vcpu *vcpu, + struct x86_instruction_info *info, + enum x86_intercept_stage stage) +{ + return X86EMUL_CONTINUE; +} + static struct kvm_x86_ops svm_x86_ops = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -3953,6 +3960,8 @@ static struct kvm_x86_ops svm_x86_ops = { .adjust_tsc_offset = svm_adjust_tsc_offset, .set_tdp_cr3 = set_tdp_cr3, + + .check_intercept = svm_check_intercept, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2b99ae72481f..3dfefe3bcd05 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4409,6 +4409,13 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) { } +static int vmx_check_intercept(struct kvm_vcpu *vcpu, + struct x86_instruction_info *info, + enum x86_intercept_stage stage) +{ + return X86EMUL_CONTINUE; +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -4494,6 +4501,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .adjust_tsc_offset = vmx_adjust_tsc_offset, .set_tdp_cr3 = vmx_set_cr3, + + .check_intercept = vmx_check_intercept, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 99bed74779d2..eebe5465c8ce 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4297,11 +4297,11 @@ static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt) preempt_enable(); } -static int emulator_intercept(struct x86_emulate_ctxt *ctxt, - enum x86_intercept intercept, +static int emulator_intercept(struct kvm_vcpu *vcpu, + struct x86_instruction_info *info, enum x86_intercept_stage stage) { - return X86EMUL_CONTINUE; + return kvm_x86_ops->check_intercept(vcpu, info, stage); } static struct x86_emulate_ops emulate_ops = { From cfec82cb7d313ae5b2c2dbb974401d7c214c7b09 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 4 Apr 2011 12:39:28 +0200 Subject: [PATCH 030/131] KVM: SVM: Add intercept check for emulated cr accesses This patch adds all necessary intercept checks for instructions that access the crX registers. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 3 + arch/x86/include/asm/kvm_host.h | 15 +++++ arch/x86/kvm/emulate.c | 105 ++++++++++++++++++++++++++--- arch/x86/kvm/svm.c | 81 +++++++++++++++++++++- arch/x86/kvm/x86.c | 13 ---- 5 files changed, 192 insertions(+), 25 deletions(-) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index eb7033cefe8e..2c0b5b47464f 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -304,6 +304,9 @@ enum x86_intercept_stage { enum x86_intercept { x86_intercept_none, + x86_intercept_cr_read, + x86_intercept_cr_write, + x86_intercept_clts, x86_intercept_lmsw, x86_intercept_smsw, x86_intercept_lidt, diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 038562c222e8..f7dfd6479d02 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -35,10 +35,25 @@ #define KVM_PIO_PAGE_OFFSET 1 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2 +#define CR0_RESERVED_BITS \ + (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ + | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ + | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) + #define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) #define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ 0xFFFFFF0000000000ULL) +#define CR4_RESERVED_BITS \ + (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ + | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ + | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_OSXSAVE \ + | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) + +#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) + + #define INVALID_PAGE (~(hpa_t)0) #define VALID_PAGE(x) ((x) != INVALID_PAGE) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index e3e96eada6f3..d2e77755efe8 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2445,6 +2445,95 @@ static int em_movdqu(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static bool valid_cr(int nr) +{ + switch (nr) { + case 0: + case 2 ... 4: + case 8: + return true; + default: + return false; + } +} + +static int check_cr_read(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + if (!valid_cr(c->modrm_reg)) + return emulate_ud(ctxt); + + return X86EMUL_CONTINUE; +} + +static int check_cr_write(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + u64 new_val = c->src.val64; + int cr = c->modrm_reg; + + static u64 cr_reserved_bits[] = { + 0xffffffff00000000ULL, + 0, 0, 0, /* CR3 checked later */ + CR4_RESERVED_BITS, + 0, 0, 0, + CR8_RESERVED_BITS, + }; + + if (!valid_cr(cr)) + return emulate_ud(ctxt); + + if (new_val & cr_reserved_bits[cr]) + return emulate_gp(ctxt, 0); + + switch (cr) { + case 0: { + u64 cr4, efer; + if (((new_val & X86_CR0_PG) && !(new_val & X86_CR0_PE)) || + ((new_val & X86_CR0_NW) && !(new_val & X86_CR0_CD))) + return emulate_gp(ctxt, 0); + + cr4 = ctxt->ops->get_cr(4, ctxt->vcpu); + ctxt->ops->get_msr(ctxt->vcpu, MSR_EFER, &efer); + + if ((new_val & X86_CR0_PG) && (efer & EFER_LME) && + !(cr4 & X86_CR4_PAE)) + return emulate_gp(ctxt, 0); + + break; + } + case 3: { + u64 rsvd = 0; + + if (is_long_mode(ctxt->vcpu)) + rsvd = CR3_L_MODE_RESERVED_BITS; + else if (is_pae(ctxt->vcpu)) + rsvd = CR3_PAE_RESERVED_BITS; + else if (is_paging(ctxt->vcpu)) + rsvd = CR3_NONPAE_RESERVED_BITS; + + if (new_val & rsvd) + return emulate_gp(ctxt, 0); + + break; + } + case 4: { + u64 cr4, efer; + + cr4 = ctxt->ops->get_cr(4, ctxt->vcpu); + ctxt->ops->get_msr(ctxt->vcpu, MSR_EFER, &efer); + + if ((efer & EFER_LMA) && !(new_val & X86_CR4_PAE)) + return emulate_gp(ctxt, 0); + + break; + } + } + + return X86EMUL_CONTINUE; +} + #define D(_y) { .flags = (_y) } #define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i } #define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \ @@ -2632,14 +2721,16 @@ static struct opcode opcode_table[256] = { static struct opcode twobyte_table[256] = { /* 0x00 - 0x0F */ N, GD(0, &group7), N, N, - N, D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv), N, + N, D(ImplicitOps | VendorSpecific), DI(ImplicitOps | Priv, clts), N, DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, N, D(ImplicitOps | ModRM), N, N, /* 0x10 - 0x1F */ N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N, /* 0x20 - 0x2F */ - D(ModRM | DstMem | Priv | Op3264), D(ModRM | DstMem | Priv | Op3264), - D(ModRM | SrcMem | Priv | Op3264), D(ModRM | SrcMem | Priv | Op3264), + DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read), + D(ModRM | DstMem | Priv | Op3264), + DIP(ModRM | SrcMem | Priv | Op3264, cr_write, check_cr_write), + D(ModRM | SrcMem | Priv | Op3264), N, N, N, N, N, N, N, N, N, N, N, N, /* 0x30 - 0x3F */ @@ -3724,14 +3815,6 @@ twobyte_insn: case 0x18: /* Grp16 (prefetch/nop) */ break; case 0x20: /* mov cr, reg */ - switch (c->modrm_reg) { - case 1: - case 5 ... 7: - case 9 ... 15: - emulate_ud(ctxt); - rc = X86EMUL_PROPAGATE_FAULT; - goto done; - } c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu); break; case 0x21: /* mov from dr to reg */ diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 798ebe695f1d..ff4ed3619d00 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3868,11 +3868,90 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) update_cr0_intercept(svm); } +#define POST_EX(exit) { .exit_code = (exit), \ + .stage = X86_ICPT_POST_EXCEPT, \ + .valid = true } + +static struct __x86_intercept { + u32 exit_code; + enum x86_intercept_stage stage; + bool valid; +} x86_intercept_map[] = { + [x86_intercept_cr_read] = POST_EX(SVM_EXIT_READ_CR0), + [x86_intercept_cr_write] = POST_EX(SVM_EXIT_WRITE_CR0), + [x86_intercept_clts] = POST_EX(SVM_EXIT_WRITE_CR0), + [x86_intercept_lmsw] = POST_EX(SVM_EXIT_WRITE_CR0), + [x86_intercept_smsw] = POST_EX(SVM_EXIT_READ_CR0), +}; + +#undef POST_EX + static int svm_check_intercept(struct kvm_vcpu *vcpu, struct x86_instruction_info *info, enum x86_intercept_stage stage) { - return X86EMUL_CONTINUE; + struct vcpu_svm *svm = to_svm(vcpu); + int vmexit, ret = X86EMUL_CONTINUE; + struct __x86_intercept icpt_info; + struct vmcb *vmcb = svm->vmcb; + + if (info->intercept >= ARRAY_SIZE(x86_intercept_map)) + goto out; + + icpt_info = x86_intercept_map[info->intercept]; + + if (!icpt_info.valid || stage != icpt_info.stage) + goto out; + + switch (icpt_info.exit_code) { + case SVM_EXIT_READ_CR0: + if (info->intercept == x86_intercept_cr_read) + icpt_info.exit_code += info->modrm_reg; + break; + case SVM_EXIT_WRITE_CR0: { + unsigned long cr0, val; + u64 intercept; + + if (info->intercept == x86_intercept_cr_write) + icpt_info.exit_code += info->modrm_reg; + + if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0) + break; + + intercept = svm->nested.intercept; + + if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))) + break; + + cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK; + val = info->src_val & ~SVM_CR0_SELECTIVE_MASK; + + if (info->intercept == x86_intercept_lmsw) { + cr0 &= 0xfUL; + val &= 0xfUL; + /* lmsw can't clear PE - catch this here */ + if (cr0 & X86_CR0_PE) + val |= X86_CR0_PE; + } + + if (cr0 ^ val) + icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE; + + break; + } + default: + break; + } + + vmcb->control.next_rip = info->next_rip; + vmcb->control.exit_code = icpt_info.exit_code; + vmexit = nested_svm_exit_handled(svm); + + ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED + : X86EMUL_CONTINUE; + +out: + return ret; } static struct kvm_x86_ops svm_x86_ops = { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index eebe5465c8ce..0d6524fa2aff 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -60,19 +60,6 @@ #include #define MAX_IO_MSRS 256 -#define CR0_RESERVED_BITS \ - (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ - | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ - | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) -#define CR4_RESERVED_BITS \ - (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ - | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ - | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXSAVE \ - | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) - -#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) - #define KVM_MAX_MCE_BANKS 32 #define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P) From 3b88e41a41344846ee28007ebfe1bb0defa7f86a Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 4 Apr 2011 12:39:29 +0200 Subject: [PATCH 031/131] KVM: SVM: Add intercept check for accessing dr registers This patch adds the intercept checks for instruction accessing the debug registers. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 2 + arch/x86/kvm/emulate.c | 63 +++++++++++++++++++++++------- arch/x86/kvm/svm.c | 6 +++ 3 files changed, 56 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 2c0b5b47464f..fdaf59ac1c07 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -309,6 +309,8 @@ enum x86_intercept { x86_intercept_clts, x86_intercept_lmsw, x86_intercept_smsw, + x86_intercept_dr_read, + x86_intercept_dr_write, x86_intercept_lidt, x86_intercept_sidt, x86_intercept_lgdt, diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index d2e77755efe8..cd9ed9f45275 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -509,6 +509,11 @@ static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, return X86EMUL_PROPAGATE_FAULT; } +static int emulate_db(struct x86_emulate_ctxt *ctxt) +{ + return emulate_exception(ctxt, DB_VECTOR, 0, false); +} + static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err) { return emulate_exception(ctxt, GP_VECTOR, err, true); @@ -2534,6 +2539,47 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int check_dr7_gd(struct x86_emulate_ctxt *ctxt) +{ + unsigned long dr7; + + ctxt->ops->get_dr(7, &dr7, ctxt->vcpu); + + /* Check if DR7.Global_Enable is set */ + return dr7 & (1 << 13); +} + +static int check_dr_read(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + int dr = c->modrm_reg; + u64 cr4; + + if (dr > 7) + return emulate_ud(ctxt); + + cr4 = ctxt->ops->get_cr(4, ctxt->vcpu); + if ((cr4 & X86_CR4_DE) && (dr == 4 || dr == 5)) + return emulate_ud(ctxt); + + if (check_dr7_gd(ctxt)) + return emulate_db(ctxt); + + return X86EMUL_CONTINUE; +} + +static int check_dr_write(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + u64 new_val = c->src.val64; + int dr = c->modrm_reg; + + if ((dr == 6 || dr == 7) && (new_val & 0xffffffff00000000ULL)) + return emulate_gp(ctxt, 0); + + return check_dr_read(ctxt); +} + #define D(_y) { .flags = (_y) } #define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i } #define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \ @@ -2728,9 +2774,9 @@ static struct opcode twobyte_table[256] = { N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N, /* 0x20 - 0x2F */ DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read), - D(ModRM | DstMem | Priv | Op3264), + DIP(ModRM | DstMem | Priv | Op3264, dr_read, check_dr_read), DIP(ModRM | SrcMem | Priv | Op3264, cr_write, check_cr_write), - D(ModRM | SrcMem | Priv | Op3264), + DIP(ModRM | SrcMem | Priv | Op3264, dr_write, check_dr_write), N, N, N, N, N, N, N, N, N, N, N, N, /* 0x30 - 0x3F */ @@ -3818,12 +3864,6 @@ twobyte_insn: c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu); break; case 0x21: /* mov from dr to reg */ - if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && - (c->modrm_reg == 4 || c->modrm_reg == 5)) { - emulate_ud(ctxt); - rc = X86EMUL_PROPAGATE_FAULT; - goto done; - } ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu); break; case 0x22: /* mov reg, cr */ @@ -3835,13 +3875,6 @@ twobyte_insn: c->dst.type = OP_NONE; break; case 0x23: /* mov from reg to dr */ - if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && - (c->modrm_reg == 4 || c->modrm_reg == 5)) { - emulate_ud(ctxt); - rc = X86EMUL_PROPAGATE_FAULT; - goto done; - } - if (ops->set_dr(c->modrm_reg, c->src.val & ((ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U), ctxt->vcpu) < 0) { diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ff4ed3619d00..381b038c0d01 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3882,6 +3882,8 @@ static struct __x86_intercept { [x86_intercept_clts] = POST_EX(SVM_EXIT_WRITE_CR0), [x86_intercept_lmsw] = POST_EX(SVM_EXIT_WRITE_CR0), [x86_intercept_smsw] = POST_EX(SVM_EXIT_READ_CR0), + [x86_intercept_dr_read] = POST_EX(SVM_EXIT_READ_DR0), + [x86_intercept_dr_write] = POST_EX(SVM_EXIT_WRITE_DR0), }; #undef POST_EX @@ -3939,6 +3941,10 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, break; } + case SVM_EXIT_READ_DR0: + case SVM_EXIT_WRITE_DR0: + icpt_info.exit_code += info->modrm_reg; + break; default: break; } From dee6bb70e4ac0588c98cc4e661664f0653117f89 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 4 Apr 2011 12:39:30 +0200 Subject: [PATCH 032/131] KVM: SVM: Add intercept checks for descriptor table accesses This patch add intercept checks into the KVM instruction emulator to check for the 8 instructions that access the descriptor table addresses. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 14 ++++++++++++-- arch/x86/kvm/svm.c | 8 ++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index cd9ed9f45275..3ac830a135fd 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2630,8 +2630,18 @@ static struct opcode group5[] = { D(SrcMem | ModRM | Stack), N, }; +static struct opcode group6[] = { + DI(ModRM | Prot, sldt), + DI(ModRM | Prot, str), + DI(ModRM | Prot | Priv, lldt), + DI(ModRM | Prot | Priv, ltr), + N, N, N, N, +}; + static struct group_dual group7 = { { - N, N, DI(ModRM | SrcMem | Priv, lgdt), DI(ModRM | SrcMem | Priv, lidt), + DI(ModRM | Mov | DstMem | Priv, sgdt), + DI(ModRM | Mov | DstMem | Priv, sidt), + DI(ModRM | SrcMem | Priv, lgdt), DI(ModRM | SrcMem | Priv, lidt), DI(SrcNone | ModRM | DstMem | Mov, smsw), N, DI(SrcMem16 | ModRM | Mov | Priv, lmsw), DI(SrcMem | ModRM | ByteOp | Priv | NoAccess, invlpg), @@ -2766,7 +2776,7 @@ static struct opcode opcode_table[256] = { static struct opcode twobyte_table[256] = { /* 0x00 - 0x0F */ - N, GD(0, &group7), N, N, + G(0, group6), GD(0, &group7), N, N, N, D(ImplicitOps | VendorSpecific), DI(ImplicitOps | Priv, clts), N, DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, N, D(ImplicitOps | ModRM), N, N, diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 381b038c0d01..ce251c907810 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3884,6 +3884,14 @@ static struct __x86_intercept { [x86_intercept_smsw] = POST_EX(SVM_EXIT_READ_CR0), [x86_intercept_dr_read] = POST_EX(SVM_EXIT_READ_DR0), [x86_intercept_dr_write] = POST_EX(SVM_EXIT_WRITE_DR0), + [x86_intercept_sldt] = POST_EX(SVM_EXIT_LDTR_READ), + [x86_intercept_str] = POST_EX(SVM_EXIT_TR_READ), + [x86_intercept_lldt] = POST_EX(SVM_EXIT_LDTR_WRITE), + [x86_intercept_ltr] = POST_EX(SVM_EXIT_TR_WRITE), + [x86_intercept_sgdt] = POST_EX(SVM_EXIT_GDTR_READ), + [x86_intercept_sidt] = POST_EX(SVM_EXIT_IDTR_READ), + [x86_intercept_lgdt] = POST_EX(SVM_EXIT_GDTR_WRITE), + [x86_intercept_lidt] = POST_EX(SVM_EXIT_IDTR_WRITE), }; #undef POST_EX From 01de8b09e6068936f7f5e386cb85637cf926468c Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 4 Apr 2011 12:39:31 +0200 Subject: [PATCH 033/131] KVM: SVM: Add intercept checks for SVM instructions This patch adds the necessary code changes in the instruction emulator and the extensions to svm.c to implement intercept checks for the svm instructions. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 44 +++++++++++++++++++++++++++++++++++++++++- arch/x86/kvm/svm.c | 8 ++++++++ 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 3ac830a135fd..a3aba9552b39 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -77,6 +77,7 @@ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ #define Prefix (1<<16) /* Instruction varies with 66/f2/f3 prefix */ #define Sse (1<<17) /* SSE Vector instruction */ +#define RMExt (1<<18) /* Opcode extension in ModRM r/m if mod == 3 */ /* Misc flags */ #define Prot (1<<21) /* instruction generates #UD if not in prot-mode */ #define VendorSpecific (1<<22) /* Vendor specific instruction */ @@ -2580,11 +2581,35 @@ static int check_dr_write(struct x86_emulate_ctxt *ctxt) return check_dr_read(ctxt); } +static int check_svme(struct x86_emulate_ctxt *ctxt) +{ + u64 efer; + + ctxt->ops->get_msr(ctxt->vcpu, MSR_EFER, &efer); + + if (!(efer & EFER_SVME)) + return emulate_ud(ctxt); + + return X86EMUL_CONTINUE; +} + +static int check_svme_pa(struct x86_emulate_ctxt *ctxt) +{ + u64 rax = kvm_register_read(ctxt->vcpu, VCPU_REGS_RAX); + + /* Valid physical address? */ + if (rax & 0xffff000000000000) + return emulate_gp(ctxt, 0); + + return check_svme(ctxt); +} + #define D(_y) { .flags = (_y) } #define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i } #define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \ .check_perm = (_p) } #define N D(0) +#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) } #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } #define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) } #define I(_f, _e) { .flags = (_f), .u.execute = (_e) } @@ -2602,6 +2627,16 @@ static int check_dr_write(struct x86_emulate_ctxt *ctxt) D2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock), \ D2bv(((_f) & ~Lock) | DstAcc | SrcImm) +static struct opcode group7_rm3[] = { + DIP(SrcNone | ModRM | Prot | Priv, vmrun, check_svme_pa), + DIP(SrcNone | ModRM | Prot , vmmcall, check_svme), + DIP(SrcNone | ModRM | Prot | Priv, vmload, check_svme_pa), + DIP(SrcNone | ModRM | Prot | Priv, vmsave, check_svme_pa), + DIP(SrcNone | ModRM | Prot | Priv, stgi, check_svme), + DIP(SrcNone | ModRM | Prot | Priv, clgi, check_svme), + DIP(SrcNone | ModRM | Prot | Priv, skinit, check_svme), + DIP(SrcNone | ModRM | Prot | Priv, invlpga, check_svme), +}; static struct opcode group1[] = { X7(D(Lock)), N @@ -2647,7 +2682,7 @@ static struct group_dual group7 = { { DI(SrcMem | ModRM | ByteOp | Priv | NoAccess, invlpg), }, { D(SrcNone | ModRM | Priv | VendorSpecific), N, - N, D(SrcNone | ModRM | Priv | VendorSpecific), + N, EXT(0, group7_rm3), DI(SrcNone | ModRM | DstMem | Mov, smsw), N, DI(SrcMem16 | ModRM | Mov | Priv, lmsw), N, } }; @@ -2853,6 +2888,7 @@ static struct opcode twobyte_table[256] = { #undef GD #undef I #undef GP +#undef EXT #undef D2bv #undef I2bv @@ -3030,6 +3066,12 @@ done_prefixes: opcode = g_mod3[goffset]; else opcode = g_mod012[goffset]; + + if (opcode.flags & RMExt) { + goffset = c->modrm & 7; + opcode = opcode.u.group[goffset]; + } + c->d |= opcode.flags; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ce251c907810..b98d00bfaf8a 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3892,6 +3892,14 @@ static struct __x86_intercept { [x86_intercept_sidt] = POST_EX(SVM_EXIT_IDTR_READ), [x86_intercept_lgdt] = POST_EX(SVM_EXIT_GDTR_WRITE), [x86_intercept_lidt] = POST_EX(SVM_EXIT_IDTR_WRITE), + [x86_intercept_vmrun] = POST_EX(SVM_EXIT_VMRUN), + [x86_intercept_vmmcall] = POST_EX(SVM_EXIT_VMMCALL), + [x86_intercept_vmload] = POST_EX(SVM_EXIT_VMLOAD), + [x86_intercept_vmsave] = POST_EX(SVM_EXIT_VMSAVE), + [x86_intercept_stgi] = POST_EX(SVM_EXIT_STGI), + [x86_intercept_clgi] = POST_EX(SVM_EXIT_CLGI), + [x86_intercept_skinit] = POST_EX(SVM_EXIT_SKINIT), + [x86_intercept_invlpga] = POST_EX(SVM_EXIT_INVLPGA), }; #undef POST_EX From d7eb82030699e6151f1356e90d495bf292564fb7 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 4 Apr 2011 12:39:32 +0200 Subject: [PATCH 034/131] KVM: SVM: Add intercept checks for remaining group7 instructions This patch implements the emulator intercept checks for the RDTSCP, MONITOR, and MWAIT instructions. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 25 +++++++++++++++++++++++-- arch/x86/kvm/svm.c | 7 +++++++ 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a3aba9552b39..b4adb4cbb5f3 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2604,6 +2604,16 @@ static int check_svme_pa(struct x86_emulate_ctxt *ctxt) return check_svme(ctxt); } +static int check_rdtsc(struct x86_emulate_ctxt *ctxt) +{ + u64 cr4 = ctxt->ops->get_cr(4, ctxt->vcpu); + + if (cr4 & X86_CR4_TSD && ctxt->ops->cpl(ctxt->vcpu)) + return emulate_ud(ctxt); + + return X86EMUL_CONTINUE; +} + #define D(_y) { .flags = (_y) } #define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i } #define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \ @@ -2627,6 +2637,12 @@ static int check_svme_pa(struct x86_emulate_ctxt *ctxt) D2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock), \ D2bv(((_f) & ~Lock) | DstAcc | SrcImm) +static struct opcode group7_rm1[] = { + DI(SrcNone | ModRM | Priv, monitor), + DI(SrcNone | ModRM | Priv, mwait), + N, N, N, N, N, N, +}; + static struct opcode group7_rm3[] = { DIP(SrcNone | ModRM | Prot | Priv, vmrun, check_svme_pa), DIP(SrcNone | ModRM | Prot , vmmcall, check_svme), @@ -2638,6 +2654,11 @@ static struct opcode group7_rm3[] = { DIP(SrcNone | ModRM | Prot | Priv, invlpga, check_svme), }; +static struct opcode group7_rm7[] = { + N, + DIP(SrcNone | ModRM, rdtscp, check_rdtsc), + N, N, N, N, N, N, +}; static struct opcode group1[] = { X7(D(Lock)), N }; @@ -2681,10 +2702,10 @@ static struct group_dual group7 = { { DI(SrcMem16 | ModRM | Mov | Priv, lmsw), DI(SrcMem | ModRM | ByteOp | Priv | NoAccess, invlpg), }, { - D(SrcNone | ModRM | Priv | VendorSpecific), N, + D(SrcNone | ModRM | Priv | VendorSpecific), EXT(0, group7_rm1), N, EXT(0, group7_rm3), DI(SrcNone | ModRM | DstMem | Mov, smsw), N, - DI(SrcMem16 | ModRM | Mov | Priv, lmsw), N, + DI(SrcMem16 | ModRM | Mov | Priv, lmsw), EXT(0, group7_rm7), } }; static struct opcode group8[] = { diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index b98d00bfaf8a..1eb5504ca6f5 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3871,6 +3871,9 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) #define POST_EX(exit) { .exit_code = (exit), \ .stage = X86_ICPT_POST_EXCEPT, \ .valid = true } +#define POST_MEM(exit) { .exit_code = (exit), \ + .stage = X86_ICPT_POST_MEMACCESS, \ + .valid = true } static struct __x86_intercept { u32 exit_code; @@ -3900,9 +3903,13 @@ static struct __x86_intercept { [x86_intercept_clgi] = POST_EX(SVM_EXIT_CLGI), [x86_intercept_skinit] = POST_EX(SVM_EXIT_SKINIT), [x86_intercept_invlpga] = POST_EX(SVM_EXIT_INVLPGA), + [x86_intercept_rdtscp] = POST_EX(SVM_EXIT_RDTSCP), + [x86_intercept_monitor] = POST_MEM(SVM_EXIT_MONITOR), + [x86_intercept_mwait] = POST_EX(SVM_EXIT_MWAIT), }; #undef POST_EX +#undef POST_MEM static int svm_check_intercept(struct kvm_vcpu *vcpu, struct x86_instruction_info *info, From 8061252ee0d21e1289235a4b7fe61f53010c46ff Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 4 Apr 2011 12:39:33 +0200 Subject: [PATCH 035/131] KVM: SVM: Add intercept checks for remaining twobyte instructions This patch adds intercepts checks for the remaining twobyte instructions to the KVM instruction emulator. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 2 ++ arch/x86/kvm/emulate.c | 25 ++++++++++++++++++------- arch/x86/kvm/svm.c | 19 +++++++++++++++++++ 3 files changed, 39 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index fdaf59ac1c07..f30650f00907 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -344,6 +344,8 @@ enum x86_intercept { x86_intercept_wbinvd, x86_intercept_monitor, x86_intercept_mwait, + x86_intercept_rdmsr, + x86_intercept_wrmsr, nr_x86_intercepts }; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b4adb4cbb5f3..0bf1f68a71c2 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2425,12 +2425,9 @@ static int em_cwd(struct x86_emulate_ctxt *ctxt) static int em_rdtsc(struct x86_emulate_ctxt *ctxt) { - unsigned cpl = ctxt->ops->cpl(ctxt->vcpu); struct decode_cache *c = &ctxt->decode; u64 tsc = 0; - if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD)) - return emulate_gp(ctxt, 0); ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc); c->regs[VCPU_REGS_RAX] = (u32)tsc; c->regs[VCPU_REGS_RDX] = tsc >> 32; @@ -2614,6 +2611,18 @@ static int check_rdtsc(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int check_rdpmc(struct x86_emulate_ctxt *ctxt) +{ + u64 cr4 = ctxt->ops->get_cr(4, ctxt->vcpu); + u64 rcx = kvm_register_read(ctxt->vcpu, VCPU_REGS_RCX); + + if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt->vcpu)) || + (rcx > 3)) + return emulate_gp(ctxt, 0); + + return X86EMUL_CONTINUE; +} + #define D(_y) { .flags = (_y) } #define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i } #define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \ @@ -2846,8 +2855,10 @@ static struct opcode twobyte_table[256] = { N, N, N, N, N, N, N, N, N, N, N, N, /* 0x30 - 0x3F */ - D(ImplicitOps | Priv), II(ImplicitOps, em_rdtsc, rdtsc), - D(ImplicitOps | Priv), N, + DI(ImplicitOps | Priv, wrmsr), + IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), + DI(ImplicitOps | Priv, rdmsr), + DIP(ImplicitOps | Priv, rdpmc, check_rdpmc), D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv | VendorSpecific), N, N, N, N, N, N, N, N, N, N, @@ -2871,12 +2882,12 @@ static struct opcode twobyte_table[256] = { X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), /* 0xA0 - 0xA7 */ D(ImplicitOps | Stack), D(ImplicitOps | Stack), - N, D(DstMem | SrcReg | ModRM | BitOp), + DI(ImplicitOps, cpuid), D(DstMem | SrcReg | ModRM | BitOp), D(DstMem | SrcReg | Src2ImmByte | ModRM), D(DstMem | SrcReg | Src2CL | ModRM), N, N, /* 0xA8 - 0xAF */ D(ImplicitOps | Stack), D(ImplicitOps | Stack), - N, D(DstMem | SrcReg | ModRM | BitOp | Lock), + DI(ImplicitOps, rsm), D(DstMem | SrcReg | ModRM | BitOp | Lock), D(DstMem | SrcReg | Src2ImmByte | ModRM), D(DstMem | SrcReg | Src2CL | ModRM), D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1eb5504ca6f5..903628922440 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3868,6 +3868,9 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) update_cr0_intercept(svm); } +#define PRE_EX(exit) { .exit_code = (exit), \ + .stage = X86_ICPT_PRE_EXCEPT, \ + .valid = true } #define POST_EX(exit) { .exit_code = (exit), \ .stage = X86_ICPT_POST_EXCEPT, \ .valid = true } @@ -3906,8 +3909,18 @@ static struct __x86_intercept { [x86_intercept_rdtscp] = POST_EX(SVM_EXIT_RDTSCP), [x86_intercept_monitor] = POST_MEM(SVM_EXIT_MONITOR), [x86_intercept_mwait] = POST_EX(SVM_EXIT_MWAIT), + [x86_intercept_invlpg] = POST_EX(SVM_EXIT_INVLPG), + [x86_intercept_invd] = POST_EX(SVM_EXIT_INVD), + [x86_intercept_wbinvd] = POST_EX(SVM_EXIT_WBINVD), + [x86_intercept_wrmsr] = POST_EX(SVM_EXIT_MSR), + [x86_intercept_rdtsc] = POST_EX(SVM_EXIT_RDTSC), + [x86_intercept_rdmsr] = POST_EX(SVM_EXIT_MSR), + [x86_intercept_rdpmc] = POST_EX(SVM_EXIT_RDPMC), + [x86_intercept_cpuid] = PRE_EX(SVM_EXIT_CPUID), + [x86_intercept_rsm] = PRE_EX(SVM_EXIT_RSM), }; +#undef PRE_EX #undef POST_EX #undef POST_MEM @@ -3968,6 +3981,12 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, case SVM_EXIT_WRITE_DR0: icpt_info.exit_code += info->modrm_reg; break; + case SVM_EXIT_MSR: + if (info->intercept == x86_intercept_wrmsr) + vmcb->control.exit_info_1 = 1; + else + vmcb->control.exit_info_1 = 0; + break; default: break; } From bf608f88faef1245ff87e731512517fc676ffe02 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 4 Apr 2011 12:39:34 +0200 Subject: [PATCH 036/131] KVM: SVM: Add intercept checks for one-byte instructions This patch add intercept checks for emulated one-byte instructions to the KVM instruction emulation path. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 4 ++-- arch/x86/kvm/svm.c | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 0bf1f68a71c2..cc32e72fe175 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2789,7 +2789,7 @@ static struct opcode opcode_table[256] = { D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg), D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A), /* 0x90 - 0x97 */ - X8(D(SrcAcc | DstReg)), + DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)), /* 0x98 - 0x9F */ D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), I(SrcImmFAddr | No64, em_call_far), N, @@ -2831,7 +2831,7 @@ static struct opcode opcode_table[256] = { D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps), D2bv(SrcNone | DstAcc), D2bv(SrcAcc | ImplicitOps), /* 0xF0 - 0xF7 */ - N, N, N, N, + N, DI(ImplicitOps, icebp), N, N, DI(ImplicitOps | Priv, hlt), D(ImplicitOps), G(ByteOp, group3), G(0, group3), /* 0xF8 - 0xFF */ diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 903628922440..9eb27100e2ea 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3918,6 +3918,13 @@ static struct __x86_intercept { [x86_intercept_rdpmc] = POST_EX(SVM_EXIT_RDPMC), [x86_intercept_cpuid] = PRE_EX(SVM_EXIT_CPUID), [x86_intercept_rsm] = PRE_EX(SVM_EXIT_RSM), + [x86_intercept_pause] = PRE_EX(SVM_EXIT_PAUSE), + [x86_intercept_pushf] = PRE_EX(SVM_EXIT_PUSHF), + [x86_intercept_popf] = PRE_EX(SVM_EXIT_POPF), + [x86_intercept_intn] = PRE_EX(SVM_EXIT_SWINT), + [x86_intercept_iret] = PRE_EX(SVM_EXIT_IRET), + [x86_intercept_icebp] = PRE_EX(SVM_EXIT_ICEBP), + [x86_intercept_hlt] = POST_EX(SVM_EXIT_HLT), }; #undef PRE_EX @@ -3987,6 +3994,13 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, else vmcb->control.exit_info_1 = 0; break; + case SVM_EXIT_PAUSE: + /* + * We get this for NOP only, but pause + * is rep not, check this here + */ + if (info->rep_prefix != REPE_PREFIX) + goto out; default: break; } From f6511935f424b9a25059ae18e91ad11dd24980e6 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 4 Apr 2011 12:39:35 +0200 Subject: [PATCH 037/131] KVM: SVM: Add checks for IO instructions This patch adds code to check for IOIO intercepts on instructions decoded by the KVM instruction emulator. [avi: fix build error due to missing #define D2bvIP] Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 4 +++ arch/x86/kvm/emulate.c | 45 ++++++++++++++++++++---------- arch/x86/kvm/svm.c | 36 ++++++++++++++++++++++++ 3 files changed, 70 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index f30650f00907..081844860a3d 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -346,6 +346,10 @@ enum x86_intercept { x86_intercept_mwait, x86_intercept_rdmsr, x86_intercept_wrmsr, + x86_intercept_in, + x86_intercept_ins, + x86_intercept_out, + x86_intercept_outs, nr_x86_intercepts }; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index cc32e72fe175..d88dcfd66a8f 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2623,6 +2623,28 @@ static int check_rdpmc(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int check_perm_in(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + c->dst.bytes = min(c->dst.bytes, 4u); + if (!emulator_io_permited(ctxt, ctxt->ops, c->src.val, c->dst.bytes)) + return emulate_gp(ctxt, 0); + + return X86EMUL_CONTINUE; +} + +static int check_perm_out(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + c->src.bytes = min(c->src.bytes, 4u); + if (!emulator_io_permited(ctxt, ctxt->ops, c->dst.val, c->src.bytes)) + return emulate_gp(ctxt, 0); + + return X86EMUL_CONTINUE; +} + #define D(_y) { .flags = (_y) } #define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i } #define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \ @@ -2640,6 +2662,7 @@ static int check_rdpmc(struct x86_emulate_ctxt *ctxt) #define GP(_f, _g) { .flags = ((_f) | Prefix), .u.gprefix = (_g) } #define D2bv(_f) D((_f) | ByteOp), D(_f) +#define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p) #define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e) #define D6ALU(_f) D2bv((_f) | DstMem | SrcReg | ModRM), \ @@ -2773,8 +2796,8 @@ static struct opcode opcode_table[256] = { I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op), I(SrcImmByte | Mov | Stack, em_push), I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op), - D2bv(DstDI | Mov | String), /* insb, insw/insd */ - D2bv(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */ + D2bvIP(DstDI | Mov | String, ins, check_perm_in), /* insb, insw/insd */ + D2bvIP(SrcSI | ImplicitOps | String, outs, check_perm_out), /* outsb, outsw/outsd */ /* 0x70 - 0x7F */ X16(D(SrcImmByte)), /* 0x80 - 0x87 */ @@ -2825,11 +2848,13 @@ static struct opcode opcode_table[256] = { N, N, N, N, N, N, N, N, /* 0xE0 - 0xE7 */ X4(D(SrcImmByte)), - D2bv(SrcImmUByte | DstAcc), D2bv(SrcAcc | DstImmUByte), + D2bvIP(SrcImmUByte | DstAcc, in, check_perm_in), + D2bvIP(SrcAcc | DstImmUByte, out, check_perm_out), /* 0xE8 - 0xEF */ D(SrcImm | Stack), D(SrcImm | ImplicitOps), D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps), - D2bv(SrcNone | DstAcc), D2bv(SrcAcc | ImplicitOps), + D2bvIP(SrcNone | DstAcc, in, check_perm_in), + D2bvIP(SrcAcc | ImplicitOps, out, check_perm_out), /* 0xF0 - 0xF7 */ N, DI(ImplicitOps, icebp), N, N, DI(ImplicitOps | Priv, hlt), D(ImplicitOps), @@ -2923,6 +2948,7 @@ static struct opcode twobyte_table[256] = { #undef EXT #undef D2bv +#undef D2bvIP #undef I2bv #undef D6ALU @@ -3731,11 +3757,6 @@ special_insn: case 0xed: /* in (e/r)ax,dx */ c->src.val = c->regs[VCPU_REGS_RDX]; do_io_in: - c->dst.bytes = min(c->dst.bytes, 4u); - if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { - rc = emulate_gp(ctxt, 0); - goto done; - } if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, &c->dst.val)) goto done; /* IO is needed */ @@ -3744,12 +3765,6 @@ special_insn: case 0xef: /* out dx,(e/r)ax */ c->dst.val = c->regs[VCPU_REGS_RDX]; do_io_out: - c->src.bytes = min(c->src.bytes, 4u); - if (!emulator_io_permited(ctxt, ops, c->dst.val, - c->src.bytes)) { - rc = emulate_gp(ctxt, 0); - goto done; - } ops->pio_out_emulated(c->src.bytes, c->dst.val, &c->src.val, 1, ctxt->vcpu); c->dst.type = OP_NONE; /* Disable writeback. */ diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 9eb27100e2ea..5c6512dbac7c 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3925,6 +3925,10 @@ static struct __x86_intercept { [x86_intercept_iret] = PRE_EX(SVM_EXIT_IRET), [x86_intercept_icebp] = PRE_EX(SVM_EXIT_ICEBP), [x86_intercept_hlt] = POST_EX(SVM_EXIT_HLT), + [x86_intercept_in] = POST_EX(SVM_EXIT_IOIO), + [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO), + [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO), + [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO), }; #undef PRE_EX @@ -4001,6 +4005,38 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, */ if (info->rep_prefix != REPE_PREFIX) goto out; + case SVM_EXIT_IOIO: { + u64 exit_info; + u32 bytes; + + exit_info = (vcpu->arch.regs[VCPU_REGS_RDX] & 0xffff) << 16; + + if (info->intercept == x86_intercept_in || + info->intercept == x86_intercept_ins) { + exit_info |= SVM_IOIO_TYPE_MASK; + bytes = info->src_bytes; + } else { + bytes = info->dst_bytes; + } + + if (info->intercept == x86_intercept_outs || + info->intercept == x86_intercept_ins) + exit_info |= SVM_IOIO_STR_MASK; + + if (info->rep_prefix) + exit_info |= SVM_IOIO_REP_MASK; + + bytes = min(bytes, 4u); + + exit_info |= bytes << SVM_IOIO_SIZE_SHIFT; + + exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1); + + vmcb->control.exit_info_1 = exit_info; + vmcb->control.exit_info_2 = info->next_rip; + + break; + } default: break; } From 628afd2aeb286415f6e7d0ee7c87aae249e7f999 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 4 Apr 2011 12:39:36 +0200 Subject: [PATCH 038/131] KVM: SVM: Remove nested sel_cr0_write handling code This patch removes all the old code which handled the nested selective cr0 write intercepts. This code was only in place as a work-around until the instruction emulator is capable of doing the same. This is the case with this patch-set and so the code can be removed. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 78 ++++++++++++++++------------------------------ 1 file changed, 26 insertions(+), 52 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 5c6512dbac7c..779b09194f03 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -93,14 +93,6 @@ struct nested_state { /* A VMEXIT is required but not yet emulated */ bool exit_required; - /* - * If we vmexit during an instruction emulation we need this to restore - * the l1 guest rip after the emulation - */ - unsigned long vmexit_rip; - unsigned long vmexit_rsp; - unsigned long vmexit_rax; - /* cache for intercepts of the guest */ u32 intercept_cr; u32 intercept_dr; @@ -1362,31 +1354,6 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_svm *svm = to_svm(vcpu); - if (is_guest_mode(vcpu)) { - /* - * We are here because we run in nested mode, the host kvm - * intercepts cr0 writes but the l1 hypervisor does not. - * But the L1 hypervisor may intercept selective cr0 writes. - * This needs to be checked here. - */ - unsigned long old, new; - - /* Remove bits that would trigger a real cr0 write intercept */ - old = vcpu->arch.cr0 & SVM_CR0_SELECTIVE_MASK; - new = cr0 & SVM_CR0_SELECTIVE_MASK; - - if (old == new) { - /* cr0 write with ts and mp unchanged */ - svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; - if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) { - svm->nested.vmexit_rip = kvm_rip_read(vcpu); - svm->nested.vmexit_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); - svm->nested.vmexit_rax = kvm_register_read(vcpu, VCPU_REGS_RAX); - return; - } - } - } - #ifdef CONFIG_X86_64 if (vcpu->arch.efer & EFER_LME) { if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { @@ -2673,6 +2640,29 @@ static int emulate_on_interception(struct vcpu_svm *svm) return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; } +bool check_selective_cr0_intercepted(struct vcpu_svm *svm, unsigned long val) +{ + unsigned long cr0 = svm->vcpu.arch.cr0; + bool ret = false; + u64 intercept; + + intercept = svm->nested.intercept; + + if (!is_guest_mode(&svm->vcpu) || + (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))) + return false; + + cr0 &= ~SVM_CR0_SELECTIVE_MASK; + val &= ~SVM_CR0_SELECTIVE_MASK; + + if (cr0 ^ val) { + svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; + ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE); + } + + return ret; +} + #define CR_VALID (1ULL << 63) static int cr_interception(struct vcpu_svm *svm) @@ -2696,7 +2686,8 @@ static int cr_interception(struct vcpu_svm *svm) val = kvm_register_read(&svm->vcpu, reg); switch (cr) { case 0: - err = kvm_set_cr0(&svm->vcpu, val); + if (!check_selective_cr0_intercepted(svm, val)) + err = kvm_set_cr0(&svm->vcpu, val); break; case 3: err = kvm_set_cr3(&svm->vcpu, val); @@ -2741,23 +2732,6 @@ static int cr_interception(struct vcpu_svm *svm) return 1; } -static int cr0_write_interception(struct vcpu_svm *svm) -{ - struct kvm_vcpu *vcpu = &svm->vcpu; - int r; - - r = cr_interception(svm); - - if (svm->nested.vmexit_rip) { - kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip); - kvm_register_write(vcpu, VCPU_REGS_RSP, svm->nested.vmexit_rsp); - kvm_register_write(vcpu, VCPU_REGS_RAX, svm->nested.vmexit_rax); - svm->nested.vmexit_rip = 0; - } - - return r; -} - static int dr_interception(struct vcpu_svm *svm) { int reg, dr; @@ -3045,7 +3019,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_CR4] = cr_interception, [SVM_EXIT_READ_CR8] = cr_interception, [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, - [SVM_EXIT_WRITE_CR0] = cr0_write_interception, + [SVM_EXIT_WRITE_CR0] = cr_interception, [SVM_EXIT_WRITE_CR3] = cr_interception, [SVM_EXIT_WRITE_CR4] = cr_interception, [SVM_EXIT_WRITE_CR8] = cr8_write_interception, From 0be839bfb4b12f17fba2d982353b0f674e327f9e Mon Sep 17 00:00:00 2001 From: "Justin P. Mattock" Date: Wed, 30 Mar 2011 09:54:47 -0700 Subject: [PATCH 039/131] KVM: Remove base_addresss in kvm_pit since it is unused The patch below removes unsigned long base_addresss; in i8254.h since it is unused. Signed-off-by: Justin P. Mattock Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index 46d08ca0b48f..b681a9f78c5e 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -33,7 +33,6 @@ struct kvm_kpit_state { }; struct kvm_pit { - unsigned long base_addresss; struct kvm_io_device dev; struct kvm_io_device speaker_dev; struct kvm *kvm; From a0c0ab2feb9d696978a7475dce4253ec62e98a16 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 28 Mar 2011 16:57:49 +0200 Subject: [PATCH 040/131] KVM: x86 emulator: do not open code return values from the emulator Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index d88dcfd66a8f..ae5f49105c3b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2287,7 +2287,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, ctxt->eip = c->eip; } - return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; + return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; } static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg, @@ -3333,7 +3333,7 @@ done_prefixes: } done: - return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; + return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; } static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) @@ -4163,5 +4163,5 @@ twobyte_insn: goto writeback; cannot_emulate: - return -1; + return EMULATION_FAILED; } From 7c5625227ff8c81953e953d8e25c3eba2ab0aeb3 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 28 Mar 2011 10:29:27 +0800 Subject: [PATCH 041/131] KVM: MMU: remove mmu_seq verification on pte update path The mmu_seq verification can be removed since we get the pfn in the protection of mmu_lock. Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu.c | 16 +++++----------- arch/x86/kvm/paging_tmpl.h | 4 +--- 3 files changed, 7 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f7dfd6479d02..ecdc562ea3e2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -274,7 +274,7 @@ struct kvm_mmu { struct kvm_mmu_page *sp); void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - u64 *spte, const void *pte, unsigned long mmu_seq); + u64 *spte, const void *pte); hpa_t root_hpa; int root_level; int shadow_root_level; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 22fae7593ee7..28418054b880 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1206,7 +1206,7 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) static void nonpaging_update_pte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, - const void *pte, unsigned long mmu_seq) + const void *pte) { WARN_ON(1); } @@ -3163,9 +3163,8 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, } static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, - u64 *spte, - const void *new, unsigned long mmu_seq) + struct kvm_mmu_page *sp, u64 *spte, + const void *new) { if (sp->role.level != PT_PAGE_TABLE_LEVEL) { ++vcpu->kvm->stat.mmu_pde_zapped; @@ -3173,7 +3172,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, } ++vcpu->kvm->stat.mmu_pte_updated; - vcpu->arch.mmu.update_pte(vcpu, sp, spte, new, mmu_seq); + vcpu->arch.mmu.update_pte(vcpu, sp, spte, new); } static bool need_remote_flush(u64 old, u64 new) @@ -3229,7 +3228,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_mmu_page *sp; struct hlist_node *node; LIST_HEAD(invalid_list); - unsigned long mmu_seq; u64 entry, gentry, *spte; unsigned pte_size, page_offset, misaligned, quadrant, offset; int level, npte, invlpg_counter, r, flooded = 0; @@ -3271,9 +3269,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, break; } - mmu_seq = vcpu->kvm->mmu_notifier_seq; - smp_rmb(); - spin_lock(&vcpu->kvm->mmu_lock); if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter) gentry = 0; @@ -3345,8 +3340,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, if (gentry && !((sp->role.word ^ vcpu->arch.mmu.base_role.word) & mask.word)) - mmu_pte_write_new_pte(vcpu, sp, spte, &gentry, - mmu_seq); + mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); if (!remote_flush && need_remote_flush(entry, *spte)) remote_flush = true; ++spte; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index c6397795d865..74f8567d57ac 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -325,7 +325,7 @@ no_present: } static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - u64 *spte, const void *pte, unsigned long mmu_seq) + u64 *spte, const void *pte) { pt_element_t gpte; unsigned pte_access; @@ -342,8 +342,6 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, kvm_release_pfn_clean(pfn); return; } - if (mmu_notifier_retry(vcpu, mmu_seq)) - return; /* * we call mmu_set_spte() with host_writable = true because that From 09000adb86550d2895b64faa52e64eaec3cae7b2 Mon Sep 17 00:00:00 2001 From: Bharat Bhushan Date: Fri, 25 Mar 2011 10:32:13 +0530 Subject: [PATCH 042/131] KVM: PPC: Fix issue clearing exit timing counters Following dump is observed on host when clearing the exit timing counters [root@p1021mds kvm]# echo -n 'c' > vm1200_vcpu0_timing INFO: task echo:1276 blocked for more than 120 seconds. "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. echo D 0ff5bf94 0 1276 1190 0x00000000 Call Trace: [c2157e40] [c0007908] __switch_to+0x9c/0xc4 [c2157e50] [c040293c] schedule+0x1b4/0x3bc [c2157e90] [c04032dc] __mutex_lock_slowpath+0x74/0xc0 [c2157ec0] [c00369e4] kvmppc_init_timing_stats+0x20/0xb8 [c2157ed0] [c0036b00] kvmppc_exit_timing_write+0x84/0x98 [c2157ef0] [c00b9f90] vfs_write+0xc0/0x16c [c2157f10] [c00ba284] sys_write+0x4c/0x90 [c2157f40] [c000e320] ret_from_syscall+0x0/0x3c The vcpu->mutex is used by kvm_ioctl_* (KVM_RUN etc) and same was used when clearing the stats (in kvmppc_init_timing_stats()). What happens is that when the guest is idle then it held the vcpu->mutx. While the exiting timing process waits for guest to release the vcpu->mutex and a hang state is reached. Now using seprate lock for exit timing stats. Signed-off-by: Bharat Bhushan Acked-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/kvm/powerpc.c | 4 ++++ arch/powerpc/kvm/timing.c | 10 +++++++--- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index bba3b9b72a39..890897cee050 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -255,6 +255,7 @@ struct kvm_vcpu_arch { u32 dbsr; #ifdef CONFIG_KVM_EXIT_TIMING + struct mutex exit_timing_lock; struct kvmppc_exit_timing timing_exit; struct kvmppc_exit_timing timing_last_enter; u32 last_exit_type; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 99758460efde..ec3d2e75c0a8 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -284,6 +284,10 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) tasklet_init(&vcpu->arch.tasklet, kvmppc_decrementer_func, (ulong)vcpu); vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup; +#ifdef CONFIG_KVM_EXIT_TIMING + mutex_init(&vcpu->arch.exit_timing_lock); +#endif + return 0; } diff --git a/arch/powerpc/kvm/timing.c b/arch/powerpc/kvm/timing.c index a021f5827a33..18f40fd3e98f 100644 --- a/arch/powerpc/kvm/timing.c +++ b/arch/powerpc/kvm/timing.c @@ -34,8 +34,8 @@ void kvmppc_init_timing_stats(struct kvm_vcpu *vcpu) { int i; - /* pause guest execution to avoid concurrent updates */ - mutex_lock(&vcpu->mutex); + /* Take a lock to avoid concurrent updates */ + mutex_lock(&vcpu->arch.exit_timing_lock); vcpu->arch.last_exit_type = 0xDEAD; for (i = 0; i < __NUMBER_OF_KVM_EXIT_TYPES; i++) { @@ -49,7 +49,7 @@ void kvmppc_init_timing_stats(struct kvm_vcpu *vcpu) vcpu->arch.timing_exit.tv64 = 0; vcpu->arch.timing_last_enter.tv64 = 0; - mutex_unlock(&vcpu->mutex); + mutex_unlock(&vcpu->arch.exit_timing_lock); } static void add_exit_timing(struct kvm_vcpu *vcpu, u64 duration, int type) @@ -65,6 +65,8 @@ static void add_exit_timing(struct kvm_vcpu *vcpu, u64 duration, int type) return; } + mutex_lock(&vcpu->arch.exit_timing_lock); + vcpu->arch.timing_count_type[type]++; /* sum */ @@ -93,6 +95,8 @@ static void add_exit_timing(struct kvm_vcpu *vcpu, u64 duration, int type) vcpu->arch.timing_min_duration[type] = duration; if (unlikely(duration > vcpu->arch.timing_max_duration[type])) vcpu->arch.timing_max_duration[type] = duration; + + mutex_unlock(&vcpu->arch.exit_timing_lock); } void kvmppc_update_timing_stats(struct kvm_vcpu *vcpu) From 8b18bc378224b4f195145b407b95768a289497e3 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 5 Apr 2011 16:21:58 +0300 Subject: [PATCH 043/131] KVM: x86 emulator: Re-add VendorSpecific tag to VMMCALL insn VMMCALL needs the VendorSpecific tag so that #UD emulation (called if a guest running on AMD was migrated to an Intel host) is allowed to process the instruction. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index ae5f49105c3b..0e31b0c249e2 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2677,7 +2677,7 @@ static struct opcode group7_rm1[] = { static struct opcode group7_rm3[] = { DIP(SrcNone | ModRM | Prot | Priv, vmrun, check_svme_pa), - DIP(SrcNone | ModRM | Prot , vmmcall, check_svme), + DIP(SrcNone | ModRM | Prot | VendorSpecific, vmmcall, check_svme), DIP(SrcNone | ModRM | Prot | Priv, vmload, check_svme_pa), DIP(SrcNone | ModRM | Prot | Priv, vmsave, check_svme_pa), DIP(SrcNone | ModRM | Prot | Priv, stgi, check_svme), From bfeed29d6d3ebd5f31253d2c067e4e6c4aeb376b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 5 Apr 2011 16:25:20 +0300 Subject: [PATCH 044/131] KVM: x86 emulator: Drop EFER.SVME requirement from VMMCALL VMMCALL requires EFER.SVME to be enabled in the host, not in the guest, which is what check_svme() checks. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 0e31b0c249e2..50bffb98ca82 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2677,7 +2677,7 @@ static struct opcode group7_rm1[] = { static struct opcode group7_rm3[] = { DIP(SrcNone | ModRM | Prot | Priv, vmrun, check_svme_pa), - DIP(SrcNone | ModRM | Prot | VendorSpecific, vmmcall, check_svme), + DI(SrcNone | ModRM | Prot | VendorSpecific, vmmcall), DIP(SrcNone | ModRM | Prot | Priv, vmload, check_svme_pa), DIP(SrcNone | ModRM | Prot | Priv, vmsave, check_svme_pa), DIP(SrcNone | ModRM | Prot | Priv, stgi, check_svme), From fbc0db76b77125e0a5131fb886cbaafa1ec5c525 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 25 Mar 2011 09:44:46 +0100 Subject: [PATCH 045/131] KVM: SVM: Implement infrastructure for TSC_RATE_MSR This patch enhances the kvm_amd module with functions to support the TSC_RATE_MSR which can be used to set a given tsc frequency for the guest vcpu. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/msr-index.h | 1 + arch/x86/kvm/svm.c | 54 +++++++++++++++++++++++++++++++- 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 3cce71413d0b..485b4f1f079b 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -118,6 +118,7 @@ complete list. */ #define MSR_AMD64_PATCH_LEVEL 0x0000008b +#define MSR_AMD64_TSC_RATIO 0xc0000104 #define MSR_AMD64_NB_CFG 0xc001001f #define MSR_AMD64_PATCH_LOADER 0xc0010020 #define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 779b09194f03..830150099958 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -63,6 +63,8 @@ MODULE_LICENSE("GPL"); #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) +#define TSC_RATIO_RSVD 0xffffff0000000000ULL + static bool erratum_383_found __read_mostly; static const u32 host_save_user_msrs[] = { @@ -136,8 +138,13 @@ struct vcpu_svm { unsigned int3_injected; unsigned long int3_rip; u32 apf_reason; + + u64 tsc_ratio; }; +static DEFINE_PER_CPU(u64, current_tsc_ratio); +#define TSC_RATIO_DEFAULT 0x0100000000ULL + #define MSR_INVALID 0xffffffffU static struct svm_direct_access_msrs { @@ -560,6 +567,10 @@ static int has_svm(void) static void svm_hardware_disable(void *garbage) { + /* Make sure we clean up behind us */ + if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) + wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); + cpu_svm_disable(); } @@ -601,6 +612,11 @@ static int svm_hardware_enable(void *garbage) wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT); + if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { + wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); + __get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT; + } + svm_init_erratum_383(); return 0; @@ -843,6 +859,32 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) seg->base = 0; } +static u64 __scale_tsc(u64 ratio, u64 tsc) +{ + u64 mult, frac, _tsc; + + mult = ratio >> 32; + frac = ratio & ((1ULL << 32) - 1); + + _tsc = tsc; + _tsc *= mult; + _tsc += (tsc >> 32) * frac; + _tsc += ((tsc & ((1ULL << 32) - 1)) * frac) >> 32; + + return _tsc; +} + +static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc) +{ + struct vcpu_svm *svm = to_svm(vcpu); + u64 _tsc = tsc; + + if (svm->tsc_ratio != TSC_RATIO_DEFAULT) + _tsc = __scale_tsc(svm->tsc_ratio, tsc); + + return _tsc; +} + static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1037,6 +1079,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) goto out; } + svm->tsc_ratio = TSC_RATIO_DEFAULT; + err = kvm_vcpu_init(&svm->vcpu, kvm, id); if (err) goto free_svm; @@ -1130,6 +1174,12 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); + + if (static_cpu_has(X86_FEATURE_TSCRATEMSR) && + svm->tsc_ratio != __get_cpu_var(current_tsc_ratio)) { + __get_cpu_var(current_tsc_ratio) = svm->tsc_ratio; + wrmsrl(MSR_AMD64_TSC_RATIO, svm->tsc_ratio); + } } static void svm_vcpu_put(struct kvm_vcpu *vcpu) @@ -2784,7 +2834,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) case MSR_IA32_TSC: { struct vmcb *vmcb = get_host_vmcb(svm); - *data = vmcb->control.tsc_offset + native_read_tsc(); + *data = vmcb->control.tsc_offset + + svm_scale_tsc(vcpu, native_read_tsc()); + break; } case MSR_STAR: From 1e993611d0dc879fde25515dc9867d1cfd4c5137 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 25 Mar 2011 09:44:47 +0100 Subject: [PATCH 046/131] KVM: X86: Let kvm-clock report the right tsc frequency This patch changes the kvm_guest_time_update function to use TSC frequency the guest actually has for updating its clock. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 6 +++--- arch/x86/kvm/x86.c | 25 +++++++++++++++---------- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ecdc562ea3e2..e3aaa02ca032 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -396,7 +396,10 @@ struct kvm_vcpu_arch { u64 last_kernel_ns; u64 last_tsc_nsec; u64 last_tsc_write; + u32 virtual_tsc_khz; bool tsc_catchup; + u32 tsc_catchup_mult; + s8 tsc_catchup_shift; bool nmi_pending; bool nmi_injected; @@ -466,9 +469,6 @@ struct kvm_arch { u64 last_tsc_nsec; u64 last_tsc_offset; u64 last_tsc_write; - u32 virtual_tsc_khz; - u32 virtual_tsc_mult; - s8 virtual_tsc_shift; struct kvm_xen_hvm_config xen_hvm_config; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0d6524fa2aff..78d729174d99 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -969,6 +969,14 @@ static inline int kvm_tsc_changes_freq(void) return ret; } +static u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.virtual_tsc_khz) + return vcpu->arch.virtual_tsc_khz; + else + return __this_cpu_read(cpu_tsc_khz); +} + static inline u64 nsec_to_cycles(u64 nsec) { u64 ret; @@ -982,20 +990,19 @@ static inline u64 nsec_to_cycles(u64 nsec) return ret; } -static void kvm_arch_set_tsc_khz(struct kvm *kvm, u32 this_tsc_khz) +static void kvm_init_tsc_catchup(struct kvm_vcpu *vcpu, u32 this_tsc_khz) { /* Compute a scale to convert nanoseconds in TSC cycles */ kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, - &kvm->arch.virtual_tsc_shift, - &kvm->arch.virtual_tsc_mult); - kvm->arch.virtual_tsc_khz = this_tsc_khz; + &vcpu->arch.tsc_catchup_shift, + &vcpu->arch.tsc_catchup_mult); } static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) { u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec, - vcpu->kvm->arch.virtual_tsc_mult, - vcpu->kvm->arch.virtual_tsc_shift); + vcpu->arch.tsc_catchup_mult, + vcpu->arch.tsc_catchup_shift); tsc += vcpu->arch.last_tsc_write; return tsc; } @@ -1062,8 +1069,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) local_irq_save(flags); kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp); kernel_ns = get_kernel_ns(); - this_tsc_khz = __this_cpu_read(cpu_tsc_khz); - + this_tsc_khz = vcpu_tsc_khz(v); if (unlikely(this_tsc_khz == 0)) { local_irq_restore(flags); kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); @@ -6060,8 +6066,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) } vcpu->arch.pio_data = page_address(page); - if (!kvm->arch.virtual_tsc_khz) - kvm_arch_set_tsc_khz(kvm, max_tsc_khz); + kvm_init_tsc_catchup(vcpu, max_tsc_khz); r = kvm_mmu_create(vcpu); if (r < 0) From 8f6055cbaf68cbd9ff2692a2cfa691b43629ccd4 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 25 Mar 2011 09:44:48 +0100 Subject: [PATCH 047/131] KVM: X86: Make tsc_delta calculation a function of guest tsc The calculation of the tsc_delta value to ensure a forward-going tsc for the guest is a function of the host-tsc. This works as long as the guests tsc_khz is equal to the hosts tsc_khz. With tsc-scaling hardware support this is not longer true and the tsc_delta needs to be calculated using guest_tsc values. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 78d729174d99..fcce29b7b6fa 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2113,8 +2113,13 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_x86_ops->vcpu_load(vcpu, cpu); if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { /* Make sure TSC doesn't go backwards */ - s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : - native_read_tsc() - vcpu->arch.last_host_tsc; + s64 tsc_delta; + u64 tsc; + + kvm_get_msr(vcpu, MSR_IA32_TSC, &tsc); + tsc_delta = !vcpu->arch.last_guest_tsc ? 0 : + tsc - vcpu->arch.last_guest_tsc; + if (tsc_delta < 0) mark_tsc_unstable("KVM discovered backwards TSC"); if (check_tsc_unstable()) { From 4051b18801f5b47bb0369feefdc80e57819d0ddf Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 25 Mar 2011 09:44:49 +0100 Subject: [PATCH 048/131] KVM: X86: Implement call-back to propagate virtual_tsc_khz This patch implements a call-back into the architecture code to allow the propagation of changes to the virtual tsc_khz of the vcpu. On SVM it updates the tsc_ratio variable, on VMX it does nothing. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm.c | 33 +++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx.c | 11 +++++++++++ 3 files changed, 45 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e3aaa02ca032..f3a7116f802f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -606,6 +606,7 @@ struct kvm_x86_ops { bool (*has_wbinvd_exit)(void); + void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz); void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 830150099958..a39fde4f5fe8 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -885,6 +885,38 @@ static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc) return _tsc; } +static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) +{ + struct vcpu_svm *svm = to_svm(vcpu); + u64 ratio; + u64 khz; + + /* TSC scaling supported? */ + if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) + return; + + /* TSC-Scaling disabled or guest TSC same frequency as host TSC? */ + if (user_tsc_khz == 0) { + vcpu->arch.virtual_tsc_khz = 0; + svm->tsc_ratio = TSC_RATIO_DEFAULT; + return; + } + + khz = user_tsc_khz; + + /* TSC scaling required - calculate ratio */ + ratio = khz << 32; + do_div(ratio, tsc_khz); + + if (ratio == 0 || ratio & TSC_RATIO_RSVD) { + WARN_ONCE(1, "Invalid TSC ratio - virtual-tsc-khz=%u\n", + user_tsc_khz); + return; + } + vcpu->arch.virtual_tsc_khz = user_tsc_khz; + svm->tsc_ratio = ratio; +} + static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4159,6 +4191,7 @@ static struct kvm_x86_ops svm_x86_ops = { .has_wbinvd_exit = svm_has_wbinvd_exit, + .set_tsc_khz = svm_set_tsc_khz, .write_tsc_offset = svm_write_tsc_offset, .adjust_tsc_offset = svm_adjust_tsc_offset, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3dfefe3bcd05..e19c7a5473d5 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1160,6 +1160,16 @@ static u64 guest_read_tsc(void) return host_tsc + tsc_offset; } +/* + * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ + * ioctl. In this case the call-back should update internal vmx state to make + * the changes effective. + */ +static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) +{ + /* Nothing to do here */ +} + /* * writes 'offset' into guest's timestamp counter offset register */ @@ -4497,6 +4507,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, + .set_tsc_khz = vmx_set_tsc_khz, .write_tsc_offset = vmx_write_tsc_offset, .adjust_tsc_offset = vmx_adjust_tsc_offset, From 857e40999e35906baa367a79137019912cfb5434 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 25 Mar 2011 09:44:50 +0100 Subject: [PATCH 049/131] KVM: X86: Delegate tsc-offset calculation to architecture code With TSC scaling in SVM the tsc-offset needs to be calculated differently. This patch propagates this calculation into the architecture specific modules so that this complexity can be handled there. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/svm.c | 10 ++++++++++ arch/x86/kvm/vmx.c | 6 ++++++ arch/x86/kvm/x86.c | 10 +++++----- 4 files changed, 23 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f3a7116f802f..da0a8ce3a139 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -609,6 +609,8 @@ struct kvm_x86_ops { void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz); void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); + u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc); + void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); int (*check_intercept)(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index a39fde4f5fe8..8c4549bef4ed 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -943,6 +943,15 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } +static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) +{ + u64 tsc; + + tsc = svm_scale_tsc(vcpu, native_read_tsc()); + + return target_tsc - tsc; +} + static void init_vmcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -4194,6 +4203,7 @@ static struct kvm_x86_ops svm_x86_ops = { .set_tsc_khz = svm_set_tsc_khz, .write_tsc_offset = svm_write_tsc_offset, .adjust_tsc_offset = svm_adjust_tsc_offset, + .compute_tsc_offset = svm_compute_tsc_offset, .set_tdp_cr3 = set_tdp_cr3, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e19c7a5473d5..aabe3334d064 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1184,6 +1184,11 @@ static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) vmcs_write64(TSC_OFFSET, offset + adjustment); } +static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) +{ + return target_tsc - native_read_tsc(); +} + /* * Reads an msr value (of 'msr_index') into 'pdata'. * Returns 0 on success, non-0 otherwise. @@ -4510,6 +4515,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_tsc_khz = vmx_set_tsc_khz, .write_tsc_offset = vmx_write_tsc_offset, .adjust_tsc_offset = vmx_adjust_tsc_offset, + .compute_tsc_offset = vmx_compute_tsc_offset, .set_tdp_cr3 = vmx_set_cr3, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fcce29b7b6fa..579ce34e7904 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -977,7 +977,7 @@ static u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu) return __this_cpu_read(cpu_tsc_khz); } -static inline u64 nsec_to_cycles(u64 nsec) +static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) { u64 ret; @@ -985,7 +985,7 @@ static inline u64 nsec_to_cycles(u64 nsec) if (kvm_tsc_changes_freq()) printk_once(KERN_WARNING "kvm: unreliable cycle conversion on adjustable rate TSC\n"); - ret = nsec * __this_cpu_read(cpu_tsc_khz); + ret = nsec * vcpu_tsc_khz(vcpu); do_div(ret, USEC_PER_SEC); return ret; } @@ -1015,7 +1015,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) s64 sdiff; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); - offset = data - native_read_tsc(); + offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); ns = get_kernel_ns(); elapsed = ns - kvm->arch.last_tsc_nsec; sdiff = data - kvm->arch.last_tsc_write; @@ -1031,13 +1031,13 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) * In that case, for a reliable TSC, we can match TSC offsets, * or make a best guest using elapsed value. */ - if (sdiff < nsec_to_cycles(5ULL * NSEC_PER_SEC) && + if (sdiff < nsec_to_cycles(vcpu, 5ULL * NSEC_PER_SEC) && elapsed < 5ULL * NSEC_PER_SEC) { if (!check_tsc_unstable()) { offset = kvm->arch.last_tsc_offset; pr_debug("kvm: matched tsc offset for %llu\n", data); } else { - u64 delta = nsec_to_cycles(elapsed); + u64 delta = nsec_to_cycles(vcpu, elapsed); offset += delta; pr_debug("kvm: adjusted tsc offset by %llu\n", delta); } From 92a1f12d2598f429bd8639e21d89305e787115c5 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 25 Mar 2011 09:44:51 +0100 Subject: [PATCH 050/131] KVM: X86: Implement userspace interface to set virtual_tsc_khz This patch implements two new vm-ioctls to get and set the virtual_tsc_khz if the machine supports tsc-scaling. Setting the tsc-frequency is only possible before userspace creates any vcpu. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- Documentation/kvm/api.txt | 23 ++++++++++++++++++++++ arch/x86/include/asm/kvm_host.h | 7 +++++++ arch/x86/kvm/svm.c | 20 +++++++++++++++++++ arch/x86/kvm/x86.c | 35 +++++++++++++++++++++++++++++++++ include/linux/kvm.h | 5 +++++ 5 files changed, 90 insertions(+) diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt index 9bef4e4cec50..1b9eaa7e8856 100644 --- a/Documentation/kvm/api.txt +++ b/Documentation/kvm/api.txt @@ -1263,6 +1263,29 @@ struct kvm_assigned_msix_entry { __u16 padding[3]; }; +4.54 KVM_SET_TSC_KHZ + +Capability: KVM_CAP_TSC_CONTROL +Architectures: x86 +Type: vcpu ioctl +Parameters: virtual tsc_khz +Returns: 0 on success, -1 on error + +Specifies the tsc frequency for the virtual machine. The unit of the +frequency is KHz. + +4.55 KVM_GET_TSC_KHZ + +Capability: KVM_CAP_GET_TSC_KHZ +Architectures: x86 +Type: vcpu ioctl +Parameters: none +Returns: virtual tsc-khz on success, negative value on error + +Returns the tsc frequency of the guest. The unit of the return value is +KHz. If the host has unstable tsc this ioctl returns -EIO instead as an +error. + 5. The kvm_run structure Application code obtains a pointer to the kvm_run structure by diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index da0a8ce3a139..bd57639fd5db 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -655,6 +655,13 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); extern bool tdp_enabled; +/* control of guest tsc rate supported? */ +extern bool kvm_has_tsc_control; +/* minimum supported tsc_khz for guests */ +extern u32 kvm_min_guest_tsc_khz; +/* maximum supported tsc_khz for guests */ +extern u32 kvm_max_guest_tsc_khz; + enum emulation_result { EMULATE_DONE, /* no further processing */ EMULATE_DO_MMIO, /* kvm_run filled with mmio request */ diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8c4549bef4ed..a98873762433 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -64,6 +64,8 @@ MODULE_LICENSE("GPL"); #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) #define TSC_RATIO_RSVD 0xffffff0000000000ULL +#define TSC_RATIO_MIN 0x0000000000000001ULL +#define TSC_RATIO_MAX 0x000000ffffffffffULL static bool erratum_383_found __read_mostly; @@ -189,6 +191,7 @@ static int nested_svm_intercept(struct vcpu_svm *svm); static int nested_svm_vmexit(struct vcpu_svm *svm); static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); +static u64 __scale_tsc(u64 ratio, u64 tsc); enum { VMCB_INTERCEPTS, /* Intercept vectors, TSC offset, @@ -798,6 +801,23 @@ static __init int svm_hardware_setup(void) if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) kvm_enable_efer_bits(EFER_FFXSR); + if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { + u64 max; + + kvm_has_tsc_control = true; + + /* + * Make sure the user can only configure tsc_khz values that + * fit into a signed integer. + * A min value is not calculated needed because it will always + * be 1 on all machines and a value of 0 is used to disable + * tsc-scaling for the vcpu. + */ + max = min(0x7fffffffULL, __scale_tsc(tsc_khz, TSC_RATIO_MAX)); + + kvm_max_guest_tsc_khz = max; + } + if (nested) { printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 579ce34e7904..1d5a7f418795 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -87,6 +87,11 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops); int ignore_msrs = 0; module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); +bool kvm_has_tsc_control; +EXPORT_SYMBOL_GPL(kvm_has_tsc_control); +u32 kvm_max_guest_tsc_khz; +EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); + #define KVM_NR_SHARED_MSRS 16 struct kvm_shared_msrs_global { @@ -1986,6 +1991,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_X86_ROBUST_SINGLESTEP: case KVM_CAP_XSAVE: case KVM_CAP_ASYNC_PF: + case KVM_CAP_GET_TSC_KHZ: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -2012,6 +2018,9 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_XCRS: r = cpu_has_xsave; break; + case KVM_CAP_TSC_CONTROL: + r = kvm_has_tsc_control; + break; default: r = 0; break; @@ -3045,6 +3054,32 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); break; } + case KVM_SET_TSC_KHZ: { + u32 user_tsc_khz; + + r = -EINVAL; + if (!kvm_has_tsc_control) + break; + + user_tsc_khz = (u32)arg; + + if (user_tsc_khz >= kvm_max_guest_tsc_khz) + goto out; + + kvm_x86_ops->set_tsc_khz(vcpu, user_tsc_khz); + + r = 0; + goto out; + } + case KVM_GET_TSC_KHZ: { + r = -EIO; + if (check_tsc_unstable()) + goto out; + + r = vcpu_tsc_khz(vcpu); + + goto out; + } default: r = -EINVAL; } diff --git a/include/linux/kvm.h b/include/linux/kvm.h index ea2dc1a2e13d..2f63ebeac639 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -541,6 +541,8 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_PPC_GET_PVINFO 57 #define KVM_CAP_PPC_IRQ_LEVEL 58 #define KVM_CAP_ASYNC_PF 59 +#define KVM_CAP_TSC_CONTROL 60 +#define KVM_CAP_GET_TSC_KHZ 61 #ifdef KVM_CAP_IRQ_ROUTING @@ -677,6 +679,9 @@ struct kvm_clock_data { #define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2) /* Available with KVM_CAP_PPC_GET_PVINFO */ #define KVM_PPC_GET_PVINFO _IOW(KVMIO, 0xa1, struct kvm_ppc_pvinfo) +/* Available with KVM_CAP_TSC_CONTROL */ +#define KVM_SET_TSC_KHZ _IO(KVMIO, 0xa2) +#define KVM_GET_TSC_KHZ _IO(KVMIO, 0xa3) /* * ioctls for vcpu fds From e3e9ed3d2c443fd90a04fb7ff231ad53ef087417 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 6 Apr 2011 12:30:03 +0200 Subject: [PATCH 051/131] KVM: SVM: Fix fault-rip on vmsave/vmload emulation When the emulation of vmload or vmsave fails because the guest passed an unsupported physical address it gets an #GP with rip pointing to the instruction after vmsave/vmload. This is a bug and fixed by this patch. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index a98873762433..a6bf2ad7429c 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2518,13 +2518,13 @@ static int vmload_interception(struct vcpu_svm *svm) if (nested_svm_check_permissions(svm)) return 1; - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); - nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); if (!nested_vmcb) return 1; + svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; + skip_emulated_instruction(&svm->vcpu); + nested_svm_vmloadsave(nested_vmcb, svm->vmcb); nested_svm_unmap(page); @@ -2539,13 +2539,13 @@ static int vmsave_interception(struct vcpu_svm *svm) if (nested_svm_check_permissions(svm)) return 1; - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); - nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); if (!nested_vmcb) return 1; + svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; + skip_emulated_instruction(&svm->vcpu); + nested_svm_vmloadsave(svm->vmcb, nested_vmcb); nested_svm_unmap(page); From 3ca3ac4dae5da5af375a9e80d2316ccfa7f0c6ab Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 31 Mar 2011 16:52:26 +0200 Subject: [PATCH 052/131] KVM: x86 emulator: Add helpers for memory access using segmented addresses Will help later adding proper segment checks. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 75 +++++++++++++++++++++++++++++------------- 1 file changed, 52 insertions(+), 23 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 50bffb98ca82..8c38f6ca935b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -540,6 +540,15 @@ static int emulate_nm(struct x86_emulate_ctxt *ctxt) return emulate_exception(ctxt, NM_VECTOR, 0, false); } +static int segmented_read_std(struct x86_emulate_ctxt *ctxt, + struct segmented_address addr, + void *data, + unsigned size) +{ + return ctxt->ops->read_std(linear(ctxt, addr), data, size, ctxt->vcpu, + &ctxt->exception); +} + static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, unsigned long eip, u8 *dest) @@ -604,13 +613,11 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt, if (op_bytes == 2) op_bytes = 3; *address = 0; - rc = ops->read_std(linear(ctxt, addr), (unsigned long *)size, 2, - ctxt->vcpu, &ctxt->exception); + rc = segmented_read_std(ctxt, addr, size, 2); if (rc != X86EMUL_CONTINUE) return rc; addr.ea += 2; - rc = ops->read_std(linear(ctxt, addr), address, op_bytes, - ctxt->vcpu, &ctxt->exception); + rc = segmented_read_std(ctxt, addr, address, op_bytes); return rc; } @@ -950,6 +957,32 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt, return X86EMUL_CONTINUE; } +static int segmented_read(struct x86_emulate_ctxt *ctxt, + struct segmented_address addr, + void *data, + unsigned size) +{ + return read_emulated(ctxt, ctxt->ops, linear(ctxt, addr), data, size); +} + +static int segmented_write(struct x86_emulate_ctxt *ctxt, + struct segmented_address addr, + const void *data, + unsigned size) +{ + return ctxt->ops->write_emulated(linear(ctxt, addr), data, size, + &ctxt->exception, ctxt->vcpu); +} + +static int segmented_cmpxchg(struct x86_emulate_ctxt *ctxt, + struct segmented_address addr, + const void *orig_data, const void *data, + unsigned size) +{ + return ctxt->ops->cmpxchg_emulated(linear(ctxt, addr), orig_data, data, + size, &ctxt->exception, ctxt->vcpu); +} + static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, unsigned int size, unsigned short port, @@ -1197,20 +1230,16 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, break; case OP_MEM: if (c->lock_prefix) - rc = ops->cmpxchg_emulated( - linear(ctxt, c->dst.addr.mem), - &c->dst.orig_val, - &c->dst.val, - c->dst.bytes, - &ctxt->exception, - ctxt->vcpu); + rc = segmented_cmpxchg(ctxt, + c->dst.addr.mem, + &c->dst.orig_val, + &c->dst.val, + c->dst.bytes); else - rc = ops->write_emulated( - linear(ctxt, c->dst.addr.mem), - &c->dst.val, - c->dst.bytes, - &ctxt->exception, - ctxt->vcpu); + rc = segmented_write(ctxt, + c->dst.addr.mem, + &c->dst.val, + c->dst.bytes); if (rc != X86EMUL_CONTINUE) return rc; break; @@ -1249,7 +1278,7 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt, addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]); addr.seg = VCPU_SREG_SS; - rc = read_emulated(ctxt, ops, linear(ctxt, addr), dest, len); + rc = segmented_read(ctxt, addr, dest, len); if (rc != X86EMUL_CONTINUE) return rc; @@ -3440,16 +3469,16 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) } if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) { - rc = read_emulated(ctxt, ops, linear(ctxt, c->src.addr.mem), - c->src.valptr, c->src.bytes); + rc = segmented_read(ctxt, c->src.addr.mem, + c->src.valptr, c->src.bytes); if (rc != X86EMUL_CONTINUE) goto done; c->src.orig_val64 = c->src.val64; } if (c->src2.type == OP_MEM) { - rc = read_emulated(ctxt, ops, linear(ctxt, c->src2.addr.mem), - &c->src2.val, c->src2.bytes); + rc = segmented_read(ctxt, c->src2.addr.mem, + &c->src2.val, c->src2.bytes); if (rc != X86EMUL_CONTINUE) goto done; } @@ -3460,7 +3489,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { /* optimisation - avoid slow emulated read if Mov */ - rc = read_emulated(ctxt, ops, linear(ctxt, c->dst.addr.mem), + rc = segmented_read(ctxt, c->dst.addr.mem, &c->dst.val, c->dst.bytes); if (rc != X86EMUL_CONTINUE) goto done; From 38503911b32186240301bbe81601cfabb37e752e Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 31 Mar 2011 18:48:09 +0200 Subject: [PATCH 053/131] KVM: x86 emulator: move invlpg emulation into a function It's going to get more complicated soon. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8c38f6ca935b..c522b4e3dbb0 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2477,6 +2477,15 @@ static int em_movdqu(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_invlpg(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + emulate_invlpg(ctxt->vcpu, linear(ctxt, c->src.addr.mem)); + /* Disable writeback. */ + c->dst.type = OP_NONE; + return X86EMUL_CONTINUE; +} + static bool valid_cr(int nr) { switch (nr) { @@ -3966,10 +3975,7 @@ twobyte_insn: rc = X86EMUL_PROPAGATE_FAULT; goto done; case 7: /* invlpg*/ - emulate_invlpg(ctxt->vcpu, - linear(ctxt, c->src.addr.mem)); - /* Disable writeback. */ - c->dst.type = OP_NONE; + rc = em_invlpg(ctxt); break; default: goto cannot_emulate; From 9fa088f4d24f045d91c37a5e55f0d2be2ef387ad Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 31 Mar 2011 18:54:30 +0200 Subject: [PATCH 054/131] KVM: x86 emulator: change address linearization to return an error code Preparing to add segment checks. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 47 +++++++++++++++++++++++++++++++++++------- 1 file changed, 39 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c522b4e3dbb0..b46fa374d0f1 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -489,8 +489,9 @@ static unsigned seg_override(struct x86_emulate_ctxt *ctxt, return c->seg_override; } -static ulong linear(struct x86_emulate_ctxt *ctxt, - struct segmented_address addr) +static int linearize(struct x86_emulate_ctxt *ctxt, + struct segmented_address addr, + ulong *linear) { struct decode_cache *c = &ctxt->decode; ulong la; @@ -498,7 +499,8 @@ static ulong linear(struct x86_emulate_ctxt *ctxt, la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea; if (c->ad_bytes != 8) la &= (u32)-1; - return la; + *linear = la; + return X86EMUL_CONTINUE; } static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, @@ -545,7 +547,13 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt, void *data, unsigned size) { - return ctxt->ops->read_std(linear(ctxt, addr), data, size, ctxt->vcpu, + int rc; + ulong linear; + + rc = linearize(ctxt, addr, &linear); + if (rc != X86EMUL_CONTINUE) + return rc; + return ctxt->ops->read_std(linear, data, size, ctxt->vcpu, &ctxt->exception); } @@ -962,7 +970,13 @@ static int segmented_read(struct x86_emulate_ctxt *ctxt, void *data, unsigned size) { - return read_emulated(ctxt, ctxt->ops, linear(ctxt, addr), data, size); + int rc; + ulong linear; + + rc = linearize(ctxt, addr, &linear); + if (rc != X86EMUL_CONTINUE) + return rc; + return read_emulated(ctxt, ctxt->ops, linear, data, size); } static int segmented_write(struct x86_emulate_ctxt *ctxt, @@ -970,7 +984,13 @@ static int segmented_write(struct x86_emulate_ctxt *ctxt, const void *data, unsigned size) { - return ctxt->ops->write_emulated(linear(ctxt, addr), data, size, + int rc; + ulong linear; + + rc = linearize(ctxt, addr, &linear); + if (rc != X86EMUL_CONTINUE) + return rc; + return ctxt->ops->write_emulated(linear, data, size, &ctxt->exception, ctxt->vcpu); } @@ -979,7 +999,13 @@ static int segmented_cmpxchg(struct x86_emulate_ctxt *ctxt, const void *orig_data, const void *data, unsigned size) { - return ctxt->ops->cmpxchg_emulated(linear(ctxt, addr), orig_data, data, + int rc; + ulong linear; + + rc = linearize(ctxt, addr, &linear); + if (rc != X86EMUL_CONTINUE) + return rc; + return ctxt->ops->cmpxchg_emulated(linear, orig_data, data, size, &ctxt->exception, ctxt->vcpu); } @@ -2480,7 +2506,12 @@ static int em_movdqu(struct x86_emulate_ctxt *ctxt) static int em_invlpg(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; - emulate_invlpg(ctxt->vcpu, linear(ctxt, c->src.addr.mem)); + int rc; + ulong linear; + + rc = linearize(ctxt, c->src.addr.mem, &linear); + if (rc == X86EMUL_CONTINUE) + emulate_invlpg(ctxt->vcpu, linear); /* Disable writeback. */ c->dst.type = OP_NONE; return X86EMUL_CONTINUE; From 83b8795a29c53a5f9f202933818128aa54c3e8d2 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 3 Apr 2011 11:31:19 +0300 Subject: [PATCH 055/131] KVM: x86 emulator: pass access size and read/write intent to linearize() Needed for segment read/write checks. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b46fa374d0f1..a2d343c4c0c1 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -491,6 +491,7 @@ static unsigned seg_override(struct x86_emulate_ctxt *ctxt, static int linearize(struct x86_emulate_ctxt *ctxt, struct segmented_address addr, + unsigned size, bool write, ulong *linear) { struct decode_cache *c = &ctxt->decode; @@ -550,7 +551,7 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt, int rc; ulong linear; - rc = linearize(ctxt, addr, &linear); + rc = linearize(ctxt, addr, size, false, &linear); if (rc != X86EMUL_CONTINUE) return rc; return ctxt->ops->read_std(linear, data, size, ctxt->vcpu, @@ -973,7 +974,7 @@ static int segmented_read(struct x86_emulate_ctxt *ctxt, int rc; ulong linear; - rc = linearize(ctxt, addr, &linear); + rc = linearize(ctxt, addr, size, false, &linear); if (rc != X86EMUL_CONTINUE) return rc; return read_emulated(ctxt, ctxt->ops, linear, data, size); @@ -987,7 +988,7 @@ static int segmented_write(struct x86_emulate_ctxt *ctxt, int rc; ulong linear; - rc = linearize(ctxt, addr, &linear); + rc = linearize(ctxt, addr, size, true, &linear); if (rc != X86EMUL_CONTINUE) return rc; return ctxt->ops->write_emulated(linear, data, size, @@ -1002,7 +1003,7 @@ static int segmented_cmpxchg(struct x86_emulate_ctxt *ctxt, int rc; ulong linear; - rc = linearize(ctxt, addr, &linear); + rc = linearize(ctxt, addr, size, true, &linear); if (rc != X86EMUL_CONTINUE) return rc; return ctxt->ops->cmpxchg_emulated(linear, orig_data, data, @@ -2509,7 +2510,7 @@ static int em_invlpg(struct x86_emulate_ctxt *ctxt) int rc; ulong linear; - rc = linearize(ctxt, c->src.addr.mem, &linear); + rc = linearize(ctxt, c->src.addr.mem, 1, false, &linear); if (rc == X86EMUL_CONTINUE) emulate_invlpg(ctxt->vcpu, linear); /* Disable writeback. */ From 52fd8b445f5e8572526e3f84c753079470152414 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 3 Apr 2011 12:33:12 +0300 Subject: [PATCH 056/131] KVM: x86 emulator: move linearize() downwards So it can call emulate_gp() without forward declarations. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a2d343c4c0c1..601a9bca4b72 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -489,21 +489,6 @@ static unsigned seg_override(struct x86_emulate_ctxt *ctxt, return c->seg_override; } -static int linearize(struct x86_emulate_ctxt *ctxt, - struct segmented_address addr, - unsigned size, bool write, - ulong *linear) -{ - struct decode_cache *c = &ctxt->decode; - ulong la; - - la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea; - if (c->ad_bytes != 8) - la &= (u32)-1; - *linear = la; - return X86EMUL_CONTINUE; -} - static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, u32 error, bool valid) { @@ -543,6 +528,21 @@ static int emulate_nm(struct x86_emulate_ctxt *ctxt) return emulate_exception(ctxt, NM_VECTOR, 0, false); } +static int linearize(struct x86_emulate_ctxt *ctxt, + struct segmented_address addr, + unsigned size, bool write, + ulong *linear) +{ + struct decode_cache *c = &ctxt->decode; + ulong la; + + la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea; + if (c->ad_bytes != 8) + la &= (u32)-1; + *linear = la; + return X86EMUL_CONTINUE; +} + static int segmented_read_std(struct x86_emulate_ctxt *ctxt, struct segmented_address addr, void *data, From 56697687da592d0429c0c3ab80ee7e9d20a3b6e5 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 3 Apr 2011 14:08:51 +0300 Subject: [PATCH 057/131] KVM: x86 emulator: move desc_limit_scaled() For reuse later. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 601a9bca4b72..793aff52a4b8 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -464,6 +464,13 @@ static inline void jmp_rel(struct decode_cache *c, int rel) register_address_increment(c, &c->eip, rel); } +static u32 desc_limit_scaled(struct desc_struct *desc) +{ + u32 limit = get_desc_limit(desc); + + return desc->g ? (limit << 12) | 0xfff : limit; +} + static void set_seg_override(struct decode_cache *c, int seg) { c->has_seg_override = true; @@ -1040,13 +1047,6 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, return 1; } -static u32 desc_limit_scaled(struct desc_struct *desc) -{ - u32 limit = get_desc_limit(desc); - - return desc->g ? (limit << 12) | 0xfff : limit; -} - static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, u16 selector, struct desc_ptr *dt) From 618ff15de19109af126b33d90d7eaec27e61c691 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 3 Apr 2011 12:32:09 +0300 Subject: [PATCH 058/131] KVM: x86 emulator: implement segment permission checks Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 63 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 793aff52a4b8..2ec69bc85846 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -515,6 +515,11 @@ static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err) return emulate_exception(ctxt, GP_VECTOR, err, true); } +static int emulate_ss(struct x86_emulate_ctxt *ctxt, int err) +{ + return emulate_exception(ctxt, SS_VECTOR, err, true); +} + static int emulate_ud(struct x86_emulate_ctxt *ctxt) { return emulate_exception(ctxt, UD_VECTOR, 0, false); @@ -541,13 +546,71 @@ static int linearize(struct x86_emulate_ctxt *ctxt, ulong *linear) { struct decode_cache *c = &ctxt->decode; + struct desc_struct desc; + bool usable; ulong la; + u32 lim; + unsigned cpl, rpl; la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea; + switch (ctxt->mode) { + case X86EMUL_MODE_REAL: + break; + case X86EMUL_MODE_PROT64: + if (((signed long)la << 16) >> 16 != la) + return emulate_gp(ctxt, 0); + break; + default: + usable = ctxt->ops->get_cached_descriptor(&desc, NULL, addr.seg, + ctxt->vcpu); + if (!usable) + goto bad; + /* code segment or read-only data segment */ + if (((desc.type & 8) || !(desc.type & 2)) && write) + goto bad; + /* unreadable code segment */ + if ((desc.type & 8) && !(desc.type & 2)) + goto bad; + lim = desc_limit_scaled(&desc); + if ((desc.type & 8) || !(desc.type & 4)) { + /* expand-up segment */ + if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim) + goto bad; + } else { + /* exapand-down segment */ + if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim) + goto bad; + lim = desc.d ? 0xffffffff : 0xffff; + if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim) + goto bad; + } + cpl = ctxt->ops->cpl(ctxt->vcpu); + rpl = ctxt->ops->get_segment_selector(addr.seg, ctxt->vcpu) & 3; + cpl = max(cpl, rpl); + if (!(desc.type & 8)) { + /* data segment */ + if (cpl > desc.dpl) + goto bad; + } else if ((desc.type & 8) && !(desc.type & 4)) { + /* nonconforming code segment */ + if (cpl != desc.dpl) + goto bad; + } else if ((desc.type & 8) && (desc.type & 4)) { + /* conforming code segment */ + if (cpl < desc.dpl) + goto bad; + } + break; + } if (c->ad_bytes != 8) la &= (u32)-1; *linear = la; return X86EMUL_CONTINUE; +bad: + if (addr.seg == VCPU_SREG_SS) + return emulate_ss(ctxt, addr.seg); + else + return emulate_gp(ctxt, addr.seg); } static int segmented_read_std(struct x86_emulate_ctxt *ctxt, From 7ae441eac521b2006c9f03c4f2a23582c07fd76d Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 31 Mar 2011 12:06:41 +0200 Subject: [PATCH 059/131] KVM: emulator: do not needlesly sync registers from emulator ctxt to vcpu Currently we sync registers back and forth before/after exiting to userspace for IO, but during IO device model shouldn't need to read/write the registers, so we can as well skip those sync points. The only exaception is broken vmware backdor interface. The new code sync registers content during IO only if registers are read from/written to by userspace in the middle of the IO operation and this almost never happens in practise. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/x86.c | 47 ++++++++++++++++++++++++++------- 2 files changed, 40 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index bd57639fd5db..3e03f37f43ea 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -385,6 +385,8 @@ struct kvm_vcpu_arch { /* emulate context */ struct x86_emulate_ctxt emulate_ctxt; + bool emulate_regs_need_sync_to_vcpu; + bool emulate_regs_need_sync_from_vcpu; gpa_t time; struct pvclock_vcpu_time_info hv_clock; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1d5a7f418795..3416a3473849 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4420,6 +4420,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) vcpu->arch.emulate_ctxt.guest_mode = is_guest_mode(vcpu); memset(c, 0, sizeof(struct decode_cache)); memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); + vcpu->arch.emulate_regs_need_sync_from_vcpu = false; } int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq) @@ -4502,6 +4503,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, { int r; struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + bool writeback = true; kvm_clear_exception_queue(vcpu); vcpu->arch.mmio_fault_cr2 = cr2; @@ -4542,9 +4544,12 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, return EMULATE_DONE; } - /* this is needed for vmware backdor interface to work since it + /* this is needed for vmware backdoor interface to work since it changes registers values during IO operation */ - memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); + if (vcpu->arch.emulate_regs_need_sync_from_vcpu) { + vcpu->arch.emulate_regs_need_sync_from_vcpu = false; + memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); + } restart: r = x86_emulate_insn(&vcpu->arch.emulate_ctxt); @@ -4565,19 +4570,28 @@ restart: } else if (vcpu->arch.pio.count) { if (!vcpu->arch.pio.in) vcpu->arch.pio.count = 0; + else + writeback = false; r = EMULATE_DO_MMIO; - } else if (vcpu->mmio_needed) + } else if (vcpu->mmio_needed) { + if (!vcpu->mmio_is_write) + writeback = false; r = EMULATE_DO_MMIO; - else if (r == EMULATION_RESTART) + } else if (r == EMULATION_RESTART) goto restart; else r = EMULATE_DONE; - toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); - kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); - kvm_make_request(KVM_REQ_EVENT, vcpu); - memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); - kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); + if (writeback) { + toggle_interruptibility(vcpu, + vcpu->arch.emulate_ctxt.interruptibility); + kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + kvm_make_request(KVM_REQ_EVENT, vcpu); + memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); + vcpu->arch.emulate_regs_need_sync_to_vcpu = false; + kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); + } else + vcpu->arch.emulate_regs_need_sync_to_vcpu = true; return r; } @@ -5587,6 +5601,18 @@ out: int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { + if (vcpu->arch.emulate_regs_need_sync_to_vcpu) { + /* + * We are here if userspace calls get_regs() in the middle of + * instruction emulation. Registers state needs to be copied + * back from emulation context to vcpu. Usrapace shouldn't do + * that usually, but some bad designed PV devices (vmware + * backdoor interface) need this to work + */ + struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); + vcpu->arch.emulate_regs_need_sync_to_vcpu = false; + } regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); @@ -5614,6 +5640,9 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { + vcpu->arch.emulate_regs_need_sync_from_vcpu = true; + vcpu->arch.emulate_regs_need_sync_to_vcpu = false; + kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); From be6d05cfdf1f2ddbdc367a6433d8eac49d6bfe6f Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 13 Apr 2011 01:27:55 +0200 Subject: [PATCH 060/131] KVM: VMX: Ensure that vmx_create_vcpu always returns proper error In case certain allocations fail, vmx_create_vcpu may return 0 as error instead of a negative value encoded via ERR_PTR. This causes a NULL pointer dereferencing later on in kvm_vm_ioctl_vcpu_create. Reported-by: Sasha Levin Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index aabe3334d064..af5206983154 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4251,8 +4251,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) goto free_vcpu; vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); + err = -ENOMEM; if (!vmx->guest_msrs) { - err = -ENOMEM; goto uninit_vcpu; } @@ -4271,7 +4271,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (err) goto free_vmcs; if (vm_need_virtualize_apic_accesses(kvm)) - if (alloc_apic_access_page(kvm) != 0) + err = alloc_apic_access_page(kvm); + if (err) goto free_vmcs; if (enable_ept) { From 575e7c1417d41dd72ddf2a49965f833ce9352e92 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 13 Apr 2011 00:24:55 +0900 Subject: [PATCH 061/131] KVM: x86 emulator: Disable writeback for CMP emulation This stops "CMP r/m, reg" to write back the data into memory. Pointed out by Avi. The writeback suppression now covers CMP, CMPS, SCAS. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 2ec69bc85846..1e0e3f8156f8 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3671,6 +3671,7 @@ special_insn: break; case 0x38 ... 0x3d: cmp: /* cmp */ + c->dst.type = OP_NONE; /* Disable writeback. */ emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); break; case 0x40 ... 0x47: /* inc r16/r32 */ @@ -3797,7 +3798,6 @@ special_insn: rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes); break; case 0xa6 ... 0xa7: /* cmps */ - c->dst.type = OP_NONE; /* Disable writeback. */ goto cmp; case 0xa8 ... 0xa9: /* test ax, imm */ goto test; From 4179bb02fd3e87183e5f698495dfcb80df187889 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 13 Apr 2011 00:29:09 +0900 Subject: [PATCH 062/131] KVM: x86 emulator: Make emulate_push() store the value directly PUSH emulation stores the value by calling writeback() after setting the dst operand appropriately in emulate_push(). This writeback() using dst is not needed at all because we know the target is the stack. So this patch makes emulate_push() call, newly introduced, segmented_write() directly. By this, many inlined writeback()'s are removed. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 87 +++++++++++++++++------------------------- 1 file changed, 34 insertions(+), 53 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 1e0e3f8156f8..4f4d9bc6178a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1345,17 +1345,19 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, return X86EMUL_CONTINUE; } -static inline void emulate_push(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int emulate_push(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; + struct segmented_address addr; - c->dst.type = OP_MEM; - c->dst.bytes = c->op_bytes; - c->dst.val = c->src.val; register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); - c->dst.addr.mem.ea = register_address(c, c->regs[VCPU_REGS_RSP]); - c->dst.addr.mem.seg = VCPU_SREG_SS; + addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]); + addr.seg = VCPU_SREG_SS; + + /* Disable writeback. */ + c->dst.type = OP_NONE; + return segmented_write(ctxt, addr, &c->src.val, c->op_bytes); } static int emulate_pop(struct x86_emulate_ctxt *ctxt, @@ -1417,14 +1419,14 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, return rc; } -static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, int seg) +static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, int seg) { struct decode_cache *c = &ctxt->decode; c->src.val = ops->get_segment_selector(seg, ctxt->vcpu); - emulate_push(ctxt, ops); + return emulate_push(ctxt, ops); } static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, @@ -1454,18 +1456,13 @@ static int emulate_pusha(struct x86_emulate_ctxt *ctxt, (reg == VCPU_REGS_RSP) ? (c->src.val = old_esp) : (c->src.val = c->regs[reg]); - emulate_push(ctxt, ops); - - rc = writeback(ctxt, ops); + rc = emulate_push(ctxt, ops); if (rc != X86EMUL_CONTINUE) return rc; ++reg; } - /* Disable writeback. */ - c->dst.type = OP_NONE; - return rc; } @@ -1503,27 +1500,22 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, /* TODO: Add limit checks */ c->src.val = ctxt->eflags; - emulate_push(ctxt, ops); - rc = writeback(ctxt, ops); + rc = emulate_push(ctxt, ops); if (rc != X86EMUL_CONTINUE) return rc; ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC); c->src.val = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); - emulate_push(ctxt, ops); - rc = writeback(ctxt, ops); + rc = emulate_push(ctxt, ops); if (rc != X86EMUL_CONTINUE) return rc; c->src.val = c->eip; - emulate_push(ctxt, ops); - rc = writeback(ctxt, ops); + rc = emulate_push(ctxt, ops); if (rc != X86EMUL_CONTINUE) return rc; - c->dst.type = OP_NONE; - ops->get_idt(&dt, ctxt->vcpu); eip_addr = dt.address + (irq << 2); @@ -1713,6 +1705,7 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; + int rc = X86EMUL_CONTINUE; switch (c->modrm_reg) { case 0: /* inc */ @@ -1726,17 +1719,17 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, old_eip = c->eip; c->eip = c->src.val; c->src.val = old_eip; - emulate_push(ctxt, ops); + rc = emulate_push(ctxt, ops); break; } case 4: /* jmp abs */ c->eip = c->src.val; break; case 6: /* push */ - emulate_push(ctxt, ops); + rc = emulate_push(ctxt, ops); break; } - return X86EMUL_CONTINUE; + return rc; } static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, @@ -2380,7 +2373,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; c->lock_prefix = 0; c->src.val = (unsigned long) error_code; - emulate_push(ctxt, ops); + ret = emulate_push(ctxt, ops); } return ret; @@ -2400,11 +2393,8 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason, has_error_code, error_code); - if (rc == X86EMUL_CONTINUE) { - rc = writeback(ctxt, ops); - if (rc == X86EMUL_CONTINUE) - ctxt->eip = c->eip; - } + if (rc == X86EMUL_CONTINUE) + ctxt->eip = c->eip; return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; } @@ -2422,8 +2412,7 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg, static int em_push(struct x86_emulate_ctxt *ctxt) { - emulate_push(ctxt, ctxt->ops); - return X86EMUL_CONTINUE; + return emulate_push(ctxt, ctxt->ops); } static int em_das(struct x86_emulate_ctxt *ctxt) @@ -2483,20 +2472,12 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt) memcpy(&c->eip, c->src.valptr, c->op_bytes); c->src.val = old_cs; - emulate_push(ctxt, ctxt->ops); - rc = writeback(ctxt, ctxt->ops); + rc = emulate_push(ctxt, ctxt->ops); if (rc != X86EMUL_CONTINUE) return rc; c->src.val = old_eip; - emulate_push(ctxt, ctxt->ops); - rc = writeback(ctxt, ctxt->ops); - if (rc != X86EMUL_CONTINUE) - return rc; - - c->dst.type = OP_NONE; - - return X86EMUL_CONTINUE; + return emulate_push(ctxt, ctxt->ops); } static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) @@ -3625,7 +3606,7 @@ special_insn: emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); break; case 0x06: /* push es */ - emulate_push_sreg(ctxt, ops, VCPU_SREG_ES); + rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_ES); break; case 0x07: /* pop es */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); @@ -3635,14 +3616,14 @@ special_insn: emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); break; case 0x0e: /* push cs */ - emulate_push_sreg(ctxt, ops, VCPU_SREG_CS); + rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_CS); break; case 0x10 ... 0x15: adc: /* adc */ emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); break; case 0x16: /* push ss */ - emulate_push_sreg(ctxt, ops, VCPU_SREG_SS); + rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_SS); break; case 0x17: /* pop ss */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); @@ -3652,7 +3633,7 @@ special_insn: emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); break; case 0x1e: /* push ds */ - emulate_push_sreg(ctxt, ops, VCPU_SREG_DS); + rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_DS); break; case 0x1f: /* pop ds */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); @@ -3789,7 +3770,7 @@ special_insn: break; case 0x9c: /* pushf */ c->src.val = (unsigned long) ctxt->eflags; - emulate_push(ctxt, ops); + rc = emulate_push(ctxt, ops); break; case 0x9d: /* popf */ c->dst.type = OP_REG; @@ -3864,7 +3845,7 @@ special_insn: long int rel = c->src.val; c->src.val = (unsigned long) c->eip; jmp_rel(c, rel); - emulate_push(ctxt, ops); + rc = emulate_push(ctxt, ops); break; } case 0xe9: /* jmp rel */ @@ -4157,7 +4138,7 @@ twobyte_insn: c->dst.val = test_cc(c->b, ctxt->eflags); break; case 0xa0: /* push fs */ - emulate_push_sreg(ctxt, ops, VCPU_SREG_FS); + rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_FS); break; case 0xa1: /* pop fs */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); @@ -4174,7 +4155,7 @@ twobyte_insn: emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); break; case 0xa8: /* push gs */ - emulate_push_sreg(ctxt, ops, VCPU_SREG_GS); + rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_GS); break; case 0xa9: /* pop gs */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); From 4487b3b48d8fa3f6a5dd4155c9e34d5e998ad7fe Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 13 Apr 2011 00:31:23 +0900 Subject: [PATCH 063/131] KVM: x86 emulator: Use em_push() instead of emulate_push() em_push() is a simple wrapper of emulate_push(). So this patch replaces emulate_push() with em_push() and removes the unnecessary former. In addition, the unused ops arguments are removed from emulate_pusha() and emulate_grp45(). Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 42 +++++++++++++++++------------------------- 1 file changed, 17 insertions(+), 25 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 4f4d9bc6178a..cb2efa463793 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1345,8 +1345,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, return X86EMUL_CONTINUE; } -static int emulate_push(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int em_push(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; struct segmented_address addr; @@ -1426,7 +1425,7 @@ static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt, c->src.val = ops->get_segment_selector(seg, ctxt->vcpu); - return emulate_push(ctxt, ops); + return em_push(ctxt); } static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, @@ -1444,8 +1443,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, return rc; } -static int emulate_pusha(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int emulate_pusha(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; unsigned long old_esp = c->regs[VCPU_REGS_RSP]; @@ -1456,7 +1454,7 @@ static int emulate_pusha(struct x86_emulate_ctxt *ctxt, (reg == VCPU_REGS_RSP) ? (c->src.val = old_esp) : (c->src.val = c->regs[reg]); - rc = emulate_push(ctxt, ops); + rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) return rc; @@ -1500,19 +1498,19 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, /* TODO: Add limit checks */ c->src.val = ctxt->eflags; - rc = emulate_push(ctxt, ops); + rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) return rc; ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC); c->src.val = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); - rc = emulate_push(ctxt, ops); + rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) return rc; c->src.val = c->eip; - rc = emulate_push(ctxt, ops); + rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) return rc; @@ -1701,8 +1699,7 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, return X86EMUL_CONTINUE; } -static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int emulate_grp45(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; int rc = X86EMUL_CONTINUE; @@ -1719,14 +1716,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, old_eip = c->eip; c->eip = c->src.val; c->src.val = old_eip; - rc = emulate_push(ctxt, ops); + rc = em_push(ctxt); break; } case 4: /* jmp abs */ c->eip = c->src.val; break; case 6: /* push */ - rc = emulate_push(ctxt, ops); + rc = em_push(ctxt); break; } return rc; @@ -2373,7 +2370,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; c->lock_prefix = 0; c->src.val = (unsigned long) error_code; - ret = emulate_push(ctxt, ops); + ret = em_push(ctxt); } return ret; @@ -2410,11 +2407,6 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg, op->addr.mem.seg = seg; } -static int em_push(struct x86_emulate_ctxt *ctxt) -{ - return emulate_push(ctxt, ctxt->ops); -} - static int em_das(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; @@ -2472,12 +2464,12 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt) memcpy(&c->eip, c->src.valptr, c->op_bytes); c->src.val = old_cs; - rc = emulate_push(ctxt, ctxt->ops); + rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) return rc; c->src.val = old_eip; - return emulate_push(ctxt, ctxt->ops); + return em_push(ctxt); } static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) @@ -3666,7 +3658,7 @@ special_insn: rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes); break; case 0x60: /* pusha */ - rc = emulate_pusha(ctxt, ops); + rc = emulate_pusha(ctxt); break; case 0x61: /* popa */ rc = emulate_popa(ctxt, ops); @@ -3770,7 +3762,7 @@ special_insn: break; case 0x9c: /* pushf */ c->src.val = (unsigned long) ctxt->eflags; - rc = emulate_push(ctxt, ops); + rc = em_push(ctxt); break; case 0x9d: /* popf */ c->dst.type = OP_REG; @@ -3845,7 +3837,7 @@ special_insn: long int rel = c->src.val; c->src.val = (unsigned long) c->eip; jmp_rel(c, rel); - rc = emulate_push(ctxt, ops); + rc = em_push(ctxt); break; } case 0xe9: /* jmp rel */ @@ -3923,7 +3915,7 @@ special_insn: break; case 0xfe: /* Grp4 */ grp45: - rc = emulate_grp45(ctxt, ops); + rc = emulate_grp45(ctxt); break; case 0xff: /* Grp5 */ if (c->modrm_reg == 5) From 71f9833bb1cba9939245f3e57388d87d69f8f399 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Wed, 13 Apr 2011 09:12:54 -0500 Subject: [PATCH 064/131] KVM: fix push of wrong eip when doing softint When doing a soft int, we need to bump eip before pushing it to the stack. Otherwise we'll do the int a second time. [apw@canonical.com: merged eip update as per Jan's recommendation.] Signed-off-by: Serge E. Hallyn Signed-off-by: Andy Whitcroft Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 12 +++++++++--- arch/x86/kvm/x86.c | 5 +++-- arch/x86/kvm/x86.h | 2 +- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index af5206983154..3f6e9bff0160 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1063,7 +1063,10 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, } if (vmx->rmode.vm86_active) { - if (kvm_inject_realmode_interrupt(vcpu, nr) != EMULATE_DONE) + int inc_eip = 0; + if (kvm_exception_is_soft(nr)) + inc_eip = vcpu->arch.event_exit_inst_len; + if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE) kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return; } @@ -2942,7 +2945,10 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) ++vcpu->stat.irq_injections; if (vmx->rmode.vm86_active) { - if (kvm_inject_realmode_interrupt(vcpu, irq) != EMULATE_DONE) + int inc_eip = 0; + if (vcpu->arch.interrupt.soft) + inc_eip = vcpu->arch.event_exit_inst_len; + if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE) kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return; } @@ -2977,7 +2983,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) ++vcpu->stat.nmi_injections; vmx->nmi_known_unmasked = false; if (vmx->rmode.vm86_active) { - if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR) != EMULATE_DONE) + if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE) kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3416a3473849..b05e3fcce298 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4423,7 +4423,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) vcpu->arch.emulate_regs_need_sync_from_vcpu = false; } -int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq) +int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) { struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; int ret; @@ -4432,7 +4432,8 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq) vcpu->arch.emulate_ctxt.decode.op_bytes = 2; vcpu->arch.emulate_ctxt.decode.ad_bytes = 2; - vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip; + vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip + + inc_eip; ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq); if (ret != X86EMUL_CONTINUE) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index c600da830ce0..e407ed3df817 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -77,7 +77,7 @@ static inline u32 bit(int bitno) void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); -int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq); +int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data); From b42fc3cbc3d6e284463e93896679379443e19d56 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Tue, 12 Apr 2011 21:30:17 -0400 Subject: [PATCH 065/131] KVM: Fix off by one in kvm_for_each_vcpu iteration This patch avoids gcc issuing the following warning when KVM_MAX_VCPUS=1: warning: array subscript is above array bounds kvm_for_each_vcpu currently checks to see if the index for the vcpu is valid /after/ loading it. We don't run into problems because the address is still inside the enclosing struct kvm and we never deference or write to it, so this isn't a security issue. The warning occurs when KVM_MAX_VCPUS=1 because the increment portion of the loop will *always* cause the loop to load an invalid location since ++idx will always be > 0. This patch moves the load so that the check occurs before the load and we don't run into the compiler warning. Signed-off-by: Neil Brown Signed-off-by: Jeff Mahoney Signed-off-by: Avi Kivity --- include/linux/kvm_host.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d1f507567068..0bc3d372e3cb 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -296,9 +296,10 @@ static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) } #define kvm_for_each_vcpu(idx, vcpup, kvm) \ - for (idx = 0, vcpup = kvm_get_vcpu(kvm, idx); \ - idx < atomic_read(&kvm->online_vcpus) && vcpup; \ - vcpup = kvm_get_vcpu(kvm, ++idx)) + for (idx = 0; \ + idx < atomic_read(&kvm->online_vcpus) && \ + (vcpup = kvm_get_vcpu(kvm, idx)) != NULL; \ + idx++) int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); From 1e015968dfb9289934ad66a67fa38a93f0adf4f8 Mon Sep 17 00:00:00 2001 From: Duan Jiong Date: Mon, 11 Apr 2011 12:56:01 +0800 Subject: [PATCH 066/131] KVM: remove useless function declarations from file arch/x86/kvm/irq.h Just remove useless function define kvm_pic_clear_isr_ack() and pit_has_pending_timer() Signed-off-by: Duan Jiong Signed-off-by: Avi Kivity --- arch/x86/kvm/irq.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index ba910d149410..53e2d084bffb 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -75,7 +75,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm); void kvm_destroy_pic(struct kvm *kvm); int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); -void kvm_pic_clear_isr_ack(struct kvm *kvm); static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) { @@ -100,7 +99,6 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu); void __kvm_migrate_timers(struct kvm_vcpu *vcpu); -int pit_has_pending_timer(struct kvm_vcpu *vcpu); int apic_has_pending_timer(struct kvm_vcpu *vcpu); #endif From 49704f26586ca87fcab4fe9323fff8db41e78910 Mon Sep 17 00:00:00 2001 From: Duan Jiong Date: Mon, 11 Apr 2011 12:44:06 +0800 Subject: [PATCH 067/131] KVM: remove useless function declaration kvm_inject_pit_timer_irqs() Just remove useless function define kvm_inject_pit_timer_irqs() from file arch/x86/kvm/i8254.h Signed-off-by:Duan Jiong Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index b681a9f78c5e..51a97426e791 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -50,7 +50,6 @@ struct kvm_pit { #define KVM_MAX_PIT_INTR_INTERVAL HZ / 100 #define KVM_PIT_CHANNEL_MASK 0x3 -void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu); void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start); struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags); void kvm_free_pit(struct kvm *kvm); From 0521e4c0bc73aa86152ee4e4bd03724c8a9e1d6b Mon Sep 17 00:00:00 2001 From: Nelson Elhage Date: Wed, 13 Apr 2011 11:44:13 -0400 Subject: [PATCH 068/131] KVM: x86 emulator: Handle wraparound in (cs_base + offset) when fetching insns Currently, setting a large (i.e. negative) base address for %cs does not work on a 64-bit host. The "JOS" teaching operating system, used by MIT and other universities, relies on such segments while bootstrapping its way to full virtual memory management. Signed-off-by: Nelson Elhage Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index cb2efa463793..4c5ff22d101a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -637,9 +637,12 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, int size, cur_size; if (eip == fc->end) { + unsigned long linear = eip + ctxt->cs_base; + if (ctxt->mode != X86EMUL_MODE_PROT64) + linear &= (u32)-1; cur_size = fc->end - fc->start; size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip)); - rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size, + rc = ops->fetch(linear, fc->data + cur_size, size, ctxt->vcpu, &ctxt->exception); if (rc != X86EMUL_CONTINUE) return rc; From 977b2d03e42e9ea9355d4baddb464810579719bd Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 18 Apr 2011 11:42:52 +0200 Subject: [PATCH 069/131] KVM: SVM: Fix nested sel_cr0 intercept path with decode-assists This patch fixes a bug in the nested-svm path when decode-assists is available on the machine. After a selective-cr0 intercept is detected the rip is advanced unconditionally. This causes the l1-guest to continue running with an l2-rip. This bug was with the sel_cr0 unit-test on decode-assists capable hardware. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index a6bf2ad7429c..de4bba99160d 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2799,6 +2799,9 @@ static int cr_interception(struct vcpu_svm *svm) case 0: if (!check_selective_cr0_intercepted(svm, val)) err = kvm_set_cr0(&svm->vcpu, val); + else + return 1; + break; case 3: err = kvm_set_cr3(&svm->vcpu, val); From 7c4c0f4fd5c3e82234c0ab61c7e7ffdb8f3af07b Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 18 Apr 2011 11:42:53 +0200 Subject: [PATCH 070/131] KVM: X86: Update last_guest_tsc in vcpu_put The last_guest_tsc is used in vcpu_load to adjust the tsc_offset since tsc-scaling is merged. So the last_guest_tsc needs to be updated in vcpu_put instead of the the last_host_tsc. This is fixed with this patch. Reported-by: Jan Kiszka Tested-by: Jan Kiszka Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/x86.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3e03f37f43ea..e50bffcf3cc0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -393,7 +393,6 @@ struct kvm_vcpu_arch { unsigned int hw_tsc_khz; unsigned int time_offset; struct page *time_page; - u64 last_host_tsc; u64 last_guest_tsc; u64 last_kernel_ns; u64 last_tsc_nsec; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b05e3fcce298..6aa137701cda 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2146,7 +2146,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { kvm_x86_ops->vcpu_put(vcpu); kvm_put_guest_fpu(vcpu); - vcpu->arch.last_host_tsc = native_read_tsc(); + kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); } static int is_efer_nx(void) From 3d9b938eefb7d91a1ae13e425931bd5ac103b762 Mon Sep 17 00:00:00 2001 From: Nelson Elhage Date: Mon, 18 Apr 2011 12:05:53 -0400 Subject: [PATCH 071/131] KVM: emulator: Use linearize() when fetching instructions Since segments need to be handled slightly differently when fetching instructions, we add a __linearize helper that accepts a new 'fetch' boolean. [avi: fix oops caused by wrong segmented_address initialization order] Signed-off-by: Nelson Elhage Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 1 - arch/x86/kvm/emulate.c | 26 ++++++++++++++++++-------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 081844860a3d..9b760c8f2576 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -265,7 +265,6 @@ struct x86_emulate_ctxt { unsigned long eip; /* eip before instruction emulation */ /* Emulated execution mode, represented by an X86EMUL_MODE value. */ int mode; - u32 cs_base; /* interruptibility state, as a result of execution of STI or MOV SS */ int interruptibility; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 4c5ff22d101a..e1f77de95404 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -540,9 +540,9 @@ static int emulate_nm(struct x86_emulate_ctxt *ctxt) return emulate_exception(ctxt, NM_VECTOR, 0, false); } -static int linearize(struct x86_emulate_ctxt *ctxt, +static int __linearize(struct x86_emulate_ctxt *ctxt, struct segmented_address addr, - unsigned size, bool write, + unsigned size, bool write, bool fetch, ulong *linear) { struct decode_cache *c = &ctxt->decode; @@ -569,7 +569,7 @@ static int linearize(struct x86_emulate_ctxt *ctxt, if (((desc.type & 8) || !(desc.type & 2)) && write) goto bad; /* unreadable code segment */ - if ((desc.type & 8) && !(desc.type & 2)) + if (!fetch && (desc.type & 8) && !(desc.type & 2)) goto bad; lim = desc_limit_scaled(&desc); if ((desc.type & 8) || !(desc.type & 4)) { @@ -602,7 +602,7 @@ static int linearize(struct x86_emulate_ctxt *ctxt, } break; } - if (c->ad_bytes != 8) + if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : c->ad_bytes != 8) la &= (u32)-1; *linear = la; return X86EMUL_CONTINUE; @@ -613,6 +613,15 @@ bad: return emulate_gp(ctxt, addr.seg); } +static int linearize(struct x86_emulate_ctxt *ctxt, + struct segmented_address addr, + unsigned size, bool write, + ulong *linear) +{ + return __linearize(ctxt, addr, size, write, false, linear); +} + + static int segmented_read_std(struct x86_emulate_ctxt *ctxt, struct segmented_address addr, void *data, @@ -637,11 +646,13 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, int size, cur_size; if (eip == fc->end) { - unsigned long linear = eip + ctxt->cs_base; - if (ctxt->mode != X86EMUL_MODE_PROT64) - linear &= (u32)-1; + unsigned long linear; + struct segmented_address addr = { .seg=VCPU_SREG_CS, .ea=eip}; cur_size = fc->end - fc->start; size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip)); + rc = __linearize(ctxt, addr, size, false, true, &linear); + if (rc != X86EMUL_CONTINUE) + return rc; rc = ops->fetch(linear, fc->data + cur_size, size, ctxt->vcpu, &ctxt->exception); if (rc != X86EMUL_CONTINUE) @@ -3127,7 +3138,6 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) c->fetch.end = c->fetch.start + insn_len; if (insn_len > 0) memcpy(c->fetch.data, insn, insn_len); - ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS); switch (mode) { case X86EMUL_MODE_REAL: From 7295261cdd42e6d41666df38d1b613cdd9e95f46 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 20 Apr 2011 13:12:27 +0300 Subject: [PATCH 072/131] KVM: x86 emulator: whitespace cleanups Clean up lines longer than 80 columns. No code changes. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 96 ++++++++++++++++++++++++------------------ 1 file changed, 54 insertions(+), 42 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index e1f77de95404..4a5b61ff0ae9 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -262,42 +262,42 @@ struct gprefix { "w", "r", _LO32, "r", "", "r") /* Instruction has three operands and one operand is stored in ECX register */ -#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \ - do { \ - unsigned long _tmp; \ - _type _clv = (_cl).val; \ - _type _srcv = (_src).val; \ - _type _dstv = (_dst).val; \ - \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "5", "2") \ - _op _suffix " %4,%1 \n" \ - _POST_EFLAGS("0", "5", "2") \ - : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \ - : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \ - ); \ - \ - (_cl).val = (unsigned long) _clv; \ - (_src).val = (unsigned long) _srcv; \ - (_dst).val = (unsigned long) _dstv; \ +#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \ + do { \ + unsigned long _tmp; \ + _type _clv = (_cl).val; \ + _type _srcv = (_src).val; \ + _type _dstv = (_dst).val; \ + \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "5", "2") \ + _op _suffix " %4,%1 \n" \ + _POST_EFLAGS("0", "5", "2") \ + : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \ + : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \ + ); \ + \ + (_cl).val = (unsigned long) _clv; \ + (_src).val = (unsigned long) _srcv; \ + (_dst).val = (unsigned long) _dstv; \ } while (0) -#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \ - do { \ - switch ((_dst).bytes) { \ - case 2: \ - __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ - "w", unsigned short); \ - break; \ - case 4: \ - __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ - "l", unsigned int); \ - break; \ - case 8: \ - ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ - "q", unsigned long)); \ - break; \ - } \ +#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \ + do { \ + switch ((_dst).bytes) { \ + case 2: \ + __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ + "w", unsigned short); \ + break; \ + case 4: \ + __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ + "l", unsigned int); \ + break; \ + case 8: \ + ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ + "q", unsigned long)); \ + break; \ + } \ } while (0) #define __emulate_1op(_op, _dst, _eflags, _suffix) \ @@ -360,13 +360,25 @@ struct gprefix { } while (0) /* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */ -#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \ - do { \ - switch((_src).bytes) { \ - case 1: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "b"); break; \ - case 2: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "w"); break; \ - case 4: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "l"); break; \ - case 8: ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "q")); break; \ +#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \ + do { \ + switch((_src).bytes) { \ + case 1: \ + __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ + _eflags, "b"); \ + break; \ + case 2: \ + __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ + _eflags, "w"); \ + break; \ + case 4: \ + __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ + _eflags, "l"); \ + break; \ + case 8: \ + ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ + _eflags, "q")); \ + break; \ } \ } while (0) @@ -402,7 +414,7 @@ struct gprefix { (_type)_x; \ }) -#define insn_fetch_arr(_arr, _size, _eip) \ +#define insn_fetch_arr(_arr, _size, _eip) \ ({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \ if (rc != X86EMUL_CONTINUE) \ goto done; \ From 0f65dd70a442ff498da10cec0a599fbd9d2d6f9e Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 20 Apr 2011 13:37:53 +0300 Subject: [PATCH 073/131] KVM: x86 emulator: drop vcpu argument from memory read/write callbacks Making the emulator caller agnostic. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 34 +++++++++---------- arch/x86/kvm/emulate.c | 54 ++++++++++++++---------------- arch/x86/kvm/x86.c | 54 +++++++++++++++++++----------- 3 files changed, 75 insertions(+), 67 deletions(-) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 9b760c8f2576..b4d846708a4b 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -92,8 +92,9 @@ struct x86_emulate_ops { * @val: [OUT] Value read from memory, zero-extended to 'u_long'. * @bytes: [IN ] Number of bytes to read from memory. */ - int (*read_std)(unsigned long addr, void *val, - unsigned int bytes, struct kvm_vcpu *vcpu, + int (*read_std)(struct x86_emulate_ctxt *ctxt, + unsigned long addr, void *val, + unsigned int bytes, struct x86_exception *fault); /* @@ -103,8 +104,8 @@ struct x86_emulate_ops { * @val: [OUT] Value write to memory, zero-extended to 'u_long'. * @bytes: [IN ] Number of bytes to write to memory. */ - int (*write_std)(unsigned long addr, void *val, - unsigned int bytes, struct kvm_vcpu *vcpu, + int (*write_std)(struct x86_emulate_ctxt *ctxt, + unsigned long addr, void *val, unsigned int bytes, struct x86_exception *fault); /* * fetch: Read bytes of standard (non-emulated/special) memory. @@ -113,8 +114,8 @@ struct x86_emulate_ops { * @val: [OUT] Value read from memory, zero-extended to 'u_long'. * @bytes: [IN ] Number of bytes to read from memory. */ - int (*fetch)(unsigned long addr, void *val, - unsigned int bytes, struct kvm_vcpu *vcpu, + int (*fetch)(struct x86_emulate_ctxt *ctxt, + unsigned long addr, void *val, unsigned int bytes, struct x86_exception *fault); /* @@ -123,11 +124,9 @@ struct x86_emulate_ops { * @val: [OUT] Value read from memory, zero-extended to 'u_long'. * @bytes: [IN ] Number of bytes to read from memory. */ - int (*read_emulated)(unsigned long addr, - void *val, - unsigned int bytes, - struct x86_exception *fault, - struct kvm_vcpu *vcpu); + int (*read_emulated)(struct x86_emulate_ctxt *ctxt, + unsigned long addr, void *val, unsigned int bytes, + struct x86_exception *fault); /* * write_emulated: Write bytes to emulated/special memory area. @@ -136,11 +135,10 @@ struct x86_emulate_ops { * required). * @bytes: [IN ] Number of bytes to write to memory. */ - int (*write_emulated)(unsigned long addr, - const void *val, + int (*write_emulated)(struct x86_emulate_ctxt *ctxt, + unsigned long addr, const void *val, unsigned int bytes, - struct x86_exception *fault, - struct kvm_vcpu *vcpu); + struct x86_exception *fault); /* * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an @@ -150,12 +148,12 @@ struct x86_emulate_ops { * @new: [IN ] Value to write to @addr. * @bytes: [IN ] Number of bytes to access using CMPXCHG. */ - int (*cmpxchg_emulated)(unsigned long addr, + int (*cmpxchg_emulated)(struct x86_emulate_ctxt *ctxt, + unsigned long addr, const void *old, const void *new, unsigned int bytes, - struct x86_exception *fault, - struct kvm_vcpu *vcpu); + struct x86_exception *fault); int (*pio_in_emulated)(int size, unsigned short port, void *val, unsigned int count, struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 4a5b61ff0ae9..ff64b17df772 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -645,8 +645,7 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt, rc = linearize(ctxt, addr, size, false, &linear); if (rc != X86EMUL_CONTINUE) return rc; - return ctxt->ops->read_std(linear, data, size, ctxt->vcpu, - &ctxt->exception); + return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception); } static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, @@ -665,8 +664,8 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, rc = __linearize(ctxt, addr, size, false, true, &linear); if (rc != X86EMUL_CONTINUE) return rc; - rc = ops->fetch(linear, fc->data + cur_size, - size, ctxt->vcpu, &ctxt->exception); + rc = ops->fetch(ctxt, linear, fc->data + cur_size, + size, &ctxt->exception); if (rc != X86EMUL_CONTINUE) return rc; fc->end += size; @@ -1047,8 +1046,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt, if (mc->pos < mc->end) goto read_cached; - rc = ops->read_emulated(addr, mc->data + mc->end, n, - &ctxt->exception, ctxt->vcpu); + rc = ops->read_emulated(ctxt, addr, mc->data + mc->end, n, + &ctxt->exception); if (rc != X86EMUL_CONTINUE) return rc; mc->end += n; @@ -1087,8 +1086,8 @@ static int segmented_write(struct x86_emulate_ctxt *ctxt, rc = linearize(ctxt, addr, size, true, &linear); if (rc != X86EMUL_CONTINUE) return rc; - return ctxt->ops->write_emulated(linear, data, size, - &ctxt->exception, ctxt->vcpu); + return ctxt->ops->write_emulated(ctxt, linear, data, size, + &ctxt->exception); } static int segmented_cmpxchg(struct x86_emulate_ctxt *ctxt, @@ -1102,8 +1101,8 @@ static int segmented_cmpxchg(struct x86_emulate_ctxt *ctxt, rc = linearize(ctxt, addr, size, true, &linear); if (rc != X86EMUL_CONTINUE) return rc; - return ctxt->ops->cmpxchg_emulated(linear, orig_data, data, - size, &ctxt->exception, ctxt->vcpu); + return ctxt->ops->cmpxchg_emulated(ctxt, linear, orig_data, data, + size, &ctxt->exception); } static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, @@ -1168,8 +1167,7 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (dt.size < index * 8 + 7) return emulate_gp(ctxt, selector & 0xfffc); addr = dt.address + index * 8; - ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, - &ctxt->exception); + ret = ops->read_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception); return ret; } @@ -1190,8 +1188,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, return emulate_gp(ctxt, selector & 0xfffc); addr = dt.address + index * 8; - ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, - &ctxt->exception); + ret = ops->write_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception); return ret; } @@ -1545,11 +1542,11 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, eip_addr = dt.address + (irq << 2); cs_addr = dt.address + (irq << 2) + 2; - rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &ctxt->exception); + rc = ops->read_std(ctxt, cs_addr, &cs, 2, &ctxt->exception); if (rc != X86EMUL_CONTINUE) return rc; - rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &ctxt->exception); + rc = ops->read_std(ctxt, eip_addr, &eip, 2, &ctxt->exception); if (rc != X86EMUL_CONTINUE) return rc; @@ -2036,13 +2033,12 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, #ifdef CONFIG_X86_64 base |= ((u64)base3) << 32; #endif - r = ops->read_std(base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, NULL); + r = ops->read_std(ctxt, base + 102, &io_bitmap_ptr, 2, NULL); if (r != X86EMUL_CONTINUE) return false; if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg)) return false; - r = ops->read_std(base + io_bitmap_ptr + port/8, &perm, 2, ctxt->vcpu, - NULL); + r = ops->read_std(ctxt, base + io_bitmap_ptr + port/8, &perm, 2, NULL); if (r != X86EMUL_CONTINUE) return false; if ((perm >> bit_idx) & mask) @@ -2150,7 +2146,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, int ret; u32 new_tss_base = get_desc_base(new_desc); - ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) /* FIXME: need to provide precise fault address */ @@ -2158,13 +2154,13 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, save_state_to_tss16(ctxt, ops, &tss_seg); - ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) /* FIXME: need to provide precise fault address */ return ret; - ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) /* FIXME: need to provide precise fault address */ @@ -2173,10 +2169,10 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, if (old_tss_sel != 0xffff) { tss_seg.prev_task_link = old_tss_sel; - ret = ops->write_std(new_tss_base, + ret = ops->write_std(ctxt, new_tss_base, &tss_seg.prev_task_link, sizeof tss_seg.prev_task_link, - ctxt->vcpu, &ctxt->exception); + &ctxt->exception); if (ret != X86EMUL_CONTINUE) /* FIXME: need to provide precise fault address */ return ret; @@ -2282,7 +2278,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, int ret; u32 new_tss_base = get_desc_base(new_desc); - ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) /* FIXME: need to provide precise fault address */ @@ -2290,13 +2286,13 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, save_state_to_tss32(ctxt, ops, &tss_seg); - ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) /* FIXME: need to provide precise fault address */ return ret; - ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) /* FIXME: need to provide precise fault address */ @@ -2305,10 +2301,10 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, if (old_tss_sel != 0xffff) { tss_seg.prev_task_link = old_tss_sel; - ret = ops->write_std(new_tss_base, + ret = ops->write_std(ctxt, new_tss_base, &tss_seg.prev_task_link, sizeof tss_seg.prev_task_link, - ctxt->vcpu, &ctxt->exception); + &ctxt->exception); if (ret != X86EMUL_CONTINUE) /* FIXME: need to provide precise fault address */ return ret; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6aa137701cda..274652ae6d52 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -63,6 +63,9 @@ #define KVM_MAX_MCE_BANKS 32 #define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P) +#define emul_to_vcpu(ctxt) \ + container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt) + /* EFER defaults: * - enable syscall per default because its emulated by KVM * - enable LME and LMA per default on 64 bit KVM @@ -3760,37 +3763,43 @@ out: } /* used for instruction fetching */ -static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, +static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, + gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access | PFERR_FETCH_MASK, exception); } -static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, +static int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, + gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception); } -static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, +static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt, + gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception); } -static int kvm_write_guest_virt_system(gva_t addr, void *val, +static int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, + gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, struct x86_exception *exception) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); void *data = val; int r = X86EMUL_CONTINUE; @@ -3818,12 +3827,13 @@ out: return r; } -static int emulator_read_emulated(unsigned long addr, +static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, + unsigned long addr, void *val, unsigned int bytes, - struct x86_exception *exception, - struct kvm_vcpu *vcpu) + struct x86_exception *exception) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); gpa_t gpa; int handled; @@ -3844,7 +3854,7 @@ static int emulator_read_emulated(unsigned long addr, if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) goto mmio; - if (kvm_read_guest_virt(addr, val, bytes, vcpu, exception) + if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception) == X86EMUL_CONTINUE) return X86EMUL_CONTINUE; @@ -3933,12 +3943,14 @@ mmio: return X86EMUL_CONTINUE; } -int emulator_write_emulated(unsigned long addr, +int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, + unsigned long addr, const void *val, unsigned int bytes, - struct x86_exception *exception, - struct kvm_vcpu *vcpu) + struct x86_exception *exception) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + /* Crossing a page boundary? */ if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { int rc, now; @@ -3966,13 +3978,14 @@ int emulator_write_emulated(unsigned long addr, (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old)) #endif -static int emulator_cmpxchg_emulated(unsigned long addr, +static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, + unsigned long addr, const void *old, const void *new, unsigned int bytes, - struct x86_exception *exception, - struct kvm_vcpu *vcpu) + struct x86_exception *exception) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); gpa_t gpa; struct page *page; char *kaddr; @@ -4028,7 +4041,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, emul_write: printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); - return emulator_write_emulated(addr, new, bytes, exception, vcpu); + return emulator_write_emulated(ctxt, addr, new, bytes, exception); } static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) @@ -5009,7 +5022,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) kvm_x86_ops->patch_hypercall(vcpu, instruction); - return emulator_write_emulated(rip, instruction, 3, NULL, vcpu); + return emulator_write_emulated(&vcpu->arch.emulate_ctxt, + rip, instruction, 3, NULL); } void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) From ca1d4a9e772bde0a0b8cda61ee9fdca29f80f361 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 20 Apr 2011 13:37:53 +0300 Subject: [PATCH 074/131] KVM: x86 emulator: drop vcpu argument from pio callbacks Making the emulator caller agnostic. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 10 ++++++---- arch/x86/kvm/emulate.c | 6 +++--- arch/x86/kvm/x86.c | 18 ++++++++++++------ 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index b4d846708a4b..1348bdf14a43 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -155,11 +155,13 @@ struct x86_emulate_ops { unsigned int bytes, struct x86_exception *fault); - int (*pio_in_emulated)(int size, unsigned short port, void *val, - unsigned int count, struct kvm_vcpu *vcpu); + int (*pio_in_emulated)(struct x86_emulate_ctxt *ctxt, + int size, unsigned short port, void *val, + unsigned int count); - int (*pio_out_emulated)(int size, unsigned short port, const void *val, - unsigned int count, struct kvm_vcpu *vcpu); + int (*pio_out_emulated)(struct x86_emulate_ctxt *ctxt, + int size, unsigned short port, const void *val, + unsigned int count); bool (*get_cached_descriptor)(struct desc_struct *desc, u32 *base3, int seg, struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index ff64b17df772..8af08a16f4dd 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1125,7 +1125,7 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, if (n == 0) n = 1; rc->pos = rc->end = 0; - if (!ops->pio_in_emulated(size, port, rc->data, n, ctxt->vcpu)) + if (!ops->pio_in_emulated(ctxt, size, port, rc->data, n)) return 0; rc->end = n * size; } @@ -3892,8 +3892,8 @@ special_insn: case 0xef: /* out dx,(e/r)ax */ c->dst.val = c->regs[VCPU_REGS_RDX]; do_io_out: - ops->pio_out_emulated(c->src.bytes, c->dst.val, - &c->src.val, 1, ctxt->vcpu); + ops->pio_out_emulated(ctxt, c->src.bytes, c->dst.val, + &c->src.val, 1); c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xf4: /* hlt */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 274652ae6d52..e9040a9b25c6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4060,9 +4060,12 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) } -static int emulator_pio_in_emulated(int size, unsigned short port, void *val, - unsigned int count, struct kvm_vcpu *vcpu) +static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, + int size, unsigned short port, void *val, + unsigned int count) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + if (vcpu->arch.pio.count) goto data_avail; @@ -4090,10 +4093,12 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val, return 0; } -static int emulator_pio_out_emulated(int size, unsigned short port, - const void *val, unsigned int count, - struct kvm_vcpu *vcpu) +static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, + int size, unsigned short port, + const void *val, unsigned int count) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + trace_kvm_pio(1, port, size, count); vcpu->arch.pio.port = port; @@ -4614,7 +4619,8 @@ EXPORT_SYMBOL_GPL(x86_emulate_instruction); int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) { unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX); - int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu); + int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt, + size, port, &val, 1); /* do not return to emulator after return from userspace */ vcpu->arch.pio.count = 0; return ret; From 4bff1e86ad286d4b3a54902540abeeaf95e64db3 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 20 Apr 2011 13:37:53 +0300 Subject: [PATCH 075/131] KVM: x86 emulator: drop vcpu argument from segment/gdt/idt callbacks Making the emulator caller agnostic. [Takuya Yoshikawa: fix typo leading to LDT failures] Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 22 +++--- arch/x86/kvm/emulate.c | 112 ++++++++++++++--------------- arch/x86/kvm/x86.c | 39 +++++----- 3 files changed, 90 insertions(+), 83 deletions(-) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 1348bdf14a43..656046a1bd51 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -163,15 +163,19 @@ struct x86_emulate_ops { int size, unsigned short port, const void *val, unsigned int count); - bool (*get_cached_descriptor)(struct desc_struct *desc, u32 *base3, - int seg, struct kvm_vcpu *vcpu); - void (*set_cached_descriptor)(struct desc_struct *desc, u32 base3, - int seg, struct kvm_vcpu *vcpu); - u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu); - void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu); - unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu); - void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); - void (*get_idt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); + bool (*get_cached_descriptor)(struct x86_emulate_ctxt *ctxt, + struct desc_struct *desc, u32 *base3, + int seg); + void (*set_cached_descriptor)(struct x86_emulate_ctxt *ctxt, + struct desc_struct *desc, u32 base3, + int seg); + u16 (*get_segment_selector)(struct x86_emulate_ctxt *ctxt, int seg); + void (*set_segment_selector)(struct x86_emulate_ctxt *ctxt, + u16 sel, int seg); + unsigned long (*get_cached_segment_base)(struct x86_emulate_ctxt *ctxt, + int seg); + void (*get_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); + void (*get_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu); int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); int (*cpl)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8af08a16f4dd..9602889f0f7f 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -495,7 +495,7 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) return 0; - return ops->get_cached_segment_base(seg, ctxt->vcpu); + return ops->get_cached_segment_base(ctxt, seg); } static unsigned seg_override(struct x86_emulate_ctxt *ctxt, @@ -573,8 +573,8 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, return emulate_gp(ctxt, 0); break; default: - usable = ctxt->ops->get_cached_descriptor(&desc, NULL, addr.seg, - ctxt->vcpu); + usable = ctxt->ops->get_cached_descriptor(ctxt, &desc, NULL, + addr.seg); if (!usable) goto bad; /* code segment or read-only data segment */ @@ -597,7 +597,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, goto bad; } cpl = ctxt->ops->cpl(ctxt->vcpu); - rpl = ctxt->ops->get_segment_selector(addr.seg, ctxt->vcpu) & 3; + rpl = ctxt->ops->get_segment_selector(ctxt, addr.seg) & 3; cpl = max(cpl, rpl); if (!(desc.type & 8)) { /* data segment */ @@ -1142,14 +1142,14 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, if (selector & 1 << 2) { struct desc_struct desc; memset (dt, 0, sizeof *dt); - if (!ops->get_cached_descriptor(&desc, NULL, VCPU_SREG_LDTR, - ctxt->vcpu)) + if (!ops->get_cached_descriptor(ctxt, &desc, NULL, + VCPU_SREG_LDTR)) return; dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */ dt->address = get_desc_base(&desc); } else - ops->get_gdt(dt, ctxt->vcpu); + ops->get_gdt(ctxt, dt); } /* allowed just for 8 bytes segments */ @@ -1304,8 +1304,8 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, return ret; } load: - ops->set_segment_selector(selector, seg, ctxt->vcpu); - ops->set_cached_descriptor(&seg_desc, 0, seg, ctxt->vcpu); + ops->set_segment_selector(ctxt, selector, seg); + ops->set_cached_descriptor(ctxt, &seg_desc, 0, seg); return X86EMUL_CONTINUE; exception: emulate_exception(ctxt, err_vec, err_code, true); @@ -1446,7 +1446,7 @@ static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt, { struct decode_cache *c = &ctxt->decode; - c->src.val = ops->get_segment_selector(seg, ctxt->vcpu); + c->src.val = ops->get_segment_selector(ctxt, seg); return em_push(ctxt); } @@ -1527,7 +1527,7 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC); - c->src.val = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); + c->src.val = ops->get_segment_selector(ctxt, VCPU_SREG_CS); rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) return rc; @@ -1537,7 +1537,7 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, if (rc != X86EMUL_CONTINUE) return rc; - ops->get_idt(&dt, ctxt->vcpu); + ops->get_idt(ctxt, &dt); eip_addr = dt.address + (irq << 2); cs_addr = dt.address + (irq << 2) + 2; @@ -1814,7 +1814,7 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, struct desc_struct *ss) { memset(cs, 0, sizeof(struct desc_struct)); - ops->get_cached_descriptor(cs, NULL, VCPU_SREG_CS, ctxt->vcpu); + ops->get_cached_descriptor(ctxt, cs, NULL, VCPU_SREG_CS); memset(ss, 0, sizeof(struct desc_struct)); cs->l = 0; /* will be adjusted later */ @@ -1861,10 +1861,10 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) cs.d = 0; cs.l = 1; } - ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu); - ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); - ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu); - ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); + ops->set_cached_descriptor(ctxt, &cs, 0, VCPU_SREG_CS); + ops->set_segment_selector(ctxt, cs_sel, VCPU_SREG_CS); + ops->set_cached_descriptor(ctxt, &ss, 0, VCPU_SREG_SS); + ops->set_segment_selector(ctxt, ss_sel, VCPU_SREG_SS); c->regs[VCPU_REGS_RCX] = c->eip; if (is_long_mode(ctxt->vcpu)) { @@ -1933,10 +1933,10 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) cs.l = 1; } - ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu); - ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); - ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu); - ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); + ops->set_cached_descriptor(ctxt, &cs, 0, VCPU_SREG_CS); + ops->set_segment_selector(ctxt, cs_sel, VCPU_SREG_CS); + ops->set_cached_descriptor(ctxt, &ss, 0, VCPU_SREG_SS); + ops->set_segment_selector(ctxt, ss_sel, VCPU_SREG_SS); ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); c->eip = msr_data; @@ -1990,10 +1990,10 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) cs_sel |= SELECTOR_RPL_MASK; ss_sel |= SELECTOR_RPL_MASK; - ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu); - ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); - ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu); - ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); + ops->set_cached_descriptor(ctxt, &cs, 0, VCPU_SREG_CS); + ops->set_segment_selector(ctxt, cs_sel, VCPU_SREG_CS); + ops->set_cached_descriptor(ctxt, &ss, 0, VCPU_SREG_SS); + ops->set_segment_selector(ctxt, ss_sel, VCPU_SREG_SS); c->eip = c->regs[VCPU_REGS_RDX]; c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX]; @@ -2024,7 +2024,7 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, unsigned mask = (1 << len) - 1; unsigned long base; - ops->get_cached_descriptor(&tr_seg, &base3, VCPU_SREG_TR, ctxt->vcpu); + ops->get_cached_descriptor(ctxt, &tr_seg, &base3, VCPU_SREG_TR); if (!tr_seg.p) return false; if (desc_limit_scaled(&tr_seg) < 103) @@ -2079,11 +2079,11 @@ static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, tss->si = c->regs[VCPU_REGS_RSI]; tss->di = c->regs[VCPU_REGS_RDI]; - tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu); - tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); - tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu); - tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu); - tss->ldt = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu); + tss->es = ops->get_segment_selector(ctxt, VCPU_SREG_ES); + tss->cs = ops->get_segment_selector(ctxt, VCPU_SREG_CS); + tss->ss = ops->get_segment_selector(ctxt, VCPU_SREG_SS); + tss->ds = ops->get_segment_selector(ctxt, VCPU_SREG_DS); + tss->ldt = ops->get_segment_selector(ctxt, VCPU_SREG_LDTR); } static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, @@ -2108,11 +2108,11 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, * SDM says that segment selectors are loaded before segment * descriptors */ - ops->set_segment_selector(tss->ldt, VCPU_SREG_LDTR, ctxt->vcpu); - ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu); - ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu); - ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu); - ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu); + ops->set_segment_selector(ctxt, tss->ldt, VCPU_SREG_LDTR); + ops->set_segment_selector(ctxt, tss->es, VCPU_SREG_ES); + ops->set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS); + ops->set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS); + ops->set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS); /* * Now load segment descriptors. If fault happenes at this stage @@ -2199,13 +2199,13 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, tss->esi = c->regs[VCPU_REGS_RSI]; tss->edi = c->regs[VCPU_REGS_RDI]; - tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu); - tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); - tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu); - tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu); - tss->fs = ops->get_segment_selector(VCPU_SREG_FS, ctxt->vcpu); - tss->gs = ops->get_segment_selector(VCPU_SREG_GS, ctxt->vcpu); - tss->ldt_selector = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu); + tss->es = ops->get_segment_selector(ctxt, VCPU_SREG_ES); + tss->cs = ops->get_segment_selector(ctxt, VCPU_SREG_CS); + tss->ss = ops->get_segment_selector(ctxt, VCPU_SREG_SS); + tss->ds = ops->get_segment_selector(ctxt, VCPU_SREG_DS); + tss->fs = ops->get_segment_selector(ctxt, VCPU_SREG_FS); + tss->gs = ops->get_segment_selector(ctxt, VCPU_SREG_GS); + tss->ldt_selector = ops->get_segment_selector(ctxt, VCPU_SREG_LDTR); } static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, @@ -2232,13 +2232,13 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, * SDM says that segment selectors are loaded before segment * descriptors */ - ops->set_segment_selector(tss->ldt_selector, VCPU_SREG_LDTR, ctxt->vcpu); - ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu); - ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu); - ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu); - ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu); - ops->set_segment_selector(tss->fs, VCPU_SREG_FS, ctxt->vcpu); - ops->set_segment_selector(tss->gs, VCPU_SREG_GS, ctxt->vcpu); + ops->set_segment_selector(ctxt, tss->ldt_selector, VCPU_SREG_LDTR); + ops->set_segment_selector(ctxt, tss->es, VCPU_SREG_ES); + ops->set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS); + ops->set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS); + ops->set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS); + ops->set_segment_selector(ctxt, tss->fs, VCPU_SREG_FS); + ops->set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS); /* * Now load segment descriptors. If fault happenes at this stage @@ -2320,9 +2320,9 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, { struct desc_struct curr_tss_desc, next_tss_desc; int ret; - u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu); + u16 old_tss_sel = ops->get_segment_selector(ctxt, VCPU_SREG_TR); ulong old_tss_base = - ops->get_cached_segment_base(VCPU_SREG_TR, ctxt->vcpu); + ops->get_cached_segment_base(ctxt, VCPU_SREG_TR); u32 desc_limit; /* FIXME: old_tss_base == ~0 ? */ @@ -2383,8 +2383,8 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, } ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu); - ops->set_cached_descriptor(&next_tss_desc, 0, VCPU_SREG_TR, ctxt->vcpu); - ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu); + ops->set_cached_descriptor(ctxt, &next_tss_desc, 0, VCPU_SREG_TR); + ops->set_segment_selector(ctxt, tss_selector, VCPU_SREG_TR); if (has_error_code) { struct decode_cache *c = &ctxt->decode; @@ -2475,7 +2475,7 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt) ulong old_eip; int rc; - old_cs = ctxt->ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); + old_cs = ctxt->ops->get_segment_selector(ctxt, VCPU_SREG_CS); old_eip = c->eip; memcpy(&sel, c->src.valptr + c->op_bytes, 2); @@ -3743,7 +3743,7 @@ special_insn: rc = emulate_ud(ctxt); goto done; } - c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu); + c->dst.val = ops->get_segment_selector(ctxt, c->modrm_reg); break; case 0x8d: /* lea r16/r32, m */ c->dst.val = c->src.addr.mem.ea; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e9040a9b25c6..6a7fbf671b26 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4237,28 +4237,29 @@ static int emulator_get_cpl(struct kvm_vcpu *vcpu) return kvm_x86_ops->get_cpl(vcpu); } -static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu) +static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - kvm_x86_ops->get_gdt(vcpu, dt); + kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt); } -static void emulator_get_idt(struct desc_ptr *dt, struct kvm_vcpu *vcpu) +static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - kvm_x86_ops->get_idt(vcpu, dt); + kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt); } -static unsigned long emulator_get_cached_segment_base(int seg, - struct kvm_vcpu *vcpu) +static unsigned long emulator_get_cached_segment_base( + struct x86_emulate_ctxt *ctxt, int seg) { - return get_segment_base(vcpu, seg); + return get_segment_base(emul_to_vcpu(ctxt), seg); } -static bool emulator_get_cached_descriptor(struct desc_struct *desc, u32 *base3, - int seg, struct kvm_vcpu *vcpu) +static bool emulator_get_cached_descriptor(struct x86_emulate_ctxt *ctxt, + struct desc_struct *desc, u32 *base3, + int seg) { struct kvm_segment var; - kvm_get_segment(vcpu, &var, seg); + kvm_get_segment(emul_to_vcpu(ctxt), &var, seg); if (var.unusable) return false; @@ -4283,9 +4284,11 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, u32 *base3, return true; } -static void emulator_set_cached_descriptor(struct desc_struct *desc, u32 base3, - int seg, struct kvm_vcpu *vcpu) +static void emulator_set_cached_descriptor(struct x86_emulate_ctxt *ctxt, + struct desc_struct *desc, u32 base3, + int seg) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); struct kvm_segment var; /* needed to preserve selector */ @@ -4314,22 +4317,22 @@ static void emulator_set_cached_descriptor(struct desc_struct *desc, u32 base3, return; } -static u16 emulator_get_segment_selector(int seg, struct kvm_vcpu *vcpu) +static u16 emulator_get_segment_selector(struct x86_emulate_ctxt *ctxt, int seg) { struct kvm_segment kvm_seg; - kvm_get_segment(vcpu, &kvm_seg, seg); + kvm_get_segment(emul_to_vcpu(ctxt), &kvm_seg, seg); return kvm_seg.selector; } -static void emulator_set_segment_selector(u16 sel, int seg, - struct kvm_vcpu *vcpu) +static void emulator_set_segment_selector(struct x86_emulate_ctxt *ctxt, + u16 sel, int seg) { struct kvm_segment kvm_seg; - kvm_get_segment(vcpu, &kvm_seg, seg); + kvm_get_segment(emul_to_vcpu(ctxt), &kvm_seg, seg); kvm_seg.selector = sel; - kvm_set_segment(vcpu, &kvm_seg, seg); + kvm_set_segment(emul_to_vcpu(ctxt), &kvm_seg, seg); } static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt) From 717746e382e58f075642403eaac26bce0640b2c5 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 20 Apr 2011 13:37:53 +0300 Subject: [PATCH 076/131] KVM: x86 emulator: drop vcpu argument from cr/dr/cpl/msr callbacks Making the emulator caller agnostic. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 14 ++--- arch/x86/kvm/emulate.c | 84 +++++++++++++++--------------- arch/x86/kvm/x86.c | 34 ++++++++---- 3 files changed, 73 insertions(+), 59 deletions(-) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 656046a1bd51..2c02e753ab82 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -176,13 +176,13 @@ struct x86_emulate_ops { int seg); void (*get_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); void (*get_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); - ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu); - int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); - int (*cpl)(struct kvm_vcpu *vcpu); - int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu); - int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu); - int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); - int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); + ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr); + int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val); + int (*cpl)(struct x86_emulate_ctxt *ctxt); + int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest); + int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); + int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data); + int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); void (*get_fpu)(struct x86_emulate_ctxt *ctxt); /* disables preempt */ void (*put_fpu)(struct x86_emulate_ctxt *ctxt); /* reenables preempt */ int (*intercept)(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 9602889f0f7f..33ad16b7db2b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -596,7 +596,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim) goto bad; } - cpl = ctxt->ops->cpl(ctxt->vcpu); + cpl = ctxt->ops->cpl(ctxt); rpl = ctxt->ops->get_segment_selector(ctxt, addr.seg) & 3; cpl = max(cpl, rpl); if (!(desc.type & 8)) { @@ -1248,7 +1248,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, rpl = selector & 3; dpl = seg_desc.dpl; - cpl = ops->cpl(ctxt->vcpu); + cpl = ops->cpl(ctxt); switch (seg) { case VCPU_SREG_SS: @@ -1407,7 +1407,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, int rc; unsigned long val, change_mask; int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; - int cpl = ops->cpl(ctxt->vcpu); + int cpl = ops->cpl(ctxt); rc = emulate_pop(ctxt, ops, &val, len); if (rc != X86EMUL_CONTINUE) @@ -1852,7 +1852,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) setup_syscalls_segments(ctxt, ops, &cs, &ss); - ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); + ops->get_msr(ctxt, MSR_STAR, &msr_data); msr_data >>= 32; cs_sel = (u16)(msr_data & 0xfffc); ss_sel = (u16)(msr_data + 8); @@ -1871,17 +1871,17 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) #ifdef CONFIG_X86_64 c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; - ops->get_msr(ctxt->vcpu, + ops->get_msr(ctxt, ctxt->mode == X86EMUL_MODE_PROT64 ? MSR_LSTAR : MSR_CSTAR, &msr_data); c->eip = msr_data; - ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); + ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data); ctxt->eflags &= ~(msr_data | EFLG_RF); #endif } else { /* legacy mode */ - ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); + ops->get_msr(ctxt, MSR_STAR, &msr_data); c->eip = (u32)msr_data; ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); @@ -1910,7 +1910,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) setup_syscalls_segments(ctxt, ops, &cs, &ss); - ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); + ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data); switch (ctxt->mode) { case X86EMUL_MODE_PROT32: if ((msr_data & 0xfffc) == 0x0) @@ -1938,10 +1938,10 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) ops->set_cached_descriptor(ctxt, &ss, 0, VCPU_SREG_SS); ops->set_segment_selector(ctxt, ss_sel, VCPU_SREG_SS); - ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); + ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data); c->eip = msr_data; - ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); + ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data); c->regs[VCPU_REGS_RSP] = msr_data; return X86EMUL_CONTINUE; @@ -1970,7 +1970,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) cs.dpl = 3; ss.dpl = 3; - ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); + ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data); switch (usermode) { case X86EMUL_MODE_PROT32: cs_sel = (u16)(msr_data + 16); @@ -2010,7 +2010,7 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt, if (ctxt->mode == X86EMUL_MODE_VM86) return true; iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; - return ops->cpl(ctxt->vcpu) > iopl; + return ops->cpl(ctxt) > iopl; } static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, @@ -2187,7 +2187,7 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, { struct decode_cache *c = &ctxt->decode; - tss->cr3 = ops->get_cr(3, ctxt->vcpu); + tss->cr3 = ops->get_cr(ctxt, 3); tss->eip = c->eip; tss->eflags = ctxt->eflags; tss->eax = c->regs[VCPU_REGS_RAX]; @@ -2215,7 +2215,7 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, struct decode_cache *c = &ctxt->decode; int ret; - if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) + if (ops->set_cr(ctxt, 3, tss->cr3)) return emulate_gp(ctxt, 0); c->eip = tss->eip; ctxt->eflags = tss->eflags | 2; @@ -2338,7 +2338,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, if (reason != TASK_SWITCH_IRET) { if ((tss_selector & 3) > next_tss_desc.dpl || - ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) + ops->cpl(ctxt) > next_tss_desc.dpl) return emulate_gp(ctxt, 0); } @@ -2382,7 +2382,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, &next_tss_desc); } - ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu); + ops->set_cr(ctxt, 0, ops->get_cr(ctxt, 0) | X86_CR0_TS); ops->set_cached_descriptor(ctxt, &next_tss_desc, 0, VCPU_SREG_TR); ops->set_segment_selector(ctxt, tss_selector, VCPU_SREG_TR); @@ -2542,7 +2542,7 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt) struct decode_cache *c = &ctxt->decode; u64 tsc = 0; - ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc); + ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc); c->regs[VCPU_REGS_RAX] = (u32)tsc; c->regs[VCPU_REGS_RDX] = tsc >> 32; return X86EMUL_CONTINUE; @@ -2625,8 +2625,8 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt) ((new_val & X86_CR0_NW) && !(new_val & X86_CR0_CD))) return emulate_gp(ctxt, 0); - cr4 = ctxt->ops->get_cr(4, ctxt->vcpu); - ctxt->ops->get_msr(ctxt->vcpu, MSR_EFER, &efer); + cr4 = ctxt->ops->get_cr(ctxt, 4); + ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); if ((new_val & X86_CR0_PG) && (efer & EFER_LME) && !(cr4 & X86_CR4_PAE)) @@ -2652,8 +2652,8 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt) case 4: { u64 cr4, efer; - cr4 = ctxt->ops->get_cr(4, ctxt->vcpu); - ctxt->ops->get_msr(ctxt->vcpu, MSR_EFER, &efer); + cr4 = ctxt->ops->get_cr(ctxt, 4); + ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); if ((efer & EFER_LMA) && !(new_val & X86_CR4_PAE)) return emulate_gp(ctxt, 0); @@ -2669,7 +2669,7 @@ static int check_dr7_gd(struct x86_emulate_ctxt *ctxt) { unsigned long dr7; - ctxt->ops->get_dr(7, &dr7, ctxt->vcpu); + ctxt->ops->get_dr(ctxt, 7, &dr7); /* Check if DR7.Global_Enable is set */ return dr7 & (1 << 13); @@ -2684,7 +2684,7 @@ static int check_dr_read(struct x86_emulate_ctxt *ctxt) if (dr > 7) return emulate_ud(ctxt); - cr4 = ctxt->ops->get_cr(4, ctxt->vcpu); + cr4 = ctxt->ops->get_cr(ctxt, 4); if ((cr4 & X86_CR4_DE) && (dr == 4 || dr == 5)) return emulate_ud(ctxt); @@ -2710,7 +2710,7 @@ static int check_svme(struct x86_emulate_ctxt *ctxt) { u64 efer; - ctxt->ops->get_msr(ctxt->vcpu, MSR_EFER, &efer); + ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); if (!(efer & EFER_SVME)) return emulate_ud(ctxt); @@ -2731,9 +2731,9 @@ static int check_svme_pa(struct x86_emulate_ctxt *ctxt) static int check_rdtsc(struct x86_emulate_ctxt *ctxt) { - u64 cr4 = ctxt->ops->get_cr(4, ctxt->vcpu); + u64 cr4 = ctxt->ops->get_cr(ctxt, 4); - if (cr4 & X86_CR4_TSD && ctxt->ops->cpl(ctxt->vcpu)) + if (cr4 & X86_CR4_TSD && ctxt->ops->cpl(ctxt)) return emulate_ud(ctxt); return X86EMUL_CONTINUE; @@ -2741,10 +2741,10 @@ static int check_rdtsc(struct x86_emulate_ctxt *ctxt) static int check_rdpmc(struct x86_emulate_ctxt *ctxt) { - u64 cr4 = ctxt->ops->get_cr(4, ctxt->vcpu); + u64 cr4 = ctxt->ops->get_cr(ctxt, 4); u64 rcx = kvm_register_read(ctxt->vcpu, VCPU_REGS_RCX); - if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt->vcpu)) || + if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) || (rcx > 3)) return emulate_gp(ctxt, 0); @@ -3514,13 +3514,13 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) } if ((c->d & Sse) - && ((ops->get_cr(0, ctxt->vcpu) & X86_CR0_EM) - || !(ops->get_cr(4, ctxt->vcpu) & X86_CR4_OSFXSR))) { + && ((ops->get_cr(ctxt, 0) & X86_CR0_EM) + || !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { rc = emulate_ud(ctxt); goto done; } - if ((c->d & Sse) && (ops->get_cr(0, ctxt->vcpu) & X86_CR0_TS)) { + if ((c->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { rc = emulate_nm(ctxt); goto done; } @@ -3533,7 +3533,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) } /* Privileged instruction can be executed only in CPL=0 */ - if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { + if ((c->d & Priv) && ops->cpl(ctxt)) { rc = emulate_gp(ctxt, 0); goto done; } @@ -4052,11 +4052,11 @@ twobyte_insn: break; case 4: /* smsw */ c->dst.bytes = 2; - c->dst.val = ops->get_cr(0, ctxt->vcpu); + c->dst.val = ops->get_cr(ctxt, 0); break; case 6: /* lmsw */ - ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0eul) | - (c->src.val & 0x0f), ctxt->vcpu); + ops->set_cr(ctxt, 0, (ops->get_cr(ctxt, 0) & ~0x0eul) | + (c->src.val & 0x0f)); c->dst.type = OP_NONE; break; case 5: /* not defined */ @@ -4084,13 +4084,13 @@ twobyte_insn: case 0x18: /* Grp16 (prefetch/nop) */ break; case 0x20: /* mov cr, reg */ - c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu); + c->dst.val = ops->get_cr(ctxt, c->modrm_reg); break; case 0x21: /* mov from dr to reg */ - ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu); + ops->get_dr(ctxt, c->modrm_reg, &c->dst.val); break; case 0x22: /* mov reg, cr */ - if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) { + if (ops->set_cr(ctxt, c->modrm_reg, c->src.val)) { emulate_gp(ctxt, 0); rc = X86EMUL_PROPAGATE_FAULT; goto done; @@ -4098,9 +4098,9 @@ twobyte_insn: c->dst.type = OP_NONE; break; case 0x23: /* mov from reg to dr */ - if (ops->set_dr(c->modrm_reg, c->src.val & + if (ops->set_dr(ctxt, c->modrm_reg, c->src.val & ((ctxt->mode == X86EMUL_MODE_PROT64) ? - ~0ULL : ~0U), ctxt->vcpu) < 0) { + ~0ULL : ~0U)) < 0) { /* #UD condition is already handled by the code above */ emulate_gp(ctxt, 0); rc = X86EMUL_PROPAGATE_FAULT; @@ -4113,7 +4113,7 @@ twobyte_insn: /* wrmsr */ msr_data = (u32)c->regs[VCPU_REGS_RAX] | ((u64)c->regs[VCPU_REGS_RDX] << 32); - if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { + if (ops->set_msr(ctxt, c->regs[VCPU_REGS_RCX], msr_data)) { emulate_gp(ctxt, 0); rc = X86EMUL_PROPAGATE_FAULT; goto done; @@ -4122,7 +4122,7 @@ twobyte_insn: break; case 0x32: /* rdmsr */ - if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { + if (ops->get_msr(ctxt, c->regs[VCPU_REGS_RCX], &msr_data)) { emulate_gp(ctxt, 0); rc = X86EMUL_PROPAGATE_FAULT; goto done; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6a7fbf671b26..16373a5bfd01 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4160,15 +4160,15 @@ int emulate_clts(struct kvm_vcpu *vcpu) return X86EMUL_CONTINUE; } -int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu) +int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) { - return _kvm_get_dr(vcpu, dr, dest); + return _kvm_get_dr(emul_to_vcpu(ctxt), dr, dest); } -int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu) +int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) { - return __kvm_set_dr(vcpu, dr, value); + return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value); } static u64 mk_cr_64(u64 curr_cr, u32 new_val) @@ -4176,8 +4176,9 @@ static u64 mk_cr_64(u64 curr_cr, u32 new_val) return (curr_cr & ~((1ULL << 32) - 1)) | new_val; } -static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu) +static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); unsigned long value; switch (cr) { @@ -4204,8 +4205,9 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu) return value; } -static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) +static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); int res = 0; switch (cr) { @@ -4232,9 +4234,9 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) return res; } -static int emulator_get_cpl(struct kvm_vcpu *vcpu) +static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) { - return kvm_x86_ops->get_cpl(vcpu); + return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt)); } static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) @@ -4335,6 +4337,18 @@ static void emulator_set_segment_selector(struct x86_emulate_ctxt *ctxt, kvm_set_segment(emul_to_vcpu(ctxt), &kvm_seg, seg); } +static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, + u32 msr_index, u64 *pdata) +{ + return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata); +} + +static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, + u32 msr_index, u64 data) +{ + return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); +} + static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt) { preempt_disable(); @@ -4379,8 +4393,8 @@ static struct x86_emulate_ops emulate_ops = { .cpl = emulator_get_cpl, .get_dr = emulator_get_dr, .set_dr = emulator_set_dr, - .set_msr = kvm_set_msr, - .get_msr = kvm_get_msr, + .set_msr = emulator_set_msr, + .get_msr = emulator_get_msr, .get_fpu = emulator_get_fpu, .put_fpu = emulator_put_fpu, .intercept = emulator_intercept, From 2953538ebbd95b145bd3629126fe5af61b88be11 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 20 Apr 2011 13:37:53 +0300 Subject: [PATCH 077/131] KVM: x86 emulator: drop vcpu argument from intercept callback Making the emulator caller agnostic. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 2 +- arch/x86/kvm/emulate.c | 2 +- arch/x86/kvm/x86.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 2c02e753ab82..e2b082aa3201 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -185,7 +185,7 @@ struct x86_emulate_ops { int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); void (*get_fpu)(struct x86_emulate_ctxt *ctxt); /* disables preempt */ void (*put_fpu)(struct x86_emulate_ctxt *ctxt); /* reenables preempt */ - int (*intercept)(struct kvm_vcpu *vcpu, + int (*intercept)(struct x86_emulate_ctxt *ctxt, struct x86_instruction_info *info, enum x86_intercept_stage stage); }; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 33ad16b7db2b..acb9fcc283ee 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -438,7 +438,7 @@ static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, .next_rip = ctxt->eip, }; - return ctxt->ops->intercept(ctxt->vcpu, &info, stage); + return ctxt->ops->intercept(ctxt, &info, stage); } static inline unsigned long ad_mask(struct decode_cache *c) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 16373a5bfd01..4f7248ea6caf 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4365,11 +4365,11 @@ static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt) preempt_enable(); } -static int emulator_intercept(struct kvm_vcpu *vcpu, +static int emulator_intercept(struct x86_emulate_ctxt *ctxt, struct x86_instruction_info *info, enum x86_intercept_stage stage) { - return kvm_x86_ops->check_intercept(vcpu, info, stage); + return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage); } static struct x86_emulate_ops emulate_ops = { From fe870ab9ce1c3e64c6d6b6ee3fe53d0d029f1044 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 20 Apr 2011 15:01:23 +0300 Subject: [PATCH 078/131] KVM: x86 emulator: avoid using ctxt->vcpu in check_perm() callbacks Unneeded for register access. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index acb9fcc283ee..0ff7d4bd1bb2 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2720,7 +2720,7 @@ static int check_svme(struct x86_emulate_ctxt *ctxt) static int check_svme_pa(struct x86_emulate_ctxt *ctxt) { - u64 rax = kvm_register_read(ctxt->vcpu, VCPU_REGS_RAX); + u64 rax = ctxt->decode.regs[VCPU_REGS_RAX]; /* Valid physical address? */ if (rax & 0xffff000000000000) @@ -2742,7 +2742,7 @@ static int check_rdtsc(struct x86_emulate_ctxt *ctxt) static int check_rdpmc(struct x86_emulate_ctxt *ctxt) { u64 cr4 = ctxt->ops->get_cr(ctxt, 4); - u64 rcx = kvm_register_read(ctxt->vcpu, VCPU_REGS_RCX); + u64 rcx = ctxt->decode.regs[VCPU_REGS_RCX]; if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) || (rcx > 3)) From 1ac9d0cfb07e8ac3b5007d8279c5bd56e124250c Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 20 Apr 2011 15:12:00 +0300 Subject: [PATCH 079/131] KVM: x86 emulator: add and use new callbacks set_idt(), set_gdt() Replacing direct calls to realmode_lgdt(), realmode_lidt(). Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 2 ++ arch/x86/include/asm/kvm_host.h | 3 --- arch/x86/kvm/emulate.c | 14 +++++++------- arch/x86/kvm/x86.c | 26 ++++++++++++-------------- 4 files changed, 21 insertions(+), 24 deletions(-) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index e2b082aa3201..4d1546aa6104 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -176,6 +176,8 @@ struct x86_emulate_ops { int seg); void (*get_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); void (*get_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); + void (*set_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); + void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr); int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val); int (*cpl)(struct x86_emulate_ctxt *ctxt); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e50bffcf3cc0..a8616ca8320e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -681,9 +681,6 @@ static inline int emulate_instruction(struct kvm_vcpu *vcpu, return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0); } -void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); -void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); - void kvm_enable_efer_bits(u64); int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 0ff7d4bd1bb2..57e0e291b38d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3494,6 +3494,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) int rc = X86EMUL_CONTINUE; int saved_dst_type = c->dst.type; int irq; /* Used for int 3, int, and into */ + struct desc_ptr desc_ptr; ctxt->decode.mem_read.pos = 0; @@ -4005,9 +4006,6 @@ twobyte_insn: switch (c->b) { case 0x01: /* lgdt, lidt, lmsw */ switch (c->modrm_reg) { - u16 size; - unsigned long address; - case 0: /* vmcall */ if (c->modrm_mod != 3 || c->modrm_rm != 1) goto cannot_emulate; @@ -4023,10 +4021,11 @@ twobyte_insn: break; case 2: /* lgdt */ rc = read_descriptor(ctxt, ops, c->src.addr.mem, - &size, &address, c->op_bytes); + &desc_ptr.size, &desc_ptr.address, + c->op_bytes); if (rc != X86EMUL_CONTINUE) goto done; - realmode_lgdt(ctxt->vcpu, size, address); + ctxt->ops->set_gdt(ctxt, &desc_ptr); /* Disable writeback. */ c->dst.type = OP_NONE; break; @@ -4041,11 +4040,12 @@ twobyte_insn: } } else { rc = read_descriptor(ctxt, ops, c->src.addr.mem, - &size, &address, + &desc_ptr.size, + &desc_ptr.address, c->op_bytes); if (rc != X86EMUL_CONTINUE) goto done; - realmode_lidt(ctxt->vcpu, size, address); + ctxt->ops->set_idt(ctxt, &desc_ptr); } /* Disable writeback. */ c->dst.type = OP_NONE; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4f7248ea6caf..7cd3a3b491de 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4249,6 +4249,16 @@ static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt); } +static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) +{ + kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt); +} + +static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) +{ + kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt); +} + static unsigned long emulator_get_cached_segment_base( struct x86_emulate_ctxt *ctxt, int seg) { @@ -4388,6 +4398,8 @@ static struct x86_emulate_ops emulate_ops = { .get_cached_segment_base = emulator_get_cached_segment_base, .get_gdt = emulator_get_gdt, .get_idt = emulator_get_idt, + .set_gdt = emulator_set_gdt, + .set_idt = emulator_set_idt, .get_cr = emulator_get_cr, .set_cr = emulator_set_cr, .cpl = emulator_get_cpl, @@ -5049,20 +5061,6 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) rip, instruction, 3, NULL); } -void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) -{ - struct desc_ptr dt = { limit, base }; - - kvm_x86_ops->set_gdt(vcpu, &dt); -} - -void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) -{ - struct desc_ptr dt = { limit, base }; - - kvm_x86_ops->set_idt(vcpu, &dt); -} - static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) { struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; From c2ad2bb3ef870067ecfc9ccdcf465feb51f2b6a5 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 20 Apr 2011 15:21:35 +0300 Subject: [PATCH 080/131] KVM: x86 emulator: drop use of is_long_mode() Requires ctxt->vcpu, which is to be abolished. Replace with open calls to get_msr(). Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 57e0e291b38d..be1532f4b8b8 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1844,12 +1844,14 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) struct desc_struct cs, ss; u64 msr_data; u16 cs_sel, ss_sel; + u64 efer = 0; /* syscall is not available in real mode */ if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86) return emulate_ud(ctxt); + ops->get_msr(ctxt, MSR_EFER, &efer); setup_syscalls_segments(ctxt, ops, &cs, &ss); ops->get_msr(ctxt, MSR_STAR, &msr_data); @@ -1857,7 +1859,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) cs_sel = (u16)(msr_data & 0xfffc); ss_sel = (u16)(msr_data + 8); - if (is_long_mode(ctxt->vcpu)) { + if (efer & EFER_LMA) { cs.d = 0; cs.l = 1; } @@ -1867,7 +1869,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) ops->set_segment_selector(ctxt, ss_sel, VCPU_SREG_SS); c->regs[VCPU_REGS_RCX] = c->eip; - if (is_long_mode(ctxt->vcpu)) { + if (efer & EFER_LMA) { #ifdef CONFIG_X86_64 c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; @@ -1897,7 +1899,9 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) struct desc_struct cs, ss; u64 msr_data; u16 cs_sel, ss_sel; + u64 efer = 0; + ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); /* inject #GP if in real mode */ if (ctxt->mode == X86EMUL_MODE_REAL) return emulate_gp(ctxt, 0); @@ -1927,8 +1931,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) cs_sel &= ~SELECTOR_RPL_MASK; ss_sel = cs_sel + 8; ss_sel &= ~SELECTOR_RPL_MASK; - if (ctxt->mode == X86EMUL_MODE_PROT64 - || is_long_mode(ctxt->vcpu)) { + if (ctxt->mode == X86EMUL_MODE_PROT64 || (efer & EFER_LMA)) { cs.d = 0; cs.l = 1; } @@ -2603,6 +2606,7 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt) struct decode_cache *c = &ctxt->decode; u64 new_val = c->src.val64; int cr = c->modrm_reg; + u64 efer = 0; static u64 cr_reserved_bits[] = { 0xffffffff00000000ULL, @@ -2620,7 +2624,7 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt) switch (cr) { case 0: { - u64 cr4, efer; + u64 cr4; if (((new_val & X86_CR0_PG) && !(new_val & X86_CR0_PE)) || ((new_val & X86_CR0_NW) && !(new_val & X86_CR0_CD))) return emulate_gp(ctxt, 0); @@ -2637,7 +2641,8 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt) case 3: { u64 rsvd = 0; - if (is_long_mode(ctxt->vcpu)) + ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); + if (efer & EFER_LMA) rsvd = CR3_L_MODE_RESERVED_BITS; else if (is_pae(ctxt->vcpu)) rsvd = CR3_PAE_RESERVED_BITS; @@ -2650,7 +2655,7 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt) break; } case 4: { - u64 cr4, efer; + u64 cr4; cr4 = ctxt->ops->get_cr(ctxt, 4); ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); From fd72c4192220d0086fb24356ac6ff9c3b1e067d9 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 20 Apr 2011 15:24:32 +0300 Subject: [PATCH 081/131] KVM: x86 emulator: Replace calls to is_pae() and is_paging with ->get_cr() Avoid use of ctxt->vcpu. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index be1532f4b8b8..6a5125328669 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2644,9 +2644,9 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt) ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); if (efer & EFER_LMA) rsvd = CR3_L_MODE_RESERVED_BITS; - else if (is_pae(ctxt->vcpu)) + else if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PAE) rsvd = CR3_PAE_RESERVED_BITS; - else if (is_paging(ctxt->vcpu)) + else if (ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PG) rsvd = CR3_NONPAE_RESERVED_BITS; if (new_val & rsvd) From 2d04a05bd7e93c13f13a82ac40de4065a99d069b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 20 Apr 2011 15:32:49 +0300 Subject: [PATCH 082/131] KVM: x86 emulator: emulate CLTS internally Avoid using ctxt->vcpu; we can do everything with ->get_cr() and ->set_cr(). A side effect is that we no longer activate the fpu on emulated CLTS; but that should be very rare. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/emulate.c | 12 +++++++++++- arch/x86/kvm/x86.c | 7 ------- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a8616ca8320e..9c3567e0f730 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -691,7 +691,6 @@ int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port); void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); int kvm_emulate_halt(struct kvm_vcpu *vcpu); int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); -int emulate_clts(struct kvm_vcpu *vcpu); int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 6a5125328669..2b903a326096 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2579,6 +2579,16 @@ static int em_invlpg(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_clts(struct x86_emulate_ctxt *ctxt) +{ + ulong cr0; + + cr0 = ctxt->ops->get_cr(ctxt, 0); + cr0 &= ~X86_CR0_TS; + ctxt->ops->set_cr(ctxt, 0, cr0); + return X86EMUL_CONTINUE; +} + static bool valid_cr(int nr) { switch (nr) { @@ -4079,7 +4089,7 @@ twobyte_insn: rc = emulate_syscall(ctxt, ops); break; case 0x06: - emulate_clts(ctxt->vcpu); + rc = em_clts(ctxt); break; case 0x09: /* wbinvd */ kvm_emulate_wbinvd(ctxt->vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7cd3a3b491de..a9e83862feb8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4153,13 +4153,6 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); -int emulate_clts(struct kvm_vcpu *vcpu) -{ - kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); - kvm_x86_ops->fpu_activate(vcpu); - return X86EMUL_CONTINUE; -} - int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) { return _kvm_get_dr(emul_to_vcpu(ctxt), dr, dest); From 3cb16fe78ce91991a876c74fc5dc99419b737b7a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 20 Apr 2011 15:38:44 +0300 Subject: [PATCH 083/131] KVM: x86 emulator: make emulate_invlpg() an emulator callback Removing direct calls to KVM. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 1 + arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/emulate.c | 2 +- arch/x86/kvm/x86.c | 6 +++--- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 4d1546aa6104..f89076943701 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -154,6 +154,7 @@ struct x86_emulate_ops { const void *new, unsigned int bytes, struct x86_exception *fault); + void (*invlpg)(struct x86_emulate_ctxt *ctxt, ulong addr); int (*pio_in_emulated)(struct x86_emulate_ctxt *ctxt, int size, unsigned short port, void *val, diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9c3567e0f730..d957d0d06562 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -690,7 +690,6 @@ struct x86_emulate_ctxt; int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port); void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); int kvm_emulate_halt(struct kvm_vcpu *vcpu); -int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 2b903a326096..5d774e91388e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2573,7 +2573,7 @@ static int em_invlpg(struct x86_emulate_ctxt *ctxt) rc = linearize(ctxt, c->src.addr.mem, 1, false, &linear); if (rc == X86EMUL_CONTINUE) - emulate_invlpg(ctxt->vcpu, linear); + ctxt->ops->invlpg(ctxt, linear); /* Disable writeback. */ c->dst.type = OP_NONE; return X86EMUL_CONTINUE; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a9e83862feb8..8af49b3df675 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4128,10 +4128,9 @@ static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) return kvm_x86_ops->get_segment_base(vcpu, seg); } -int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) +static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address) { - kvm_mmu_invlpg(vcpu, address); - return X86EMUL_CONTINUE; + kvm_mmu_invlpg(emul_to_vcpu(ctxt), address); } int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) @@ -4382,6 +4381,7 @@ static struct x86_emulate_ops emulate_ops = { .read_emulated = emulator_read_emulated, .write_emulated = emulator_write_emulated, .cmpxchg_emulated = emulator_cmpxchg_emulated, + .invlpg = emulator_invlpg, .pio_in_emulated = emulator_pio_in_emulated, .pio_out_emulated = emulator_pio_out_emulated, .get_cached_descriptor = emulator_get_cached_descriptor, From 6c3287f7c5050076b554145f11bdba058de287d1 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 20 Apr 2011 15:43:05 +0300 Subject: [PATCH 084/131] KVM: x86 emulator: add new ->halt() callback Instead of reaching into vcpu internals. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 1 + arch/x86/kvm/emulate.c | 2 +- arch/x86/kvm/x86.c | 6 ++++++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index f89076943701..d30f1e9b7544 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -186,6 +186,7 @@ struct x86_emulate_ops { int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data); int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); + void (*halt)(struct x86_emulate_ctxt *ctxt); void (*get_fpu)(struct x86_emulate_ctxt *ctxt); /* disables preempt */ void (*put_fpu)(struct x86_emulate_ctxt *ctxt); /* reenables preempt */ int (*intercept)(struct x86_emulate_ctxt *ctxt, diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5d774e91388e..210df51b76a4 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3913,7 +3913,7 @@ special_insn: c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xf4: /* hlt */ - ctxt->vcpu->arch.halt_request = 1; + ctxt->ops->halt(ctxt); break; case 0xf5: /* cmc */ /* complement carry flag from eflags reg */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8af49b3df675..2246cf1a4ee0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4351,6 +4351,11 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); } +static void emulator_halt(struct x86_emulate_ctxt *ctxt) +{ + emul_to_vcpu(ctxt)->arch.halt_request = 1; +} + static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt) { preempt_disable(); @@ -4400,6 +4405,7 @@ static struct x86_emulate_ops emulate_ops = { .set_dr = emulator_set_dr, .set_msr = emulator_set_msr, .get_msr = emulator_get_msr, + .halt = emulator_halt, .get_fpu = emulator_get_fpu, .put_fpu = emulator_put_fpu, .intercept = emulator_intercept, From d6aa10003b0cded5a538af0d198460e89dc2d6d2 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 20 Apr 2011 15:47:13 +0300 Subject: [PATCH 085/131] KVM: x86 emulator: add ->fix_hypercall() callback Artificial, but needed to remove direct calls to KVM. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 1 + arch/x86/include/asm/kvm_host.h | 2 -- arch/x86/kvm/emulate.c | 4 ++-- arch/x86/kvm/x86.c | 6 +++++- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index d30f1e9b7544..d30840ddd2f3 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -187,6 +187,7 @@ struct x86_emulate_ops { int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data); int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); void (*halt)(struct x86_emulate_ctxt *ctxt); + int (*fix_hypercall)(struct x86_emulate_ctxt *ctxt); void (*get_fpu)(struct x86_emulate_ctxt *ctxt); /* disables preempt */ void (*put_fpu)(struct x86_emulate_ctxt *ctxt); /* reenables preempt */ int (*intercept)(struct x86_emulate_ctxt *ctxt, diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d957d0d06562..6cfc1ab2cdd6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -752,8 +752,6 @@ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); -int kvm_fix_hypercall(struct kvm_vcpu *vcpu); - int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, void *insn, int insn_len); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 210df51b76a4..64e7373d3b2b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4025,7 +4025,7 @@ twobyte_insn: if (c->modrm_mod != 3 || c->modrm_rm != 1) goto cannot_emulate; - rc = kvm_fix_hypercall(ctxt->vcpu); + rc = ctxt->ops->fix_hypercall(ctxt); if (rc != X86EMUL_CONTINUE) goto done; @@ -4048,7 +4048,7 @@ twobyte_insn: if (c->modrm_mod == 3) { switch (c->modrm_rm) { case 1: - rc = kvm_fix_hypercall(ctxt->vcpu); + rc = ctxt->ops->fix_hypercall(ctxt); break; default: goto cannot_emulate; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2246cf1a4ee0..4a2b40e25021 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -152,6 +152,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { u64 __read_mostly host_xcr0; +int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); + static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) { int i; @@ -4406,6 +4408,7 @@ static struct x86_emulate_ops emulate_ops = { .set_msr = emulator_set_msr, .get_msr = emulator_get_msr, .halt = emulator_halt, + .fix_hypercall = emulator_fix_hypercall, .get_fpu = emulator_get_fpu, .put_fpu = emulator_put_fpu, .intercept = emulator_intercept, @@ -5042,8 +5045,9 @@ out: } EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); -int kvm_fix_hypercall(struct kvm_vcpu *vcpu) +int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); char instruction[3]; unsigned long rip = kvm_rip_read(vcpu); From bcaf5cc543bdb8f61fc3ce09944e0ecde2966595 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 20 Apr 2011 15:53:23 +0300 Subject: [PATCH 086/131] KVM: x86 emulator: add new ->wbinvd() callback Instead of calling kvm_emulate_wbinvd() directly. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 1 + arch/x86/kvm/emulate.c | 2 +- arch/x86/kvm/x86.c | 6 ++++++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index d30840ddd2f3..51341d6b2f31 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -187,6 +187,7 @@ struct x86_emulate_ops { int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data); int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); void (*halt)(struct x86_emulate_ctxt *ctxt); + void (*wbinvd)(struct x86_emulate_ctxt *ctxt); int (*fix_hypercall)(struct x86_emulate_ctxt *ctxt); void (*get_fpu)(struct x86_emulate_ctxt *ctxt); /* disables preempt */ void (*put_fpu)(struct x86_emulate_ctxt *ctxt); /* reenables preempt */ diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 64e7373d3b2b..522bc35d290c 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4092,7 +4092,7 @@ twobyte_insn: rc = em_clts(ctxt); break; case 0x09: /* wbinvd */ - kvm_emulate_wbinvd(ctxt->vcpu); + ctxt->ops->wbinvd(ctxt); break; case 0x08: /* invd */ case 0x0d: /* GrpP (prefetch) */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4a2b40e25021..5d853d540f95 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4154,6 +4154,11 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); +static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt) +{ + kvm_emulate_wbinvd(emul_to_vcpu(ctxt)); +} + int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) { return _kvm_get_dr(emul_to_vcpu(ctxt), dr, dest); @@ -4408,6 +4413,7 @@ static struct x86_emulate_ops emulate_ops = { .set_msr = emulator_set_msr, .get_msr = emulator_get_msr, .halt = emulator_halt, + .wbinvd = emulator_wbinvd, .fix_hypercall = emulator_fix_hypercall, .get_fpu = emulator_get_fpu, .put_fpu = emulator_put_fpu, From 5197b808a7f459f9c7436573c7785ff3c1324c08 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 20 Apr 2011 15:55:40 +0300 Subject: [PATCH 087/131] KVM: Avoid using x86_emulate_ctxt.vcpu We can use container_of() instead. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5d853d540f95..65a5b0c545aa 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4366,7 +4366,7 @@ static void emulator_halt(struct x86_emulate_ctxt *ctxt) static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt) { preempt_disable(); - kvm_load_guest_fpu(ctxt->vcpu); + kvm_load_guest_fpu(emul_to_vcpu(ctxt)); /* * CR0.TS may reference the host fpu state, not the guest fpu state, * so it may be clear at this point. From 13db70eca62c5bbb2cbbf6b23dadb94065d363d1 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 20 Apr 2011 15:56:20 +0300 Subject: [PATCH 088/131] KVM: x86 emulator: drop x86_emulate_ctxt::vcpu No longer used. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 2 -- arch/x86/kvm/x86.c | 1 - 2 files changed, 3 deletions(-) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 51341d6b2f31..127ea3e17171 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -269,8 +269,6 @@ struct x86_emulate_ctxt { struct x86_emulate_ops *ops; /* Register state before/after emulation. */ - struct kvm_vcpu *vcpu; - unsigned long eflags; unsigned long eip; /* eip before instruction emulation */ /* Emulated execution mode, represented by an X86EMUL_MODE value. */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 65a5b0c545aa..a831d5d8ca14 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4463,7 +4463,6 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - vcpu->arch.emulate_ctxt.vcpu = vcpu; vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); vcpu->arch.emulate_ctxt.mode = From a78484c60e35555d6e0e5b1eb83d4913621c59fb Mon Sep 17 00:00:00 2001 From: "Roedel, Joerg" Date: Wed, 20 Apr 2011 15:33:16 +0200 Subject: [PATCH 089/131] KVM: MMU: Make cmpxchg_gpte aware of nesting too This patch makes the cmpxchg_gpte() function aware of the difference between l1-gfns and l2-gfns when nested virtualization is in use. This fixes a potential data-corruption problem in the l1-guest and makes the code work correct (at least as correct as the hardware which is emulated in this code) again. Cc: stable@kernel.org Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 74f8567d57ac..1b6899088f97 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -78,15 +78,21 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; } -static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, +static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gfn_t table_gfn, unsigned index, pt_element_t orig_pte, pt_element_t new_pte) { pt_element_t ret; pt_element_t *table; struct page *page; + gpa_t gpa; - page = gfn_to_page(kvm, table_gfn); + gpa = mmu->translate_gpa(vcpu, table_gfn << PAGE_SHIFT, + PFERR_USER_MASK|PFERR_WRITE_MASK); + if (gpa == UNMAPPED_GVA) + return -EFAULT; + + page = gfn_to_page(vcpu->kvm, gpa_to_gfn(gpa)); table = kmap_atomic(page, KM_USER0); ret = CMPXCHG(&table[index], orig_pte, new_pte); @@ -192,11 +198,17 @@ walk: #endif if (!eperm && !rsvd_fault && !(pte & PT_ACCESSED_MASK)) { + int ret; trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte)); - if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, - index, pte, pte|PT_ACCESSED_MASK)) + ret = FNAME(cmpxchg_gpte)(vcpu, mmu, table_gfn, + index, pte, pte|PT_ACCESSED_MASK); + if (ret < 0) { + present = false; + break; + } else if (ret) goto walk; + mark_page_dirty(vcpu->kvm, table_gfn); pte |= PT_ACCESSED_MASK; } @@ -245,13 +257,17 @@ walk: goto error; if (write_fault && !is_dirty_gpte(pte)) { - bool ret; + int ret; trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); - ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, + ret = FNAME(cmpxchg_gpte)(vcpu, mmu, table_gfn, index, pte, pte|PT_DIRTY_MASK); - if (ret) + if (ret < 0) { + present = false; + goto error; + } else if (ret) goto walk; + mark_page_dirty(vcpu->kvm, table_gfn); pte |= PT_DIRTY_MASK; walker->ptes[walker->level - 1] = pte; From a38f84ca8c6991925cb8bb6371ade8df9d3cc1e6 Mon Sep 17 00:00:00 2001 From: Liu Yuan Date: Thu, 21 Apr 2011 14:53:57 +0800 Subject: [PATCH 090/131] KVM: ioapic: Fix an error field reference Function ioapic_debug() in the ioapic_deliver() misnames one filed by reference. This patch correct it. Signed-off-by: Liu Yuan Signed-off-by: Avi Kivity --- virt/kvm/ioapic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 0b9df8303dcf..8df1ca104a7f 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -167,7 +167,7 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq) ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x " "vector=%x trig_mode=%x\n", - entry->fields.dest, entry->fields.dest_mode, + entry->fields.dest_id, entry->fields.dest_mode, entry->fields.delivery_mode, entry->fields.vector, entry->fields.trig_mode); From cfb223753c3750b28103136ad4d51d1a3ae4d64b Mon Sep 17 00:00:00 2001 From: Clemens Noss Date: Thu, 21 Apr 2011 21:16:05 +0200 Subject: [PATCH 091/131] KVM: x86 emulator: avoid calling wbinvd() macro Commit 0b56652e33c72092956c651ab6ceb9f0ad081153 fails to build: CC [M] arch/x86/kvm/emulate.o arch/x86/kvm/emulate.c: In function 'x86_emulate_insn': arch/x86/kvm/emulate.c:4095:25: error: macro "wbinvd" passed 1 arguments, but takes just 0 arch/x86/kvm/emulate.c:4095:3: warning: statement with no effect make[2]: *** [arch/x86/kvm/emulate.o] Error 1 make[1]: *** [arch/x86/kvm] Error 2 make: *** [arch/x86] Error 2 Work around this for now. Signed-off-by: Clemens Noss Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 522bc35d290c..ccb8b383beab 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4092,7 +4092,7 @@ twobyte_insn: rc = em_clts(ctxt); break; case 0x09: /* wbinvd */ - ctxt->ops->wbinvd(ctxt); + (ctxt->ops->wbinvd)(ctxt); break; case 0x08: /* invd */ case 0x0d: /* GrpP (prefetch) */ From d42244499f171a499aa6748bc52304bb40e68ecc Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 21 Apr 2011 09:09:22 -0700 Subject: [PATCH 092/131] KVM: x86 emulator: fix const value warning on i386 in svm insn RAX check arch/x86/kvm/emulate.c:2598: warning: integer constant is too large for 'long' type Signed-off-by: Randy Dunlap Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index ccb8b383beab..77a5f54f151f 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2738,7 +2738,7 @@ static int check_svme_pa(struct x86_emulate_ctxt *ctxt) u64 rax = ctxt->decode.regs[VCPU_REGS_RAX]; /* Valid physical address? */ - if (rax & 0xffff000000000000) + if (rax & 0xffff000000000000ULL) return emulate_gp(ctxt, 0); return check_svme(ctxt); From 26d05cc7403dfffbc5afd66a1292adfb7566e3ff Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 21 Apr 2011 12:07:59 +0300 Subject: [PATCH 093/131] KVM: x86 emulator: move 0F 01 sub-opcodes into their own functions Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 146 ++++++++++++++++++++++++++++------------- 1 file changed, 99 insertions(+), 47 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 77a5f54f151f..8e1d0c8196bd 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2589,6 +2589,95 @@ static int em_clts(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_vmcall(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + + if (c->modrm_mod != 3 || c->modrm_rm != 1) + return X86EMUL_UNHANDLEABLE; + + rc = ctxt->ops->fix_hypercall(ctxt); + if (rc != X86EMUL_CONTINUE) + return rc; + + /* Let the processor re-execute the fixed hypercall */ + c->eip = ctxt->eip; + /* Disable writeback. */ + c->dst.type = OP_NONE; + return X86EMUL_CONTINUE; +} + +static int em_lgdt(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + struct desc_ptr desc_ptr; + int rc; + + rc = read_descriptor(ctxt, ctxt->ops, c->src.addr.mem, + &desc_ptr.size, &desc_ptr.address, + c->op_bytes); + if (rc != X86EMUL_CONTINUE) + return rc; + ctxt->ops->set_gdt(ctxt, &desc_ptr); + /* Disable writeback. */ + c->dst.type = OP_NONE; + return X86EMUL_CONTINUE; +} + +static int em_svm(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + + switch (c->modrm_rm) { + case 1: + rc = ctxt->ops->fix_hypercall(ctxt); + break; + default: + return X86EMUL_UNHANDLEABLE; + } + /* Disable writeback. */ + c->dst.type = OP_NONE; + return rc; +} + +static int em_lidt(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + struct desc_ptr desc_ptr; + int rc; + + rc = read_descriptor(ctxt, ctxt->ops, c->src.addr.mem, + &desc_ptr.size, + &desc_ptr.address, + c->op_bytes); + if (rc != X86EMUL_CONTINUE) + return rc; + ctxt->ops->set_idt(ctxt, &desc_ptr); + /* Disable writeback. */ + c->dst.type = OP_NONE; + return X86EMUL_CONTINUE; +} + +static int em_smsw(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + c->dst.bytes = 2; + c->dst.val = ctxt->ops->get_cr(ctxt, 0); + return X86EMUL_CONTINUE; +} + +static int em_lmsw(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul) + | (c->src.val & 0x0f)); + c->dst.type = OP_NONE; + return X86EMUL_CONTINUE; +} + static bool valid_cr(int nr) { switch (nr) { @@ -3509,7 +3598,6 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) int rc = X86EMUL_CONTINUE; int saved_dst_type = c->dst.type; int irq; /* Used for int 3, int, and into */ - struct desc_ptr desc_ptr; ctxt->decode.mem_read.pos = 0; @@ -4022,62 +4110,26 @@ twobyte_insn: case 0x01: /* lgdt, lidt, lmsw */ switch (c->modrm_reg) { case 0: /* vmcall */ - if (c->modrm_mod != 3 || c->modrm_rm != 1) - goto cannot_emulate; - - rc = ctxt->ops->fix_hypercall(ctxt); - if (rc != X86EMUL_CONTINUE) - goto done; - - /* Let the processor re-execute the fixed hypercall */ - c->eip = ctxt->eip; - /* Disable writeback. */ - c->dst.type = OP_NONE; + rc = em_vmcall(ctxt); break; case 2: /* lgdt */ - rc = read_descriptor(ctxt, ops, c->src.addr.mem, - &desc_ptr.size, &desc_ptr.address, - c->op_bytes); - if (rc != X86EMUL_CONTINUE) - goto done; - ctxt->ops->set_gdt(ctxt, &desc_ptr); - /* Disable writeback. */ - c->dst.type = OP_NONE; + rc = em_lgdt(ctxt); break; case 3: /* lidt/vmmcall */ - if (c->modrm_mod == 3) { - switch (c->modrm_rm) { - case 1: - rc = ctxt->ops->fix_hypercall(ctxt); - break; - default: - goto cannot_emulate; - } - } else { - rc = read_descriptor(ctxt, ops, c->src.addr.mem, - &desc_ptr.size, - &desc_ptr.address, - c->op_bytes); - if (rc != X86EMUL_CONTINUE) - goto done; - ctxt->ops->set_idt(ctxt, &desc_ptr); - } - /* Disable writeback. */ - c->dst.type = OP_NONE; + if (c->modrm_mod == 3) + return em_svm(ctxt); + else + return em_lidt(ctxt); break; case 4: /* smsw */ - c->dst.bytes = 2; - c->dst.val = ops->get_cr(ctxt, 0); + rc = em_smsw(ctxt); break; case 6: /* lmsw */ - ops->set_cr(ctxt, 0, (ops->get_cr(ctxt, 0) & ~0x0eul) | - (c->src.val & 0x0f)); - c->dst.type = OP_NONE; + rc = em_lmsw(ctxt); break; case 5: /* not defined */ - emulate_ud(ctxt); - rc = X86EMUL_PROPAGATE_FAULT; - goto done; + rc = emulate_ud(ctxt); + break; case 7: /* invlpg*/ rc = em_invlpg(ctxt); break; From 68152d88122b24fad0f5910f74efcd19120a19a8 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 21 Apr 2011 12:17:13 +0300 Subject: [PATCH 094/131] KVM: x86 emulator: Don't force #UD for 0F 01 /5 While it isn't defined, no need to force a #UD. If it becomes defined in the future this can cause wierd problems for the guest. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8e1d0c8196bd..2132fab188b0 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4127,9 +4127,6 @@ twobyte_insn: case 6: /* lmsw */ rc = em_lmsw(ctxt); break; - case 5: /* not defined */ - rc = emulate_ud(ctxt); - break; case 7: /* invlpg*/ rc = em_invlpg(ctxt); break; From 5ef39c71d8398115245a5974b488f8703ba3a6b0 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 21 Apr 2011 12:21:50 +0300 Subject: [PATCH 095/131] KVM: x86 emulator: Use opcode::execute for 0F 01 opcode Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 56 ++++++++++-------------------------------- 1 file changed, 13 insertions(+), 43 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 2132fab188b0..252f28348cfe 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2625,18 +2625,13 @@ static int em_lgdt(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static int em_svm(struct x86_emulate_ctxt *ctxt) +static int em_vmmcall(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; int rc; - switch (c->modrm_rm) { - case 1: - rc = ctxt->ops->fix_hypercall(ctxt); - break; - default: - return X86EMUL_UNHANDLEABLE; - } + rc = ctxt->ops->fix_hypercall(ctxt); + /* Disable writeback. */ c->dst.type = OP_NONE; return rc; @@ -2909,7 +2904,7 @@ static struct opcode group7_rm1[] = { static struct opcode group7_rm3[] = { DIP(SrcNone | ModRM | Prot | Priv, vmrun, check_svme_pa), - DI(SrcNone | ModRM | Prot | VendorSpecific, vmmcall), + II(SrcNone | ModRM | Prot | VendorSpecific, em_vmmcall, vmmcall), DIP(SrcNone | ModRM | Prot | Priv, vmload, check_svme_pa), DIP(SrcNone | ModRM | Prot | Priv, vmsave, check_svme_pa), DIP(SrcNone | ModRM | Prot | Priv, stgi, check_svme), @@ -2961,15 +2956,17 @@ static struct opcode group6[] = { static struct group_dual group7 = { { DI(ModRM | Mov | DstMem | Priv, sgdt), DI(ModRM | Mov | DstMem | Priv, sidt), - DI(ModRM | SrcMem | Priv, lgdt), DI(ModRM | SrcMem | Priv, lidt), - DI(SrcNone | ModRM | DstMem | Mov, smsw), N, - DI(SrcMem16 | ModRM | Mov | Priv, lmsw), - DI(SrcMem | ModRM | ByteOp | Priv | NoAccess, invlpg), + II(ModRM | SrcMem | Priv, em_lgdt, lgdt), + II(ModRM | SrcMem | Priv, em_lidt, lidt), + II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N, + II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), + II(SrcMem | ModRM | ByteOp | Priv | NoAccess, em_invlpg, invlpg), }, { - D(SrcNone | ModRM | Priv | VendorSpecific), EXT(0, group7_rm1), + I(SrcNone | ModRM | Priv | VendorSpecific, em_vmcall), + EXT(0, group7_rm1), N, EXT(0, group7_rm3), - DI(SrcNone | ModRM | DstMem | Mov, smsw), N, - DI(SrcMem16 | ModRM | Mov | Priv, lmsw), EXT(0, group7_rm7), + II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N, + II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), EXT(0, group7_rm7), } }; static struct opcode group8[] = { @@ -4107,33 +4104,6 @@ done: twobyte_insn: switch (c->b) { - case 0x01: /* lgdt, lidt, lmsw */ - switch (c->modrm_reg) { - case 0: /* vmcall */ - rc = em_vmcall(ctxt); - break; - case 2: /* lgdt */ - rc = em_lgdt(ctxt); - break; - case 3: /* lidt/vmmcall */ - if (c->modrm_mod == 3) - return em_svm(ctxt); - else - return em_lidt(ctxt); - break; - case 4: /* smsw */ - rc = em_smsw(ctxt); - break; - case 6: /* lmsw */ - rc = em_lmsw(ctxt); - break; - case 7: /* invlpg*/ - rc = em_invlpg(ctxt); - break; - default: - goto cannot_emulate; - } - break; case 0x05: /* syscall */ rc = emulate_syscall(ctxt, ops); break; From 40e19b519caeb93def89c45082d776fccfb96dbb Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 21 Apr 2011 12:35:41 +0300 Subject: [PATCH 096/131] KVM: SVM: Get rid of x86_intercept_map::valid By reserving 0 as an invalid x86_intercept_stage, we no longer need to store a valid flag in x86_intercept_map. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 1 + arch/x86/kvm/svm.c | 12 ++++-------- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 127ea3e17171..28114f581fa3 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -304,6 +304,7 @@ struct x86_emulate_ctxt { X86EMUL_MODE_PROT64) enum x86_intercept_stage { + X86_ICTP_NONE = 0, /* Allow zero-init to not match anything */ X86_ICPT_PRE_EXCEPT, X86_ICPT_POST_EXCEPT, X86_ICPT_POST_MEMACCESS, diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index de4bba99160d..9cff0368e1fc 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3959,19 +3959,15 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) } #define PRE_EX(exit) { .exit_code = (exit), \ - .stage = X86_ICPT_PRE_EXCEPT, \ - .valid = true } + .stage = X86_ICPT_PRE_EXCEPT, } #define POST_EX(exit) { .exit_code = (exit), \ - .stage = X86_ICPT_POST_EXCEPT, \ - .valid = true } + .stage = X86_ICPT_POST_EXCEPT, } #define POST_MEM(exit) { .exit_code = (exit), \ - .stage = X86_ICPT_POST_MEMACCESS, \ - .valid = true } + .stage = X86_ICPT_POST_MEMACCESS, } static struct __x86_intercept { u32 exit_code; enum x86_intercept_stage stage; - bool valid; } x86_intercept_map[] = { [x86_intercept_cr_read] = POST_EX(SVM_EXIT_READ_CR0), [x86_intercept_cr_write] = POST_EX(SVM_EXIT_WRITE_CR0), @@ -4039,7 +4035,7 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, icpt_info = x86_intercept_map[info->intercept]; - if (!icpt_info.valid || stage != icpt_info.stage) + if (stage != icpt_info.stage) goto out; switch (icpt_info.exit_code) { From 6e2ca7d1802bf8ed9908435e34daa116662e7790 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Fri, 22 Apr 2011 00:34:44 +0900 Subject: [PATCH 097/131] KVM: MMU: Optimize guest page table walk This patch optimizes the guest page table walk by using get_user() instead of copy_from_user(). With this patch applied, paging64_walk_addr_generic() has become about 0.5us to 1.0us faster on my Phenom II machine with NPT on. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 1b6899088f97..a32a1c809149 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -123,6 +123,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, gva_t addr, u32 access) { pt_element_t pte; + pt_element_t __user *ptep_user; gfn_t table_gfn; unsigned index, pt_access, uninitialized_var(pte_access); gpa_t pte_gpa; @@ -158,6 +159,9 @@ walk: pt_access = ACC_ALL; for (;;) { + gfn_t real_gfn; + unsigned long host_addr; + index = PT_INDEX(addr, walker->level); table_gfn = gpte_to_gfn(pte); @@ -166,9 +170,22 @@ walk: walker->table_gfn[walker->level - 1] = table_gfn; walker->pte_gpa[walker->level - 1] = pte_gpa; - if (kvm_read_guest_page_mmu(vcpu, mmu, table_gfn, &pte, - offset, sizeof(pte), - PFERR_USER_MASK|PFERR_WRITE_MASK)) { + real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn), + PFERR_USER_MASK|PFERR_WRITE_MASK); + if (real_gfn == UNMAPPED_GVA) { + present = false; + break; + } + real_gfn = gpa_to_gfn(real_gfn); + + host_addr = gfn_to_hva(vcpu->kvm, real_gfn); + if (kvm_is_error_hva(host_addr)) { + present = false; + break; + } + + ptep_user = (pt_element_t __user *)((void *)host_addr + offset); + if (get_user(pte, ptep_user)) { present = false; break; } From d67fc27ae2bdc2d2fb6c8ec4238a12b502b95cc7 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sat, 23 Apr 2011 18:48:02 +0900 Subject: [PATCH 098/131] KVM: x86 emulator: Use opcode::execute for Group 1, CMPS and SCAS The following instructions are changed to use opcode::execute. Group 1 (80-83) ADD (00-05), OR (08-0D), ADC (10-15), SBB (18-1D), AND (20-25), SUB (28-2D), XOR (30-35), CMP (38-3D) CMPS (A6-A7), SCAS (AE-AF) The last two do the same as CMP in the emulator, so em_cmp() is used. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 161 +++++++++++++++++++++++------------------ 1 file changed, 89 insertions(+), 72 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 252f28348cfe..8784916abf77 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2512,6 +2512,72 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_add(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); + return X86EMUL_CONTINUE; +} + +static int em_or(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); + return X86EMUL_CONTINUE; +} + +static int em_adc(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); + return X86EMUL_CONTINUE; +} + +static int em_sbb(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); + return X86EMUL_CONTINUE; +} + +static int em_and(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); + return X86EMUL_CONTINUE; +} + +static int em_sub(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); + return X86EMUL_CONTINUE; +} + +static int em_xor(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags); + return X86EMUL_CONTINUE; +} + +static int em_cmp(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); + /* Disable writeback. */ + c->dst.type = OP_NONE; + return X86EMUL_CONTINUE; +} + static int em_imul(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; @@ -2892,9 +2958,9 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) #define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p) #define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e) -#define D6ALU(_f) D2bv((_f) | DstMem | SrcReg | ModRM), \ - D2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock), \ - D2bv(((_f) & ~Lock) | DstAcc | SrcImm) +#define I6ALU(_f, _e) I2bv((_f) | DstMem | SrcReg | ModRM, _e), \ + I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \ + I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) static struct opcode group7_rm1[] = { DI(SrcNone | ModRM | Priv, monitor), @@ -2918,8 +2984,16 @@ static struct opcode group7_rm7[] = { DIP(SrcNone | ModRM, rdtscp, check_rdtsc), N, N, N, N, N, N, }; + static struct opcode group1[] = { - X7(D(Lock)), N + I(Lock, em_add), + I(Lock, em_or), + I(Lock, em_adc), + I(Lock, em_sbb), + I(Lock, em_and), + I(Lock, em_sub), + I(Lock, em_xor), + I(0, em_cmp), }; static struct opcode group1A[] = { @@ -2991,25 +3065,25 @@ static struct gprefix pfx_0f_6f_0f_7f = { static struct opcode opcode_table[256] = { /* 0x00 - 0x07 */ - D6ALU(Lock), + I6ALU(Lock, em_add), D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), /* 0x08 - 0x0F */ - D6ALU(Lock), + I6ALU(Lock, em_or), D(ImplicitOps | Stack | No64), N, /* 0x10 - 0x17 */ - D6ALU(Lock), + I6ALU(Lock, em_adc), D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), /* 0x18 - 0x1F */ - D6ALU(Lock), + I6ALU(Lock, em_sbb), D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), /* 0x20 - 0x27 */ - D6ALU(Lock), N, N, + I6ALU(Lock, em_and), N, N, /* 0x28 - 0x2F */ - D6ALU(Lock), N, I(ByteOp | DstAcc | No64, em_das), + I6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das), /* 0x30 - 0x37 */ - D6ALU(Lock), N, N, + I6ALU(Lock, em_xor), N, N, /* 0x38 - 0x3F */ - D6ALU(0), N, N, + I6ALU(0, em_cmp), N, N, /* 0x40 - 0x4F */ X16(D(DstReg)), /* 0x50 - 0x57 */ @@ -3050,12 +3124,12 @@ static struct opcode opcode_table[256] = { I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov), I2bv(SrcSI | DstDI | Mov | String, em_mov), - D2bv(SrcSI | DstDI | String), + I2bv(SrcSI | DstDI | String, em_cmp), /* 0xA8 - 0xAF */ D2bv(DstAcc | SrcImm), I2bv(SrcAcc | DstDI | Mov | String, em_mov), I2bv(SrcSI | DstAcc | Mov | String, em_mov), - D2bv(SrcAcc | DstDI | String), + I2bv(SrcAcc | DstDI | String, em_cmp), /* 0xB0 - 0xB7 */ X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)), /* 0xB8 - 0xBF */ @@ -3179,7 +3253,7 @@ static struct opcode twobyte_table[256] = { #undef D2bv #undef D2bvIP #undef I2bv -#undef D6ALU +#undef I6ALU static unsigned imm_size(struct decode_cache *c) { @@ -3715,60 +3789,27 @@ special_insn: goto twobyte_insn; switch (c->b) { - case 0x00 ... 0x05: - add: /* add */ - emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); - break; case 0x06: /* push es */ rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_ES); break; case 0x07: /* pop es */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); break; - case 0x08 ... 0x0d: - or: /* or */ - emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); - break; case 0x0e: /* push cs */ rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_CS); break; - case 0x10 ... 0x15: - adc: /* adc */ - emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); - break; case 0x16: /* push ss */ rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_SS); break; case 0x17: /* pop ss */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); break; - case 0x18 ... 0x1d: - sbb: /* sbb */ - emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); - break; case 0x1e: /* push ds */ rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_DS); break; case 0x1f: /* pop ds */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); break; - case 0x20 ... 0x25: - and: /* and */ - emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); - break; - case 0x28 ... 0x2d: - sub: /* sub */ - emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); - break; - case 0x30 ... 0x35: - xor: /* xor */ - emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags); - break; - case 0x38 ... 0x3d: - cmp: /* cmp */ - c->dst.type = OP_NONE; /* Disable writeback. */ - emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); - break; case 0x40 ... 0x47: /* inc r16/r32 */ emulate_1op("inc", c->dst, ctxt->eflags); break; @@ -3803,26 +3844,6 @@ special_insn: if (test_cc(c->b, ctxt->eflags)) jmp_rel(c, c->src.val); break; - case 0x80 ... 0x83: /* Grp1 */ - switch (c->modrm_reg) { - case 0: - goto add; - case 1: - goto or; - case 2: - goto adc; - case 3: - goto sbb; - case 4: - goto and; - case 5: - goto sub; - case 6: - goto xor; - case 7: - goto cmp; - } - break; case 0x84 ... 0x85: test: emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); @@ -3892,12 +3913,8 @@ special_insn: c->dst.bytes = c->op_bytes; rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes); break; - case 0xa6 ... 0xa7: /* cmps */ - goto cmp; case 0xa8 ... 0xa9: /* test ax, imm */ goto test; - case 0xae ... 0xaf: /* scas */ - goto cmp; case 0xc0 ... 0xc1: emulate_grp2(ctxt); break; From c54fe504693204fa672b10a57c3d82a8c41e0b4e Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sat, 23 Apr 2011 18:49:40 +0900 Subject: [PATCH 099/131] KVM: x86 emulator: Use opcode::execute for POP reg (58-5F) In addition, the RET emulation is changed to call em_pop() to remove the pop_instruction label. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8784916abf77..9f491bfb00fd 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1400,6 +1400,13 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt, return rc; } +static int em_pop(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + return emulate_pop(ctxt, ctxt->ops, &c->dst.val, c->op_bytes); +} + static int emulate_popf(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, void *dest, int len) @@ -3089,7 +3096,7 @@ static struct opcode opcode_table[256] = { /* 0x50 - 0x57 */ X8(I(SrcReg | Stack, em_push)), /* 0x58 - 0x5F */ - X8(D(DstReg | Stack)), + X8(I(DstReg | Stack, em_pop)), /* 0x60 - 0x67 */ D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ , @@ -3816,10 +3823,6 @@ special_insn: case 0x48 ... 0x4f: /* dec r16/r32 */ emulate_1op("dec", c->dst, ctxt->eflags); break; - case 0x58 ... 0x5f: /* pop reg */ - pop_instruction: - rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes); - break; case 0x60: /* pusha */ rc = emulate_pusha(ctxt); break; @@ -3922,7 +3925,8 @@ special_insn: c->dst.type = OP_REG; c->dst.addr.reg = &c->eip; c->dst.bytes = c->op_bytes; - goto pop_instruction; + rc = em_pop(ctxt); + break; case 0xc4: /* les */ rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES); break; From b96a7fad020b42eb4a564f8a2fb41827a83c4375 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sat, 23 Apr 2011 18:51:07 +0900 Subject: [PATCH 100/131] KVM: x86 emulator: Use opcode::execute for PUSHA/POPA (60/61) For this, emulate_pusha/popa() are converted to em_pusha/popa(). Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 9f491bfb00fd..b7c6e43e4f2b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1473,7 +1473,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, return rc; } -static int emulate_pusha(struct x86_emulate_ctxt *ctxt) +static int em_pusha(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; unsigned long old_esp = c->regs[VCPU_REGS_RSP]; @@ -1494,8 +1494,7 @@ static int emulate_pusha(struct x86_emulate_ctxt *ctxt) return rc; } -static int emulate_popa(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int em_popa(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; int rc = X86EMUL_CONTINUE; @@ -1508,7 +1507,7 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt, --reg; } - rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes); + rc = emulate_pop(ctxt, ctxt->ops, &c->regs[reg], c->op_bytes); if (rc != X86EMUL_CONTINUE) break; --reg; @@ -3098,7 +3097,8 @@ static struct opcode opcode_table[256] = { /* 0x58 - 0x5F */ X8(I(DstReg | Stack, em_pop)), /* 0x60 - 0x67 */ - D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), + I(ImplicitOps | Stack | No64, em_pusha), + I(ImplicitOps | Stack | No64, em_popa), N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ , N, N, N, N, /* 0x68 - 0x6F */ @@ -3823,12 +3823,6 @@ special_insn: case 0x48 ... 0x4f: /* dec r16/r32 */ emulate_1op("dec", c->dst, ctxt->eflags); break; - case 0x60: /* pusha */ - rc = emulate_pusha(ctxt); - break; - case 0x61: /* popa */ - rc = emulate_popa(ctxt, ops); - break; case 0x63: /* movsxd */ if (ctxt->mode != X86EMUL_MODE_PROT64) goto cannot_emulate; From 62aaa2f05abd59598f132e6ebad86318291b5be0 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sat, 23 Apr 2011 18:52:56 +0900 Subject: [PATCH 101/131] KVM: x86 emulator: Use opcode::execute for PUSHF/POPF (9C/9D) For this, em_pushf/popf() are introduced. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b7c6e43e4f2b..c1d9116cf3ac 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1448,6 +1448,16 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, return rc; } +static int em_popf(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + c->dst.type = OP_REG; + c->dst.addr.reg = &ctxt->eflags; + c->dst.bytes = c->op_bytes; + return emulate_popf(ctxt, ctxt->ops, &c->dst.val, c->op_bytes); +} + static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, int seg) { @@ -1494,6 +1504,14 @@ static int em_pusha(struct x86_emulate_ctxt *ctxt) return rc; } +static int em_pushf(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + c->src.val = (unsigned long)ctxt->eflags; + return em_push(ctxt); +} + static int em_popa(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; @@ -3126,7 +3144,8 @@ static struct opcode opcode_table[256] = { /* 0x98 - 0x9F */ D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), I(SrcImmFAddr | No64, em_call_far), N, - DI(ImplicitOps | Stack, pushf), DI(ImplicitOps | Stack, popf), N, N, + II(ImplicitOps | Stack, em_pushf, pushf), + II(ImplicitOps | Stack, em_popf, popf), N, N, /* 0xA0 - 0xA7 */ I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov), @@ -3900,16 +3919,6 @@ special_insn: case 8: c->dst.val = (s32)c->dst.val; break; } break; - case 0x9c: /* pushf */ - c->src.val = (unsigned long) ctxt->eflags; - rc = em_push(ctxt); - break; - case 0x9d: /* popf */ - c->dst.type = OP_REG; - c->dst.addr.reg = &ctxt->eflags; - c->dst.bytes = c->op_bytes; - rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes); - break; case 0xa8 ... 0xa9: /* test ax, imm */ goto test; case 0xc0 ... 0xc1: From 781e0743af3c5ba356d55bc60df59f2dded1e938 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 24 Apr 2011 12:25:50 +0300 Subject: [PATCH 102/131] KVM: MMU: Add unlikely() annotations to walk_addr_generic() walk_addr_generic() is a hot path and is also hard for the cpu to predict - some of the parameters (fetch_fault in particular) vary wildly from invocation to invocation. Add unlikely() annotations where appropriate; all walk failures are considered unlikely, as are cases where we have to mark the accessed or dirty bit, as they are slow paths both in kvm and on real processors. Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index a32a1c809149..652d56c081f7 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -172,49 +172,51 @@ walk: real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn), PFERR_USER_MASK|PFERR_WRITE_MASK); - if (real_gfn == UNMAPPED_GVA) { + if (unlikely(real_gfn == UNMAPPED_GVA)) { present = false; break; } real_gfn = gpa_to_gfn(real_gfn); host_addr = gfn_to_hva(vcpu->kvm, real_gfn); - if (kvm_is_error_hva(host_addr)) { + if (unlikely(kvm_is_error_hva(host_addr))) { present = false; break; } ptep_user = (pt_element_t __user *)((void *)host_addr + offset); - if (get_user(pte, ptep_user)) { + if (unlikely(get_user(pte, ptep_user))) { present = false; break; } trace_kvm_mmu_paging_element(pte, walker->level); - if (!is_present_gpte(pte)) { + if (unlikely(!is_present_gpte(pte))) { present = false; break; } - if (is_rsvd_bits_set(&vcpu->arch.mmu, pte, walker->level)) { + if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte, + walker->level))) { rsvd_fault = true; break; } - if (write_fault && !is_writable_pte(pte)) - if (user_fault || is_write_protection(vcpu)) - eperm = true; + if (unlikely(write_fault && !is_writable_pte(pte) + && (user_fault || is_write_protection(vcpu)))) + eperm = true; - if (user_fault && !(pte & PT_USER_MASK)) + if (unlikely(user_fault && !(pte & PT_USER_MASK))) eperm = true; #if PTTYPE == 64 - if (fetch_fault && (pte & PT64_NX_MASK)) + if (unlikely(fetch_fault && (pte & PT64_NX_MASK))) eperm = true; #endif - if (!eperm && !rsvd_fault && !(pte & PT_ACCESSED_MASK)) { + if (!eperm && !rsvd_fault + && unlikely(!(pte & PT_ACCESSED_MASK))) { int ret; trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte)); @@ -270,10 +272,10 @@ walk: --walker->level; } - if (!present || eperm || rsvd_fault) + if (unlikely(!present || eperm || rsvd_fault)) goto error; - if (write_fault && !is_dirty_gpte(pte)) { + if (write_fault && unlikely(!is_dirty_gpte(pte))) { int ret; trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); From 46561646ce409ad96c22645362c113de04f60bfb Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 24 Apr 2011 14:09:59 +0300 Subject: [PATCH 103/131] KVM: x86 emulator: consolidate group handling Move all groups into a single field and handle them in a single place. This saves bits when we add more group types (3 bits -> 7 groups types). Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 80 +++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c1d9116cf3ac..7466abae84bf 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -73,11 +73,12 @@ #define MemAbs (1<<11) /* Memory operand is absolute displacement */ #define String (1<<12) /* String instruction (rep capable) */ #define Stack (1<<13) /* Stack instruction (push/pop) */ +#define GroupMask (7<<14) /* Opcode uses one of the group mechanisms */ #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ -#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ -#define Prefix (1<<16) /* Instruction varies with 66/f2/f3 prefix */ +#define GroupDual (2<<14) /* Alternate decoding of mod == 3 */ +#define Prefix (3<<14) /* Instruction varies with 66/f2/f3 prefix */ +#define RMExt (4<<14) /* Opcode extension in ModRM r/m if mod == 3 */ #define Sse (1<<17) /* SSE Vector instruction */ -#define RMExt (1<<18) /* Opcode extension in ModRM r/m if mod == 3 */ /* Misc flags */ #define Prot (1<<21) /* instruction generates #UD if not in prot-mode */ #define VendorSpecific (1<<22) /* Vendor specific instruction */ @@ -2969,7 +2970,7 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) #define N D(0) #define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) } #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } -#define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) } +#define GD(_f, _g) { .flags = ((_f) | GroupDual), .u.gdual = (_g) } #define I(_f, _e) { .flags = (_f), .u.execute = (_e) } #define II(_f, _e, _i) \ { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i } @@ -3337,9 +3338,9 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) struct decode_cache *c = &ctxt->decode; int rc = X86EMUL_CONTINUE; int mode = ctxt->mode; - int def_op_bytes, def_ad_bytes, dual, goffset, simd_prefix; + int def_op_bytes, def_ad_bytes, goffset, simd_prefix; bool op_prefix = false; - struct opcode opcode, *g_mod012, *g_mod3; + struct opcode opcode; struct operand memop = { .type = OP_NONE }; c->eip = ctxt->eip; @@ -3433,44 +3434,43 @@ done_prefixes: } c->d = opcode.flags; - if (c->d & Group) { - dual = c->d & GroupDual; - c->modrm = insn_fetch(u8, 1, c->eip); - --c->eip; - - if (c->d & GroupDual) { - g_mod012 = opcode.u.gdual->mod012; - g_mod3 = opcode.u.gdual->mod3; - } else - g_mod012 = g_mod3 = opcode.u.group; - - c->d &= ~(Group | GroupDual); - - goffset = (c->modrm >> 3) & 7; - - if ((c->modrm >> 6) == 3) - opcode = g_mod3[goffset]; - else - opcode = g_mod012[goffset]; - - if (opcode.flags & RMExt) { + while (c->d & GroupMask) { + switch (c->d & GroupMask) { + case Group: + c->modrm = insn_fetch(u8, 1, c->eip); + --c->eip; + goffset = (c->modrm >> 3) & 7; + opcode = opcode.u.group[goffset]; + break; + case GroupDual: + c->modrm = insn_fetch(u8, 1, c->eip); + --c->eip; + goffset = (c->modrm >> 3) & 7; + if ((c->modrm >> 6) == 3) + opcode = opcode.u.gdual->mod3[goffset]; + else + opcode = opcode.u.gdual->mod012[goffset]; + break; + case RMExt: goffset = c->modrm & 7; opcode = opcode.u.group[goffset]; - } - - c->d |= opcode.flags; - } - - if (c->d & Prefix) { - if (c->rep_prefix && op_prefix) + break; + case Prefix: + if (c->rep_prefix && op_prefix) + return X86EMUL_UNHANDLEABLE; + simd_prefix = op_prefix ? 0x66 : c->rep_prefix; + switch (simd_prefix) { + case 0x00: opcode = opcode.u.gprefix->pfx_no; break; + case 0x66: opcode = opcode.u.gprefix->pfx_66; break; + case 0xf2: opcode = opcode.u.gprefix->pfx_f2; break; + case 0xf3: opcode = opcode.u.gprefix->pfx_f3; break; + } + break; + default: return X86EMUL_UNHANDLEABLE; - simd_prefix = op_prefix ? 0x66 : c->rep_prefix; - switch (simd_prefix) { - case 0x00: opcode = opcode.u.gprefix->pfx_no; break; - case 0x66: opcode = opcode.u.gprefix->pfx_66; break; - case 0xf2: opcode = opcode.u.gprefix->pfx_f2; break; - case 0xf3: opcode = opcode.u.gprefix->pfx_f3; break; } + + c->d &= ~GroupMask; c->d |= opcode.flags; } From 8d7d810255982bfcc355cdb8972d72843acb0cf8 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Tue, 12 Apr 2011 12:36:21 +0300 Subject: [PATCH 104/131] KVM: mmio_fault_cr2 is not used Remove unused variable mmio_fault_cr2. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/x86.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6cfc1ab2cdd6..afb0e69bd160 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -358,7 +358,6 @@ struct kvm_vcpu_arch { struct fpu guest_fpu; u64 xcr0; - gva_t mmio_fault_cr2; struct kvm_pio_request pio; void *pio_data; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a831d5d8ca14..e3ac212f7c87 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4561,7 +4561,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, bool writeback = true; kvm_clear_exception_queue(vcpu); - vcpu->arch.mmio_fault_cr2 = cr2; + /* * TODO: fix emulate.c to use guest_read/write_register * instead of direct ->regs accesses, can save hundred cycles From 4947e7cd0ee36e1aa37dfec4f7fa71cc64a2f0fd Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Tue, 12 Apr 2011 12:36:23 +0300 Subject: [PATCH 105/131] KVM: emulator: Propagate fault in far jump emulation Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 7466abae84bf..3624f202b440 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3993,7 +3993,8 @@ special_insn: jump_far: memcpy(&sel, c->src.valptr + c->op_bytes, 2); - if (load_segment_descriptor(ctxt, ops, sel, VCPU_SREG_CS)) + rc = load_segment_descriptor(ctxt, ops, sel, VCPU_SREG_CS); + if (rc != X86EMUL_CONTINUE) goto done; c->eip = 0; From 0004c7c25757f103ddb3a9e4bcfd533aad41f9a0 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Tue, 12 Apr 2011 12:36:24 +0300 Subject: [PATCH 106/131] KVM: Fix compound mmio mmio_index should be taken into account when copying data from userspace. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e3ac212f7c87..a9a307a75465 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5576,7 +5576,8 @@ static int complete_mmio(struct kvm_vcpu *vcpu) if (vcpu->mmio_needed) { vcpu->mmio_needed = 0; if (!vcpu->mmio_is_write) - memcpy(vcpu->mmio_data, run->mmio.data, 8); + memcpy(vcpu->mmio_data + vcpu->mmio_index, + run->mmio.data, 8); vcpu->mmio_index += 8; if (vcpu->mmio_index < vcpu->mmio_size) { run->exit_reason = KVM_EXIT_MMIO; From 2aab2c5b2bac6510b3bd143ca83babee382f4302 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Tue, 12 Apr 2011 12:36:25 +0300 Subject: [PATCH 107/131] KVM: call cache_all_regs() only once during instruction emulation Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a9a307a75465..f5f2d3d44577 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4459,6 +4459,12 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; int cs_db, cs_l; + /* + * TODO: fix emulate.c to use guest_read/write_register + * instead of direct ->regs accesses, can save hundred cycles + * on Intel for instructions that don't read/change RSP, for + * for example. + */ cache_all_regs(vcpu); kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); @@ -4562,14 +4568,6 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, kvm_clear_exception_queue(vcpu); - /* - * TODO: fix emulate.c to use guest_read/write_register - * instead of direct ->regs accesses, can save hundred cycles - * on Intel for instructions that don't read/change RSP, for - * for example. - */ - cache_all_regs(vcpu); - if (!(emulation_type & EMULTYPE_NO_DECODE)) { init_emulate_ctxt(vcpu); vcpu->arch.emulate_ctxt.interruptibility = 0; From 4429d5dc1197aaf8188e4febcde54d26a51baf6c Mon Sep 17 00:00:00 2001 From: "BrillyWu@viatech.com.cn" Date: Mon, 25 Apr 2011 13:55:15 +0800 Subject: [PATCH 108/131] KVM: Add CPUID support for VIA CPU The CPUIDs for Centaur are added, and then the features of PadLock hardware engine on VIA CPU, such as "ace", "ace_en" and so on, can be passed into the kvm guest. Signed-off-by: Brilly Wu Signed-off-by: Kary Jin Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f5f2d3d44577..22bc69ccf3ef 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2336,6 +2336,12 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); + /* cpuid 0xC0000001.edx */ + const u32 kvm_supported_word5_x86_features = + F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) | + F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | + F(PMM) | F(PMM_EN); + /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); do_cpuid_1_ent(entry, function, index); @@ -2445,6 +2451,20 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->ecx &= kvm_supported_word6_x86_features; cpuid_mask(&entry->ecx, 6); break; + /*Add support for Centaur's CPUID instruction*/ + case 0xC0000000: + /*Just support up to 0xC0000004 now*/ + entry->eax = min(entry->eax, 0xC0000004); + break; + case 0xC0000001: + entry->edx &= kvm_supported_word5_x86_features; + cpuid_mask(&entry->edx, 5); + break; + case 0xC0000002: + case 0xC0000003: + case 0xC0000004: + /*Now nothing to do, reserved for the future*/ + break; } kvm_x86_ops->set_supported_cpuid(function, entry); @@ -2491,6 +2511,26 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, if (nent >= cpuid->nent) goto out_free; + /* Add support for Centaur's CPUID instruction. */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR) { + do_cpuid_ent(&cpuid_entries[nent], 0xC0000000, 0, + &nent, cpuid->nent); + + r = -E2BIG; + if (nent >= cpuid->nent) + goto out_free; + + limit = cpuid_entries[nent - 1].eax; + for (func = 0xC0000001; + func <= limit && nent < cpuid->nent; ++func) + do_cpuid_ent(&cpuid_entries[nent], func, 0, + &nent, cpuid->nent); + + r = -E2BIG; + if (nent >= cpuid->nent) + goto out_free; + } + do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent, cpuid->nent); From b74323dc2b7b5690d18bc7934b98e4665e778a7b Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Wed, 27 Apr 2011 14:06:07 -0400 Subject: [PATCH 109/131] KVM: ia64: fix sparse warnings This patch fixes some sparse warning about "dubious one-bit signed bitfield." Signed-off-by: Jeff Mahoney Originally-by: Jan Blunck Signed-off-by: Jan Blunck Acked-by: Xiantao Zhang Signed-off-by: Avi Kivity --- arch/ia64/kvm/vti.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/ia64/kvm/vti.h b/arch/ia64/kvm/vti.h index f6c5617e16af..b214b5b0432d 100644 --- a/arch/ia64/kvm/vti.h +++ b/arch/ia64/kvm/vti.h @@ -83,13 +83,13 @@ union vac { unsigned long value; struct { - int a_int:1; - int a_from_int_cr:1; - int a_to_int_cr:1; - int a_from_psr:1; - int a_from_cpuid:1; - int a_cover:1; - int a_bsw:1; + unsigned int a_int:1; + unsigned int a_from_int_cr:1; + unsigned int a_to_int_cr:1; + unsigned int a_from_psr:1; + unsigned int a_from_cpuid:1; + unsigned int a_cover:1; + unsigned int a_bsw:1; long reserved:57; }; }; @@ -97,12 +97,12 @@ union vac { union vdc { unsigned long value; struct { - int d_vmsw:1; - int d_extint:1; - int d_ibr_dbr:1; - int d_pmc:1; - int d_to_pmd:1; - int d_itm:1; + unsigned int d_vmsw:1; + unsigned int d_extint:1; + unsigned int d_ibr_dbr:1; + unsigned int d_pmc:1; + unsigned int d_to_pmd:1; + unsigned int d_itm:1; long reserved:58; }; }; From 8f74d8e16812d63639871b4e56409b08bdcb66fc Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Thu, 28 Apr 2011 07:08:36 +0900 Subject: [PATCH 110/131] KVM: MMU: Fix 64-bit paging breakage on x86_32 Fix regression introduced by commit e30d2a170506830d5eef5e9d7990c5aedf1b0a51 KVM: MMU: Optimize guest page table walk On x86_32, get_user() does not support 64-bit values and we fail to build KVM at the point of 64-bit paging. This patch fixes this by using get_user() twice for that condition. Signed-off-by: Takuya Yoshikawa Reported-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 652d56c081f7..52450a6b784f 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -115,6 +115,20 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) return access; } +static int FNAME(read_gpte)(pt_element_t *pte, pt_element_t __user *ptep_user) +{ +#if defined(CONFIG_X86_32) && (PTTYPE == 64) + u32 *p = (u32 *)pte; + u32 __user *p_user = (u32 __user *)ptep_user; + + if (unlikely(get_user(*p, p_user))) + return -EFAULT; + return get_user(*(p + 1), p_user + 1); +#else + return get_user(*pte, ptep_user); +#endif +} + /* * Fetch a guest pte for a guest virtual address */ @@ -185,7 +199,7 @@ walk: } ptep_user = (pt_element_t __user *)((void *)host_addr + offset); - if (unlikely(get_user(pte, ptep_user))) { + if (unlikely(FNAME(read_gpte)(&pte, ptep_user))) { present = false; break; } From ae8cc059550b2c2ec7a5e9650bb1be7b988a1208 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sun, 24 Apr 2011 22:00:50 -0700 Subject: [PATCH 111/131] KVM: SVM: Make dump_vmcb static, reduce text dump_vmcb isn't used outside this module, make it static. Shrink text and object by ~1% by standardizing formats. $ size arch/x86/kvm/svm.o* text data bss dec hex filename 52910 580 10072 63562 f84a arch/x86/kvm/svm.o.new 53563 580 10072 64215 fad7 arch/x86/kvm/svm.o.old Signed-off-by: Joe Perches Acked-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 176 ++++++++++++++++++++++++--------------------- 1 file changed, 94 insertions(+), 82 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 9cff0368e1fc..506e4fe23adc 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3191,97 +3191,109 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_NPF] = pf_interception, }; -void dump_vmcb(struct kvm_vcpu *vcpu) +static void dump_vmcb(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; pr_err("VMCB Control Area:\n"); - pr_err("cr_read: %04x\n", control->intercept_cr & 0xffff); - pr_err("cr_write: %04x\n", control->intercept_cr >> 16); - pr_err("dr_read: %04x\n", control->intercept_dr & 0xffff); - pr_err("dr_write: %04x\n", control->intercept_dr >> 16); - pr_err("exceptions: %08x\n", control->intercept_exceptions); - pr_err("intercepts: %016llx\n", control->intercept); - pr_err("pause filter count: %d\n", control->pause_filter_count); - pr_err("iopm_base_pa: %016llx\n", control->iopm_base_pa); - pr_err("msrpm_base_pa: %016llx\n", control->msrpm_base_pa); - pr_err("tsc_offset: %016llx\n", control->tsc_offset); - pr_err("asid: %d\n", control->asid); - pr_err("tlb_ctl: %d\n", control->tlb_ctl); - pr_err("int_ctl: %08x\n", control->int_ctl); - pr_err("int_vector: %08x\n", control->int_vector); - pr_err("int_state: %08x\n", control->int_state); - pr_err("exit_code: %08x\n", control->exit_code); - pr_err("exit_info1: %016llx\n", control->exit_info_1); - pr_err("exit_info2: %016llx\n", control->exit_info_2); - pr_err("exit_int_info: %08x\n", control->exit_int_info); - pr_err("exit_int_info_err: %08x\n", control->exit_int_info_err); - pr_err("nested_ctl: %lld\n", control->nested_ctl); - pr_err("nested_cr3: %016llx\n", control->nested_cr3); - pr_err("event_inj: %08x\n", control->event_inj); - pr_err("event_inj_err: %08x\n", control->event_inj_err); - pr_err("lbr_ctl: %lld\n", control->lbr_ctl); - pr_err("next_rip: %016llx\n", control->next_rip); + pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff); + pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16); + pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff); + pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16); + pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions); + pr_err("%-20s%016llx\n", "intercepts:", control->intercept); + pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count); + pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa); + pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa); + pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset); + pr_err("%-20s%d\n", "asid:", control->asid); + pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl); + pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl); + pr_err("%-20s%08x\n", "int_vector:", control->int_vector); + pr_err("%-20s%08x\n", "int_state:", control->int_state); + pr_err("%-20s%08x\n", "exit_code:", control->exit_code); + pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1); + pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2); + pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info); + pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err); + pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl); + pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3); + pr_err("%-20s%08x\n", "event_inj:", control->event_inj); + pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err); + pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl); + pr_err("%-20s%016llx\n", "next_rip:", control->next_rip); pr_err("VMCB State Save Area:\n"); - pr_err("es: s: %04x a: %04x l: %08x b: %016llx\n", - save->es.selector, save->es.attrib, - save->es.limit, save->es.base); - pr_err("cs: s: %04x a: %04x l: %08x b: %016llx\n", - save->cs.selector, save->cs.attrib, - save->cs.limit, save->cs.base); - pr_err("ss: s: %04x a: %04x l: %08x b: %016llx\n", - save->ss.selector, save->ss.attrib, - save->ss.limit, save->ss.base); - pr_err("ds: s: %04x a: %04x l: %08x b: %016llx\n", - save->ds.selector, save->ds.attrib, - save->ds.limit, save->ds.base); - pr_err("fs: s: %04x a: %04x l: %08x b: %016llx\n", - save->fs.selector, save->fs.attrib, - save->fs.limit, save->fs.base); - pr_err("gs: s: %04x a: %04x l: %08x b: %016llx\n", - save->gs.selector, save->gs.attrib, - save->gs.limit, save->gs.base); - pr_err("gdtr: s: %04x a: %04x l: %08x b: %016llx\n", - save->gdtr.selector, save->gdtr.attrib, - save->gdtr.limit, save->gdtr.base); - pr_err("ldtr: s: %04x a: %04x l: %08x b: %016llx\n", - save->ldtr.selector, save->ldtr.attrib, - save->ldtr.limit, save->ldtr.base); - pr_err("idtr: s: %04x a: %04x l: %08x b: %016llx\n", - save->idtr.selector, save->idtr.attrib, - save->idtr.limit, save->idtr.base); - pr_err("tr: s: %04x a: %04x l: %08x b: %016llx\n", - save->tr.selector, save->tr.attrib, - save->tr.limit, save->tr.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "es:", + save->es.selector, save->es.attrib, + save->es.limit, save->es.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "cs:", + save->cs.selector, save->cs.attrib, + save->cs.limit, save->cs.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "ss:", + save->ss.selector, save->ss.attrib, + save->ss.limit, save->ss.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "ds:", + save->ds.selector, save->ds.attrib, + save->ds.limit, save->ds.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "fs:", + save->fs.selector, save->fs.attrib, + save->fs.limit, save->fs.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "gs:", + save->gs.selector, save->gs.attrib, + save->gs.limit, save->gs.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "gdtr:", + save->gdtr.selector, save->gdtr.attrib, + save->gdtr.limit, save->gdtr.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "ldtr:", + save->ldtr.selector, save->ldtr.attrib, + save->ldtr.limit, save->ldtr.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "idtr:", + save->idtr.selector, save->idtr.attrib, + save->idtr.limit, save->idtr.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "tr:", + save->tr.selector, save->tr.attrib, + save->tr.limit, save->tr.base); pr_err("cpl: %d efer: %016llx\n", save->cpl, save->efer); - pr_err("cr0: %016llx cr2: %016llx\n", - save->cr0, save->cr2); - pr_err("cr3: %016llx cr4: %016llx\n", - save->cr3, save->cr4); - pr_err("dr6: %016llx dr7: %016llx\n", - save->dr6, save->dr7); - pr_err("rip: %016llx rflags: %016llx\n", - save->rip, save->rflags); - pr_err("rsp: %016llx rax: %016llx\n", - save->rsp, save->rax); - pr_err("star: %016llx lstar: %016llx\n", - save->star, save->lstar); - pr_err("cstar: %016llx sfmask: %016llx\n", - save->cstar, save->sfmask); - pr_err("kernel_gs_base: %016llx sysenter_cs: %016llx\n", - save->kernel_gs_base, save->sysenter_cs); - pr_err("sysenter_esp: %016llx sysenter_eip: %016llx\n", - save->sysenter_esp, save->sysenter_eip); - pr_err("gpat: %016llx dbgctl: %016llx\n", - save->g_pat, save->dbgctl); - pr_err("br_from: %016llx br_to: %016llx\n", - save->br_from, save->br_to); - pr_err("excp_from: %016llx excp_to: %016llx\n", - save->last_excp_from, save->last_excp_to); - + pr_err("%-15s %016llx %-13s %016llx\n", + "cr0:", save->cr0, "cr2:", save->cr2); + pr_err("%-15s %016llx %-13s %016llx\n", + "cr3:", save->cr3, "cr4:", save->cr4); + pr_err("%-15s %016llx %-13s %016llx\n", + "dr6:", save->dr6, "dr7:", save->dr7); + pr_err("%-15s %016llx %-13s %016llx\n", + "rip:", save->rip, "rflags:", save->rflags); + pr_err("%-15s %016llx %-13s %016llx\n", + "rsp:", save->rsp, "rax:", save->rax); + pr_err("%-15s %016llx %-13s %016llx\n", + "star:", save->star, "lstar:", save->lstar); + pr_err("%-15s %016llx %-13s %016llx\n", + "cstar:", save->cstar, "sfmask:", save->sfmask); + pr_err("%-15s %016llx %-13s %016llx\n", + "kernel_gs_base:", save->kernel_gs_base, + "sysenter_cs:", save->sysenter_cs); + pr_err("%-15s %016llx %-13s %016llx\n", + "sysenter_esp:", save->sysenter_esp, + "sysenter_eip:", save->sysenter_eip); + pr_err("%-15s %016llx %-13s %016llx\n", + "gpat:", save->g_pat, "dbgctl:", save->dbgctl); + pr_err("%-15s %016llx %-13s %016llx\n", + "br_from:", save->br_from, "br_to:", save->br_to); + pr_err("%-15s %016llx %-13s %016llx\n", + "excp_from:", save->last_excp_from, + "excp_to:", save->last_excp_to); } static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) From 0a434bb2bf094f463ca3ca71ac42cea9e423048f Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 28 Apr 2011 15:59:33 +0300 Subject: [PATCH 112/131] KVM: VMX: Avoid reading %rip unnecessarily when handling exceptions Avoids a VMREAD. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3f6e9bff0160..139a5cb1f5e1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3170,7 +3170,6 @@ static int handle_exception(struct kvm_vcpu *vcpu) } error_code = 0; - rip = kvm_rip_read(vcpu); if (intr_info & INTR_INFO_DELIVER_CODE_MASK) error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); if (is_page_fault(intr_info)) { @@ -3217,6 +3216,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) vmx->vcpu.arch.event_exit_inst_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); kvm_run->exit_reason = KVM_EXIT_DEBUG; + rip = kvm_rip_read(vcpu); kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; kvm_run->debug.arch.exception = ex_no; break; From 1aa366163b8b69f660cf94fd5062fa44859e4318 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 27 Apr 2011 13:20:30 +0300 Subject: [PATCH 113/131] KVM: x86 emulator: consolidate segment accessors Instead of separate accessors for the segment selector and cached descriptor, use one accessor for both. This simplifies the code somewhat. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 13 +-- arch/x86/kvm/emulate.c | 122 ++++++++++++++++------------- arch/x86/kvm/x86.c | 41 +++------- 3 files changed, 83 insertions(+), 93 deletions(-) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 28114f581fa3..0049211959c0 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -164,15 +164,10 @@ struct x86_emulate_ops { int size, unsigned short port, const void *val, unsigned int count); - bool (*get_cached_descriptor)(struct x86_emulate_ctxt *ctxt, - struct desc_struct *desc, u32 *base3, - int seg); - void (*set_cached_descriptor)(struct x86_emulate_ctxt *ctxt, - struct desc_struct *desc, u32 base3, - int seg); - u16 (*get_segment_selector)(struct x86_emulate_ctxt *ctxt, int seg); - void (*set_segment_selector)(struct x86_emulate_ctxt *ctxt, - u16 sel, int seg); + bool (*get_segment)(struct x86_emulate_ctxt *ctxt, u16 *selector, + struct desc_struct *desc, u32 *base3, int seg); + void (*set_segment)(struct x86_emulate_ctxt *ctxt, u16 selector, + struct desc_struct *desc, u32 base3, int seg); unsigned long (*get_cached_segment_base)(struct x86_emulate_ctxt *ctxt, int seg); void (*get_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 3624f202b440..59992484f5f3 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -553,6 +553,26 @@ static int emulate_nm(struct x86_emulate_ctxt *ctxt) return emulate_exception(ctxt, NM_VECTOR, 0, false); } +static u16 get_segment_selector(struct x86_emulate_ctxt *ctxt, unsigned seg) +{ + u16 selector; + struct desc_struct desc; + + ctxt->ops->get_segment(ctxt, &selector, &desc, NULL, seg); + return selector; +} + +static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector, + unsigned seg) +{ + u16 dummy; + u32 base3; + struct desc_struct desc; + + ctxt->ops->get_segment(ctxt, &dummy, &desc, &base3, seg); + ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg); +} + static int __linearize(struct x86_emulate_ctxt *ctxt, struct segmented_address addr, unsigned size, bool write, bool fetch, @@ -563,6 +583,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, bool usable; ulong la; u32 lim; + u16 sel; unsigned cpl, rpl; la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea; @@ -574,8 +595,8 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, return emulate_gp(ctxt, 0); break; default: - usable = ctxt->ops->get_cached_descriptor(ctxt, &desc, NULL, - addr.seg); + usable = ctxt->ops->get_segment(ctxt, &sel, &desc, NULL, + addr.seg); if (!usable) goto bad; /* code segment or read-only data segment */ @@ -598,7 +619,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, goto bad; } cpl = ctxt->ops->cpl(ctxt); - rpl = ctxt->ops->get_segment_selector(ctxt, addr.seg) & 3; + rpl = sel & 3; cpl = max(cpl, rpl); if (!(desc.type & 8)) { /* data segment */ @@ -1142,9 +1163,10 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, { if (selector & 1 << 2) { struct desc_struct desc; + u16 sel; + memset (dt, 0, sizeof *dt); - if (!ops->get_cached_descriptor(ctxt, &desc, NULL, - VCPU_SREG_LDTR)) + if (!ops->get_segment(ctxt, &sel, &desc, NULL, VCPU_SREG_LDTR)) return; dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */ @@ -1305,8 +1327,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, return ret; } load: - ops->set_segment_selector(ctxt, selector, seg); - ops->set_cached_descriptor(ctxt, &seg_desc, 0, seg); + ops->set_segment(ctxt, selector, &seg_desc, 0, seg); return X86EMUL_CONTINUE; exception: emulate_exception(ctxt, err_vec, err_code, true); @@ -1464,7 +1485,7 @@ static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt, { struct decode_cache *c = &ctxt->decode; - c->src.val = ops->get_segment_selector(ctxt, seg); + c->src.val = get_segment_selector(ctxt, seg); return em_push(ctxt); } @@ -1552,7 +1573,7 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC); - c->src.val = ops->get_segment_selector(ctxt, VCPU_SREG_CS); + c->src.val = get_segment_selector(ctxt, VCPU_SREG_CS); rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) return rc; @@ -1838,8 +1859,10 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, struct desc_struct *cs, struct desc_struct *ss) { + u16 selector; + memset(cs, 0, sizeof(struct desc_struct)); - ops->get_cached_descriptor(ctxt, cs, NULL, VCPU_SREG_CS); + ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS); memset(ss, 0, sizeof(struct desc_struct)); cs->l = 0; /* will be adjusted later */ @@ -1888,10 +1911,8 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) cs.d = 0; cs.l = 1; } - ops->set_cached_descriptor(ctxt, &cs, 0, VCPU_SREG_CS); - ops->set_segment_selector(ctxt, cs_sel, VCPU_SREG_CS); - ops->set_cached_descriptor(ctxt, &ss, 0, VCPU_SREG_SS); - ops->set_segment_selector(ctxt, ss_sel, VCPU_SREG_SS); + ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); + ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); c->regs[VCPU_REGS_RCX] = c->eip; if (efer & EFER_LMA) { @@ -1961,10 +1982,8 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) cs.l = 1; } - ops->set_cached_descriptor(ctxt, &cs, 0, VCPU_SREG_CS); - ops->set_segment_selector(ctxt, cs_sel, VCPU_SREG_CS); - ops->set_cached_descriptor(ctxt, &ss, 0, VCPU_SREG_SS); - ops->set_segment_selector(ctxt, ss_sel, VCPU_SREG_SS); + ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); + ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data); c->eip = msr_data; @@ -2018,10 +2037,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) cs_sel |= SELECTOR_RPL_MASK; ss_sel |= SELECTOR_RPL_MASK; - ops->set_cached_descriptor(ctxt, &cs, 0, VCPU_SREG_CS); - ops->set_segment_selector(ctxt, cs_sel, VCPU_SREG_CS); - ops->set_cached_descriptor(ctxt, &ss, 0, VCPU_SREG_SS); - ops->set_segment_selector(ctxt, ss_sel, VCPU_SREG_SS); + ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); + ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); c->eip = c->regs[VCPU_REGS_RDX]; c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX]; @@ -2048,11 +2065,11 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, struct desc_struct tr_seg; u32 base3; int r; - u16 io_bitmap_ptr, perm, bit_idx = port & 0x7; + u16 tr, io_bitmap_ptr, perm, bit_idx = port & 0x7; unsigned mask = (1 << len) - 1; unsigned long base; - ops->get_cached_descriptor(ctxt, &tr_seg, &base3, VCPU_SREG_TR); + ops->get_segment(ctxt, &tr, &tr_seg, &base3, VCPU_SREG_TR); if (!tr_seg.p) return false; if (desc_limit_scaled(&tr_seg) < 103) @@ -2107,11 +2124,11 @@ static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, tss->si = c->regs[VCPU_REGS_RSI]; tss->di = c->regs[VCPU_REGS_RDI]; - tss->es = ops->get_segment_selector(ctxt, VCPU_SREG_ES); - tss->cs = ops->get_segment_selector(ctxt, VCPU_SREG_CS); - tss->ss = ops->get_segment_selector(ctxt, VCPU_SREG_SS); - tss->ds = ops->get_segment_selector(ctxt, VCPU_SREG_DS); - tss->ldt = ops->get_segment_selector(ctxt, VCPU_SREG_LDTR); + tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); + tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); + tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS); + tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS); + tss->ldt = get_segment_selector(ctxt, VCPU_SREG_LDTR); } static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, @@ -2136,11 +2153,11 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, * SDM says that segment selectors are loaded before segment * descriptors */ - ops->set_segment_selector(ctxt, tss->ldt, VCPU_SREG_LDTR); - ops->set_segment_selector(ctxt, tss->es, VCPU_SREG_ES); - ops->set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS); - ops->set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS); - ops->set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS); + set_segment_selector(ctxt, tss->ldt, VCPU_SREG_LDTR); + set_segment_selector(ctxt, tss->es, VCPU_SREG_ES); + set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS); + set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS); + set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS); /* * Now load segment descriptors. If fault happenes at this stage @@ -2227,13 +2244,13 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, tss->esi = c->regs[VCPU_REGS_RSI]; tss->edi = c->regs[VCPU_REGS_RDI]; - tss->es = ops->get_segment_selector(ctxt, VCPU_SREG_ES); - tss->cs = ops->get_segment_selector(ctxt, VCPU_SREG_CS); - tss->ss = ops->get_segment_selector(ctxt, VCPU_SREG_SS); - tss->ds = ops->get_segment_selector(ctxt, VCPU_SREG_DS); - tss->fs = ops->get_segment_selector(ctxt, VCPU_SREG_FS); - tss->gs = ops->get_segment_selector(ctxt, VCPU_SREG_GS); - tss->ldt_selector = ops->get_segment_selector(ctxt, VCPU_SREG_LDTR); + tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); + tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); + tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS); + tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS); + tss->fs = get_segment_selector(ctxt, VCPU_SREG_FS); + tss->gs = get_segment_selector(ctxt, VCPU_SREG_GS); + tss->ldt_selector = get_segment_selector(ctxt, VCPU_SREG_LDTR); } static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, @@ -2260,13 +2277,13 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, * SDM says that segment selectors are loaded before segment * descriptors */ - ops->set_segment_selector(ctxt, tss->ldt_selector, VCPU_SREG_LDTR); - ops->set_segment_selector(ctxt, tss->es, VCPU_SREG_ES); - ops->set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS); - ops->set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS); - ops->set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS); - ops->set_segment_selector(ctxt, tss->fs, VCPU_SREG_FS); - ops->set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS); + set_segment_selector(ctxt, tss->ldt_selector, VCPU_SREG_LDTR); + set_segment_selector(ctxt, tss->es, VCPU_SREG_ES); + set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS); + set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS); + set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS); + set_segment_selector(ctxt, tss->fs, VCPU_SREG_FS); + set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS); /* * Now load segment descriptors. If fault happenes at this stage @@ -2348,7 +2365,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, { struct desc_struct curr_tss_desc, next_tss_desc; int ret; - u16 old_tss_sel = ops->get_segment_selector(ctxt, VCPU_SREG_TR); + u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR); ulong old_tss_base = ops->get_cached_segment_base(ctxt, VCPU_SREG_TR); u32 desc_limit; @@ -2411,8 +2428,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, } ops->set_cr(ctxt, 0, ops->get_cr(ctxt, 0) | X86_CR0_TS); - ops->set_cached_descriptor(ctxt, &next_tss_desc, 0, VCPU_SREG_TR); - ops->set_segment_selector(ctxt, tss_selector, VCPU_SREG_TR); + ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR); if (has_error_code) { struct decode_cache *c = &ctxt->decode; @@ -2503,7 +2519,7 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt) ulong old_eip; int rc; - old_cs = ctxt->ops->get_segment_selector(ctxt, VCPU_SREG_CS); + old_cs = get_segment_selector(ctxt, VCPU_SREG_CS); old_eip = c->eip; memcpy(&sel, c->src.valptr + c->op_bytes, 2); @@ -3881,7 +3897,7 @@ special_insn: rc = emulate_ud(ctxt); goto done; } - c->dst.val = ops->get_segment_selector(ctxt, c->modrm_reg); + c->dst.val = get_segment_selector(ctxt, c->modrm_reg); break; case 0x8d: /* lea r16/r32, m */ c->dst.val = c->src.addr.mem.ea; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 22bc69ccf3ef..77c9d8673dc4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4304,13 +4304,14 @@ static unsigned long emulator_get_cached_segment_base( return get_segment_base(emul_to_vcpu(ctxt), seg); } -static bool emulator_get_cached_descriptor(struct x86_emulate_ctxt *ctxt, - struct desc_struct *desc, u32 *base3, - int seg) +static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector, + struct desc_struct *desc, u32 *base3, + int seg) { struct kvm_segment var; kvm_get_segment(emul_to_vcpu(ctxt), &var, seg); + *selector = var.selector; if (var.unusable) return false; @@ -4335,16 +4336,14 @@ static bool emulator_get_cached_descriptor(struct x86_emulate_ctxt *ctxt, return true; } -static void emulator_set_cached_descriptor(struct x86_emulate_ctxt *ctxt, - struct desc_struct *desc, u32 base3, - int seg) +static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector, + struct desc_struct *desc, u32 base3, + int seg) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); struct kvm_segment var; - /* needed to preserve selector */ - kvm_get_segment(vcpu, &var, seg); - + var.selector = selector; var.base = get_desc_base(desc); #ifdef CONFIG_X86_64 var.base |= ((u64)base3) << 32; @@ -4368,24 +4367,6 @@ static void emulator_set_cached_descriptor(struct x86_emulate_ctxt *ctxt, return; } -static u16 emulator_get_segment_selector(struct x86_emulate_ctxt *ctxt, int seg) -{ - struct kvm_segment kvm_seg; - - kvm_get_segment(emul_to_vcpu(ctxt), &kvm_seg, seg); - return kvm_seg.selector; -} - -static void emulator_set_segment_selector(struct x86_emulate_ctxt *ctxt, - u16 sel, int seg) -{ - struct kvm_segment kvm_seg; - - kvm_get_segment(emul_to_vcpu(ctxt), &kvm_seg, seg); - kvm_seg.selector = sel; - kvm_set_segment(emul_to_vcpu(ctxt), &kvm_seg, seg); -} - static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata) { @@ -4436,10 +4417,8 @@ static struct x86_emulate_ops emulate_ops = { .invlpg = emulator_invlpg, .pio_in_emulated = emulator_pio_in_emulated, .pio_out_emulated = emulator_pio_out_emulated, - .get_cached_descriptor = emulator_get_cached_descriptor, - .set_cached_descriptor = emulator_set_cached_descriptor, - .get_segment_selector = emulator_get_segment_selector, - .set_segment_selector = emulator_set_segment_selector, + .get_segment = emulator_get_segment, + .set_segment = emulator_set_segment, .get_cached_segment_base = emulator_get_cached_segment_base, .get_gdt = emulator_get_gdt, .get_idt = emulator_get_idt, From 2fb92db1ec08f3235c500e7f460eeb78092d844e Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 27 Apr 2011 19:42:18 +0300 Subject: [PATCH 114/131] KVM: VMX: Cache vmcs segment fields Since the emulator now checks segment limits and access rights, it generates a lot more accesses to the vmcs segment fields. Undo some of the performance hit by cacheing those fields in a read-only cache (the entire cache is invalidated on any write, or on guest exit). Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/vmx.c | 102 ++++++++++++++++++++++++++++---- 2 files changed, 93 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index afb0e69bd160..d2ac8e2ee897 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -136,6 +136,7 @@ enum kvm_reg_ex { VCPU_EXREG_CR3, VCPU_EXREG_RFLAGS, VCPU_EXREG_CPL, + VCPU_EXREG_SEGMENTS, }; enum { diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 139a5cb1f5e1..4c3fa0f67469 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -162,6 +162,10 @@ struct vcpu_vmx { u32 ar; } tr, es, ds, fs, gs; } rmode; + struct { + u32 bitmask; /* 4 bits per segment (1 bit per field) */ + struct kvm_save_segment seg[8]; + } segment_cache; int vpid; bool emulation_required; @@ -174,6 +178,15 @@ struct vcpu_vmx { bool rdtscp_enabled; }; +enum segment_cache_field { + SEG_FIELD_SEL = 0, + SEG_FIELD_BASE = 1, + SEG_FIELD_LIMIT = 2, + SEG_FIELD_AR = 3, + + SEG_FIELD_NR = 4 +}; + static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) { return container_of(vcpu, struct vcpu_vmx, vcpu); @@ -646,6 +659,62 @@ static void vmcs_set_bits(unsigned long field, u32 mask) vmcs_writel(field, vmcs_readl(field) | mask); } +static void vmx_segment_cache_clear(struct vcpu_vmx *vmx) +{ + vmx->segment_cache.bitmask = 0; +} + +static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, + unsigned field) +{ + bool ret; + u32 mask = 1 << (seg * SEG_FIELD_NR + field); + + if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) { + vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS); + vmx->segment_cache.bitmask = 0; + } + ret = vmx->segment_cache.bitmask & mask; + vmx->segment_cache.bitmask |= mask; + return ret; +} + +static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg) +{ + u16 *p = &vmx->segment_cache.seg[seg].selector; + + if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL)) + *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector); + return *p; +} + +static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg) +{ + ulong *p = &vmx->segment_cache.seg[seg].base; + + if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE)) + *p = vmcs_readl(kvm_vmx_segment_fields[seg].base); + return *p; +} + +static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg) +{ + u32 *p = &vmx->segment_cache.seg[seg].limit; + + if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT)) + *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit); + return *p; +} + +static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) +{ + u32 *p = &vmx->segment_cache.seg[seg].ar; + + if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR)) + *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes); + return *p; +} + static void update_exception_bitmap(struct kvm_vcpu *vcpu) { u32 eb; @@ -1271,9 +1340,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) break; #ifdef CONFIG_X86_64 case MSR_FS_BASE: + vmx_segment_cache_clear(vmx); vmcs_writel(GUEST_FS_BASE, data); break; case MSR_GS_BASE: + vmx_segment_cache_clear(vmx); vmcs_writel(GUEST_GS_BASE, data); break; case MSR_KERNEL_GS_BASE: @@ -1717,6 +1788,8 @@ static void enter_pmode(struct kvm_vcpu *vcpu) vmx->emulation_required = 1; vmx->rmode.vm86_active = 0; + vmx_segment_cache_clear(vmx); + vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector); vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base); vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit); @@ -1740,6 +1813,8 @@ static void enter_pmode(struct kvm_vcpu *vcpu) fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs); fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs); + vmx_segment_cache_clear(vmx); + vmcs_write16(GUEST_SS_SELECTOR, 0); vmcs_write32(GUEST_SS_AR_BYTES, 0x93); @@ -1803,6 +1878,8 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); } + vmx_segment_cache_clear(vmx); + vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR); vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); @@ -1879,6 +1956,8 @@ static void enter_lmode(struct kvm_vcpu *vcpu) { u32 guest_tr_ar; + vmx_segment_cache_clear(to_vmx(vcpu)); + guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { printk(KERN_DEBUG "%s: tss fixup for long mode. \n", @@ -2082,7 +2161,6 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; struct kvm_save_segment *save; u32 ar; @@ -2104,13 +2182,13 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, var->limit = save->limit; ar = save->ar; if (seg == VCPU_SREG_TR - || var->selector == vmcs_read16(sf->selector)) + || var->selector == vmx_read_guest_seg_selector(vmx, seg)) goto use_saved_rmode_seg; } - var->base = vmcs_readl(sf->base); - var->limit = vmcs_read32(sf->limit); - var->selector = vmcs_read16(sf->selector); - ar = vmcs_read32(sf->ar_bytes); + var->base = vmx_read_guest_seg_base(vmx, seg); + var->limit = vmx_read_guest_seg_limit(vmx, seg); + var->selector = vmx_read_guest_seg_selector(vmx, seg); + ar = vmx_read_guest_seg_ar(vmx, seg); use_saved_rmode_seg: if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state) ar = 0; @@ -2127,14 +2205,13 @@ use_saved_rmode_seg: static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) { - struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; struct kvm_segment s; if (to_vmx(vcpu)->rmode.vm86_active) { vmx_get_segment(vcpu, &s, seg); return s.base; } - return vmcs_readl(sf->base); + return vmx_read_guest_seg_base(to_vmx(vcpu), seg); } static int __vmx_get_cpl(struct kvm_vcpu *vcpu) @@ -2146,7 +2223,7 @@ static int __vmx_get_cpl(struct kvm_vcpu *vcpu) && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */ return 3; - return vmcs_read16(GUEST_CS_SELECTOR) & 3; + return vmx_read_guest_seg_selector(to_vmx(vcpu), VCPU_SREG_CS) & 3; } static int vmx_get_cpl(struct kvm_vcpu *vcpu) @@ -2188,6 +2265,8 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; u32 ar; + vmx_segment_cache_clear(vmx); + if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { vmcs_write16(sf->selector, var->selector); vmx->rmode.tr.selector = var->selector; @@ -2229,7 +2308,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) { - u32 ar = vmcs_read32(GUEST_CS_AR_BYTES); + u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS); *db = (ar >> 14) & 1; *l = (ar >> 13) & 1; @@ -2816,6 +2895,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) if (ret != 0) goto out; + vmx_segment_cache_clear(vmx); + seg_setup(VCPU_SREG_CS); /* * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode @@ -4188,6 +4269,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | (1 << VCPU_EXREG_RFLAGS) | (1 << VCPU_EXREG_CPL) | (1 << VCPU_EXREG_PDPTR) + | (1 << VCPU_EXREG_SEGMENTS) | (1 << VCPU_EXREG_CR3)); vcpu->arch.regs_dirty = 0; From 90d34b0e45df3bfe522e9e9d604c4c1a0253699d Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Tue, 29 Mar 2011 16:49:10 -0500 Subject: [PATCH 115/131] KVM: PPC: e500: emulate SVR Return the actual host SVR for now, as we already do for PVR. Eventually we may support Qemu overriding PVR/SVR if the situation is appropriate, once we implement KVM_SET_SREGS on e500. Signed-off-by: Scott Wood Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/kvm_e500.h | 1 + arch/powerpc/kvm/e500.c | 1 + arch/powerpc/kvm/e500_emulate.c | 2 ++ 3 files changed, 4 insertions(+) diff --git a/arch/powerpc/include/asm/kvm_e500.h b/arch/powerpc/include/asm/kvm_e500.h index 7fea26fffb25..bb2a0890600f 100644 --- a/arch/powerpc/include/asm/kvm_e500.h +++ b/arch/powerpc/include/asm/kvm_e500.h @@ -43,6 +43,7 @@ struct kvmppc_vcpu_e500 { u32 host_pid[E500_PID_NUM]; u32 pid[E500_PID_NUM]; + u32 svr; u32 mas0; u32 mas1; diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index e3768ee9b595..0c1af1267843 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -63,6 +63,7 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) /* Registers init */ vcpu->arch.pvr = mfspr(SPRN_PVR); + vcpu_e500->svr = mfspr(SPRN_SVR); /* Since booke kvm only support one core, update all vcpus' PIR to 0 */ vcpu->vcpu_id = 0; diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index 8e3edfbc9634..e2fb47f035a5 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c @@ -175,6 +175,8 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) kvmppc_set_gpr(vcpu, rt, vcpu_e500->hid0); break; case SPRN_HID1: kvmppc_set_gpr(vcpu, rt, vcpu_e500->hid1); break; + case SPRN_SVR: + kvmppc_set_gpr(vcpu, rt, vcpu_e500->svr); break; case SPRN_MMUCSR0: kvmppc_set_gpr(vcpu, rt, 0); break; From 49ea06957bf637b28aa338fba26432d5bafdeb99 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Mon, 28 Mar 2011 15:01:24 -0500 Subject: [PATCH 116/131] KVM: PPC: fix exit accounting for SPRs, tlbwe, tlbsx The exit type setting for mfspr/mtspr is moved from 44x to toplevel SPR emulation. This enables it on e500, and makes sure that all SPRs are covered. Exit accounting for tlbwe and tlbsx is added to e500. Signed-off-by: Stuart Yoder Signed-off-by: Scott Wood Signed-off-by: Alexander Graf --- arch/powerpc/kvm/44x_emulate.c | 2 -- arch/powerpc/kvm/e500_tlb.c | 5 ++++- arch/powerpc/kvm/emulate.c | 2 ++ 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/44x_emulate.c b/arch/powerpc/kvm/44x_emulate.c index 65ea083a5b27..549bb2c9a47a 100644 --- a/arch/powerpc/kvm/44x_emulate.c +++ b/arch/powerpc/kvm/44x_emulate.c @@ -158,7 +158,6 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) emulated = kvmppc_booke_emulate_mtspr(vcpu, sprn, rs); } - kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS); return emulated; } @@ -179,7 +178,6 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, rt); } - kvmppc_set_exit_type(vcpu, EMULATED_MFSPR_EXITS); return emulated; } diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index d6d6d47a75a9..56ac4523857d 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved. + * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved. * * Author: Yu Liu, yu.liu@freescale.com * @@ -24,6 +24,7 @@ #include "../mm/mmu_decl.h" #include "e500_tlb.h" #include "trace.h" +#include "timing.h" #define to_htlb1_esel(esel) (tlb1_entry_num - (esel) - 1) @@ -506,6 +507,7 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) vcpu_e500->mas7 = 0; } + kvmppc_set_exit_type(vcpu, EMULATED_TLBSX_EXITS); return EMULATE_DONE; } @@ -571,6 +573,7 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) write_host_tlbe(vcpu_e500, stlbsel, sesel); } + kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS); return EMULATE_DONE; } diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index c64fd2909bb2..8f7a3aa03c26 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -294,6 +294,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) } break; } + kvmppc_set_exit_type(vcpu, EMULATED_MFSPR_EXITS); break; case OP_31_XOP_STHX: @@ -363,6 +364,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) printk("mtspr: unknown spr %x\n", sprn); break; } + kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS); break; case OP_31_XOP_DCBI: From 1a040b26c5c915b317103b87ae7006d40443f197 Mon Sep 17 00:00:00 2001 From: Stuart Yoder Date: Mon, 28 Mar 2011 15:01:56 -0500 Subject: [PATCH 117/131] KVM: PPC: use ticks, not usecs, for exit timing Convert to microseconds when displaying (with fix from Bharat Bhushan ). This reduces rounding error with large quantities of short exits. Signed-off-by: Stuart Yoder Signed-off-by: Scott Wood Signed-off-by: Alexander Graf --- arch/powerpc/kvm/timing.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kvm/timing.c b/arch/powerpc/kvm/timing.c index 18f40fd3e98f..319177df9587 100644 --- a/arch/powerpc/kvm/timing.c +++ b/arch/powerpc/kvm/timing.c @@ -151,17 +151,30 @@ static int kvmppc_exit_timing_show(struct seq_file *m, void *private) { struct kvm_vcpu *vcpu = m->private; int i; + u64 min, max, sum, sum_quad; seq_printf(m, "%s", "type count min max sum sum_squared\n"); + for (i = 0; i < __NUMBER_OF_KVM_EXIT_TYPES; i++) { + + min = vcpu->arch.timing_min_duration[i]; + do_div(min, tb_ticks_per_usec); + max = vcpu->arch.timing_max_duration[i]; + do_div(max, tb_ticks_per_usec); + sum = vcpu->arch.timing_sum_duration[i]; + do_div(sum, tb_ticks_per_usec); + sum_quad = vcpu->arch.timing_sum_quad_duration[i]; + do_div(sum_quad, tb_ticks_per_usec); + seq_printf(m, "%12s %10d %10lld %10lld %20lld %20lld\n", kvm_exit_names[i], vcpu->arch.timing_count_type[i], - vcpu->arch.timing_min_duration[i], - vcpu->arch.timing_max_duration[i], - vcpu->arch.timing_sum_duration[i], - vcpu->arch.timing_sum_quad_duration[i]); + min, + max, + sum, + sum_quad); + } return 0; } From eab176722f4628b2d9cf76221a43dd3a0e37e632 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Wed, 27 Apr 2011 17:24:10 -0500 Subject: [PATCH 118/131] KVM: PPC: booke: save/restore VRSAVE (a.k.a. USPRG0) Linux doesn't use USPRG0 (now renamed VRSAVE in the architecture, even when Altivec isn't involved), but a guest might. Signed-off-by: Scott Wood Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/kernel/asm-offsets.c | 1 + arch/powerpc/kvm/booke_interrupts.S | 1 - arch/powerpc/kvm/powerpc.c | 13 +++++++++++++ 4 files changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 890897cee050..a16804399165 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -223,6 +223,7 @@ struct kvm_vcpu_arch { ulong hflags; ulong guest_owned_ext; #endif + u32 vrsave; /* also USPRG0 */ u32 mmucr; ulong sprg4; ulong sprg5; diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 23e6a93145ab..cf0d82278951 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -395,6 +395,7 @@ int main(void) DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack)); DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid)); DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr)); + DEFINE(VCPU_VRSAVE, offsetof(struct kvm_vcpu, arch.vrsave)); DEFINE(VCPU_SPRG4, offsetof(struct kvm_vcpu, arch.sprg4)); DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5)); DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6)); diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S index 1cc471faac2d..b58ccae95904 100644 --- a/arch/powerpc/kvm/booke_interrupts.S +++ b/arch/powerpc/kvm/booke_interrupts.S @@ -380,7 +380,6 @@ lightweight_exit: * because host interrupt handlers would get confused. */ lwz r1, VCPU_GPR(r1)(r4) - /* XXX handle USPRG0 */ /* Host interrupt handlers may have clobbered these guest-readable * SPRGs, so we need to reload them here with the guest's values. */ lwz r3, VCPU_SPRG4(r4) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index ec3d2e75c0a8..9e6aa8bfd160 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -298,12 +298,25 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { +#ifdef CONFIG_BOOKE + /* + * vrsave (formerly usprg0) isn't used by Linux, but may + * be used by the guest. + * + * On non-booke this is associated with Altivec and + * is handled by code in book3s.c. + */ + mtspr(SPRN_VRSAVE, vcpu->arch.vrsave); +#endif kvmppc_core_vcpu_load(vcpu, cpu); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { kvmppc_core_vcpu_put(vcpu); +#ifdef CONFIG_BOOKE + vcpu->arch.vrsave = mfspr(SPRN_VRSAVE); +#endif } int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, From 5ce941ee4258b836cf818d2ac159d8cf3ebad648 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Wed, 27 Apr 2011 17:24:21 -0500 Subject: [PATCH 119/131] KVM: PPC: booke: add sregs support Signed-off-by: Scott Wood Signed-off-by: Alexander Graf --- Documentation/kvm/api.txt | 6 +- arch/powerpc/include/asm/kvm.h | 184 ++++++++++++++++++++++++++++ arch/powerpc/include/asm/kvm_44x.h | 1 - arch/powerpc/include/asm/kvm_e500.h | 1 + arch/powerpc/include/asm/kvm_host.h | 3 + arch/powerpc/include/asm/kvm_ppc.h | 9 ++ arch/powerpc/kvm/44x.c | 10 ++ arch/powerpc/kvm/booke.c | 154 ++++++++++++++++++++++- arch/powerpc/kvm/e500.c | 75 ++++++++++++ arch/powerpc/kvm/e500_emulate.c | 5 +- arch/powerpc/kvm/e500_tlb.c | 8 ++ arch/powerpc/kvm/emulate.c | 13 +- arch/powerpc/kvm/powerpc.c | 4 + include/linux/kvm.h | 1 + 14 files changed, 461 insertions(+), 13 deletions(-) diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt index 1b9eaa7e8856..f64c41f8ba61 100644 --- a/Documentation/kvm/api.txt +++ b/Documentation/kvm/api.txt @@ -261,7 +261,7 @@ See KVM_GET_REGS for the data structure. 4.13 KVM_GET_SREGS Capability: basic -Architectures: x86 +Architectures: x86, ppc Type: vcpu ioctl Parameters: struct kvm_sregs (out) Returns: 0 on success, -1 on error @@ -279,6 +279,8 @@ struct kvm_sregs { __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64]; }; +/* ppc -- see arch/powerpc/include/asm/kvm.h */ + interrupt_bitmap is a bitmap of pending external interrupts. At most one bit may be set. This interrupt has been acknowledged by the APIC but not yet injected into the cpu core. @@ -286,7 +288,7 @@ but not yet injected into the cpu core. 4.14 KVM_SET_SREGS Capability: basic -Architectures: x86 +Architectures: x86, ppc Type: vcpu ioctl Parameters: struct kvm_sregs (in) Returns: 0 on success, -1 on error diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h index 18ea6963ad77..d2ca5ed3877b 100644 --- a/arch/powerpc/include/asm/kvm.h +++ b/arch/powerpc/include/asm/kvm.h @@ -45,6 +45,114 @@ struct kvm_regs { __u64 gpr[32]; }; +#define KVM_SREGS_E_IMPL_NONE 0 +#define KVM_SREGS_E_IMPL_FSL 1 + +#define KVM_SREGS_E_FSL_PIDn (1 << 0) /* PID1/PID2 */ + +/* + * Feature bits indicate which sections of the sregs struct are valid, + * both in KVM_GET_SREGS and KVM_SET_SREGS. On KVM_SET_SREGS, registers + * corresponding to unset feature bits will not be modified. This allows + * restoring a checkpoint made without that feature, while keeping the + * default values of the new registers. + * + * KVM_SREGS_E_BASE contains: + * CSRR0/1 (refers to SRR2/3 on 40x) + * ESR + * DEAR + * MCSR + * TSR + * TCR + * DEC + * TB + * VRSAVE (USPRG0) + */ +#define KVM_SREGS_E_BASE (1 << 0) + +/* + * KVM_SREGS_E_ARCH206 contains: + * + * PIR + * MCSRR0/1 + * DECAR + * IVPR + */ +#define KVM_SREGS_E_ARCH206 (1 << 1) + +/* + * Contains EPCR, plus the upper half of 64-bit registers + * that are 32-bit on 32-bit implementations. + */ +#define KVM_SREGS_E_64 (1 << 2) + +#define KVM_SREGS_E_SPRG8 (1 << 3) +#define KVM_SREGS_E_MCIVPR (1 << 4) + +/* + * IVORs are used -- contains IVOR0-15, plus additional IVORs + * in combination with an appropriate feature bit. + */ +#define KVM_SREGS_E_IVOR (1 << 5) + +/* + * Contains MAS0-4, MAS6-7, TLBnCFG, MMUCFG. + * Also TLBnPS if MMUCFG[MAVN] = 1. + */ +#define KVM_SREGS_E_ARCH206_MMU (1 << 6) + +/* DBSR, DBCR, IAC, DAC, DVC */ +#define KVM_SREGS_E_DEBUG (1 << 7) + +/* Enhanced debug -- DSRR0/1, SPRG9 */ +#define KVM_SREGS_E_ED (1 << 8) + +/* Embedded Floating Point (SPE) -- IVOR32-34 if KVM_SREGS_E_IVOR */ +#define KVM_SREGS_E_SPE (1 << 9) + +/* External Proxy (EXP) -- EPR */ +#define KVM_SREGS_EXP (1 << 10) + +/* External PID (E.PD) -- EPSC/EPLC */ +#define KVM_SREGS_E_PD (1 << 11) + +/* Processor Control (E.PC) -- IVOR36-37 if KVM_SREGS_E_IVOR */ +#define KVM_SREGS_E_PC (1 << 12) + +/* Page table (E.PT) -- EPTCFG */ +#define KVM_SREGS_E_PT (1 << 13) + +/* Embedded Performance Monitor (E.PM) -- IVOR35 if KVM_SREGS_E_IVOR */ +#define KVM_SREGS_E_PM (1 << 14) + +/* + * Special updates: + * + * Some registers may change even while a vcpu is not running. + * To avoid losing these changes, by default these registers are + * not updated by KVM_SET_SREGS. To force an update, set the bit + * in u.e.update_special corresponding to the register to be updated. + * + * The update_special field is zero on return from KVM_GET_SREGS. + * + * When restoring a checkpoint, the caller can set update_special + * to 0xffffffff to ensure that everything is restored, even new features + * that the caller doesn't know about. + */ +#define KVM_SREGS_E_UPDATE_MCSR (1 << 0) +#define KVM_SREGS_E_UPDATE_TSR (1 << 1) +#define KVM_SREGS_E_UPDATE_DEC (1 << 2) +#define KVM_SREGS_E_UPDATE_DBSR (1 << 3) + +/* + * In KVM_SET_SREGS, reserved/pad fields must be left untouched from a + * previous KVM_GET_REGS. + * + * Unless otherwise indicated, setting any register with KVM_SET_SREGS + * directly sets its value. It does not trigger any special semantics such + * as write-one-to-clear. Calling KVM_SET_SREGS on an unmodified struct + * just received from KVM_GET_SREGS is always a no-op. + */ struct kvm_sregs { __u32 pvr; union { @@ -62,6 +170,82 @@ struct kvm_sregs { __u64 dbat[8]; } ppc32; } s; + struct { + union { + struct { /* KVM_SREGS_E_IMPL_FSL */ + __u32 features; /* KVM_SREGS_E_FSL_ */ + __u32 svr; + __u64 mcar; + __u32 hid0; + + /* KVM_SREGS_E_FSL_PIDn */ + __u32 pid1, pid2; + } fsl; + __u8 pad[256]; + } impl; + + __u32 features; /* KVM_SREGS_E_ */ + __u32 impl_id; /* KVM_SREGS_E_IMPL_ */ + __u32 update_special; /* KVM_SREGS_E_UPDATE_ */ + __u32 pir; /* read-only */ + __u64 sprg8; + __u64 sprg9; /* E.ED */ + __u64 csrr0; + __u64 dsrr0; /* E.ED */ + __u64 mcsrr0; + __u32 csrr1; + __u32 dsrr1; /* E.ED */ + __u32 mcsrr1; + __u32 esr; + __u64 dear; + __u64 ivpr; + __u64 mcivpr; + __u64 mcsr; /* KVM_SREGS_E_UPDATE_MCSR */ + + __u32 tsr; /* KVM_SREGS_E_UPDATE_TSR */ + __u32 tcr; + __u32 decar; + __u32 dec; /* KVM_SREGS_E_UPDATE_DEC */ + + /* + * Userspace can read TB directly, but the + * value reported here is consistent with "dec". + * + * Read-only. + */ + __u64 tb; + + __u32 dbsr; /* KVM_SREGS_E_UPDATE_DBSR */ + __u32 dbcr[3]; + __u32 iac[4]; + __u32 dac[2]; + __u32 dvc[2]; + __u8 num_iac; /* read-only */ + __u8 num_dac; /* read-only */ + __u8 num_dvc; /* read-only */ + __u8 pad; + + __u32 epr; /* EXP */ + __u32 vrsave; /* a.k.a. USPRG0 */ + __u32 epcr; /* KVM_SREGS_E_64 */ + + __u32 mas0; + __u32 mas1; + __u64 mas2; + __u64 mas7_3; + __u32 mas4; + __u32 mas6; + + __u32 ivor_low[16]; /* IVOR0-15 */ + __u32 ivor_high[18]; /* IVOR32+, plus room to expand */ + + __u32 mmucfg; /* read-only */ + __u32 eptcfg; /* E.PT, read-only */ + __u32 tlbcfg[4];/* read-only */ + __u32 tlbps[4]; /* read-only */ + + __u32 eplc, epsc; /* E.PD */ + } e; __u8 pad[1020]; } u; }; diff --git a/arch/powerpc/include/asm/kvm_44x.h b/arch/powerpc/include/asm/kvm_44x.h index d22d39942a92..a0e57618ff33 100644 --- a/arch/powerpc/include/asm/kvm_44x.h +++ b/arch/powerpc/include/asm/kvm_44x.h @@ -61,7 +61,6 @@ static inline struct kvmppc_vcpu_44x *to_44x(struct kvm_vcpu *vcpu) return container_of(vcpu, struct kvmppc_vcpu_44x, vcpu); } -void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 new_pid); void kvmppc_44x_tlb_put(struct kvm_vcpu *vcpu); void kvmppc_44x_tlb_load(struct kvm_vcpu *vcpu); diff --git a/arch/powerpc/include/asm/kvm_e500.h b/arch/powerpc/include/asm/kvm_e500.h index bb2a0890600f..7a2a565f88c4 100644 --- a/arch/powerpc/include/asm/kvm_e500.h +++ b/arch/powerpc/include/asm/kvm_e500.h @@ -59,6 +59,7 @@ struct kvmppc_vcpu_e500 { u32 hid1; u32 tlb0cfg; u32 tlb1cfg; + u64 mcar; struct kvm_vcpu vcpu; }; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index a16804399165..186f150b9b89 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -233,6 +233,9 @@ struct kvm_vcpu_arch { ulong csrr1; ulong dsrr0; ulong dsrr1; + ulong mcsrr0; + ulong mcsrr1; + ulong mcsr; ulong esr; u32 dec; u32 decar; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index ecb3bc74c344..9345238edecf 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -61,6 +61,7 @@ extern int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu); extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu); extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu); +extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb); /* Core-specific hooks */ @@ -142,4 +143,12 @@ static inline u32 kvmppc_set_field(u64 inst, int msb, int lsb, int value) return r; } +void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); +int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); + +void kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); +int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); + +void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid); + #endif /* __POWERPC_KVM_PPC_H__ */ diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c index 74d0e7421143..da3a1225c0ac 100644 --- a/arch/powerpc/kvm/44x.c +++ b/arch/powerpc/kvm/44x.c @@ -107,6 +107,16 @@ int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu, return 0; } +void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + kvmppc_get_sregs_ivor(vcpu, sregs); +} + +int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + return kvmppc_set_sregs_ivor(vcpu, sregs); +} + struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) { struct kvmppc_vcpu_44x *vcpu_44x; diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index ef76acb455c3..8462b3a1c1c7 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -569,6 +569,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) kvmppc_set_msr(vcpu, regs->msr); vcpu->arch.shared->srr0 = regs->srr0; vcpu->arch.shared->srr1 = regs->srr1; + kvmppc_set_pid(vcpu, regs->pid); vcpu->arch.shared->sprg0 = regs->sprg0; vcpu->arch.shared->sprg1 = regs->sprg1; vcpu->arch.shared->sprg2 = regs->sprg2; @@ -584,16 +585,165 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) return 0; } +static void get_sregs_base(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + u64 tb = get_tb(); + + sregs->u.e.features |= KVM_SREGS_E_BASE; + + sregs->u.e.csrr0 = vcpu->arch.csrr0; + sregs->u.e.csrr1 = vcpu->arch.csrr1; + sregs->u.e.mcsr = vcpu->arch.mcsr; + sregs->u.e.esr = vcpu->arch.esr; + sregs->u.e.dear = vcpu->arch.shared->dar; + sregs->u.e.tsr = vcpu->arch.tsr; + sregs->u.e.tcr = vcpu->arch.tcr; + sregs->u.e.dec = kvmppc_get_dec(vcpu, tb); + sregs->u.e.tb = tb; + sregs->u.e.vrsave = vcpu->arch.vrsave; +} + +static int set_sregs_base(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + if (!(sregs->u.e.features & KVM_SREGS_E_BASE)) + return 0; + + vcpu->arch.csrr0 = sregs->u.e.csrr0; + vcpu->arch.csrr1 = sregs->u.e.csrr1; + vcpu->arch.mcsr = sregs->u.e.mcsr; + vcpu->arch.esr = sregs->u.e.esr; + vcpu->arch.shared->dar = sregs->u.e.dear; + vcpu->arch.vrsave = sregs->u.e.vrsave; + vcpu->arch.tcr = sregs->u.e.tcr; + + if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_DEC) + vcpu->arch.dec = sregs->u.e.dec; + + kvmppc_emulate_dec(vcpu); + + if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) { + /* + * FIXME: existing KVM timer handling is incomplete. + * TSR cannot be read by the guest, and its value in + * vcpu->arch is always zero. For now, just handle + * the case where the caller is trying to inject a + * decrementer interrupt. + */ + + if ((sregs->u.e.tsr & TSR_DIS) && + (vcpu->arch.tcr & TCR_DIE)) + kvmppc_core_queue_dec(vcpu); + } + + return 0; +} + +static void get_sregs_arch206(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + sregs->u.e.features |= KVM_SREGS_E_ARCH206; + + sregs->u.e.pir = 0; + sregs->u.e.mcsrr0 = vcpu->arch.mcsrr0; + sregs->u.e.mcsrr1 = vcpu->arch.mcsrr1; + sregs->u.e.decar = vcpu->arch.decar; + sregs->u.e.ivpr = vcpu->arch.ivpr; +} + +static int set_sregs_arch206(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + if (!(sregs->u.e.features & KVM_SREGS_E_ARCH206)) + return 0; + + if (sregs->u.e.pir != 0) + return -EINVAL; + + vcpu->arch.mcsrr0 = sregs->u.e.mcsrr0; + vcpu->arch.mcsrr1 = sregs->u.e.mcsrr1; + vcpu->arch.decar = sregs->u.e.decar; + vcpu->arch.ivpr = sregs->u.e.ivpr; + + return 0; +} + +void kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + sregs->u.e.features |= KVM_SREGS_E_IVOR; + + sregs->u.e.ivor_low[0] = vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL]; + sregs->u.e.ivor_low[1] = vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK]; + sregs->u.e.ivor_low[2] = vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE]; + sregs->u.e.ivor_low[3] = vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE]; + sregs->u.e.ivor_low[4] = vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL]; + sregs->u.e.ivor_low[5] = vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT]; + sregs->u.e.ivor_low[6] = vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM]; + sregs->u.e.ivor_low[7] = vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL]; + sregs->u.e.ivor_low[8] = vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL]; + sregs->u.e.ivor_low[9] = vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL]; + sregs->u.e.ivor_low[10] = vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER]; + sregs->u.e.ivor_low[11] = vcpu->arch.ivor[BOOKE_IRQPRIO_FIT]; + sregs->u.e.ivor_low[12] = vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG]; + sregs->u.e.ivor_low[13] = vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS]; + sregs->u.e.ivor_low[14] = vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS]; + sregs->u.e.ivor_low[15] = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG]; +} + +int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + if (!(sregs->u.e.features & KVM_SREGS_E_IVOR)) + return 0; + + vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL] = sregs->u.e.ivor_low[0]; + vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK] = sregs->u.e.ivor_low[1]; + vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE] = sregs->u.e.ivor_low[2]; + vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE] = sregs->u.e.ivor_low[3]; + vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL] = sregs->u.e.ivor_low[4]; + vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT] = sregs->u.e.ivor_low[5]; + vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM] = sregs->u.e.ivor_low[6]; + vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL] = sregs->u.e.ivor_low[7]; + vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL] = sregs->u.e.ivor_low[8]; + vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL] = sregs->u.e.ivor_low[9]; + vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER] = sregs->u.e.ivor_low[10]; + vcpu->arch.ivor[BOOKE_IRQPRIO_FIT] = sregs->u.e.ivor_low[11]; + vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG] = sregs->u.e.ivor_low[12]; + vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS] = sregs->u.e.ivor_low[13]; + vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS] = sregs->u.e.ivor_low[14]; + vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = sregs->u.e.ivor_low[15]; + + return 0; +} + int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { - return -ENOTSUPP; + sregs->pvr = vcpu->arch.pvr; + + get_sregs_base(vcpu, sregs); + get_sregs_arch206(vcpu, sregs); + kvmppc_core_get_sregs(vcpu, sregs); + return 0; } int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { - return -ENOTSUPP; + int ret; + + if (vcpu->arch.pvr != sregs->pvr) + return -EINVAL; + + ret = set_sregs_base(vcpu, sregs); + if (ret < 0) + return ret; + + ret = set_sregs_arch206(vcpu, sregs); + if (ret < 0) + return ret; + + return kvmppc_core_set_sregs(vcpu, sregs); } int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index 0c1af1267843..318dbc61ba44 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -97,6 +97,81 @@ int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu, return 0; } +void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + + sregs->u.e.features |= KVM_SREGS_E_ARCH206_MMU | KVM_SREGS_E_SPE | + KVM_SREGS_E_PM; + sregs->u.e.impl_id = KVM_SREGS_E_IMPL_FSL; + + sregs->u.e.impl.fsl.features = 0; + sregs->u.e.impl.fsl.svr = vcpu_e500->svr; + sregs->u.e.impl.fsl.hid0 = vcpu_e500->hid0; + sregs->u.e.impl.fsl.mcar = vcpu_e500->mcar; + + sregs->u.e.mas0 = vcpu_e500->mas0; + sregs->u.e.mas1 = vcpu_e500->mas1; + sregs->u.e.mas2 = vcpu_e500->mas2; + sregs->u.e.mas7_3 = ((u64)vcpu_e500->mas7 << 32) | vcpu_e500->mas3; + sregs->u.e.mas4 = vcpu_e500->mas4; + sregs->u.e.mas6 = vcpu_e500->mas6; + + sregs->u.e.mmucfg = mfspr(SPRN_MMUCFG); + sregs->u.e.tlbcfg[0] = vcpu_e500->tlb0cfg; + sregs->u.e.tlbcfg[1] = vcpu_e500->tlb1cfg; + sregs->u.e.tlbcfg[2] = 0; + sregs->u.e.tlbcfg[3] = 0; + + sregs->u.e.ivor_high[0] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL]; + sregs->u.e.ivor_high[1] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA]; + sregs->u.e.ivor_high[2] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND]; + sregs->u.e.ivor_high[3] = + vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR]; + + kvmppc_get_sregs_ivor(vcpu, sregs); +} + +int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + + if (sregs->u.e.impl_id == KVM_SREGS_E_IMPL_FSL) { + vcpu_e500->svr = sregs->u.e.impl.fsl.svr; + vcpu_e500->hid0 = sregs->u.e.impl.fsl.hid0; + vcpu_e500->mcar = sregs->u.e.impl.fsl.mcar; + } + + if (sregs->u.e.features & KVM_SREGS_E_ARCH206_MMU) { + vcpu_e500->mas0 = sregs->u.e.mas0; + vcpu_e500->mas1 = sregs->u.e.mas1; + vcpu_e500->mas2 = sregs->u.e.mas2; + vcpu_e500->mas7 = sregs->u.e.mas7_3 >> 32; + vcpu_e500->mas3 = (u32)sregs->u.e.mas7_3; + vcpu_e500->mas4 = sregs->u.e.mas4; + vcpu_e500->mas6 = sregs->u.e.mas6; + } + + if (!(sregs->u.e.features & KVM_SREGS_E_IVOR)) + return 0; + + if (sregs->u.e.features & KVM_SREGS_E_SPE) { + vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL] = + sregs->u.e.ivor_high[0]; + vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA] = + sregs->u.e.ivor_high[1]; + vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND] = + sregs->u.e.ivor_high[2]; + } + + if (sregs->u.e.features & KVM_SREGS_E_PM) { + vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR] = + sregs->u.e.ivor_high[3]; + } + + return kvmppc_set_sregs_ivor(vcpu, sregs); +} + struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) { struct kvmppc_vcpu_e500 *vcpu_e500; diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index e2fb47f035a5..69cd665a0caf 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved. + * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved. * * Author: Yu Liu, * @@ -78,8 +78,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) switch (sprn) { case SPRN_PID: - vcpu_e500->pid[0] = vcpu->arch.shadow_pid = - vcpu->arch.pid = spr_val; + kvmppc_set_pid(vcpu, spr_val); break; case SPRN_PID1: vcpu_e500->pid[1] = spr_val; break; diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index 56ac4523857d..b18fe353397d 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -675,6 +675,14 @@ int kvmppc_e500_tlb_search(struct kvm_vcpu *vcpu, return -1; } +void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + + vcpu_e500->pid[0] = vcpu->arch.shadow_pid = + vcpu->arch.pid = pid; +} + void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500) { struct tlbe *tlbe; diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index 8f7a3aa03c26..141dce3c6810 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -114,6 +114,12 @@ void kvmppc_emulate_dec(struct kvm_vcpu *vcpu) } } +u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb) +{ + u64 jd = tb - vcpu->arch.dec_jiffies; + return vcpu->arch.dec - jd; +} + /* XXX to do: * lhax * lhaux @@ -279,11 +285,8 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) case SPRN_DEC: { - u64 jd = get_tb() - vcpu->arch.dec_jiffies; - kvmppc_set_gpr(vcpu, rt, vcpu->arch.dec - jd); - pr_debug("mfDEC: %x - %llx = %lx\n", - vcpu->arch.dec, jd, - kvmppc_get_gpr(vcpu, rt)); + kvmppc_set_gpr(vcpu, rt, + kvmppc_get_dec(vcpu, get_tb())); break; } default: diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 9e6aa8bfd160..616dd516ca1f 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -175,7 +175,11 @@ int kvm_dev_ioctl_check_extension(long ext) int r; switch (ext) { +#ifdef CONFIG_BOOKE + case KVM_CAP_PPC_BOOKE_SREGS: +#else case KVM_CAP_PPC_SEGSTATE: +#endif case KVM_CAP_PPC_PAIRED_SINGLES: case KVM_CAP_PPC_UNSET_IRQ: case KVM_CAP_PPC_IRQ_LEVEL: diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 2f63ebeac639..55ef181521ff 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -543,6 +543,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_ASYNC_PF 59 #define KVM_CAP_TSC_CONTROL 60 #define KVM_CAP_GET_TSC_KHZ 61 +#define KVM_CAP_PPC_BOOKE_SREGS 62 #ifdef KVM_CAP_IRQ_ROUTING From 12cb814f3bb35736420cc6bfc9fed7b6a9d3a828 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sat, 7 May 2011 16:31:36 +0900 Subject: [PATCH 120/131] KVM: MMU: Clean up gpte reading with copy_from_user() When we optimized walk_addr_generic() by not using the generic guest memory reader, we replaced copy_from_user() with get_user(): commit e30d2a170506830d5eef5e9d7990c5aedf1b0a51 KVM: MMU: Optimize guest page table walk commit 15e2ac9a43d4d7d08088e404fddf2533a8e7d52e KVM: MMU: Fix 64-bit paging breakage on x86_32 But as Andi pointed out later, copy_from_user() does the same as get_user() as long as we give a constant size to it. So we use copy_from_user() to clean up the code. The only, noticeable, regression introduced by this is 64-bit gpte reading on x86_32 hosts needed for PAE guests. But this can be mitigated by implementing 8-byte get_user() for x86_32, if needed. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 52450a6b784f..88ca456ccd68 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -115,20 +115,6 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) return access; } -static int FNAME(read_gpte)(pt_element_t *pte, pt_element_t __user *ptep_user) -{ -#if defined(CONFIG_X86_32) && (PTTYPE == 64) - u32 *p = (u32 *)pte; - u32 __user *p_user = (u32 __user *)ptep_user; - - if (unlikely(get_user(*p, p_user))) - return -EFAULT; - return get_user(*(p + 1), p_user + 1); -#else - return get_user(*pte, ptep_user); -#endif -} - /* * Fetch a guest pte for a guest virtual address */ @@ -199,7 +185,7 @@ walk: } ptep_user = (pt_element_t __user *)((void *)host_addr + offset); - if (unlikely(FNAME(read_gpte)(&pte, ptep_user))) { + if (unlikely(copy_from_user(&pte, ptep_user, sizeof(pte)))) { present = false; break; } From fa3d315a4ce2c0891cdde262562e710d95fba19e Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sat, 7 May 2011 16:35:38 +0900 Subject: [PATCH 121/131] KVM: Validate userspace_addr of memslot when registered This way, we can avoid checking the user space address many times when we read the guest memory. Although we can do the same for write if we check which slots are writable, we do not care write now: reading the guest memory happens more often than writing. [avi: change VERIFY_READ to VERIFY_WRITE] Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 2 +- virt/kvm/kvm_main.c | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 88ca456ccd68..e3f81418797e 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -185,7 +185,7 @@ walk: } ptep_user = (pt_element_t __user *)((void *)host_addr + offset); - if (unlikely(copy_from_user(&pte, ptep_user, sizeof(pte)))) { + if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) { present = false; break; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 58146457bf97..ed3c4e7c1008 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -648,7 +648,10 @@ int __kvm_set_memory_region(struct kvm *kvm, goto out; if (mem->guest_phys_addr & (PAGE_SIZE - 1)) goto out; - if (user_alloc && (mem->userspace_addr & (PAGE_SIZE - 1))) + /* We can read the guest memory with __xxx_user() later on. */ + if (user_alloc && + ((mem->userspace_addr & (PAGE_SIZE - 1)) || + !access_ok(VERIFY_WRITE, mem->userspace_addr, mem->memory_size))) goto out; if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) goto out; @@ -1283,7 +1286,7 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, addr = gfn_to_hva(kvm, gfn); if (kvm_is_error_hva(addr)) return -EFAULT; - r = copy_from_user(data, (void __user *)addr + offset, len); + r = __copy_from_user(data, (void __user *)addr + offset, len); if (r) return -EFAULT; return 0; From c1ed6dea8113597cfa00911faa814d9dbb586932 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 2 May 2011 02:23:13 +0900 Subject: [PATCH 122/131] KVM: x86 emulator: Remove unused arg from seg_override() In addition, one comma at the end of a statement is replaced with a semicolon. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 59992484f5f3..3d0e5ac3a0c1 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -500,7 +500,6 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, } static unsigned seg_override(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, struct decode_cache *c) { if (!c->has_seg_override) @@ -3527,7 +3526,7 @@ done_prefixes: if (!c->has_seg_override) set_seg_override(c, VCPU_SREG_DS); - memop.addr.mem.seg = seg_override(ctxt, ops, c); + memop.addr.mem.seg = seg_override(ctxt, c); if (memop.type == OP_MEM && c->ad_bytes != 8) memop.addr.mem.ea = (u32)memop.addr.mem.ea; @@ -3587,7 +3586,7 @@ done_prefixes: c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; c->src.addr.mem.ea = register_address(c, c->regs[VCPU_REGS_RSI]); - c->src.addr.mem.seg = seg_override(ctxt, ops, c), + c->src.addr.mem.seg = seg_override(ctxt, c); c->src.val = 0; break; case SrcImmFAddr: @@ -4103,7 +4102,7 @@ writeback: c->dst.type = saved_dst_type; if ((c->d & SrcMask) == SrcSI) - string_addr_inc(ctxt, seg_override(ctxt, ops, c), + string_addr_inc(ctxt, seg_override(ctxt, c), VCPU_REGS_RSI, &c->src); if ((c->d & DstMask) == DstDI) From 509cf9fe11dad85f96944095ed63b2caa85cdfc9 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 2 May 2011 02:25:07 +0900 Subject: [PATCH 123/131] KVM: x86 emulator: Remove unused arg from read_descriptor() Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 3d0e5ac3a0c1..c26243f6d668 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -729,7 +729,6 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs, } static int read_descriptor(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, struct segmented_address addr, u16 *size, unsigned long *address, int op_bytes) { @@ -2720,7 +2719,7 @@ static int em_lgdt(struct x86_emulate_ctxt *ctxt) struct desc_ptr desc_ptr; int rc; - rc = read_descriptor(ctxt, ctxt->ops, c->src.addr.mem, + rc = read_descriptor(ctxt, c->src.addr.mem, &desc_ptr.size, &desc_ptr.address, c->op_bytes); if (rc != X86EMUL_CONTINUE) @@ -2749,9 +2748,8 @@ static int em_lidt(struct x86_emulate_ctxt *ctxt) struct desc_ptr desc_ptr; int rc; - rc = read_descriptor(ctxt, ctxt->ops, c->src.addr.mem, - &desc_ptr.size, - &desc_ptr.address, + rc = read_descriptor(ctxt, c->src.addr.mem, + &desc_ptr.size, &desc_ptr.address, c->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; From adddcecf9222aa32938480cc1d03de629fab2a86 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 2 May 2011 02:26:23 +0900 Subject: [PATCH 124/131] KVM: x86 emulator: Remove unused arg from writeback() Remove inline at this chance. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c26243f6d668..d4f4375c0480 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1351,8 +1351,7 @@ static void write_register_operand(struct operand *op) } } -static inline int writeback(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int writeback(struct x86_emulate_ctxt *ctxt) { int rc; struct decode_cache *c = &ctxt->decode; @@ -4089,7 +4088,7 @@ special_insn: goto done; writeback: - rc = writeback(ctxt, ops); + rc = writeback(ctxt); if (rc != X86EMUL_CONTINUE) goto done; From 3b9be3bf2e4d45828f84ba615283a53d11ebf470 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 2 May 2011 02:27:55 +0900 Subject: [PATCH 125/131] KVM: x86 emulator: Remove unused arg from emulate_pop() The opt of emulate_grp1a() is also removed. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index d4f4375c0480..569e57dd1d55 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1402,7 +1402,6 @@ static int em_push(struct x86_emulate_ctxt *ctxt) } static int emulate_pop(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, void *dest, int len) { struct decode_cache *c = &ctxt->decode; @@ -1423,7 +1422,7 @@ static int em_pop(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; - return emulate_pop(ctxt, ctxt->ops, &c->dst.val, c->op_bytes); + return emulate_pop(ctxt, &c->dst.val, c->op_bytes); } static int emulate_popf(struct x86_emulate_ctxt *ctxt, @@ -1435,7 +1434,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; int cpl = ops->cpl(ctxt); - rc = emulate_pop(ctxt, ops, &val, len); + rc = emulate_pop(ctxt, &val, len); if (rc != X86EMUL_CONTINUE) return rc; @@ -1494,7 +1493,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, unsigned long selector; int rc; - rc = emulate_pop(ctxt, ops, &selector, c->op_bytes); + rc = emulate_pop(ctxt, &selector, c->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; @@ -1544,7 +1543,7 @@ static int em_popa(struct x86_emulate_ctxt *ctxt) --reg; } - rc = emulate_pop(ctxt, ctxt->ops, &c->regs[reg], c->op_bytes); + rc = emulate_pop(ctxt, &c->regs[reg], c->op_bytes); if (rc != X86EMUL_CONTINUE) break; --reg; @@ -1633,7 +1632,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt, /* TODO: Add stack limit check */ - rc = emulate_pop(ctxt, ops, &temp_eip, c->op_bytes); + rc = emulate_pop(ctxt, &temp_eip, c->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; @@ -1641,12 +1640,12 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt, if (temp_eip & ~0xffff) return emulate_gp(ctxt, 0); - rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); + rc = emulate_pop(ctxt, &cs, c->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; - rc = emulate_pop(ctxt, ops, &temp_eflags, c->op_bytes); + rc = emulate_pop(ctxt, &temp_eflags, c->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; @@ -1688,12 +1687,11 @@ static inline int emulate_iret(struct x86_emulate_ctxt *ctxt, } } -static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; - return emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes); + return emulate_pop(ctxt, &c->dst.val, c->dst.bytes); } static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) @@ -1822,12 +1820,12 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, int rc; unsigned long cs; - rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes); + rc = emulate_pop(ctxt, &c->eip, c->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; if (c->op_bytes == 4) c->eip = (u32)c->eip; - rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); + rc = emulate_pop(ctxt, &cs, c->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS); @@ -2543,7 +2541,7 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) c->dst.type = OP_REG; c->dst.addr.reg = &c->eip; c->dst.bytes = c->op_bytes; - rc = emulate_pop(ctxt, ctxt->ops, &c->dst.val, c->op_bytes); + rc = emulate_pop(ctxt, &c->dst.val, c->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val); @@ -3918,7 +3916,7 @@ special_insn: break; } case 0x8f: /* pop (sole member of Grp1a) */ - rc = emulate_grp1a(ctxt, ops); + rc = emulate_grp1a(ctxt); break; case 0x90 ... 0x97: /* nop / xchg reg, rax */ if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX]) From 51187683cb11b959535d32eb91b673c6a9a03e88 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 2 May 2011 02:29:17 +0900 Subject: [PATCH 126/131] KVM: x86 emulator: Rename emulate_grpX() to em_grpX() The prototypes are changed appropriately. We also replaces "goto grp45;" with simple em_grp45() call. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 569e57dd1d55..d9ebf6939e49 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1687,14 +1687,14 @@ static inline int emulate_iret(struct x86_emulate_ctxt *ctxt, } } -static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt) +static int em_grp1a(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; return emulate_pop(ctxt, &c->dst.val, c->dst.bytes); } -static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) +static int em_grp2(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; switch (c->modrm_reg) { @@ -1721,10 +1721,10 @@ static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags); break; } + return X86EMUL_CONTINUE; } -static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int em_grp3(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; unsigned long *rax = &c->regs[VCPU_REGS_RAX]; @@ -1763,7 +1763,7 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, return X86EMUL_CONTINUE; } -static int emulate_grp45(struct x86_emulate_ctxt *ctxt) +static int em_grp45(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; int rc = X86EMUL_CONTINUE; @@ -1793,8 +1793,7 @@ static int emulate_grp45(struct x86_emulate_ctxt *ctxt) return rc; } -static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int em_grp9(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; u64 old = c->dst.orig_val64; @@ -3916,7 +3915,7 @@ special_insn: break; } case 0x8f: /* pop (sole member of Grp1a) */ - rc = emulate_grp1a(ctxt); + rc = em_grp1a(ctxt); break; case 0x90 ... 0x97: /* nop / xchg reg, rax */ if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX]) @@ -3932,7 +3931,7 @@ special_insn: case 0xa8 ... 0xa9: /* test ax, imm */ goto test; case 0xc0 ... 0xc1: - emulate_grp2(ctxt); + rc = em_grp2(ctxt); break; case 0xc3: /* ret */ c->dst.type = OP_REG; @@ -3967,11 +3966,11 @@ special_insn: rc = emulate_iret(ctxt, ops); break; case 0xd0 ... 0xd1: /* Grp2 */ - emulate_grp2(ctxt); + rc = em_grp2(ctxt); break; case 0xd2 ... 0xd3: /* Grp2 */ c->src.val = c->regs[VCPU_REGS_RCX]; - emulate_grp2(ctxt); + rc = em_grp2(ctxt); break; case 0xe0 ... 0xe2: /* loop/loopz/loopnz */ register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); @@ -4040,7 +4039,7 @@ special_insn: ctxt->eflags ^= EFLG_CF; break; case 0xf6 ... 0xf7: /* Grp3 */ - rc = emulate_grp3(ctxt, ops); + rc = em_grp3(ctxt); break; case 0xf8: /* clc */ ctxt->eflags &= ~EFLG_CF; @@ -4071,13 +4070,13 @@ special_insn: ctxt->eflags |= EFLG_DF; break; case 0xfe: /* Grp4 */ - grp45: - rc = emulate_grp45(ctxt); + rc = em_grp45(ctxt); break; case 0xff: /* Grp5 */ if (c->modrm_reg == 5) goto jump_far; - goto grp45; + rc = em_grp45(ctxt); + break; default: goto cannot_emulate; } @@ -4344,7 +4343,7 @@ twobyte_insn: (u64) c->src.val; break; case 0xc7: /* Grp9 (cmpxchg8b) */ - rc = emulate_grp9(ctxt, ops); + rc = em_grp9(ctxt); break; default: goto cannot_emulate; From d2f62766d5778bbaf80d4feb90a23c7edc371a54 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 2 May 2011 02:30:48 +0900 Subject: [PATCH 127/131] KVM: x86 emulator: Make jmp far emulation into a separate function We introduce em_jmp_far(). We also call this from em_grp45() to stop treating modrm_reg == 5 case separately in the group 5 emulation. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index d9ebf6939e49..d6e2477feb18 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1687,6 +1687,23 @@ static inline int emulate_iret(struct x86_emulate_ctxt *ctxt, } } +static int em_jmp_far(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + unsigned short sel; + + memcpy(&sel, c->src.valptr + c->op_bytes, 2); + + rc = load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS); + if (rc != X86EMUL_CONTINUE) + return rc; + + c->eip = 0; + memcpy(&c->eip, c->src.valptr, c->op_bytes); + return X86EMUL_CONTINUE; +} + static int em_grp1a(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; @@ -1786,6 +1803,9 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt) case 4: /* jmp abs */ c->eip = c->src.val; break; + case 5: /* jmp far */ + rc = em_jmp_far(ctxt); + break; case 6: /* push */ rc = em_push(ctxt); break; @@ -3997,19 +4017,9 @@ special_insn: } case 0xe9: /* jmp rel */ goto jmp; - case 0xea: { /* jmp far */ - unsigned short sel; - jump_far: - memcpy(&sel, c->src.valptr + c->op_bytes, 2); - - rc = load_segment_descriptor(ctxt, ops, sel, VCPU_SREG_CS); - if (rc != X86EMUL_CONTINUE) - goto done; - - c->eip = 0; - memcpy(&c->eip, c->src.valptr, c->op_bytes); + case 0xea: /* jmp far */ + rc = em_jmp_far(ctxt); break; - } case 0xeb: jmp: /* jmp rel short */ jmp_rel(c, c->src.val); @@ -4073,8 +4083,6 @@ special_insn: rc = em_grp45(ctxt); break; case 0xff: /* Grp5 */ - if (c->modrm_reg == 5) - goto jump_far; rc = em_grp45(ctxt); break; default: From 8fa2206821953a50a3a02ea33fcfb3ced2fd9997 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 4 May 2011 16:31:04 +0300 Subject: [PATCH 128/131] KVM: make guest mode entry to be rcu quiescent state KVM does not hold any references to rcu protected data when it switches CPU into a guest mode. In fact switching to a guest mode is very similar to exiting to userspase from rcu point of view. In addition CPU may stay in a guest mode for quite a long time (up to one time slice). Lets treat guest mode as quiescent state, just like we do with user-mode execution. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- include/linux/kvm_host.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 0bc3d372e3cb..b9c3299c6a55 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -591,8 +591,17 @@ static inline int kvm_deassign_device(struct kvm *kvm, static inline void kvm_guest_enter(void) { + BUG_ON(preemptible()); account_system_vtime(current); current->flags |= PF_VCPU; + /* KVM does not hold any references to rcu protected data when it + * switches CPU into a guest mode. In fact switching to a guest mode + * is very similar to exiting to userspase from rcu point of view. In + * addition CPU may stay in a guest mode for quite a long time (up to + * one time slice). Lets treat guest mode as quiescent state, just like + * we do with user-mode execution. + */ + rcu_virt_note_context_switch(smp_processor_id()); } static inline void kvm_guest_exit(void) From 76d25402dfc972a135a656613736fe3602045a95 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Mon, 9 May 2011 22:48:54 +0300 Subject: [PATCH 129/131] KVM: Add documentation for KVM_CAP_NR_VCPUS Document KVM_CAP_NR_VCPUS that can be used by the userspace to determine maximum number of VCPUs it can create with the KVM_CREATE_VCPU ioctl. Cc: Avi Kivity Cc: Marcelo Tosatti Cc: Jan Kiszka Signed-off-by: Pekka Enberg Signed-off-by: Avi Kivity --- Documentation/kvm/api.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt index f64c41f8ba61..42542eb802ca 100644 --- a/Documentation/kvm/api.txt +++ b/Documentation/kvm/api.txt @@ -175,7 +175,10 @@ Parameters: vcpu id (apic id on x86) Returns: vcpu fd on success, -1 on error This API adds a vcpu to a virtual machine. The vcpu id is a small integer -in the range [0, max_vcpus). +in the range [0, max_vcpus). You can use KVM_CAP_NR_VCPUS of the +KVM_CHECK_EXTENSION ioctl() to determine the value for max_vcpus at run-time. +If the KVM_CAP_NR_VCPUS does not exist, you should assume that max_vcpus is 4 +cpus max. 4.8 KVM_GET_DIRTY_LOG (vm ioctl) From 85722cda308c0ad7390dc910139b2ce58c11b9c4 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Wed, 11 May 2011 09:28:28 +0900 Subject: [PATCH 130/131] KVM: Fix kvm mmu_notifier initialization order Like the following, mmu_notifier can be called after registering immediately. So, kvm have to initialize kvm->mmu_lock before it. BUG: spinlock bad magic on CPU#0, kswapd0/342 lock: ffff8800af8c4000, .magic: 00000000, .owner: /-1, .owner_cpu: 0 Pid: 342, comm: kswapd0 Not tainted 2.6.39-rc5+ #1 Call Trace: [] spin_bug+0x9c/0xa3 [] do_raw_spin_lock+0x29/0x13c [] ? flush_tlb_others_ipi+0xaf/0xfd [] _raw_spin_lock+0x9/0xb [] kvm_mmu_notifier_clear_flush_young+0x2c/0x66 [kvm] [] __mmu_notifier_clear_flush_young+0x2b/0x57 [] page_referenced_one+0x88/0xea [] page_referenced+0x1fc/0x256 [] shrink_page_list+0x187/0x53a [] shrink_inactive_list+0x1e0/0x33d [] ? determine_dirtyable_memory+0x15/0x27 [] ? call_function_single_interrupt+0xe/0x20 [] shrink_zone+0x322/0x3de [] ? zone_watermark_ok_safe+0xe2/0xf1 [] kswapd+0x516/0x818 [] ? shrink_zone+0x3de/0x3de [] kthread+0x7d/0x85 [] kernel_thread_helper+0x4/0x10 [] ? __init_kthread_worker+0x37/0x37 [] ? gs_change+0xb/0xb Signed-off-by: OGAWA Hirofumi Signed-off-by: Avi Kivity --- virt/kvm/kvm_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ed3c4e7c1008..22cdb960660a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -467,6 +467,7 @@ static struct kvm *kvm_create_vm(void) if (!kvm->buses[i]) goto out_err; } + spin_lock_init(&kvm->mmu_lock); r = kvm_init_mmu_notifier(kvm); if (r) @@ -474,7 +475,6 @@ static struct kvm *kvm_create_vm(void) kvm->mm = current->mm; atomic_inc(&kvm->mm->mm_count); - spin_lock_init(&kvm->mmu_lock); kvm_eventfd_init(kvm); mutex_init(&kvm->lock); mutex_init(&kvm->irq_lock); From c8cfbb555eb3632bf3dcbe1a591c1f4d0c28681c Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sun, 1 May 2011 14:33:07 +0900 Subject: [PATCH 131/131] KVM: MMU: Use ptep_user for cmpxchg_gpte() The address of the gpte was already calculated and stored in ptep_user before entering cmpxchg_gpte(). This patch makes cmpxchg_gpte() to use that to make it clear that we are using the same address during walk_addr_generic(). Note that the unlikely annotations are used to show that the conditions are something unusual rather than for performance. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/paging_tmpl.h | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index e3f81418797e..6c4dc010c4cb 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -79,21 +79,19 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) } static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gfn_t table_gfn, unsigned index, - pt_element_t orig_pte, pt_element_t new_pte) + pt_element_t __user *ptep_user, unsigned index, + pt_element_t orig_pte, pt_element_t new_pte) { + int npages; pt_element_t ret; pt_element_t *table; struct page *page; - gpa_t gpa; - gpa = mmu->translate_gpa(vcpu, table_gfn << PAGE_SHIFT, - PFERR_USER_MASK|PFERR_WRITE_MASK); - if (gpa == UNMAPPED_GVA) + npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page); + /* Check if the user is doing something meaningless. */ + if (unlikely(npages != 1)) return -EFAULT; - page = gfn_to_page(vcpu->kvm, gpa_to_gfn(gpa)); - table = kmap_atomic(page, KM_USER0); ret = CMPXCHG(&table[index], orig_pte, new_pte); kunmap_atomic(table, KM_USER0); @@ -220,9 +218,9 @@ walk: int ret; trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte)); - ret = FNAME(cmpxchg_gpte)(vcpu, mmu, table_gfn, - index, pte, pte|PT_ACCESSED_MASK); - if (ret < 0) { + ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, + pte, pte|PT_ACCESSED_MASK); + if (unlikely(ret < 0)) { present = false; break; } else if (ret) @@ -279,9 +277,9 @@ walk: int ret; trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); - ret = FNAME(cmpxchg_gpte)(vcpu, mmu, table_gfn, index, pte, - pte|PT_DIRTY_MASK); - if (ret < 0) { + ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, + pte, pte|PT_DIRTY_MASK); + if (unlikely(ret < 0)) { present = false; goto error; } else if (ret)