Mercurial > illumos > git > illumos-joyent
changeset 25248:71e294efbc2d
12916 bhyve should be able to limit vmx capabilities
12917 bhyve should always use Unrestricted Guest
Reviewed by: Mike Zeller <mike.zeller@joyent.com>
Approved by: Dan McDonald <danmcd@joyent.com>
author | Patrick Mooney <pmooney@pfmooney.com> |
---|---|
date | Wed, 24 Jun 2020 20:53:43 +0000 |
parents | 00298d3cee5f |
children | 3714f17693a0 |
files | usr/src/cmd/bhyve/bhyverun.c usr/src/cmd/bhyve/spinup_ap.c usr/src/lib/libvmmapi/common/vmmapi.c usr/src/uts/i86pc/io/vmm/amd/svm.c usr/src/uts/i86pc/io/vmm/intel/vmx.c usr/src/uts/i86pc/io/vmm/intel/vmx.h usr/src/uts/i86pc/sys/vmm.h |
diffstat | 7 files changed, 162 insertions(+), 501 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/cmd/bhyve/bhyverun.c Tue Jul 21 02:10:58 2020 -0500 +++ b/usr/src/cmd/bhyve/bhyverun.c Wed Jun 24 20:53:43 2020 +0000 @@ -39,6 +39,7 @@ * * Copyright 2015 Pluribus Networks Inc. * Copyright 2018 Joyent, Inc. + * Copyright 2020 Oxide Computer Company */ #include <sys/cdefs.h> @@ -948,6 +949,7 @@ static int num_vcpus_allowed(struct vmctx *ctx) { +#ifdef __FreeBSD__ int tmp, error; error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp); @@ -960,6 +962,10 @@ return (VM_MAXCPU); else return (1); +#else + /* Unrestricted Guest is always enabled on illumos */ + return (VM_MAXCPU); +#endif /* __FreeBSD__ */ } void @@ -1314,11 +1320,15 @@ vga_init(1); if (lpc_bootrom()) { +#ifdef __FreeBSD__ if (vm_set_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, 1)) { fprintf(stderr, "ROM boot failed: unrestricted guest " "capability not available\n"); exit(4); } +#else + /* Unrestricted Guest is always enabled on illumos */ +#endif error = vcpu_reset(ctx, BSP); assert(error == 0); }
--- a/usr/src/cmd/bhyve/spinup_ap.c Tue Jul 21 02:10:58 2020 -0500 +++ b/usr/src/cmd/bhyve/spinup_ap.c Wed Jun 24 20:53:43 2020 +0000 @@ -27,6 +27,18 @@ * * $FreeBSD$ */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2020 Oxide Computer Company + */ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); @@ -89,6 +101,7 @@ fbsdrun_set_capabilities(ctx, newcpu); +#ifdef __FreeBSD__ /* * Enable the 'unrestricted guest' mode for 'newcpu'. * @@ -97,6 +110,9 @@ */ error = vm_set_capability(ctx, newcpu, VM_CAP_UNRESTRICTED_GUEST, 1); assert(error == 0); +#else + /* Unrestricted Guest is always enabled on illumos */ +#endif spinup_ap_realmode(ctx, newcpu, &rip);
--- a/usr/src/lib/libvmmapi/common/vmmapi.c Tue Jul 21 02:10:58 2020 -0500 +++ b/usr/src/lib/libvmmapi/common/vmmapi.c Wed Jun 24 20:53:43 2020 +0000 @@ -39,6 +39,7 @@ * * Copyright 2015 Pluribus Networks Inc. * Copyright 2019 Joyent, Inc. + * Copyright 2020 Oxide Computer Company */ #include <sys/cdefs.h> @@ -1004,7 +1005,9 @@ [VM_CAP_HALT_EXIT] = "hlt_exit", [VM_CAP_MTRAP_EXIT] = "mtrap_exit", [VM_CAP_PAUSE_EXIT] = "pause_exit", +#ifdef __FreeBSD__ [VM_CAP_UNRESTRICTED_GUEST] = "unrestricted_guest", +#endif [VM_CAP_ENABLE_INVPCID] = "enable_invpcid", [VM_CAP_BPT_EXIT] = "bpt_exit", };
--- a/usr/src/uts/i86pc/io/vmm/amd/svm.c Tue Jul 21 02:10:58 2020 -0500 +++ b/usr/src/uts/i86pc/io/vmm/amd/svm.c Wed Jun 24 20:53:43 2020 +0000 @@ -2342,11 +2342,6 @@ svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_PAUSE, val); break; - case VM_CAP_UNRESTRICTED_GUEST: - /* Unrestricted guest execution cannot be disabled in SVM */ - if (val == 0) - error = EINVAL; - break; default: error = ENOENT; break; @@ -2372,9 +2367,6 @@ *retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_PAUSE); break; - case VM_CAP_UNRESTRICTED_GUEST: - *retval = 1; /* unrestricted guest is always enabled */ - break; default: error = ENOENT; break;
--- a/usr/src/uts/i86pc/io/vmm/intel/vmx.c Tue Jul 21 02:10:58 2020 -0500 +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.c Wed Jun 24 20:53:43 2020 +0000 @@ -134,7 +134,14 @@ PROCBASED_CR3_STORE_EXITING | \ PROCBASED_IO_BITMAPS) -#define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT +/* + * EPT and Unrestricted Guest are considered necessities. The latter is not a + * requirement on FreeBSD, where grub2-bhyve is used to load guests directly + * without a bootrom starting in real mode. + */ +#define PROCBASED_CTLS2_ONE_SETTING \ + (PROCBASED2_ENABLE_EPT | \ + PROCBASED2_UNRESTRICTED_GUEST) #define PROCBASED_CTLS2_ZERO_SETTING 0 #define VM_EXIT_CTLS_ONE_SETTING \ @@ -206,10 +213,6 @@ SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit, 0, "PAUSE triggers a VM-exit"); -static int cap_unrestricted_guest; -SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD, - &cap_unrestricted_guest, 0, "Unrestricted guests"); - static int cap_monitor_trap; SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD, &cap_monitor_trap, 0, "Monitor trap flag"); @@ -218,17 +221,8 @@ SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid, 0, "Guests are allowed to use INVPCID"); -static int tpr_shadowing; -SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, tpr_shadowing, CTLFLAG_RD, - &tpr_shadowing, 0, "TPR shadowing support"); - -static int virtual_interrupt_delivery; -SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD, - &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support"); - -static int posted_interrupts; -SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD, - &posted_interrupts, 0, "APICv posted interrupt support"); +/* Extra capabilities (VMX_CAP_*) beyond the minimum */ +static enum vmx_caps vmx_capabilities; static int pirvec = -1; SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD, @@ -603,94 +597,6 @@ } } -#ifdef __FreeBSD__ -static void -vpid_init(void) -{ - /* - * VPID 0 is required when the "enable VPID" execution control is - * disabled. - * - * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the - * unit number allocator does not have sufficient unique VPIDs to - * satisfy the allocation. - * - * The remaining VPIDs are managed by the unit number allocator. - */ - vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL); -} - -static void -vmx_disable(void *arg __unused) -{ - struct invvpid_desc invvpid_desc = { 0 }; - struct invept_desc invept_desc = { 0 }; - - if (vmxon_enabled[curcpu]) { - /* - * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. - * - * VMXON or VMXOFF are not required to invalidate any TLB - * caching structures. This prevents potential retention of - * cached information in the TLB between distinct VMX episodes. - */ - invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); - invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); - vmxoff(); - } - load_cr4(rcr4() & ~CR4_VMXE); -} - -static int -vmx_cleanup(void) -{ - - if (pirvec >= 0) - lapic_ipi_free(pirvec); - - if (vpid_unr != NULL) { - delete_unrhdr(vpid_unr); - vpid_unr = NULL; - } - - if (nmi_flush_l1d_sw == 1) - nmi_flush_l1d_sw = 0; - - smp_rendezvous(NULL, vmx_disable, NULL, NULL); - - return (0); -} - -static void -vmx_enable(void *arg __unused) -{ - int error; - uint64_t feature_control; - - feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); - if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 || - (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { - wrmsr(MSR_IA32_FEATURE_CONTROL, - feature_control | IA32_FEATURE_CONTROL_VMX_EN | - IA32_FEATURE_CONTROL_LOCK); - } - - load_cr4(rcr4() | CR4_VMXE); - - *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); - error = vmxon(vmxon_region[curcpu]); - if (error == 0) - vmxon_enabled[curcpu] = 1; -} - -static void -vmx_restore(void) -{ - - if (vmxon_enabled[curcpu]) - vmxon(vmxon_region[curcpu]); -} -#else /* __FreeBSD__ */ static int vmx_cleanup(void) { @@ -703,48 +609,14 @@ { /* No-op on illumos */ } -#endif /* __FreeBSD__ */ static int vmx_init(int ipinum) { int error; -#ifdef __FreeBSD__ - uint64_t basic, fixed0, fixed1, feature_control; -#else uint64_t fixed0, fixed1; -#endif - uint32_t tmp, procbased2_vid_bits; - -#ifdef __FreeBSD__ - /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ - if (!(cpu_feature2 & CPUID2_VMX)) { - printf("vmx_init: processor does not support VMX operation\n"); - return (ENXIO); - } - - /* - * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits - * are set (bits 0 and 2 respectively). - */ - feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); - if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 && - (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { - printf("vmx_init: VMX operation disabled by BIOS\n"); - return (ENXIO); - } - - /* - * Verify capabilities MSR_VMX_BASIC: - * - bit 54 indicates support for INS/OUTS decoding - */ - basic = rdmsr(MSR_VMX_BASIC); - if ((basic & (1UL << 54)) == 0) { - printf("vmx_init: processor does not support desired basic " - "capabilities\n"); - return (EINVAL); - } -#endif /* __FreeBSD__ */ + uint32_t tmp; + enum vmx_caps avail_caps = VMX_CAP_NONE; /* Check support for primary processor-based VM-execution controls */ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, @@ -828,93 +700,53 @@ PROCBASED_PAUSE_EXITING, 0, &tmp) == 0); - cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, - MSR_VMX_PROCBASED_CTLS2, - PROCBASED2_UNRESTRICTED_GUEST, 0, - &tmp) == 0); - cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0, &tmp) == 0); - /* - * Check support for TPR shadow. - */ - error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, - MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0, - &tmp); - if (error == 0) { - tpr_shadowing = 1; - TUNABLE_INT_FETCH("hw.vmm.vmx.use_tpr_shadowing", - &tpr_shadowing); - } - - if (tpr_shadowing) { - procbased_ctls |= PROCBASED_USE_TPR_SHADOW; - procbased_ctls &= ~PROCBASED_CR8_LOAD_EXITING; - procbased_ctls &= ~PROCBASED_CR8_STORE_EXITING; - } - - /* - * Check support for virtual interrupt delivery. + /* Check for APIC virtualization capabilities: + * - TPR shadowing + * - Full APICv (with or without x2APIC support) + * - Posted interrupt handling */ - procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES | - PROCBASED2_VIRTUALIZE_X2APIC_MODE | - PROCBASED2_APIC_REGISTER_VIRTUALIZATION | - PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY); - - error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, - procbased2_vid_bits, 0, &tmp); - if (error == 0 && tpr_shadowing) { - virtual_interrupt_delivery = 1; - TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid", - &virtual_interrupt_delivery); - } - - if (virtual_interrupt_delivery) { - procbased_ctls |= PROCBASED_USE_TPR_SHADOW; - procbased_ctls2 |= procbased2_vid_bits; - procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE; - - /* - * Check for Posted Interrupts only if Virtual Interrupt - * Delivery is enabled. - */ - error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, - MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0, - &tmp); - if (error == 0) { -#ifdef __FreeBSD__ - pirvec = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) : - &IDTVEC(justreturn)); - if (pirvec < 0) { - if (bootverbose) { - printf("vmx_init: unable to allocate " - "posted interrupt vector\n"); + if (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, MSR_VMX_TRUE_PROCBASED_CTLS, + PROCBASED_USE_TPR_SHADOW, 0, &tmp) == 0) { + avail_caps |= VMX_CAP_TPR_SHADOW; + + const uint32_t apicv_bits = + PROCBASED2_VIRTUALIZE_APIC_ACCESSES | + PROCBASED2_APIC_REGISTER_VIRTUALIZATION | + PROCBASED2_VIRTUALIZE_X2APIC_MODE | + PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY; + if (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, + MSR_VMX_PROCBASED_CTLS2, apicv_bits, 0, &tmp) == 0) { + avail_caps |= VMX_CAP_APICV; + + /* + * It may make sense in the future to differentiate + * hardware (or software) configurations with APICv but + * no support for accelerating x2APIC mode. + */ + avail_caps |= VMX_CAP_APICV_X2APIC; + + error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, + MSR_VMX_TRUE_PINBASED_CTLS, + PINBASED_POSTED_INTERRUPT, 0, &tmp); + if (error == 0) { + /* + * If the PSM-provided interfaces for requesting + * and using a PIR IPI vector are present, use + * them for posted interrupts. + */ + if (psm_get_pir_ipivect != NULL && + psm_send_pir_ipi != NULL) { + pirvec = psm_get_pir_ipivect(); + avail_caps |= VMX_CAP_APICV_PIR; } - } else { - posted_interrupts = 1; - TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir", - &posted_interrupts); } -#else - /* - * If the PSM-provided interfaces for requesting and - * using a PIR IPI vector are present, use them for - * posted interrupts. - */ - if (psm_get_pir_ipivect != NULL && - psm_send_pir_ipi != NULL) { - pirvec = psm_get_pir_ipivect(); - posted_interrupts = 1; - } -#endif } } - if (posted_interrupts) - pinbased_ctls |= PINBASED_POSTED_INTERRUPT; - /* Initialize EPT */ error = ept_init(ipinum); if (error) { @@ -962,11 +794,10 @@ cr0_zeros_mask = ~fixed0 & ~fixed1; /* - * CR0_PE and CR0_PG can be set to zero in VMX non-root operation - * if unrestricted guest execution is allowed. + * Since Unrestricted Guest was already verified present, CR0_PE and + * CR0_PG are allowed to be set to zero in VMX non-root operation */ - if (cap_unrestricted_guest) - cr0_ones_mask &= ~(CR0_PG | CR0_PE); + cr0_ones_mask &= ~(CR0_PG | CR0_PE); /* * Do not allow the guest to set CR0_NW or CR0_CD. @@ -978,17 +809,9 @@ cr4_ones_mask = fixed0 & fixed1; cr4_zeros_mask = ~fixed0 & ~fixed1; -#ifdef __FreeBSD__ - vpid_init(); -#endif - vmx_msr_init(); -#ifdef __FreeBSD__ - /* enable VMX operation */ - smp_rendezvous(NULL, vmx_enable, NULL, NULL); -#endif - + vmx_capabilities = avail_caps; vmx_initialized = 1; return (0); @@ -1065,6 +888,7 @@ struct vmcs *vmcs; uint32_t exc_bitmap; uint16_t maxcpus; + uint32_t proc_ctls, proc2_ctls, pin_ctls; vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); if ((uintptr_t)vmx & PAGE_MASK) { @@ -1119,16 +943,38 @@ vpid_alloc(vpid, VM_MAXCPU); - if (virtual_interrupt_delivery) { + /* Grab the established defaults */ + proc_ctls = procbased_ctls; + proc2_ctls = procbased_ctls2; + pin_ctls = pinbased_ctls; + /* For now, default to the available capabilities */ + vmx->vmx_caps = vmx_capabilities; + + if (vmx_cap_en(vmx, VMX_CAP_TPR_SHADOW)) { + proc_ctls |= PROCBASED_USE_TPR_SHADOW; + proc_ctls &= ~PROCBASED_CR8_LOAD_EXITING; + proc_ctls &= ~PROCBASED_CR8_STORE_EXITING; + } + if (vmx_cap_en(vmx, VMX_CAP_APICV)) { + ASSERT(vmx_cap_en(vmx, VMX_CAP_TPR_SHADOW)); + + proc2_ctls |= (PROCBASED2_VIRTUALIZE_APIC_ACCESSES | + PROCBASED2_APIC_REGISTER_VIRTUALIZATION | + PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY); + error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE, APIC_ACCESS_ADDRESS); /* XXX this should really return an error to the caller */ KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error)); } + if (vmx_cap_en(vmx, VMX_CAP_APICV_PIR)) { + ASSERT(vmx_cap_en(vmx, VMX_CAP_APICV)); + + pin_ctls |= PINBASED_POSTED_INTERRUPT; + } maxcpus = vm_get_maxcpus(vm); for (i = 0; i < maxcpus; i++) { -#ifndef __FreeBSD__ /* * Cache physical address lookups for various components which * may be required inside the critical_enter() section implied @@ -1137,13 +983,10 @@ vm_paddr_t msr_bitmap_pa = vtophys(vmx->msr_bitmap); vm_paddr_t apic_page_pa = vtophys(&vmx->apic_page[i]); vm_paddr_t pir_desc_pa = vtophys(&vmx->pir_desc[i]); -#endif /* __FreeBSD__ */ vmcs = &vmx->vmcs[i]; vmcs->identifier = vmx_revision(); -#ifndef __FreeBSD__ vmcs->vmcs_pa = (uint64_t)vtophys(vmcs); -#endif error = vmclear(vmcs); if (error != 0) { panic("vmx_vminit: vmclear error %d on vcpu %d\n", @@ -1157,25 +1000,14 @@ VMPTRLD(vmcs); error = 0; -#ifdef __FreeBSD__ - /* - * The illumos vmx_enter_guest implementation avoids some of - * the %rsp-manipulation games which are present in the stock - * one from FreeBSD. - */ - error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]); -#endif + error += vmwrite(VMCS_EPTP, vmx->eptp); - error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls); - error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls); - error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2); + error += vmwrite(VMCS_PIN_BASED_CTLS, pin_ctls); + error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, proc_ctls); + error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, proc2_ctls); error += vmwrite(VMCS_EXIT_CTLS, exit_ctls); error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls); -#ifdef __FreeBSD__ - error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap)); -#else error += vmwrite(VMCS_MSR_BITMAP, msr_bitmap_pa); -#endif error += vmwrite(VMCS_VPID, vpid[i]); if (guest_l1d_flush && !guest_l1d_flush_sw) { @@ -1197,37 +1029,27 @@ vmx->ctx[i].guest_dr6 = DBREG_DR6_RESERVED1; error += vmwrite(VMCS_GUEST_DR7, DBREG_DR7_RESERVED1); - if (tpr_shadowing) { -#ifdef __FreeBSD__ - error += vmwrite(VMCS_VIRTUAL_APIC, - vtophys(&vmx->apic_page[i])); -#else + if (vmx_cap_en(vmx, VMX_CAP_TPR_SHADOW)) { error += vmwrite(VMCS_VIRTUAL_APIC, apic_page_pa); -#endif } - if (virtual_interrupt_delivery) { + if (vmx_cap_en(vmx, VMX_CAP_APICV)) { error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS); error += vmwrite(VMCS_EOI_EXIT0, 0); error += vmwrite(VMCS_EOI_EXIT1, 0); error += vmwrite(VMCS_EOI_EXIT2, 0); error += vmwrite(VMCS_EOI_EXIT3, 0); } - if (posted_interrupts) { + if (vmx_cap_en(vmx, VMX_CAP_APICV_PIR)) { error += vmwrite(VMCS_PIR_VECTOR, pirvec); -#ifdef __FreeBSD__ - error += vmwrite(VMCS_PIR_DESC, - vtophys(&vmx->pir_desc[i])); -#else error += vmwrite(VMCS_PIR_DESC, pir_desc_pa); -#endif } VMCLEAR(vmcs); KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs")); vmx->cap[i].set = 0; - vmx->cap[i].proc_ctls = procbased_ctls; - vmx->cap[i].proc_ctls2 = procbased_ctls2; + vmx->cap[i].proc_ctls = proc_ctls; + vmx->cap[i].proc_ctls2 = proc2_ctls; vmx->cap[i].exc_bitmap = exc_bitmap; vmx->state[i].nextrip = ~0; @@ -1536,7 +1358,6 @@ #endif } -#ifndef __FreeBSD__ static void vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic, uint64_t guestrip) @@ -1640,7 +1461,7 @@ */ KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d from INTR", vector)); - } else if (!virtual_interrupt_delivery) { + } else if (!vmx_cap_en(vmx, VMX_CAP_APICV)) { /* Ask the local apic for a vector to inject */ if (!vlapic_pending_intr(vlapic, &vector)) return; @@ -1715,197 +1536,6 @@ */ vmx_set_int_window_exiting(vmx, vcpu); } -#else -static void -vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic, - uint64_t guestrip) -{ - int vector, need_nmi_exiting, extint_pending; - uint64_t rflags, entryinfo; - uint32_t gi, info; - - vlapic_tmr_update(vlapic); - - if (vmx->state[vcpu].nextrip != guestrip) { - gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); - if (gi & HWINTR_BLOCKING) { - VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking " - "cleared due to rip change: %#lx/%#lx", - vmx->state[vcpu].nextrip, guestrip); - gi &= ~HWINTR_BLOCKING; - vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); - } - } - - if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) { - KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry " - "intinfo is not valid: %#lx", __func__, entryinfo)); - - info = vmcs_read(VMCS_ENTRY_INTR_INFO); - KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject " - "pending exception: %#lx/%#x", __func__, entryinfo, info)); - - info = entryinfo; - vector = info & 0xff; - if (vector == IDT_BP || vector == IDT_OF) { - /* - * VT-x requires #BP and #OF to be injected as software - * exceptions. - */ - info &= ~VMCS_INTR_T_MASK; - info |= VMCS_INTR_T_SWEXCEPTION; - } - - if (info & VMCS_INTR_DEL_ERRCODE) - vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32); - - vmcs_write(VMCS_ENTRY_INTR_INFO, info); - } - - if (vm_nmi_pending(vmx->vm, vcpu)) { - /* - * If there are no conditions blocking NMI injection then - * inject it directly here otherwise enable "NMI window - * exiting" to inject it as soon as we can. - * - * We also check for STI_BLOCKING because some implementations - * don't allow NMI injection in this case. If we are running - * on a processor that doesn't have this restriction it will - * immediately exit and the NMI will be injected in the - * "NMI window exiting" handler. - */ - need_nmi_exiting = 1; - gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); - if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) { - info = vmcs_read(VMCS_ENTRY_INTR_INFO); - if ((info & VMCS_INTR_VALID) == 0) { - vmx_inject_nmi(vmx, vcpu); - need_nmi_exiting = 0; - } else { - VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI " - "due to VM-entry intr info %#x", info); - } - } else { - VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to " - "Guest Interruptibility-state %#x", gi); - } - - if (need_nmi_exiting) - vmx_set_nmi_window_exiting(vmx, vcpu); - } - - extint_pending = vm_extint_pending(vmx->vm, vcpu); - - if (!extint_pending && virtual_interrupt_delivery) { - vmx_inject_pir(vlapic); - return; - } - - /* - * If interrupt-window exiting is already in effect then don't bother - * checking for pending interrupts. This is just an optimization and - * not needed for correctness. - */ - if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) { - VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to " - "pending int_window_exiting"); - return; - } - - if (!extint_pending) { - /* Ask the local apic for a vector to inject */ - if (!vlapic_pending_intr(vlapic, &vector)) - return; - - /* - * From the Intel SDM, Volume 3, Section "Maskable - * Hardware Interrupts": - * - maskable interrupt vectors [16,255] can be delivered - * through the local APIC. - */ - KASSERT(vector >= 16 && vector <= 255, - ("invalid vector %d from local APIC", vector)); - } else { - /* Ask the legacy pic for a vector to inject */ - vatpic_pending_intr(vmx->vm, &vector); - - /* - * From the Intel SDM, Volume 3, Section "Maskable - * Hardware Interrupts": - * - maskable interrupt vectors [0,255] can be delivered - * through the INTR pin. - */ - KASSERT(vector >= 0 && vector <= 255, - ("invalid vector %d from INTR", vector)); - } - - /* Check RFLAGS.IF and the interruptibility state of the guest */ - rflags = vmcs_read(VMCS_GUEST_RFLAGS); - if ((rflags & PSL_I) == 0) { - VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " - "rflags %#lx", vector, rflags); - goto cantinject; - } - - gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); - if (gi & HWINTR_BLOCKING) { - VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " - "Guest Interruptibility-state %#x", vector, gi); - goto cantinject; - } - - info = vmcs_read(VMCS_ENTRY_INTR_INFO); - if (info & VMCS_INTR_VALID) { - /* - * This is expected and could happen for multiple reasons: - * - A vectoring VM-entry was aborted due to astpending - * - A VM-exit happened during event injection. - * - An exception was injected above. - * - An NMI was injected above or after "NMI window exiting" - */ - VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " - "VM-entry intr info %#x", vector, info); - goto cantinject; - } - - /* Inject the interrupt */ - info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID; - info |= vector; - vmcs_write(VMCS_ENTRY_INTR_INFO, info); - - if (!extint_pending) { - /* Update the Local APIC ISR */ - vlapic_intr_accepted(vlapic, vector); - } else { - vm_extint_clear(vmx->vm, vcpu); - vatpic_intr_accepted(vmx->vm, vector); - - /* - * After we accepted the current ExtINT the PIC may - * have posted another one. If that is the case, set - * the Interrupt Window Exiting execution control so - * we can inject that one too. - * - * Also, interrupt window exiting allows us to inject any - * pending APIC vector that was preempted by the ExtINT - * as soon as possible. This applies both for the software - * emulated vlapic and the hardware assisted virtual APIC. - */ - vmx_set_int_window_exiting(vmx, vcpu); - } - - VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); - - return; - -cantinject: - /* - * Set the Interrupt Window Exiting execution control so we can inject - * the interrupt as soon as blocking condition goes away. - */ - vmx_set_int_window_exiting(vmx, vcpu); -} -#endif /* __FreeBSD__ */ /* * If the Virtual NMIs execution control is '1' then the logical processor @@ -2830,11 +2460,12 @@ SDT_PROBE3(vmm, vmx, exit, halt, vmx, vcpu, vmexit); vmexit->exitcode = VM_EXITCODE_HLT; vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS); - if (virtual_interrupt_delivery) + if (vmx_cap_en(vmx, VMX_CAP_APICV)) { vmexit->u.hlt.intr_status = vmcs_read(VMCS_GUEST_INTR_STATUS); - else + } else { vmexit->u.hlt.intr_status = 0; + } break; case EXIT_REASON_MTF: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1); @@ -3336,22 +2967,16 @@ * * The same reasoning applies to the IPI generated by * pmap_invalidate_ept(). - */ -#ifdef __FreeBSD__ - disable_intr(); - vmx_inject_interrupts(vmx, vcpu, vlapic, rip); -#else - /* + * * The bulk of guest interrupt injection is done without * interrupts disabled on the host CPU. This is necessary * since contended mutexes might force the thread to sleep. */ vmx_inject_interrupts(vmx, vcpu, vlapic, rip); disable_intr(); - if (virtual_interrupt_delivery) { + if (vmx_cap_en(vmx, VMX_CAP_APICV)) { vmx_inject_pir(vlapic); } -#endif /* __FreeBSD__ */ /* * Check for vcpu suspension after injecting events because @@ -3449,12 +3074,15 @@ #endif /* - * If TPR Shadowing is enabled, the TPR Threshold - * must be updated right before entering the guest. + * If TPR Shadowing is enabled, the TPR Threshold must be + * updated right before entering the guest. */ - if (tpr_shadowing && !virtual_interrupt_delivery) { - if ((vmx->cap[vcpu].proc_ctls & PROCBASED_USE_TPR_SHADOW) != 0) { - vmcs_write(VMCS_TPR_THRESHOLD, vlapic_get_cr8(vlapic)); + if (vmx_cap_en(vmx, VMX_CAP_TPR_SHADOW) && + !vmx_cap_en(vmx, VMX_CAP_APICV)) { + if ((vmx->cap[vcpu].proc_ctls & + PROCBASED_USE_TPR_SHADOW) != 0) { + vmcs_write(VMCS_TPR_THRESHOLD, + vlapic_get_cr8(vlapic)); } } @@ -3812,10 +3440,6 @@ if (cap_monitor_trap) ret = 0; break; - case VM_CAP_UNRESTRICTED_GUEST: - if (cap_unrestricted_guest) - ret = 0; - break; case VM_CAP_ENABLE_INVPCID: if (cap_invpcid) ret = 0; @@ -3876,15 +3500,6 @@ reg = VMCS_PRI_PROC_BASED_CTLS; } break; - case VM_CAP_UNRESTRICTED_GUEST: - if (cap_unrestricted_guest) { - retval = 0; - pptr = &vmx->cap[vcpu].proc_ctls2; - baseval = *pptr; - flag = PROCBASED2_UNRESTRICTED_GUEST; - reg = VMCS_SEC_PROC_BASED_CTLS; - } - break; case VM_CAP_ENABLE_INVPCID: if (cap_invpcid) { retval = 0; @@ -4309,21 +3924,21 @@ vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid]; vlapic_vtx->vmx = vmx; - if (tpr_shadowing) { + if (vmx_cap_en(vmx, VMX_CAP_TPR_SHADOW)) { vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode_ts; } - - if (virtual_interrupt_delivery) { + if (vmx_cap_en(vmx, VMX_CAP_APICV)) { vlapic->ops.set_intr_ready = vmx_set_intr_ready; vlapic->ops.pending_intr = vmx_pending_intr; vlapic->ops.intr_accepted = vmx_intr_accepted; vlapic->ops.set_tmr = vmx_set_tmr; vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode_vid; + + if (vmx_cap_en(vmx, VMX_CAP_APICV_PIR)) { + vlapic->ops.post_intr = vmx_post_intr; + } } - if (posted_interrupts) - vlapic->ops.post_intr = vmx_post_intr; - vlapic_init(vlapic); return (vlapic);
--- a/usr/src/uts/i86pc/io/vmm/intel/vmx.h Tue Jul 21 02:10:58 2020 -0500 +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.h Wed Jun 24 20:53:43 2020 +0000 @@ -29,7 +29,17 @@ */ /* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * * Copyright 2018 Joyent, Inc. + * Copyright 2020 Oxide Computer Company */ #ifndef _VMX_H_ @@ -151,6 +161,7 @@ struct vmxcap cap[VM_MAXCPU]; struct vmxstate state[VM_MAXCPU]; uint64_t eptp; + enum vmx_caps vmx_caps; struct vm *vm; long eptgen[MAXCPU]; /* cached pmap->pm_eptgen */ }; @@ -158,6 +169,12 @@ CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0); CTASSERT((offsetof(struct vmx, pir_desc[0]) & 63) == 0); +static __inline bool +vmx_cap_en(const struct vmx *vmx, enum vmx_caps cap) +{ + return ((vmx->vmx_caps & cap) == cap); +} + #define VMX_GUEST_VMEXIT 0 #define VMX_VMRESUME_ERROR 1 #define VMX_VMLAUNCH_ERROR 2
--- a/usr/src/uts/i86pc/sys/vmm.h Tue Jul 21 02:10:58 2020 -0500 +++ b/usr/src/uts/i86pc/sys/vmm.h Wed Jun 24 20:53:43 2020 +0000 @@ -39,6 +39,7 @@ * * Copyright 2015 Pluribus Networks Inc. * Copyright 2019 Joyent, Inc. + * Copyright 2020 Oxide Computer Company */ #ifndef _VMM_H_ @@ -165,12 +166,19 @@ VM_CAP_HALT_EXIT, VM_CAP_MTRAP_EXIT, VM_CAP_PAUSE_EXIT, - VM_CAP_UNRESTRICTED_GUEST, VM_CAP_ENABLE_INVPCID, VM_CAP_BPT_EXIT, VM_CAP_MAX }; +enum vmx_caps { + VMX_CAP_NONE = 0, + VMX_CAP_TPR_SHADOW = (1UL << 0), + VMX_CAP_APICV = (1UL << 1), + VMX_CAP_APICV_X2APIC = (1UL << 2), + VMX_CAP_APICV_PIR = (1UL << 3), +}; + enum vm_intr_trigger { EDGE_TRIGGER, LEVEL_TRIGGER