changeset 25248:71e294efbc2d

12916 bhyve should be able to limit vmx capabilities 12917 bhyve should always use Unrestricted Guest Reviewed by: Mike Zeller <mike.zeller@joyent.com> Approved by: Dan McDonald <danmcd@joyent.com>
author Patrick Mooney <pmooney@pfmooney.com>
date Wed, 24 Jun 2020 20:53:43 +0000
parents 00298d3cee5f
children 3714f17693a0
files usr/src/cmd/bhyve/bhyverun.c usr/src/cmd/bhyve/spinup_ap.c usr/src/lib/libvmmapi/common/vmmapi.c usr/src/uts/i86pc/io/vmm/amd/svm.c usr/src/uts/i86pc/io/vmm/intel/vmx.c usr/src/uts/i86pc/io/vmm/intel/vmx.h usr/src/uts/i86pc/sys/vmm.h
diffstat 7 files changed, 162 insertions(+), 501 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/cmd/bhyve/bhyverun.c	Tue Jul 21 02:10:58 2020 -0500
+++ b/usr/src/cmd/bhyve/bhyverun.c	Wed Jun 24 20:53:43 2020 +0000
@@ -39,6 +39,7 @@
  *
  * Copyright 2015 Pluribus Networks Inc.
  * Copyright 2018 Joyent, Inc.
+ * Copyright 2020 Oxide Computer Company
  */
 
 #include <sys/cdefs.h>
@@ -948,6 +949,7 @@
 static int
 num_vcpus_allowed(struct vmctx *ctx)
 {
+#ifdef __FreeBSD__
 	int tmp, error;
 
 	error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp);
@@ -960,6 +962,10 @@
 		return (VM_MAXCPU);
 	else
 		return (1);
+#else
+	/* Unrestricted Guest is always enabled on illumos */
+	return (VM_MAXCPU);
+#endif /* __FreeBSD__ */
 }
 
 void
@@ -1314,11 +1320,15 @@
 	vga_init(1);
 
 	if (lpc_bootrom()) {
+#ifdef __FreeBSD__
 		if (vm_set_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, 1)) {
 			fprintf(stderr, "ROM boot failed: unrestricted guest "
 			    "capability not available\n");
 			exit(4);
 		}
+#else
+		/* Unrestricted Guest is always enabled on illumos */
+#endif
 		error = vcpu_reset(ctx, BSP);
 		assert(error == 0);
 	}
--- a/usr/src/cmd/bhyve/spinup_ap.c	Tue Jul 21 02:10:58 2020 -0500
+++ b/usr/src/cmd/bhyve/spinup_ap.c	Wed Jun 24 20:53:43 2020 +0000
@@ -27,6 +27,18 @@
  *
  * $FreeBSD$
  */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2020 Oxide Computer Company
+ */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
@@ -89,6 +101,7 @@
 
 	fbsdrun_set_capabilities(ctx, newcpu);
 
+#ifdef __FreeBSD__
 	/*
 	 * Enable the 'unrestricted guest' mode for 'newcpu'.
 	 *
@@ -97,6 +110,9 @@
 	 */
 	error = vm_set_capability(ctx, newcpu, VM_CAP_UNRESTRICTED_GUEST, 1);
 	assert(error == 0);
+#else
+	/* Unrestricted Guest is always enabled on illumos */
+#endif
 
 	spinup_ap_realmode(ctx, newcpu, &rip);
 
--- a/usr/src/lib/libvmmapi/common/vmmapi.c	Tue Jul 21 02:10:58 2020 -0500
+++ b/usr/src/lib/libvmmapi/common/vmmapi.c	Wed Jun 24 20:53:43 2020 +0000
@@ -39,6 +39,7 @@
  *
  * Copyright 2015 Pluribus Networks Inc.
  * Copyright 2019 Joyent, Inc.
+ * Copyright 2020 Oxide Computer Company
  */
 
 #include <sys/cdefs.h>
@@ -1004,7 +1005,9 @@
 	[VM_CAP_HALT_EXIT]  = "hlt_exit",
 	[VM_CAP_MTRAP_EXIT] = "mtrap_exit",
 	[VM_CAP_PAUSE_EXIT] = "pause_exit",
+#ifdef __FreeBSD__
 	[VM_CAP_UNRESTRICTED_GUEST] = "unrestricted_guest",
+#endif
 	[VM_CAP_ENABLE_INVPCID] = "enable_invpcid",
 	[VM_CAP_BPT_EXIT] = "bpt_exit",
 };
--- a/usr/src/uts/i86pc/io/vmm/amd/svm.c	Tue Jul 21 02:10:58 2020 -0500
+++ b/usr/src/uts/i86pc/io/vmm/amd/svm.c	Wed Jun 24 20:53:43 2020 +0000
@@ -2342,11 +2342,6 @@
 		svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
 		    VMCB_INTCPT_PAUSE, val);
 		break;
-	case VM_CAP_UNRESTRICTED_GUEST:
-		/* Unrestricted guest execution cannot be disabled in SVM */
-		if (val == 0)
-			error = EINVAL;
-		break;
 	default:
 		error = ENOENT;
 		break;
@@ -2372,9 +2367,6 @@
 		*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
 		    VMCB_INTCPT_PAUSE);
 		break;
-	case VM_CAP_UNRESTRICTED_GUEST:
-		*retval = 1;	/* unrestricted guest is always enabled */
-		break;
 	default:
 		error = ENOENT;
 		break;
--- a/usr/src/uts/i86pc/io/vmm/intel/vmx.c	Tue Jul 21 02:10:58 2020 -0500
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.c	Wed Jun 24 20:53:43 2020 +0000
@@ -134,7 +134,14 @@
 	PROCBASED_CR3_STORE_EXITING |	\
 	PROCBASED_IO_BITMAPS)
 
-#define	PROCBASED_CTLS2_ONE_SETTING	PROCBASED2_ENABLE_EPT
+/*
+ * EPT and Unrestricted Guest are considered necessities.  The latter is not a
+ * requirement on FreeBSD, where grub2-bhyve is used to load guests directly
+ * without a bootrom starting in real mode.
+ */
+#define	PROCBASED_CTLS2_ONE_SETTING		\
+	(PROCBASED2_ENABLE_EPT |		\
+	PROCBASED2_UNRESTRICTED_GUEST)
 #define	PROCBASED_CTLS2_ZERO_SETTING	0
 
 #define	VM_EXIT_CTLS_ONE_SETTING					\
@@ -206,10 +213,6 @@
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit,
     0, "PAUSE triggers a VM-exit");
 
-static int cap_unrestricted_guest;
-SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD,
-    &cap_unrestricted_guest, 0, "Unrestricted guests");
-
 static int cap_monitor_trap;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD,
     &cap_monitor_trap, 0, "Monitor trap flag");
@@ -218,17 +221,8 @@
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid,
     0, "Guests are allowed to use INVPCID");
 
-static int tpr_shadowing;
-SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, tpr_shadowing, CTLFLAG_RD,
-    &tpr_shadowing, 0, "TPR shadowing support");
-
-static int virtual_interrupt_delivery;
-SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
-    &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support");
-
-static int posted_interrupts;
-SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD,
-    &posted_interrupts, 0, "APICv posted interrupt support");
+/* Extra capabilities (VMX_CAP_*) beyond the minimum */
+static enum vmx_caps vmx_capabilities;
 
 static int pirvec = -1;
 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD,
@@ -603,94 +597,6 @@
 	}
 }
 
-#ifdef __FreeBSD__
-static void
-vpid_init(void)
-{
-	/*
-	 * VPID 0 is required when the "enable VPID" execution control is
-	 * disabled.
-	 *
-	 * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the
-	 * unit number allocator does not have sufficient unique VPIDs to
-	 * satisfy the allocation.
-	 *
-	 * The remaining VPIDs are managed by the unit number allocator.
-	 */
-	vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL);
-}
-
-static void
-vmx_disable(void *arg __unused)
-{
-	struct invvpid_desc invvpid_desc = { 0 };
-	struct invept_desc invept_desc = { 0 };
-
-	if (vmxon_enabled[curcpu]) {
-		/*
-		 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
-		 *
-		 * VMXON or VMXOFF are not required to invalidate any TLB
-		 * caching structures. This prevents potential retention of
-		 * cached information in the TLB between distinct VMX episodes.
-		 */
-		invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
-		invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
-		vmxoff();
-	}
-	load_cr4(rcr4() & ~CR4_VMXE);
-}
-
-static int
-vmx_cleanup(void)
-{
-
-	if (pirvec >= 0)
-		lapic_ipi_free(pirvec);
-
-	if (vpid_unr != NULL) {
-		delete_unrhdr(vpid_unr);
-		vpid_unr = NULL;
-	}
-
-	if (nmi_flush_l1d_sw == 1)
-		nmi_flush_l1d_sw = 0;
-
-	smp_rendezvous(NULL, vmx_disable, NULL, NULL);
-
-	return (0);
-}
-
-static void
-vmx_enable(void *arg __unused)
-{
-	int error;
-	uint64_t feature_control;
-
-	feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
-	if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 ||
-	    (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
-		wrmsr(MSR_IA32_FEATURE_CONTROL,
-		    feature_control | IA32_FEATURE_CONTROL_VMX_EN |
-		    IA32_FEATURE_CONTROL_LOCK);
-	}
-
-	load_cr4(rcr4() | CR4_VMXE);
-
-	*(uint32_t *)vmxon_region[curcpu] = vmx_revision();
-	error = vmxon(vmxon_region[curcpu]);
-	if (error == 0)
-		vmxon_enabled[curcpu] = 1;
-}
-
-static void
-vmx_restore(void)
-{
-
-	if (vmxon_enabled[curcpu])
-		vmxon(vmxon_region[curcpu]);
-}
-#else /* __FreeBSD__ */
 static int
 vmx_cleanup(void)
 {
@@ -703,48 +609,14 @@
 {
 	/* No-op on illumos */
 }
-#endif /* __FreeBSD__ */
 
 static int
 vmx_init(int ipinum)
 {
 	int error;
-#ifdef __FreeBSD__
-	uint64_t basic, fixed0, fixed1, feature_control;
-#else
 	uint64_t fixed0, fixed1;
-#endif
-	uint32_t tmp, procbased2_vid_bits;
-
-#ifdef __FreeBSD__
-	/* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
-	if (!(cpu_feature2 & CPUID2_VMX)) {
-		printf("vmx_init: processor does not support VMX operation\n");
-		return (ENXIO);
-	}
-
-	/*
-	 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits
-	 * are set (bits 0 and 2 respectively).
-	 */
-	feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
-	if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 &&
-	    (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
-		printf("vmx_init: VMX operation disabled by BIOS\n");
-		return (ENXIO);
-	}
-
-	/*
-	 * Verify capabilities MSR_VMX_BASIC:
-	 * - bit 54 indicates support for INS/OUTS decoding
-	 */
-	basic = rdmsr(MSR_VMX_BASIC);
-	if ((basic & (1UL << 54)) == 0) {
-		printf("vmx_init: processor does not support desired basic "
-		    "capabilities\n");
-		return (EINVAL);
-	}
-#endif /* __FreeBSD__ */
+	uint32_t tmp;
+	enum vmx_caps avail_caps = VMX_CAP_NONE;
 
 	/* Check support for primary processor-based VM-execution controls */
 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
@@ -828,93 +700,53 @@
 					 PROCBASED_PAUSE_EXITING, 0,
 					 &tmp) == 0);
 
-	cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
-					MSR_VMX_PROCBASED_CTLS2,
-					PROCBASED2_UNRESTRICTED_GUEST, 0,
-				        &tmp) == 0);
-
 	cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
 	    MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0,
 	    &tmp) == 0);
 
-	/*
-	 * Check support for TPR shadow.
-	 */
-	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
-	    MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0,
-	    &tmp);
-	if (error == 0) {
-		tpr_shadowing = 1;
-		TUNABLE_INT_FETCH("hw.vmm.vmx.use_tpr_shadowing",
-		    &tpr_shadowing);
-	}
-
-	if (tpr_shadowing) {
-		procbased_ctls |= PROCBASED_USE_TPR_SHADOW;
-		procbased_ctls &= ~PROCBASED_CR8_LOAD_EXITING;
-		procbased_ctls &= ~PROCBASED_CR8_STORE_EXITING;
-	}
-
-	/*
-	 * Check support for virtual interrupt delivery.
+	/* Check for APIC virtualization capabilities:
+	 * - TPR shadowing
+	 * - Full APICv (with or without x2APIC support)
+	 * - Posted interrupt handling
 	 */
-	procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES |
-	    PROCBASED2_VIRTUALIZE_X2APIC_MODE |
-	    PROCBASED2_APIC_REGISTER_VIRTUALIZATION |
-	    PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY);
-
-	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
-	    procbased2_vid_bits, 0, &tmp);
-	if (error == 0 && tpr_shadowing) {
-		virtual_interrupt_delivery = 1;
-		TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid",
-		    &virtual_interrupt_delivery);
-	}
-
-	if (virtual_interrupt_delivery) {
-		procbased_ctls |= PROCBASED_USE_TPR_SHADOW;
-		procbased_ctls2 |= procbased2_vid_bits;
-		procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE;
-
-		/*
-		 * Check for Posted Interrupts only if Virtual Interrupt
-		 * Delivery is enabled.
-		 */
-		error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
-		    MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0,
-		    &tmp);
-		if (error == 0) {
-#ifdef __FreeBSD__
-			pirvec = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) :
-			    &IDTVEC(justreturn));
-			if (pirvec < 0) {
-				if (bootverbose) {
-					printf("vmx_init: unable to allocate "
-					    "posted interrupt vector\n");
+	if (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, MSR_VMX_TRUE_PROCBASED_CTLS,
+	    PROCBASED_USE_TPR_SHADOW, 0, &tmp) == 0) {
+		avail_caps |= VMX_CAP_TPR_SHADOW;
+
+		const uint32_t apicv_bits =
+		    PROCBASED2_VIRTUALIZE_APIC_ACCESSES |
+		    PROCBASED2_APIC_REGISTER_VIRTUALIZATION |
+		    PROCBASED2_VIRTUALIZE_X2APIC_MODE |
+		    PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY;
+		if (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
+		    MSR_VMX_PROCBASED_CTLS2, apicv_bits, 0, &tmp) == 0) {
+			avail_caps |= VMX_CAP_APICV;
+
+			/*
+			 * It may make sense in the future to differentiate
+			 * hardware (or software) configurations with APICv but
+			 * no support for accelerating x2APIC mode.
+			 */
+			avail_caps |= VMX_CAP_APICV_X2APIC;
+
+			error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
+			    MSR_VMX_TRUE_PINBASED_CTLS,
+			    PINBASED_POSTED_INTERRUPT, 0, &tmp);
+			if (error == 0) {
+				/*
+				 * If the PSM-provided interfaces for requesting
+				 * and using a PIR IPI vector are present, use
+				 * them for posted interrupts.
+				 */
+				if (psm_get_pir_ipivect != NULL &&
+				    psm_send_pir_ipi != NULL) {
+					pirvec = psm_get_pir_ipivect();
+					avail_caps |= VMX_CAP_APICV_PIR;
 				}
-			} else {
-				posted_interrupts = 1;
-				TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir",
-				    &posted_interrupts);
 			}
-#else
-			/*
-			 * If the PSM-provided interfaces for requesting and
-			 * using a PIR IPI vector are present, use them for
-			 * posted interrupts.
-			 */
-			if (psm_get_pir_ipivect != NULL &&
-			    psm_send_pir_ipi != NULL) {
-				pirvec = psm_get_pir_ipivect();
-				posted_interrupts = 1;
-			}
-#endif
 		}
 	}
 
-	if (posted_interrupts)
-		    pinbased_ctls |= PINBASED_POSTED_INTERRUPT;
-
 	/* Initialize EPT */
 	error = ept_init(ipinum);
 	if (error) {
@@ -962,11 +794,10 @@
 	cr0_zeros_mask = ~fixed0 & ~fixed1;
 
 	/*
-	 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation
-	 * if unrestricted guest execution is allowed.
+	 * Since Unrestricted Guest was already verified present, CR0_PE and
+	 * CR0_PG are allowed to be set to zero in VMX non-root operation
 	 */
-	if (cap_unrestricted_guest)
-		cr0_ones_mask &= ~(CR0_PG | CR0_PE);
+	cr0_ones_mask &= ~(CR0_PG | CR0_PE);
 
 	/*
 	 * Do not allow the guest to set CR0_NW or CR0_CD.
@@ -978,17 +809,9 @@
 	cr4_ones_mask = fixed0 & fixed1;
 	cr4_zeros_mask = ~fixed0 & ~fixed1;
 
-#ifdef __FreeBSD__
-	vpid_init();
-#endif
-
 	vmx_msr_init();
 
-#ifdef __FreeBSD__
-	/* enable VMX operation */
-	smp_rendezvous(NULL, vmx_enable, NULL, NULL);
-#endif
-
+	vmx_capabilities = avail_caps;
 	vmx_initialized = 1;
 
 	return (0);
@@ -1065,6 +888,7 @@
 	struct vmcs *vmcs;
 	uint32_t exc_bitmap;
 	uint16_t maxcpus;
+	uint32_t proc_ctls, proc2_ctls, pin_ctls;
 
 	vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
 	if ((uintptr_t)vmx & PAGE_MASK) {
@@ -1119,16 +943,38 @@
 
 	vpid_alloc(vpid, VM_MAXCPU);
 
-	if (virtual_interrupt_delivery) {
+	/* Grab the established defaults */
+	proc_ctls = procbased_ctls;
+	proc2_ctls = procbased_ctls2;
+	pin_ctls = pinbased_ctls;
+	/* For now, default to the available capabilities */
+	vmx->vmx_caps = vmx_capabilities;
+
+	if (vmx_cap_en(vmx, VMX_CAP_TPR_SHADOW)) {
+		proc_ctls |= PROCBASED_USE_TPR_SHADOW;
+		proc_ctls &= ~PROCBASED_CR8_LOAD_EXITING;
+		proc_ctls &= ~PROCBASED_CR8_STORE_EXITING;
+	}
+	if (vmx_cap_en(vmx, VMX_CAP_APICV)) {
+		ASSERT(vmx_cap_en(vmx, VMX_CAP_TPR_SHADOW));
+
+		proc2_ctls |= (PROCBASED2_VIRTUALIZE_APIC_ACCESSES |
+		    PROCBASED2_APIC_REGISTER_VIRTUALIZATION |
+		    PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY);
+
 		error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE,
 		    APIC_ACCESS_ADDRESS);
 		/* XXX this should really return an error to the caller */
 		KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error));
 	}
+	if (vmx_cap_en(vmx, VMX_CAP_APICV_PIR)) {
+		ASSERT(vmx_cap_en(vmx, VMX_CAP_APICV));
+
+		pin_ctls |= PINBASED_POSTED_INTERRUPT;
+	}
 
 	maxcpus = vm_get_maxcpus(vm);
 	for (i = 0; i < maxcpus; i++) {
-#ifndef __FreeBSD__
 		/*
 		 * Cache physical address lookups for various components which
 		 * may be required inside the critical_enter() section implied
@@ -1137,13 +983,10 @@
 		vm_paddr_t msr_bitmap_pa = vtophys(vmx->msr_bitmap);
 		vm_paddr_t apic_page_pa = vtophys(&vmx->apic_page[i]);
 		vm_paddr_t pir_desc_pa = vtophys(&vmx->pir_desc[i]);
-#endif /* __FreeBSD__ */
 
 		vmcs = &vmx->vmcs[i];
 		vmcs->identifier = vmx_revision();
-#ifndef __FreeBSD__
 		vmcs->vmcs_pa = (uint64_t)vtophys(vmcs);
-#endif
 		error = vmclear(vmcs);
 		if (error != 0) {
 			panic("vmx_vminit: vmclear error %d on vcpu %d\n",
@@ -1157,25 +1000,14 @@
 
 		VMPTRLD(vmcs);
 		error = 0;
-#ifdef __FreeBSD__
-		/*
-		 * The illumos vmx_enter_guest implementation avoids some of
-		 * the %rsp-manipulation games which are present in the stock
-		 * one from FreeBSD.
-		 */
-		error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]);
-#endif
+
 		error += vmwrite(VMCS_EPTP, vmx->eptp);
-		error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls);
-		error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls);
-		error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2);
+		error += vmwrite(VMCS_PIN_BASED_CTLS, pin_ctls);
+		error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, proc_ctls);
+		error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, proc2_ctls);
 		error += vmwrite(VMCS_EXIT_CTLS, exit_ctls);
 		error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls);
-#ifdef __FreeBSD__
-		error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap));
-#else
 		error += vmwrite(VMCS_MSR_BITMAP, msr_bitmap_pa);
-#endif
 		error += vmwrite(VMCS_VPID, vpid[i]);
 
 		if (guest_l1d_flush && !guest_l1d_flush_sw) {
@@ -1197,37 +1029,27 @@
 		vmx->ctx[i].guest_dr6 = DBREG_DR6_RESERVED1;
 		error += vmwrite(VMCS_GUEST_DR7, DBREG_DR7_RESERVED1);
 
-		if (tpr_shadowing) {
-#ifdef __FreeBSD__
-			error += vmwrite(VMCS_VIRTUAL_APIC,
-			    vtophys(&vmx->apic_page[i]));
-#else
+		if (vmx_cap_en(vmx, VMX_CAP_TPR_SHADOW)) {
 			error += vmwrite(VMCS_VIRTUAL_APIC, apic_page_pa);
-#endif
 		}
 
-		if (virtual_interrupt_delivery) {
+		if (vmx_cap_en(vmx, VMX_CAP_APICV)) {
 			error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS);
 			error += vmwrite(VMCS_EOI_EXIT0, 0);
 			error += vmwrite(VMCS_EOI_EXIT1, 0);
 			error += vmwrite(VMCS_EOI_EXIT2, 0);
 			error += vmwrite(VMCS_EOI_EXIT3, 0);
 		}
-		if (posted_interrupts) {
+		if (vmx_cap_en(vmx, VMX_CAP_APICV_PIR)) {
 			error += vmwrite(VMCS_PIR_VECTOR, pirvec);
-#ifdef __FreeBSD__
-			error += vmwrite(VMCS_PIR_DESC,
-			    vtophys(&vmx->pir_desc[i]));
-#else
 			error += vmwrite(VMCS_PIR_DESC, pir_desc_pa);
-#endif
 		}
 		VMCLEAR(vmcs);
 		KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs"));
 
 		vmx->cap[i].set = 0;
-		vmx->cap[i].proc_ctls = procbased_ctls;
-		vmx->cap[i].proc_ctls2 = procbased_ctls2;
+		vmx->cap[i].proc_ctls = proc_ctls;
+		vmx->cap[i].proc_ctls2 = proc2_ctls;
 		vmx->cap[i].exc_bitmap = exc_bitmap;
 
 		vmx->state[i].nextrip = ~0;
@@ -1536,7 +1358,6 @@
 #endif
 }
 
-#ifndef __FreeBSD__
 static void
 vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic,
     uint64_t guestrip)
@@ -1640,7 +1461,7 @@
 		 */
 		KASSERT(vector >= 0 && vector <= 255,
 		    ("invalid vector %d from INTR", vector));
-	} else if (!virtual_interrupt_delivery) {
+	} else if (!vmx_cap_en(vmx, VMX_CAP_APICV)) {
 		/* Ask the local apic for a vector to inject */
 		if (!vlapic_pending_intr(vlapic, &vector))
 			return;
@@ -1715,197 +1536,6 @@
 	 */
 	vmx_set_int_window_exiting(vmx, vcpu);
 }
-#else
-static void
-vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic,
-    uint64_t guestrip)
-{
-	int vector, need_nmi_exiting, extint_pending;
-	uint64_t rflags, entryinfo;
-	uint32_t gi, info;
-
-	vlapic_tmr_update(vlapic);
-
-	if (vmx->state[vcpu].nextrip != guestrip) {
-		gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
-		if (gi & HWINTR_BLOCKING) {
-			VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking "
-			    "cleared due to rip change: %#lx/%#lx",
-			    vmx->state[vcpu].nextrip, guestrip);
-			gi &= ~HWINTR_BLOCKING;
-			vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
-		}
-	}
-
-	if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) {
-		KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry "
-		    "intinfo is not valid: %#lx", __func__, entryinfo));
-
-		info = vmcs_read(VMCS_ENTRY_INTR_INFO);
-		KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject "
-		     "pending exception: %#lx/%#x", __func__, entryinfo, info));
-
-		info = entryinfo;
-		vector = info & 0xff;
-		if (vector == IDT_BP || vector == IDT_OF) {
-			/*
-			 * VT-x requires #BP and #OF to be injected as software
-			 * exceptions.
-			 */
-			info &= ~VMCS_INTR_T_MASK;
-			info |= VMCS_INTR_T_SWEXCEPTION;
-		}
-
-		if (info & VMCS_INTR_DEL_ERRCODE)
-			vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32);
-
-		vmcs_write(VMCS_ENTRY_INTR_INFO, info);
-	}
-
-	if (vm_nmi_pending(vmx->vm, vcpu)) {
-		/*
-		 * If there are no conditions blocking NMI injection then
-		 * inject it directly here otherwise enable "NMI window
-		 * exiting" to inject it as soon as we can.
-		 *
-		 * We also check for STI_BLOCKING because some implementations
-		 * don't allow NMI injection in this case. If we are running
-		 * on a processor that doesn't have this restriction it will
-		 * immediately exit and the NMI will be injected in the
-		 * "NMI window exiting" handler.
-		 */
-		need_nmi_exiting = 1;
-		gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
-		if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) {
-			info = vmcs_read(VMCS_ENTRY_INTR_INFO);
-			if ((info & VMCS_INTR_VALID) == 0) {
-				vmx_inject_nmi(vmx, vcpu);
-				need_nmi_exiting = 0;
-			} else {
-				VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI "
-				    "due to VM-entry intr info %#x", info);
-			}
-		} else {
-			VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to "
-			    "Guest Interruptibility-state %#x", gi);
-		}
-
-		if (need_nmi_exiting)
-			vmx_set_nmi_window_exiting(vmx, vcpu);
-	}
-
-	extint_pending = vm_extint_pending(vmx->vm, vcpu);
-
-	if (!extint_pending && virtual_interrupt_delivery) {
-		vmx_inject_pir(vlapic);
-		return;
-	}
-
-	/*
-	 * If interrupt-window exiting is already in effect then don't bother
-	 * checking for pending interrupts. This is just an optimization and
-	 * not needed for correctness.
-	 */
-	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) {
-		VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to "
-		    "pending int_window_exiting");
-		return;
-	}
-
-	if (!extint_pending) {
-		/* Ask the local apic for a vector to inject */
-		if (!vlapic_pending_intr(vlapic, &vector))
-			return;
-
-		/*
-		 * From the Intel SDM, Volume 3, Section "Maskable
-		 * Hardware Interrupts":
-		 * - maskable interrupt vectors [16,255] can be delivered
-		 *   through the local APIC.
-		*/
-		KASSERT(vector >= 16 && vector <= 255,
-		    ("invalid vector %d from local APIC", vector));
-	} else {
-		/* Ask the legacy pic for a vector to inject */
-		vatpic_pending_intr(vmx->vm, &vector);
-
-		/*
-		 * From the Intel SDM, Volume 3, Section "Maskable
-		 * Hardware Interrupts":
-		 * - maskable interrupt vectors [0,255] can be delivered
-		 *   through the INTR pin.
-		 */
-		KASSERT(vector >= 0 && vector <= 255,
-		    ("invalid vector %d from INTR", vector));
-	}
-
-	/* Check RFLAGS.IF and the interruptibility state of the guest */
-	rflags = vmcs_read(VMCS_GUEST_RFLAGS);
-	if ((rflags & PSL_I) == 0) {
-		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
-		    "rflags %#lx", vector, rflags);
-		goto cantinject;
-	}
-
-	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
-	if (gi & HWINTR_BLOCKING) {
-		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
-		    "Guest Interruptibility-state %#x", vector, gi);
-		goto cantinject;
-	}
-
-	info = vmcs_read(VMCS_ENTRY_INTR_INFO);
-	if (info & VMCS_INTR_VALID) {
-		/*
-		 * This is expected and could happen for multiple reasons:
-		 * - A vectoring VM-entry was aborted due to astpending
-		 * - A VM-exit happened during event injection.
-		 * - An exception was injected above.
-		 * - An NMI was injected above or after "NMI window exiting"
-		 */
-		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
-		    "VM-entry intr info %#x", vector, info);
-		goto cantinject;
-	}
-
-	/* Inject the interrupt */
-	info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID;
-	info |= vector;
-	vmcs_write(VMCS_ENTRY_INTR_INFO, info);
-
-	if (!extint_pending) {
-		/* Update the Local APIC ISR */
-		vlapic_intr_accepted(vlapic, vector);
-	} else {
-		vm_extint_clear(vmx->vm, vcpu);
-		vatpic_intr_accepted(vmx->vm, vector);
-
-		/*
-		 * After we accepted the current ExtINT the PIC may
-		 * have posted another one.  If that is the case, set
-		 * the Interrupt Window Exiting execution control so
-		 * we can inject that one too.
-		 *
-		 * Also, interrupt window exiting allows us to inject any
-		 * pending APIC vector that was preempted by the ExtINT
-		 * as soon as possible. This applies both for the software
-		 * emulated vlapic and the hardware assisted virtual APIC.
-		 */
-		vmx_set_int_window_exiting(vmx, vcpu);
-	}
-
-	VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
-
-	return;
-
-cantinject:
-	/*
-	 * Set the Interrupt Window Exiting execution control so we can inject
-	 * the interrupt as soon as blocking condition goes away.
-	 */
-	vmx_set_int_window_exiting(vmx, vcpu);
-}
-#endif /* __FreeBSD__ */
 
 /*
  * If the Virtual NMIs execution control is '1' then the logical processor
@@ -2830,11 +2460,12 @@
 		SDT_PROBE3(vmm, vmx, exit, halt, vmx, vcpu, vmexit);
 		vmexit->exitcode = VM_EXITCODE_HLT;
 		vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS);
-		if (virtual_interrupt_delivery)
+		if (vmx_cap_en(vmx, VMX_CAP_APICV)) {
 			vmexit->u.hlt.intr_status =
 			    vmcs_read(VMCS_GUEST_INTR_STATUS);
-		else
+		} else {
 			vmexit->u.hlt.intr_status = 0;
+		}
 		break;
 	case EXIT_REASON_MTF:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
@@ -3336,22 +2967,16 @@
 		 *
 		 * The same reasoning applies to the IPI generated by
 		 * pmap_invalidate_ept().
-		 */
-#ifdef __FreeBSD__
-		disable_intr();
-		vmx_inject_interrupts(vmx, vcpu, vlapic, rip);
-#else
-		/*
+		 *
 		 * The bulk of guest interrupt injection is done without
 		 * interrupts disabled on the host CPU.  This is necessary
 		 * since contended mutexes might force the thread to sleep.
 		 */
 		vmx_inject_interrupts(vmx, vcpu, vlapic, rip);
 		disable_intr();
-		if (virtual_interrupt_delivery) {
+		if (vmx_cap_en(vmx, VMX_CAP_APICV)) {
 			vmx_inject_pir(vlapic);
 		}
-#endif /* __FreeBSD__ */
 
 		/*
 		 * Check for vcpu suspension after injecting events because
@@ -3449,12 +3074,15 @@
 #endif
 
 		/*
-		 * If TPR Shadowing is enabled, the TPR Threshold
-		 * must be updated right before entering the guest.
+		 * If TPR Shadowing is enabled, the TPR Threshold must be
+		 * updated right before entering the guest.
 		 */
-		if (tpr_shadowing && !virtual_interrupt_delivery) {
-			if ((vmx->cap[vcpu].proc_ctls & PROCBASED_USE_TPR_SHADOW) != 0) {
-				vmcs_write(VMCS_TPR_THRESHOLD, vlapic_get_cr8(vlapic));
+		if (vmx_cap_en(vmx, VMX_CAP_TPR_SHADOW) &&
+		    !vmx_cap_en(vmx, VMX_CAP_APICV)) {
+			if ((vmx->cap[vcpu].proc_ctls &
+			    PROCBASED_USE_TPR_SHADOW) != 0) {
+				vmcs_write(VMCS_TPR_THRESHOLD,
+				    vlapic_get_cr8(vlapic));
 			}
 		}
 
@@ -3812,10 +3440,6 @@
 		if (cap_monitor_trap)
 			ret = 0;
 		break;
-	case VM_CAP_UNRESTRICTED_GUEST:
-		if (cap_unrestricted_guest)
-			ret = 0;
-		break;
 	case VM_CAP_ENABLE_INVPCID:
 		if (cap_invpcid)
 			ret = 0;
@@ -3876,15 +3500,6 @@
 			reg = VMCS_PRI_PROC_BASED_CTLS;
 		}
 		break;
-	case VM_CAP_UNRESTRICTED_GUEST:
-		if (cap_unrestricted_guest) {
-			retval = 0;
-			pptr = &vmx->cap[vcpu].proc_ctls2;
-			baseval = *pptr;
-			flag = PROCBASED2_UNRESTRICTED_GUEST;
-			reg = VMCS_SEC_PROC_BASED_CTLS;
-		}
-		break;
 	case VM_CAP_ENABLE_INVPCID:
 		if (cap_invpcid) {
 			retval = 0;
@@ -4309,21 +3924,21 @@
 	vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid];
 	vlapic_vtx->vmx = vmx;
 
-	if (tpr_shadowing) {
+	if (vmx_cap_en(vmx, VMX_CAP_TPR_SHADOW)) {
 		vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode_ts;
 	}
-
-	if (virtual_interrupt_delivery) {
+	if (vmx_cap_en(vmx, VMX_CAP_APICV)) {
 		vlapic->ops.set_intr_ready = vmx_set_intr_ready;
 		vlapic->ops.pending_intr = vmx_pending_intr;
 		vlapic->ops.intr_accepted = vmx_intr_accepted;
 		vlapic->ops.set_tmr = vmx_set_tmr;
 		vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode_vid;
+
+		if (vmx_cap_en(vmx, VMX_CAP_APICV_PIR)) {
+			vlapic->ops.post_intr = vmx_post_intr;
+		}
 	}
 
-	if (posted_interrupts)
-		vlapic->ops.post_intr = vmx_post_intr;
-
 	vlapic_init(vlapic);
 
 	return (vlapic);
--- a/usr/src/uts/i86pc/io/vmm/intel/vmx.h	Tue Jul 21 02:10:58 2020 -0500
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.h	Wed Jun 24 20:53:43 2020 +0000
@@ -29,7 +29,17 @@
  */
 
 /*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
  * Copyright 2018 Joyent, Inc.
+ * Copyright 2020 Oxide Computer Company
  */
 
 #ifndef _VMX_H_
@@ -151,6 +161,7 @@
 	struct vmxcap	cap[VM_MAXCPU];
 	struct vmxstate	state[VM_MAXCPU];
 	uint64_t	eptp;
+	enum vmx_caps	vmx_caps;
 	struct vm	*vm;
 	long		eptgen[MAXCPU];		/* cached pmap->pm_eptgen */
 };
@@ -158,6 +169,12 @@
 CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0);
 CTASSERT((offsetof(struct vmx, pir_desc[0]) & 63) == 0);
 
+static __inline bool
+vmx_cap_en(const struct vmx *vmx, enum vmx_caps cap)
+{
+	return ((vmx->vmx_caps & cap) == cap);
+}
+
 #define	VMX_GUEST_VMEXIT	0
 #define	VMX_VMRESUME_ERROR	1
 #define	VMX_VMLAUNCH_ERROR	2
--- a/usr/src/uts/i86pc/sys/vmm.h	Tue Jul 21 02:10:58 2020 -0500
+++ b/usr/src/uts/i86pc/sys/vmm.h	Wed Jun 24 20:53:43 2020 +0000
@@ -39,6 +39,7 @@
  *
  * Copyright 2015 Pluribus Networks Inc.
  * Copyright 2019 Joyent, Inc.
+ * Copyright 2020 Oxide Computer Company
  */
 
 #ifndef _VMM_H_
@@ -165,12 +166,19 @@
 	VM_CAP_HALT_EXIT,
 	VM_CAP_MTRAP_EXIT,
 	VM_CAP_PAUSE_EXIT,
-	VM_CAP_UNRESTRICTED_GUEST,
 	VM_CAP_ENABLE_INVPCID,
 	VM_CAP_BPT_EXIT,
 	VM_CAP_MAX
 };
 
+enum vmx_caps {
+	VMX_CAP_NONE		= 0,
+	VMX_CAP_TPR_SHADOW	= (1UL << 0),
+	VMX_CAP_APICV		= (1UL << 1),
+	VMX_CAP_APICV_X2APIC	= (1UL << 2),
+	VMX_CAP_APICV_PIR	= (1UL << 3),
+};
+
 enum vm_intr_trigger {
 	EDGE_TRIGGER,
 	LEVEL_TRIGGER