changeset 5159:6cdd421a2458

6590353 ancient mtrr crud in i86pc/os/startup.c must die 6604314 lock ordering problem can cause deadlock at bootup 6604381 typo in LEVEL_SIZE call in i86pc's hat_share() 6604444 HCK_PARTIALCKSUM packets from dom0 to domU fail checksum test in domU 6605202 domU panics with 'bad mutex' from a freed data structure after 1h of xmstress 6605536 xvdi_ring_has_unconsumed_responses() panic during suspend 6606142 page_numtopp_alloc() uses page_reclaim() incorrectly. 6606864 xm-test vcpu-disable/02_vcpu-set_stress.py fails to adjust vcpus 6609008 shutdown_req_active usage can hang
author johnlev
date Mon, 01 Oct 2007 15:49:09 -0700
parents 2ccc7eeb32f8
children 6a35c54999f3
files usr/src/uts/common/io/vnic/vnic_dev.c usr/src/uts/common/xen/io/xdf.c usr/src/uts/common/xen/io/xenbus_xs.c usr/src/uts/common/xen/io/xnf.c usr/src/uts/common/xen/os/xvdi.c usr/src/uts/i86pc/os/machdep.c usr/src/uts/i86pc/os/mp_startup.c usr/src/uts/i86pc/os/startup.c usr/src/uts/i86pc/vm/hat_i86.c usr/src/uts/i86pc/vm/i86_mmu.c usr/src/uts/i86xpv/os/mp_xen.c usr/src/uts/i86xpv/os/xen_machdep.c usr/src/uts/i86xpv/sys/hypervisor.h usr/src/uts/intel/sys/archsystm.h usr/src/uts/intel/sys/x86_archext.h
diffstat 15 files changed, 366 insertions(+), 403 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/uts/common/io/vnic/vnic_dev.c	Mon Oct 01 15:48:08 2007 -0700
+++ b/usr/src/uts/common/io/vnic/vnic_dev.c	Mon Oct 01 15:49:09 2007 -0700
@@ -1542,9 +1542,6 @@
 	for (loop = vnic_mac->va_promisc;
 	    loop != NULL;
 	    loop = loop->vn_promisc_next) {
-		mblk_t *copy;
-		uint64_t gen;
-
 		if (loop == sender)
 			continue;
 
@@ -1557,15 +1554,22 @@
 		ASSERT(flow != NULL);
 
 		if (!flow->vf_is_active) {
+			mblk_t *copy;
+			uint64_t gen;
+
+			if ((copy = vnic_copymsg_cksum(mp)) == NULL)
+				break;
+			if ((sender != NULL) &&
+			    ((copy = vnic_fix_cksum(copy)) == NULL))
+				break;
+
 			VNIC_FLOW_REFHOLD(flow);
 			gen = vnic_mac->va_promisc_gen;
 			rw_exit(&vnic_mac->va_promisc_lock);
 
-			if ((copy = vnic_copymsg_cksum(mp)) != NULL) {
-				fn_info = vnic_classifier_get_fn_info(flow);
-				(fn_info->ff_fn)(fn_info->ff_arg1,
-				    fn_info->ff_arg2, copy);
-			}
+			fn_info = vnic_classifier_get_fn_info(flow);
+			(fn_info->ff_fn)(fn_info->ff_arg1,
+			    fn_info->ff_arg2, copy);
 
 			VNIC_FLOW_REFRELE(flow);
 			rw_enter(&vnic_mac->va_promisc_lock, RW_READER);
--- a/usr/src/uts/common/xen/io/xdf.c	Mon Oct 01 15:48:08 2007 -0700
+++ b/usr/src/uts/common/xen/io/xdf.c	Mon Oct 01 15:49:09 2007 -0700
@@ -511,6 +511,7 @@
 {
 	xdf_t *vdp;
 	int instance;
+	enum xdf_state st;
 
 	instance = ddi_get_instance(devi);
 
@@ -522,16 +523,25 @@
 
 	xvdi_suspend(devi);
 
-	/* stop further I/O requests */
 	mutex_enter(&vdp->xdf_cb_lk);
 	mutex_enter(&vdp->xdf_dev_lk);
-	vdp->xdf_status = XD_SUSPEND;
+	st = vdp->xdf_status;
+	/* change status to stop further I/O requests */
+	if (st == XD_READY)
+		vdp->xdf_status = XD_SUSPEND;
 	mutex_exit(&vdp->xdf_dev_lk);
 	mutex_exit(&vdp->xdf_cb_lk);
 
 	/* make sure no more I/O responses left in the ring buffer */
-	(void) ddi_remove_intr(devi, 0, NULL);
-	(void) xdf_drain_io(vdp);
+	if ((st == XD_INIT) || (st == XD_READY)) {
+		(void) ddi_remove_intr(devi, 0, NULL);
+		(void) xdf_drain_io(vdp);
+		/*
+		 * no need to teardown the ring buffer here
+		 * it will be simply re-init'ed during resume when
+		 * we call xvdi_alloc_ring
+		 */
+	}
 
 	if (xdfdebug & SUSRES_DBG)
 		xen_printf("xdf_suspend: SUCCESS\n");
@@ -561,7 +571,7 @@
 	}
 
 	mutex_enter(&vdp->xdf_dev_lk);
-	ASSERT(vdp->xdf_status == XD_SUSPEND);
+	ASSERT(vdp->xdf_status != XD_READY);
 	vdp->xdf_status = XD_UNKNOWN;
 	mutex_exit(&vdp->xdf_dev_lk);
 
--- a/usr/src/uts/common/xen/io/xenbus_xs.c	Mon Oct 01 15:48:08 2007 -0700
+++ b/usr/src/uts/common/xen/io/xenbus_xs.c	Mon Oct 01 15:49:09 2007 -0700
@@ -888,22 +888,15 @@
 		mutex_enter(&watch_events_lock);
 		while (list_empty(&watch_events))
 			cv_wait(&watch_events_cv, &watch_events_lock);
-
-		mutex_enter(&xenwatch_mutex);
-
 		msg = list_head(&watch_events);
-		if (msg != NULL)
-			list_remove(&watch_events, msg);
+		ASSERT(msg != NULL);
+		list_remove(&watch_events, msg);
 		mutex_exit(&watch_events_lock);
 
-		if (msg != NULL) {
-			msg->un.watch.handle->callback(
-			    msg->un.watch.handle,
-			    (const char **)msg->un.watch.vec,
-			    msg->un.watch.vec_size);
-			free_stored_msg(msg);
-		}
-
+		mutex_enter(&xenwatch_mutex);
+		msg->un.watch.handle->callback(msg->un.watch.handle,
+		    (const char **)msg->un.watch.vec, msg->un.watch.vec_size);
+		free_stored_msg(msg);
 		mutex_exit(&xenwatch_mutex);
 	}
 }
--- a/usr/src/uts/common/xen/io/xnf.c	Mon Oct 01 15:48:08 2007 -0700
+++ b/usr/src/uts/common/xen/io/xnf.c	Mon Oct 01 15:49:09 2007 -0700
@@ -820,8 +820,6 @@
 	if (macp != NULL)
 		mac_free(macp);
 
-	(void) xvdi_switch_state(devinfo, XBT_NULL, XenbusStateClosed);
-
 	return (DDI_FAILURE);
 }
 
--- a/usr/src/uts/common/xen/os/xvdi.c	Mon Oct 01 15:48:08 2007 -0700
+++ b/usr/src/uts/common/xen/os/xvdi.c	Mon Oct 01 15:49:09 2007 -0700
@@ -267,6 +267,13 @@
 	else
 		(void) snprintf(xsnamebuf, sizeof (xsnamebuf),
 		    "%s/%d/%d", xdcp->xs_path_be, domid, vdevnum);
+	if ((xenbus_read_driver_state(xsname) >= XenbusStateClosing)) {
+		/* Don't try to init a dev that may be closing */
+		mutex_destroy(&pdp->xd_lk);
+		kmem_free(pdp, sizeof (*pdp));
+		ddi_set_parent_data(dip, NULL);
+		return (DDI_FAILURE);
+	}
 
 	pdp->xd_xsdev.nodename = i_ddi_strdup(xsname, KM_SLEEP);
 	pdp->xd_xsdev.devicetype = xdcp->xsdev;
@@ -334,6 +341,9 @@
 		/* Remove any registered watches. */
 		i_xvdi_rem_watches(dip);
 
+		/* tell other end to close */
+		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
+
 		if (pdp->xd_xsdev.nodename != NULL)
 			kmem_free((char *)(pdp->xd_xsdev.nodename),
 			    strlen(pdp->xd_xsdev.nodename) + 1);
@@ -683,6 +693,7 @@
 	char xsnamebuf[TYPICALMAXPATHLEN];
 	char *type, *node = NULL, *xsname = NULL;
 	unsigned int tlen;
+	int ret;
 
 	ASSERT(DEVI_BUSY_OWNED(parent));
 
@@ -752,13 +763,11 @@
 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, dip, "vdev", vdev);
 
 	if (i_ddi_devi_attached(parent))
-		/*
-		 * Cleanup happens in xendev_removechild when the
-		 * other end closes or a driver fails to attach.
-		 */
-		(void) ndi_devi_online(dip, 0);
+		ret = ndi_devi_online(dip, 0);
 	else
-		(void) ndi_devi_bind_driver(dip, 0);
+		ret = ndi_devi_bind_driver(dip, 0);
+	if (ret != NDI_SUCCESS)
+		(void) ndi_devi_offline(dip, NDI_DEVI_REMOVE);
 
 	return (dip);
 }
--- a/usr/src/uts/i86pc/os/machdep.c	Mon Oct 01 15:48:08 2007 -0700
+++ b/usr/src/uts/i86pc/os/machdep.c	Mon Oct 01 15:49:09 2007 -0700
@@ -171,10 +171,6 @@
 void
 mdboot(int cmd, int fcn, char *mdep, boolean_t invoke_cb)
 {
-#ifndef __xpv
-	extern void mtrr_resync(void);
-#endif
-
 	if (!panicstr) {
 		kpreempt_disable();
 		affinity_set(CPU_CURRENT);
@@ -251,10 +247,6 @@
 	(void) spl8();
 	(*psm_shutdownf)(cmd, fcn);
 
-#ifndef __xpv
-	mtrr_resync();
-#endif
-
 	if (fcn == AD_HALT || fcn == AD_POWEROFF)
 		halt((char *)NULL);
 	else
--- a/usr/src/uts/i86pc/os/mp_startup.c	Mon Oct 01 15:48:08 2007 -0700
+++ b/usr/src/uts/i86pc/os/mp_startup.c	Mon Oct 01 15:49:09 2007 -0700
@@ -1384,11 +1384,10 @@
 
 #ifndef __xpv
 	/*
-	 * We need to Sync MTRR with cpu0's MTRR. We have to do
-	 * this with interrupts disabled.
+	 * Program this cpu's PAT
 	 */
-	if (x86_feature & X86_MTRR)
-		mtrr_sync();
+	if (x86_feature & X86_PAT)
+		pat_sync();
 #endif
 
 	/*
--- a/usr/src/uts/i86pc/os/startup.c	Mon Oct 01 15:48:08 2007 -0700
+++ b/usr/src/uts/i86pc/os/startup.c	Mon Oct 01 15:49:09 2007 -0700
@@ -81,6 +81,7 @@
 #include <sys/stack.h>
 #include <sys/trap.h>
 #include <sys/fp.h>
+#include <vm/kboot_mmu.h>
 #include <vm/anon.h>
 #include <vm/as.h>
 #include <vm/page.h>
@@ -112,9 +113,11 @@
 #include <sys/cpu_module.h>
 #include <sys/smbios.h>
 #include <sys/debug_info.h>
+#include <sys/bootinfo.h>
 #include <sys/ddi_timer.h>
 
 #ifdef __xpv
+
 #include <sys/hypervisor.h>
 #include <sys/xen_mmu.h>
 #include <sys/evtchn_impl.h>
@@ -122,12 +125,12 @@
 #include <sys/xpv_panic.h>
 #include <xen/sys/xenbus_comms.h>
 #include <xen/public/physdev.h>
+
 extern void xen_late_startup(void);
-extern struct xen_evt_data cpu0_evt_data;
-#endif
 
-#include <sys/bootinfo.h>
-#include <vm/kboot_mmu.h>
+struct xen_evt_data cpu0_evt_data;
+
+#endif /* __xpv */
 
 extern void progressbar_init(void);
 extern void progressbar_start(void);
@@ -1668,9 +1671,9 @@
 
 #ifndef __xpv
 	/*
-	 * Setup MTRR (Memory type range registers)
+	 * Setup Page Attribute Table
 	 */
-	setup_mtrr();
+	pat_sync();
 #endif
 
 	/*
@@ -2346,138 +2349,47 @@
 
 #ifndef __xpv
 /*
- * These are MTTR registers supported by P6
+ * Solaris adds an entry for Write Combining caching to the PAT
  */
-static struct	mtrrvar	mtrrphys_arr[MAX_MTRRVAR];
-static uint64_t mtrr64k, mtrr16k1, mtrr16k2;
-static uint64_t mtrr4k1, mtrr4k2, mtrr4k3;
-static uint64_t mtrr4k4, mtrr4k5, mtrr4k6;
-static uint64_t mtrr4k7, mtrr4k8, mtrrcap;
-uint64_t mtrrdef, pat_attr_reg;
-
-/*
- * Disable reprogramming of MTRRs by default.
- */
-int	enable_relaxed_mtrr = 0;
+static uint64_t pat_attr_reg = PAT_DEFAULT_ATTRIBUTE;
 
 void
-setup_mtrr(void)
+pat_sync(void)
 {
-	int i, ecx;
-	int vcnt;
-	struct	mtrrvar	*mtrrphys;
+	ulong_t	cr0, cr0_orig, cr4;
 
-	if (!(x86_feature & X86_MTRR))
+	if (!(x86_feature & X86_PAT))
 		return;
+	cr0_orig = cr0 = getcr0();
+	cr4 = getcr4();
 
-	mtrrcap = rdmsr(REG_MTRRCAP);
-	mtrrdef = rdmsr(REG_MTRRDEF);
-	if (mtrrcap & MTRRCAP_FIX) {
-		mtrr64k = rdmsr(REG_MTRR64K);
-		mtrr16k1 = rdmsr(REG_MTRR16K1);
-		mtrr16k2 = rdmsr(REG_MTRR16K2);
-		mtrr4k1 = rdmsr(REG_MTRR4K1);
-		mtrr4k2 = rdmsr(REG_MTRR4K2);
-		mtrr4k3 = rdmsr(REG_MTRR4K3);
-		mtrr4k4 = rdmsr(REG_MTRR4K4);
-		mtrr4k5 = rdmsr(REG_MTRR4K5);
-		mtrr4k6 = rdmsr(REG_MTRR4K6);
-		mtrr4k7 = rdmsr(REG_MTRR4K7);
-		mtrr4k8 = rdmsr(REG_MTRR4K8);
-	}
-	if ((vcnt = (mtrrcap & MTRRCAP_VCNTMASK)) > MAX_MTRRVAR)
-		vcnt = MAX_MTRRVAR;
-
-	for (i = 0, ecx = REG_MTRRPHYSBASE0, mtrrphys = mtrrphys_arr;
-	    i <  vcnt - 1; i++, ecx += 2, mtrrphys++) {
-		mtrrphys->mtrrphys_base = rdmsr(ecx);
-		mtrrphys->mtrrphys_mask = rdmsr(ecx + 1);
-		if ((x86_feature & X86_PAT) && enable_relaxed_mtrr)
-			mtrrphys->mtrrphys_mask &= ~MTRRPHYSMASK_V;
-	}
-	if (x86_feature & X86_PAT) {
-		if (enable_relaxed_mtrr)
-			mtrrdef = MTRR_TYPE_WB|MTRRDEF_FE|MTRRDEF_E;
-		pat_attr_reg = PAT_DEFAULT_ATTRIBUTE;
+	/* disable caching and flush all caches and TLBs */
+	cr0 |= CR0_CD;
+	cr0 &= ~CR0_NW;
+	setcr0(cr0);
+	invalidate_cache();
+	if (cr4 & CR4_PGE) {
+		setcr4(cr4 & ~(ulong_t)CR4_PGE);
+		setcr4(cr4);
+	} else {
+		reload_cr3();
 	}
 
-	mtrr_sync();
-}
-
-/*
- * Sync current cpu mtrr with the incore copy of mtrr.
- * This function has to be invoked with interrupts disabled
- * Currently we do not capture other cpu's. This is invoked on cpu0
- * just after reading /etc/system.
- * On other cpu's its invoked from mp_startup().
- */
-void
-mtrr_sync(void)
-{
-	uint_t	crvalue, cr0_orig;
-	int	vcnt, i, ecx;
-	struct	mtrrvar	*mtrrphys;
-
-	cr0_orig = crvalue = getcr0();
-	crvalue |= CR0_CD;
-	crvalue &= ~CR0_NW;
-	setcr0(crvalue);
-	invalidate_cache();
-
-#if !defined(__xpv)
-	reload_cr3();
-#endif
-	if (x86_feature & X86_PAT)
-		wrmsr(REG_MTRRPAT, pat_attr_reg);
+	/* add our entry to the PAT */
+	wrmsr(REG_PAT, pat_attr_reg);
 
-	wrmsr(REG_MTRRDEF, rdmsr(REG_MTRRDEF) &
-	    ~((uint64_t)(uintptr_t)MTRRDEF_E));
-
-	if (mtrrcap & MTRRCAP_FIX) {
-		wrmsr(REG_MTRR64K, mtrr64k);
-		wrmsr(REG_MTRR16K1, mtrr16k1);
-		wrmsr(REG_MTRR16K2, mtrr16k2);
-		wrmsr(REG_MTRR4K1, mtrr4k1);
-		wrmsr(REG_MTRR4K2, mtrr4k2);
-		wrmsr(REG_MTRR4K3, mtrr4k3);
-		wrmsr(REG_MTRR4K4, mtrr4k4);
-		wrmsr(REG_MTRR4K5, mtrr4k5);
-		wrmsr(REG_MTRR4K6, mtrr4k6);
-		wrmsr(REG_MTRR4K7, mtrr4k7);
-		wrmsr(REG_MTRR4K8, mtrr4k8);
+	/* flush TLBs and cache again, then reenable cr0 caching */
+	if (cr4 & CR4_PGE) {
+		setcr4(cr4 & ~(ulong_t)CR4_PGE);
+		setcr4(cr4);
+	} else {
+		reload_cr3();
 	}
-	if ((vcnt = (mtrrcap & MTRRCAP_VCNTMASK)) > MAX_MTRRVAR)
-		vcnt = MAX_MTRRVAR;
-	for (i = 0, ecx = REG_MTRRPHYSBASE0, mtrrphys = mtrrphys_arr;
-	    i <  vcnt - 1; i++, ecx += 2, mtrrphys++) {
-		wrmsr(ecx, mtrrphys->mtrrphys_base);
-		wrmsr(ecx + 1, mtrrphys->mtrrphys_mask);
-	}
-	wrmsr(REG_MTRRDEF, mtrrdef);
-
-#if !defined(__xpv)
-	reload_cr3();
-#endif
 	invalidate_cache();
 	setcr0(cr0_orig);
 }
 
-/*
- * resync mtrr so that BIOS is happy. Called from mdboot
- */
-void
-mtrr_resync(void)
-{
-	if ((x86_feature & X86_PAT) && enable_relaxed_mtrr) {
-		/*
-		 * We could have changed the default mtrr definition.
-		 * Put it back to uncached which is what it is at power on
-		 */
-		mtrrdef = MTRR_TYPE_UC|MTRRDEF_FE|MTRRDEF_E;
-		mtrr_sync();
-	}
-}
-#endif
+#endif /* !__xpv */
 
 void
 get_system_configuration(void)
--- a/usr/src/uts/i86pc/vm/hat_i86.c	Mon Oct 01 15:48:08 2007 -0700
+++ b/usr/src/uts/i86pc/vm/hat_i86.c	Mon Oct 01 15:49:09 2007 -0700
@@ -2893,7 +2893,7 @@
 		/*
 		 * The range of address space must cover a full table.
 		 */
-		if (e_ism_addr - ism_addr < LEVEL_SIZE(1 + 1))
+		if (e_ism_addr - ism_addr < LEVEL_SIZE(l + 1))
 			goto not_shared;
 
 		/*
--- a/usr/src/uts/i86pc/vm/i86_mmu.c	Mon Oct 01 15:48:08 2007 -0700
+++ b/usr/src/uts/i86pc/vm/i86_mmu.c	Mon Oct 01 15:49:09 2007 -0700
@@ -139,11 +139,14 @@
 		}
 	}
 
-	if (!PP_ISFREE(pp) || !page_reclaim(pp, (kmutex_t *)NULL)) {
+	if (!PP_ISFREE(pp)) {
 		page_unlock(pp);
 		return (NULL);
 	}
 
+	if (!page_reclaim(pp, (kmutex_t *)NULL))
+		return (NULL);
+
 	return (pp);
 }
 
--- a/usr/src/uts/i86xpv/os/mp_xen.c	Mon Oct 01 15:48:08 2007 -0700
+++ b/usr/src/uts/i86xpv/os/mp_xen.c	Mon Oct 01 15:49:09 2007 -0700
@@ -24,6 +24,71 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Virtual CPU management.
+ *
+ * VCPUs can be controlled in one of two ways; through the domain itself
+ * (psradm, p_online(), etc.), and via changes in xenstore (vcpu_config()).
+ * Unfortunately, the terminology is used in different ways; they work out as
+ * follows:
+ *
+ * P_ONLINE: the VCPU is up and running, taking interrupts and running threads
+ *
+ * P_OFFLINE: the VCPU is up and running, but quiesced (i.e. blocked in the
+ * hypervisor on the idle thread).  It must be up since a downed VCPU cannot
+ * receive interrupts, and we require this for offline CPUs in Solaris.
+ *
+ * P_POWEROFF: the VCPU is down (we never called xen_vcpu_up(), or called
+ * xen_vcpu_down() for it).  It can't take interrupts or run anything, though
+ * if it has run previously, its software state (cpu_t, machcpu structures, IPI
+ * event channels, etc.) will still exist.
+ *
+ * The hypervisor has two notions of CPU states as represented in the store:
+ *
+ * "offline": the VCPU is down.  Corresponds to P_POWEROFF.
+ *
+ * "online": the VCPU is running.  Corresponds to a CPU state other than
+ * P_POWEROFF.
+ *
+ * Currently, only a notification via xenstore can bring a CPU into a
+ * P_POWEROFF state, and only the domain can change between P_ONLINE, P_NOINTR,
+ * P_OFFLINE, etc.  We need to be careful to treat xenstore notifications
+ * idempotently, as we'll get 'duplicate' entries when we resume a domain.
+ *
+ * Note that the xenstore configuration is strictly advisory, in that a domain
+ * can choose to ignore it and still power up a VCPU in the offline state. To
+ * play nice, we don't allow it. Thus, any attempt to power on/off a CPU is
+ * ENOTSUP from within Solaris.
+ *
+ * Powering off a VCPU and suspending the domain use similar code. The
+ * difficulty here is that we must ensure that each VCPU is in a stable
+ * state: it must have a saved PCB, and not be responding to interrupts
+ * (since we are just about to remove its ability to run on a real CPU,
+ * possibly forever).  However, an offline CPU in Solaris can take
+ * cross-call interrupts, as mentioned, so we must go through a
+ * two-stage process.  First, we use the standard Solaris pause_cpus().
+ * This ensures that all CPUs are either in mach_cpu_pause() or
+ * mach_cpu_idle(), and nothing will cross-call them.
+ *
+ * Powered-off-CPUs are already safe, as we own the cpu_lock needed to
+ * bring them back up, and in state CPU_PHASE_POWERED_OFF.
+ *
+ * Running CPUs are spinning in mach_cpu_pause() waiting for either
+ * PAUSE_IDLE or CPU_PHASE_WAIT_SAFE.
+ *
+ * Offline CPUs are either running the idle thread and periodically
+ * checking for CPU_PHASE_WAIT_SAFE, or blocked in the hypervisor.
+ *
+ * Thus, we set CPU_PHASE_WAIT_SAFE for every powered-on CPU, as well as
+ * poking them to make sure they're not blocked[1]. When every CPU has
+ * responded by reaching a safe state and setting CPU_PHASE_SAFE, we
+ * know we can suspend, or power-off a CPU, without problems.
+ *
+ * [1] note that we have to repeatedly poke offline CPUs: it's the only
+ * way to ensure that the CPU doesn't miss the state change before
+ * dropping into HYPERVISOR_block().
+ */
+
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #include <sys/types.h>
@@ -35,26 +100,37 @@
 #include <sys/machsystm.h>
 #include <sys/segments.h>
 #include <sys/cpuvar.h>
-#include <sys/psw.h>
 #include <sys/x86_archext.h>
 #include <sys/controlregs.h>
-#include <vm/as.h>
-#include <vm/hat.h>
-#include <vm/hat_i86.h>
+#include <sys/hypervisor.h>
+#include <sys/xpv_panic.h>
 #include <sys/mman.h>
-#include <sys/hypervisor.h>
-#include <xen/sys/xenbus_impl.h>
-#include <sys/xpv_panic.h>
+#include <sys/psw.h>
+#include <sys/cpu.h>
+#include <sys/sunddi.h>
 #include <util/sscanf.h>
-#include <sys/cpu.h>
-#include <asm/cpu.h>
+#include <vm/hat_i86.h>
+#include <vm/hat.h>
+#include <vm/as.h>
 
+#include <xen/public/io/xs_wire.h>
+#include <xen/sys/xenbus_impl.h>
 #include <xen/public/vcpu.h>
-#include <xen/public/io/xs_wire.h>
+
+#define	CPU_PHASE_NONE 0
+#define	CPU_PHASE_WAIT_SAFE 1
+#define	CPU_PHASE_SAFE 2
+#define	CPU_PHASE_POWERED_OFF 3
 
-struct xen_evt_data cpu0_evt_data;		/* cpu0's pending event data */
+/*
+ * We can only poke CPUs during barrier enter 256 times a second at
+ * most.
+ */
+#define	POKE_TIMEOUT (NANOSEC / 256)
 
 static taskq_t *cpu_config_tq;
+static int cpu_phase[NCPU];
+
 static void vcpu_config_event(struct xenbus_watch *, const char **, uint_t);
 static int xen_vcpu_initialize(processorid_t, vcpu_guest_context_t *);
 
@@ -352,10 +428,8 @@
 }
 
 /*
- * Restore the context of a CPU during resume.  The CPU must either
- * have been blocked in cpu_idle() (running the idle thread), if it was
- * offline, or inside cpu_pause_thread().  Either way we can restore safely
- * from the t_pcb.
+ * Restore the context of a CPU during resume.  This context is always
+ * inside enter_safe_phase(), below.
  */
 void
 mach_cpucontext_restore(cpu_t *cp)
@@ -390,16 +464,56 @@
 	ASSERT(err == 0);
 }
 
+/*
+ * Reach a point at which the CPU can be safely powered-off or
+ * suspended.  Nothing can wake this CPU out of the loop.
+ */
+static void
+enter_safe_phase(void)
+{
+	ulong_t flags = intr_clear();
+
+	if (setjmp(&curthread->t_pcb) == 0) {
+		cpu_phase[CPU->cpu_id] = CPU_PHASE_SAFE;
+		while (cpu_phase[CPU->cpu_id] == CPU_PHASE_SAFE)
+			SMT_PAUSE();
+	}
+
+	ASSERT(!interrupts_enabled());
+
+	intr_restore(flags);
+}
+
+/*
+ * Offline CPUs run this code even under a pause_cpus(), so we must
+ * check if we need to enter the safe phase.
+ */
 void
 mach_cpu_idle(void)
 {
 	if (IN_XPV_PANIC()) {
 		xpv_panic_halt();
 	} else  {
-		(void) setjmp(&curthread->t_pcb);
-		CPUSET_ATOMIC_ADD(cpu_suspend_set, CPU->cpu_id);
 		(void) HYPERVISOR_block();
-		CPUSET_ATOMIC_DEL(cpu_suspend_set, CPU->cpu_id);
+		if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
+			enter_safe_phase();
+	}
+}
+
+/*
+ * Spin until either start_cpus() wakes us up, or we get a request to
+ * enter the safe phase (followed by a later start_cpus()).
+ */
+void
+mach_cpu_pause(volatile char *safe)
+{
+	*safe = PAUSE_WAIT;
+	membar_enter();
+
+	while (*safe != PAUSE_IDLE) {
+		if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
+			enter_safe_phase();
+		SMT_PAUSE();
 	}
 }
 
@@ -411,67 +525,6 @@
 	(void) xen_vcpu_down(CPU->cpu_id);
 }
 
-void
-mach_cpu_pause(volatile char *safe)
-{
-	ulong_t flags;
-
-	flags = intr_clear();
-
-	if (setjmp(&curthread->t_pcb) == 0) {
-		CPUSET_ATOMIC_ADD(cpu_suspend_set, CPU->cpu_id);
-		/*
-		 * This cpu is now safe.
-		 */
-		*safe = PAUSE_WAIT;
-		membar_enter();
-	}
-
-	while (*safe != PAUSE_IDLE)
-		SMT_PAUSE();
-
-	CPUSET_ATOMIC_DEL(cpu_suspend_set, CPU->cpu_id);
-
-	intr_restore(flags);
-}
-
-/*
- * Virtual CPU management.
- *
- * VCPUs can be controlled in one of two ways; through the domain itself
- * (psradm, p_online(), etc.), and via changes in xenstore (vcpu_config()).
- * Unfortunately, the terminology is used in different ways; they work out as
- * follows:
- *
- * P_ONLINE: the VCPU is up and running, taking interrupts and running threads
- *
- * P_OFFLINE: the VCPU is up and running, but quiesced (i.e. blocked in the
- * hypervisor on the idle thread).  It must be up since a downed VCPU cannot
- * receive interrupts, and we require this for offline CPUs in Solaris.
- *
- * P_POWEROFF: the VCPU is down (we never called xen_vcpu_up(), or called
- * xen_vcpu_down() for it).  It can't take interrupts or run anything, though
- * if it has run previously, its software state (cpu_t, machcpu structures, IPI
- * event channels, etc.) will still exist.
- *
- * The hypervisor has two notions of CPU states as represented in the store:
- *
- * "offline": the VCPU is down.  Corresponds to P_POWEROFF.
- *
- * "online": the VCPU is running.  Corresponds to a CPU state other than
- * P_POWEROFF.
- *
- * Currently, only a notification via xenstore can bring a CPU into a
- * P_POWEROFF state, and only the domain can change between P_ONLINE, P_NOINTR,
- * P_OFFLINE, etc.  We need to be careful to treat xenstore notifications
- * idempotently, as we'll get 'duplicate' entries when we resume a domain.
- *
- * Note that the xenstore configuration is strictly advisory, in that a domain
- * can choose to ignore it and still power up a VCPU in the offline state. To
- * play nice, we don't allow it. Thus, any attempt to power on/off a CPU is
- * ENOTSUP from within Solaris.
- */
-
 /*ARGSUSED*/
 int
 mp_cpu_poweron(struct cpu *cp)
@@ -486,78 +539,122 @@
 	return (ENOTSUP);
 }
 
-static int
-poweron_vcpu(struct cpu *cp)
+void
+mp_enter_barrier(void)
 {
-	int error;
+	hrtime_t last_poke_time = 0;
+	int poke_allowed = 0;
+	int done = 0;
+	int i;
 
 	ASSERT(MUTEX_HELD(&cpu_lock));
 
-	if (HYPERVISOR_vcpu_op(VCPUOP_is_up, cp->cpu_id, NULL) != 0) {
-		printf("poweron_vcpu: vcpu%d is not available!\n",
-		    cp->cpu_id);
-		return (ENXIO);
+	pause_cpus(NULL);
+
+	while (!done) {
+		done = 1;
+		poke_allowed = 0;
+
+		if (xpv_gethrtime() - last_poke_time > POKE_TIMEOUT) {
+			last_poke_time = xpv_gethrtime();
+			poke_allowed = 1;
+		}
+
+		for (i = 0; i < NCPU; i++) {
+			cpu_t *cp = cpu_get(i);
+
+			if (cp == NULL || cp == CPU)
+				continue;
+
+			switch (cpu_phase[i]) {
+			case CPU_PHASE_NONE:
+				cpu_phase[i] = CPU_PHASE_WAIT_SAFE;
+				poke_cpu(i);
+				done = 0;
+				break;
+
+			case CPU_PHASE_WAIT_SAFE:
+				if (poke_allowed)
+					poke_cpu(i);
+				done = 0;
+				break;
+
+			case CPU_PHASE_SAFE:
+			case CPU_PHASE_POWERED_OFF:
+				break;
+			}
+		}
+
+		SMT_PAUSE();
+	}
+}
+
+void
+mp_leave_barrier(void)
+{
+	int i;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	for (i = 0; i < NCPU; i++) {
+		cpu_t *cp = cpu_get(i);
+
+		if (cp == NULL || cp == CPU)
+			continue;
+
+		switch (cpu_phase[i]) {
+		/*
+		 * If we see a CPU in one of these phases, something has
+		 * gone badly wrong with the guarantees
+		 * mp_enter_barrier() is supposed to provide.  Rather
+		 * than attempt to stumble along (and since we can't
+		 * panic properly in this context), we tell the
+		 * hypervisor we've crashed.
+		 */
+		case CPU_PHASE_NONE:
+		case CPU_PHASE_WAIT_SAFE:
+			(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
+			break;
+
+		case CPU_PHASE_POWERED_OFF:
+			break;
+
+		case CPU_PHASE_SAFE:
+			cpu_phase[i] = CPU_PHASE_NONE;
+		}
 	}
 
-	if ((error = xen_vcpu_up(cp->cpu_id)) == 0) {
-		CPUSET_ADD(cpu_ready_set, cp->cpu_id);
-		cp->cpu_flags |= CPU_EXISTS | CPU_READY | CPU_RUNNING;
-		cp->cpu_flags &= ~CPU_POWEROFF;
-		/*
-		 * There are some nasty races possible here.
-		 * Tell the vcpu it's up one more time.
-		 * XXPV	Is this enough?  Is this safe?
-		 */
-		(void) xen_vcpu_up(cp->cpu_id);
-
-		cpu_set_state(cp);
-	}
-	return (error);
+	start_cpus();
 }
 
 static int
-poweroff_poke(void)
-{
-	CPUSET_ATOMIC_DEL(cpu_suspend_set, CPU->cpu_id);
-	return (0);
-}
-
-/*
- * We must ensure that the VCPU reaches a safe state (in the suspend set, and
- * thus is not going to change) before we can power it off.  The VCPU could
- * still be in mach_cpu_pause() and about to head back out; so just checking
- * cpu_suspend_set() isn't sufficient to make sure the VCPU has stopped moving.
- * Instead, we xcall it to delete itself from the set; whichever way it comes
- * back from that xcall, it won't mark itself in the set until it's safely back
- * in mach_cpu_idle().
- */
-static int
 poweroff_vcpu(struct cpu *cp)
 {
 	int error;
-	cpuset_t set;
 
 	ASSERT(MUTEX_HELD(&cpu_lock));
 
 	ASSERT(CPU->cpu_id != cp->cpu_id);
 	ASSERT(cp->cpu_flags & CPU_QUIESCED);
 
-	CPUSET_ONLY(set, cp->cpu_id);
-
-	xc_sync(0, 0, 0, X_CALL_HIPRI, set, (xc_func_t)poweroff_poke);
-
-	while (!CPU_IN_SET(cpu_suspend_set, cp->cpu_id))
-		SMT_PAUSE();
+	mp_enter_barrier();
 
 	if ((error = xen_vcpu_down(cp->cpu_id)) == 0) {
-		ASSERT(CPU_IN_SET(cpu_suspend_set, cp->cpu_id));
+		ASSERT(cpu_phase[cp->cpu_id] == CPU_PHASE_SAFE);
+
 		CPUSET_DEL(cpu_ready_set, cp->cpu_id);
+
 		cp->cpu_flags |= CPU_POWEROFF | CPU_OFFLINE;
 		cp->cpu_flags &=
 		    ~(CPU_RUNNING | CPU_READY | CPU_EXISTS | CPU_ENABLE);
 
+		cpu_phase[cp->cpu_id] = CPU_PHASE_POWERED_OFF;
+
 		cpu_set_state(cp);
 	}
+
+	mp_leave_barrier();
+
 	return (error);
 }
 
@@ -631,6 +728,37 @@
 }
 
 static int
+poweron_vcpu(struct cpu *cp)
+{
+	int error;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	if (HYPERVISOR_vcpu_op(VCPUOP_is_up, cp->cpu_id, NULL) != 0) {
+		printf("poweron_vcpu: vcpu%d is not available!\n",
+		    cp->cpu_id);
+		return (ENXIO);
+	}
+
+	if ((error = xen_vcpu_up(cp->cpu_id)) == 0) {
+		CPUSET_ADD(cpu_ready_set, cp->cpu_id);
+		cp->cpu_flags |= CPU_EXISTS | CPU_READY | CPU_RUNNING;
+		cp->cpu_flags &= ~CPU_POWEROFF;
+		/*
+		 * There are some nasty races possible here.
+		 * Tell the vcpu it's up one more time.
+		 * XXPV	Is this enough?  Is this safe?
+		 */
+		(void) xen_vcpu_up(cp->cpu_id);
+
+		cpu_phase[cp->cpu_id] = CPU_PHASE_NONE;
+
+		cpu_set_state(cp);
+	}
+	return (error);
+}
+
+static int
 vcpu_config_poweron(processorid_t id)
 {
 	cpu_t *cp;
--- a/usr/src/uts/i86xpv/os/xen_machdep.c	Mon Oct 01 15:48:08 2007 -0700
+++ b/usr/src/uts/i86xpv/os/xen_machdep.c	Mon Oct 01 15:49:09 2007 -0700
@@ -64,7 +64,6 @@
 #include <sys/cmn_err.h>
 #include <sys/trap.h>
 #include <sys/segments.h>
-#include <sys/sunddi.h>		/* for ddi_strtoul */
 #include <sys/hypervisor.h>
 #include <sys/xen_mmu.h>
 #include <sys/machsystm.h>
@@ -92,17 +91,6 @@
 #include <sys/balloon_impl.h>
 #include <sys/ddi.h>
 
-/*
- * Hypervisor-specific utility routines - these can be invoked from the
- * normal control flow.  It might be useful to partition these into
- * different files, but let's see how it looks before we get too
- * carried away with that idea.
- */
-
-/*
- * In the current absence of any useful way to debug domains that are hung
- * whilst suspending, we have a more clumsy approach...
- */
 #ifdef DEBUG
 #define	SUSPEND_DEBUG if (xen_suspend_debug) xen_printf
 #else
@@ -110,9 +98,7 @@
 #endif
 
 int cpr_debug;
-cpuset_t cpu_suspend_set;
 cpuset_t cpu_suspend_lost_set;
-volatile int xen_suspending_cpus;
 static int xen_suspend_debug;
 
 void
@@ -210,7 +196,6 @@
 
 /*
  * The list of mfn pages is out of date.  Recompute it.
- * XXPV: can we race against another suspend call? Think not.
  */
 static void
 rebuild_mfn_list(void)
@@ -248,21 +233,7 @@
 
 	SUSPEND_DEBUG("suspend_cpus\n");
 
-	xen_suspending_cpus = 1;
-
-	pause_cpus(NULL);
-
-	SUSPEND_DEBUG("waiting for offline CPUs\n");
-
-	/*
-	 * For us to proceed safely, all CPUs except the current one must be
-	 * present in cpu_suspend_set.  Running CPUs will participate in
-	 * pause_cpus(), and eventually reach mach_cpu_pause().  Powered-off
-	 * VCPUs will already be in the set, again in mach_cpu_pause().
-	 * Finally, offline CPUs will be sitting in mach_cpu_idle().
-	 */
-	while (!CPUSET_ISEQUAL(mp_cpus, cpu_suspend_set))
-		SMT_PAUSE();
+	mp_enter_barrier();
 
 	for (i = 1; i < ncpus; i++) {
 		if (!CPU_IN_SET(cpu_suspend_lost_set, i)) {
@@ -279,8 +250,6 @@
 {
 	int i;
 
-	xen_suspending_cpus = 0;
-
 	for (i = 1; i < ncpus; i++) {
 		if (cpu[i] == NULL)
 			continue;
@@ -292,7 +261,7 @@
 		}
 	}
 
-	start_cpus();
+	mp_leave_barrier();
 }
 
 /*
@@ -573,7 +542,6 @@
 }
 
 taskq_t *xen_shutdown_tq;
-volatile int shutdown_req_active;
 
 #define	SHUTDOWN_INVALID	-1
 #define	SHUTDOWN_POWEROFF	0
@@ -623,7 +591,6 @@
 
 	if (cmd == SHUTDOWN_SUSPEND) {
 		xen_suspend_domain();
-		shutdown_req_active = 0;
 		return;
 	}
 
@@ -716,12 +683,6 @@
 
 	kmem_free(str, slen);
 	if (shutdown_code != SHUTDOWN_INVALID) {
-		if (shutdown_code == SHUTDOWN_SUSPEND) {
-			while (shutdown_req_active)
-				SMT_PAUSE();
-		}
-
-		shutdown_req_active = 1;
 		(void) taskq_dispatch(xen_shutdown_tq, xen_shutdown,
 		    (void *)(intptr_t)shutdown_code, 0);
 	}
--- a/usr/src/uts/i86xpv/sys/hypervisor.h	Mon Oct 01 15:48:08 2007 -0700
+++ b/usr/src/uts/i86xpv/sys/hypervisor.h	Mon Oct 01 15:49:09 2007 -0700
@@ -106,9 +106,10 @@
 extern void mach_cpucontext_reset(cpu_t *);
 extern void mach_cpucontext_restore(cpu_t *);
 
-extern cpuset_t cpu_suspend_set;
+extern void mp_enter_barrier(void);
+extern void mp_leave_barrier(void);
+
 extern cpuset_t cpu_suspend_lost_set;
-extern volatile int xen_suspending_cpus;
 
 extern int xen_gdt_setprot(cpu_t *, uint_t);
 extern int xen_ldt_setprot(user_desc_t *, size_t, uint_t);
--- a/usr/src/uts/intel/sys/archsystm.h	Mon Oct 01 15:48:08 2007 -0700
+++ b/usr/src/uts/intel/sys/archsystm.h	Mon Oct 01 15:49:09 2007 -0700
@@ -187,7 +187,7 @@
 #define	IN_XPV_PANIC() (xpv_panicking > 0)
 #else
 extern void setup_mca(void);
-extern void setup_mtrr(void);
+extern void pat_sync(void);
 #define	cpr_dprintf prom_printf
 #define	IN_XPV_PANIC() (__lintzero)
 #endif
--- a/usr/src/uts/intel/sys/x86_archext.h	Mon Oct 01 15:48:08 2007 -0700
+++ b/usr/src/uts/intel/sys/x86_archext.h	Mon Oct 01 15:49:09 2007 -0700
@@ -191,23 +191,8 @@
 #define	K5_TSC		0x10
 #define	K5_TR12		0x12
 
-#define	REG_MTRRCAP		0xfe
-#define	REG_MTRRDEF		0x2ff
-#define	REG_MTRR64K		0x250
-#define	REG_MTRR16K1		0x258
-#define	REG_MTRR16K2		0x259
-#define	REG_MTRR4K1		0x268
-#define	REG_MTRR4K2		0x269
-#define	REG_MTRR4K3		0x26a
-#define	REG_MTRR4K4		0x26b
-#define	REG_MTRR4K5		0x26c
-#define	REG_MTRR4K6		0x26d
-#define	REG_MTRR4K7		0x26e
-#define	REG_MTRR4K8		0x26f
-#define	REG_MTRRPAT		0x277
+#define	REG_PAT		0x277
 
-#define	REG_MTRRPHYSBASE0	0x200
-#define	REG_MTRRPHYSMASK7	0x20f
 #define	REG_MC0_CTL		0x400
 #define	REG_MC5_MISC		0x417
 #define	REG_PERFCTR0		0xc1
@@ -285,66 +270,34 @@
 
 #define	MCI_CTL_VALUE		0xffffffff
 
-#define	MTRRTYPE_MASK		0xff
-
-
-#define	MTRRCAP_FIX		0x100
-#define	MTRRCAP_VCNTMASK	0xff
-#define	MTRRCAP_USWC		0x400
-
-#define	MTRRDEF_E		0x800
-#define	MTRRDEF_FE		0x400
-
-#define	MTRRPHYSMASK_V		0x800
-
 #define	MTRR_TYPE_UC		0
 #define	MTRR_TYPE_WC		1
 #define	MTRR_TYPE_WT		4
 #define	MTRR_TYPE_WP		5
 #define	MTRR_TYPE_WB		6
+#define	MTRR_TYPE_UC_		7
 
 /*
- * Page attribute table is setup in the following way
- * PAT0	Write-BACK
+ * For Solaris we set up the page attritubute table in the following way:
+ * PAT0	Write-Back
  * PAT1	Write-Through
- * PAT2	Unchacheable
+ * PAT2	Unchacheable-
  * PAT3	Uncacheable
- * PAT4 Uncacheable
- * PAT5	Write-Protect
+ * PAT4 Write-Back
+ * PAT5	Write-Through
  * PAT6	Write-Combine
  * PAT7 Uncacheable
+ * The only difference from h/w default is entry 6.
  */
-#define	PAT_DEFAULT_ATTRIBUTE \
-	((uint64_t)MTRR_TYPE_WC << 48)|((uint64_t)MTRR_TYPE_WP << 40)| \
-	(MTRR_TYPE_WT << 8)|(MTRR_TYPE_WB)
-
-
-#define	MTRR_SETTYPE(a, t)	((a &= (uint64_t)~0xff),\
-				    (a |= ((t) & 0xff)))
-#define	MTRR_SETVINVALID(a)	((a) &= ~MTRRPHYSMASK_V)
-
-
-#define	MTRR_SETVBASE(a, b, t)	((a) =\
-					((((uint64_t)(b)) & 0xffffff000)|\
-					(((uint32_t)(t)) & 0xff)))
-
-#define	MTRR_SETVMASK(a, s, v) ((a) =\
-				((~(((uint64_t)(s)) - 1) & 0xffffff000)|\
-					(((uint32_t)(v)) << 11)))
-
-#define	MTRR_GETVBASE(a)	(((uint64_t)(a)) & 0xffffff000)
-#define	MTRR_GETVTYPE(a)	(((uint64_t)(a)) & 0xff)
-#define	MTRR_GETVSIZE(a)	((~((uint64_t)(a)) + 1) & 0xffffff000)
-
-
-#define	MAX_MTRRVAR	8
-
-#if !defined(_ASM)
-typedef	struct	mtrrvar {
-	uint64_t	mtrrphys_base;
-	uint64_t	mtrrphys_mask;
-} mtrrvar_t;
-#endif	/* _ASM */
+#define	PAT_DEFAULT_ATTRIBUTE			\
+	((uint64_t)MTRR_TYPE_WB |		\
+	((uint64_t)MTRR_TYPE_WT << 8) |		\
+	((uint64_t)MTRR_TYPE_UC_ << 16) |	\
+	((uint64_t)MTRR_TYPE_UC << 24) |	\
+	((uint64_t)MTRR_TYPE_WB << 32) |	\
+	((uint64_t)MTRR_TYPE_WT << 40) |	\
+	((uint64_t)MTRR_TYPE_WC << 48) |	\
+	((uint64_t)MTRR_TYPE_UC << 56))
 
 #define	X86_LARGEPAGE	0x00000001
 #define	X86_TSC		0x00000002