Mercurial > illumos > illumos-gate
changeset 5159:6cdd421a2458
6590353 ancient mtrr crud in i86pc/os/startup.c must die
6604314 lock ordering problem can cause deadlock at bootup
6604381 typo in LEVEL_SIZE call in i86pc's hat_share()
6604444 HCK_PARTIALCKSUM packets from dom0 to domU fail checksum test in domU
6605202 domU panics with 'bad mutex' from a freed data structure after 1h of xmstress
6605536 xvdi_ring_has_unconsumed_responses() panic during suspend
6606142 page_numtopp_alloc() uses page_reclaim() incorrectly.
6606864 xm-test vcpu-disable/02_vcpu-set_stress.py fails to adjust vcpus
6609008 shutdown_req_active usage can hang
author | johnlev |
---|---|
date | Mon, 01 Oct 2007 15:49:09 -0700 |
parents | 2ccc7eeb32f8 |
children | 6a35c54999f3 |
files | usr/src/uts/common/io/vnic/vnic_dev.c usr/src/uts/common/xen/io/xdf.c usr/src/uts/common/xen/io/xenbus_xs.c usr/src/uts/common/xen/io/xnf.c usr/src/uts/common/xen/os/xvdi.c usr/src/uts/i86pc/os/machdep.c usr/src/uts/i86pc/os/mp_startup.c usr/src/uts/i86pc/os/startup.c usr/src/uts/i86pc/vm/hat_i86.c usr/src/uts/i86pc/vm/i86_mmu.c usr/src/uts/i86xpv/os/mp_xen.c usr/src/uts/i86xpv/os/xen_machdep.c usr/src/uts/i86xpv/sys/hypervisor.h usr/src/uts/intel/sys/archsystm.h usr/src/uts/intel/sys/x86_archext.h |
diffstat | 15 files changed, 366 insertions(+), 403 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/uts/common/io/vnic/vnic_dev.c Mon Oct 01 15:48:08 2007 -0700 +++ b/usr/src/uts/common/io/vnic/vnic_dev.c Mon Oct 01 15:49:09 2007 -0700 @@ -1542,9 +1542,6 @@ for (loop = vnic_mac->va_promisc; loop != NULL; loop = loop->vn_promisc_next) { - mblk_t *copy; - uint64_t gen; - if (loop == sender) continue; @@ -1557,15 +1554,22 @@ ASSERT(flow != NULL); if (!flow->vf_is_active) { + mblk_t *copy; + uint64_t gen; + + if ((copy = vnic_copymsg_cksum(mp)) == NULL) + break; + if ((sender != NULL) && + ((copy = vnic_fix_cksum(copy)) == NULL)) + break; + VNIC_FLOW_REFHOLD(flow); gen = vnic_mac->va_promisc_gen; rw_exit(&vnic_mac->va_promisc_lock); - if ((copy = vnic_copymsg_cksum(mp)) != NULL) { - fn_info = vnic_classifier_get_fn_info(flow); - (fn_info->ff_fn)(fn_info->ff_arg1, - fn_info->ff_arg2, copy); - } + fn_info = vnic_classifier_get_fn_info(flow); + (fn_info->ff_fn)(fn_info->ff_arg1, + fn_info->ff_arg2, copy); VNIC_FLOW_REFRELE(flow); rw_enter(&vnic_mac->va_promisc_lock, RW_READER);
--- a/usr/src/uts/common/xen/io/xdf.c Mon Oct 01 15:48:08 2007 -0700 +++ b/usr/src/uts/common/xen/io/xdf.c Mon Oct 01 15:49:09 2007 -0700 @@ -511,6 +511,7 @@ { xdf_t *vdp; int instance; + enum xdf_state st; instance = ddi_get_instance(devi); @@ -522,16 +523,25 @@ xvdi_suspend(devi); - /* stop further I/O requests */ mutex_enter(&vdp->xdf_cb_lk); mutex_enter(&vdp->xdf_dev_lk); - vdp->xdf_status = XD_SUSPEND; + st = vdp->xdf_status; + /* change status to stop further I/O requests */ + if (st == XD_READY) + vdp->xdf_status = XD_SUSPEND; mutex_exit(&vdp->xdf_dev_lk); mutex_exit(&vdp->xdf_cb_lk); /* make sure no more I/O responses left in the ring buffer */ - (void) ddi_remove_intr(devi, 0, NULL); - (void) xdf_drain_io(vdp); + if ((st == XD_INIT) || (st == XD_READY)) { + (void) ddi_remove_intr(devi, 0, NULL); + (void) xdf_drain_io(vdp); + /* + * no need to teardown the ring buffer here + * it will be simply re-init'ed during resume when + * we call xvdi_alloc_ring + */ + } if (xdfdebug & SUSRES_DBG) xen_printf("xdf_suspend: SUCCESS\n"); @@ -561,7 +571,7 @@ } mutex_enter(&vdp->xdf_dev_lk); - ASSERT(vdp->xdf_status == XD_SUSPEND); + ASSERT(vdp->xdf_status != XD_READY); vdp->xdf_status = XD_UNKNOWN; mutex_exit(&vdp->xdf_dev_lk);
--- a/usr/src/uts/common/xen/io/xenbus_xs.c Mon Oct 01 15:48:08 2007 -0700 +++ b/usr/src/uts/common/xen/io/xenbus_xs.c Mon Oct 01 15:49:09 2007 -0700 @@ -888,22 +888,15 @@ mutex_enter(&watch_events_lock); while (list_empty(&watch_events)) cv_wait(&watch_events_cv, &watch_events_lock); - - mutex_enter(&xenwatch_mutex); - msg = list_head(&watch_events); - if (msg != NULL) - list_remove(&watch_events, msg); + ASSERT(msg != NULL); + list_remove(&watch_events, msg); mutex_exit(&watch_events_lock); - if (msg != NULL) { - msg->un.watch.handle->callback( - msg->un.watch.handle, - (const char **)msg->un.watch.vec, - msg->un.watch.vec_size); - free_stored_msg(msg); - } - + mutex_enter(&xenwatch_mutex); + msg->un.watch.handle->callback(msg->un.watch.handle, + (const char **)msg->un.watch.vec, msg->un.watch.vec_size); + free_stored_msg(msg); mutex_exit(&xenwatch_mutex); } }
--- a/usr/src/uts/common/xen/io/xnf.c Mon Oct 01 15:48:08 2007 -0700 +++ b/usr/src/uts/common/xen/io/xnf.c Mon Oct 01 15:49:09 2007 -0700 @@ -820,8 +820,6 @@ if (macp != NULL) mac_free(macp); - (void) xvdi_switch_state(devinfo, XBT_NULL, XenbusStateClosed); - return (DDI_FAILURE); }
--- a/usr/src/uts/common/xen/os/xvdi.c Mon Oct 01 15:48:08 2007 -0700 +++ b/usr/src/uts/common/xen/os/xvdi.c Mon Oct 01 15:49:09 2007 -0700 @@ -267,6 +267,13 @@ else (void) snprintf(xsnamebuf, sizeof (xsnamebuf), "%s/%d/%d", xdcp->xs_path_be, domid, vdevnum); + if ((xenbus_read_driver_state(xsname) >= XenbusStateClosing)) { + /* Don't try to init a dev that may be closing */ + mutex_destroy(&pdp->xd_lk); + kmem_free(pdp, sizeof (*pdp)); + ddi_set_parent_data(dip, NULL); + return (DDI_FAILURE); + } pdp->xd_xsdev.nodename = i_ddi_strdup(xsname, KM_SLEEP); pdp->xd_xsdev.devicetype = xdcp->xsdev; @@ -334,6 +341,9 @@ /* Remove any registered watches. */ i_xvdi_rem_watches(dip); + /* tell other end to close */ + (void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed); + if (pdp->xd_xsdev.nodename != NULL) kmem_free((char *)(pdp->xd_xsdev.nodename), strlen(pdp->xd_xsdev.nodename) + 1); @@ -683,6 +693,7 @@ char xsnamebuf[TYPICALMAXPATHLEN]; char *type, *node = NULL, *xsname = NULL; unsigned int tlen; + int ret; ASSERT(DEVI_BUSY_OWNED(parent)); @@ -752,13 +763,11 @@ (void) ndi_prop_update_int(DDI_DEV_T_NONE, dip, "vdev", vdev); if (i_ddi_devi_attached(parent)) - /* - * Cleanup happens in xendev_removechild when the - * other end closes or a driver fails to attach. - */ - (void) ndi_devi_online(dip, 0); + ret = ndi_devi_online(dip, 0); else - (void) ndi_devi_bind_driver(dip, 0); + ret = ndi_devi_bind_driver(dip, 0); + if (ret != NDI_SUCCESS) + (void) ndi_devi_offline(dip, NDI_DEVI_REMOVE); return (dip); }
--- a/usr/src/uts/i86pc/os/machdep.c Mon Oct 01 15:48:08 2007 -0700 +++ b/usr/src/uts/i86pc/os/machdep.c Mon Oct 01 15:49:09 2007 -0700 @@ -171,10 +171,6 @@ void mdboot(int cmd, int fcn, char *mdep, boolean_t invoke_cb) { -#ifndef __xpv - extern void mtrr_resync(void); -#endif - if (!panicstr) { kpreempt_disable(); affinity_set(CPU_CURRENT); @@ -251,10 +247,6 @@ (void) spl8(); (*psm_shutdownf)(cmd, fcn); -#ifndef __xpv - mtrr_resync(); -#endif - if (fcn == AD_HALT || fcn == AD_POWEROFF) halt((char *)NULL); else
--- a/usr/src/uts/i86pc/os/mp_startup.c Mon Oct 01 15:48:08 2007 -0700 +++ b/usr/src/uts/i86pc/os/mp_startup.c Mon Oct 01 15:49:09 2007 -0700 @@ -1384,11 +1384,10 @@ #ifndef __xpv /* - * We need to Sync MTRR with cpu0's MTRR. We have to do - * this with interrupts disabled. + * Program this cpu's PAT */ - if (x86_feature & X86_MTRR) - mtrr_sync(); + if (x86_feature & X86_PAT) + pat_sync(); #endif /*
--- a/usr/src/uts/i86pc/os/startup.c Mon Oct 01 15:48:08 2007 -0700 +++ b/usr/src/uts/i86pc/os/startup.c Mon Oct 01 15:49:09 2007 -0700 @@ -81,6 +81,7 @@ #include <sys/stack.h> #include <sys/trap.h> #include <sys/fp.h> +#include <vm/kboot_mmu.h> #include <vm/anon.h> #include <vm/as.h> #include <vm/page.h> @@ -112,9 +113,11 @@ #include <sys/cpu_module.h> #include <sys/smbios.h> #include <sys/debug_info.h> +#include <sys/bootinfo.h> #include <sys/ddi_timer.h> #ifdef __xpv + #include <sys/hypervisor.h> #include <sys/xen_mmu.h> #include <sys/evtchn_impl.h> @@ -122,12 +125,12 @@ #include <sys/xpv_panic.h> #include <xen/sys/xenbus_comms.h> #include <xen/public/physdev.h> + extern void xen_late_startup(void); -extern struct xen_evt_data cpu0_evt_data; -#endif -#include <sys/bootinfo.h> -#include <vm/kboot_mmu.h> +struct xen_evt_data cpu0_evt_data; + +#endif /* __xpv */ extern void progressbar_init(void); extern void progressbar_start(void); @@ -1668,9 +1671,9 @@ #ifndef __xpv /* - * Setup MTRR (Memory type range registers) + * Setup Page Attribute Table */ - setup_mtrr(); + pat_sync(); #endif /* @@ -2346,138 +2349,47 @@ #ifndef __xpv /* - * These are MTTR registers supported by P6 + * Solaris adds an entry for Write Combining caching to the PAT */ -static struct mtrrvar mtrrphys_arr[MAX_MTRRVAR]; -static uint64_t mtrr64k, mtrr16k1, mtrr16k2; -static uint64_t mtrr4k1, mtrr4k2, mtrr4k3; -static uint64_t mtrr4k4, mtrr4k5, mtrr4k6; -static uint64_t mtrr4k7, mtrr4k8, mtrrcap; -uint64_t mtrrdef, pat_attr_reg; - -/* - * Disable reprogramming of MTRRs by default. - */ -int enable_relaxed_mtrr = 0; +static uint64_t pat_attr_reg = PAT_DEFAULT_ATTRIBUTE; void -setup_mtrr(void) +pat_sync(void) { - int i, ecx; - int vcnt; - struct mtrrvar *mtrrphys; + ulong_t cr0, cr0_orig, cr4; - if (!(x86_feature & X86_MTRR)) + if (!(x86_feature & X86_PAT)) return; + cr0_orig = cr0 = getcr0(); + cr4 = getcr4(); - mtrrcap = rdmsr(REG_MTRRCAP); - mtrrdef = rdmsr(REG_MTRRDEF); - if (mtrrcap & MTRRCAP_FIX) { - mtrr64k = rdmsr(REG_MTRR64K); - mtrr16k1 = rdmsr(REG_MTRR16K1); - mtrr16k2 = rdmsr(REG_MTRR16K2); - mtrr4k1 = rdmsr(REG_MTRR4K1); - mtrr4k2 = rdmsr(REG_MTRR4K2); - mtrr4k3 = rdmsr(REG_MTRR4K3); - mtrr4k4 = rdmsr(REG_MTRR4K4); - mtrr4k5 = rdmsr(REG_MTRR4K5); - mtrr4k6 = rdmsr(REG_MTRR4K6); - mtrr4k7 = rdmsr(REG_MTRR4K7); - mtrr4k8 = rdmsr(REG_MTRR4K8); - } - if ((vcnt = (mtrrcap & MTRRCAP_VCNTMASK)) > MAX_MTRRVAR) - vcnt = MAX_MTRRVAR; - - for (i = 0, ecx = REG_MTRRPHYSBASE0, mtrrphys = mtrrphys_arr; - i < vcnt - 1; i++, ecx += 2, mtrrphys++) { - mtrrphys->mtrrphys_base = rdmsr(ecx); - mtrrphys->mtrrphys_mask = rdmsr(ecx + 1); - if ((x86_feature & X86_PAT) && enable_relaxed_mtrr) - mtrrphys->mtrrphys_mask &= ~MTRRPHYSMASK_V; - } - if (x86_feature & X86_PAT) { - if (enable_relaxed_mtrr) - mtrrdef = MTRR_TYPE_WB|MTRRDEF_FE|MTRRDEF_E; - pat_attr_reg = PAT_DEFAULT_ATTRIBUTE; + /* disable caching and flush all caches and TLBs */ + cr0 |= CR0_CD; + cr0 &= ~CR0_NW; + setcr0(cr0); + invalidate_cache(); + if (cr4 & CR4_PGE) { + setcr4(cr4 & ~(ulong_t)CR4_PGE); + setcr4(cr4); + } else { + reload_cr3(); } - mtrr_sync(); -} - -/* - * Sync current cpu mtrr with the incore copy of mtrr. - * This function has to be invoked with interrupts disabled - * Currently we do not capture other cpu's. This is invoked on cpu0 - * just after reading /etc/system. - * On other cpu's its invoked from mp_startup(). - */ -void -mtrr_sync(void) -{ - uint_t crvalue, cr0_orig; - int vcnt, i, ecx; - struct mtrrvar *mtrrphys; - - cr0_orig = crvalue = getcr0(); - crvalue |= CR0_CD; - crvalue &= ~CR0_NW; - setcr0(crvalue); - invalidate_cache(); - -#if !defined(__xpv) - reload_cr3(); -#endif - if (x86_feature & X86_PAT) - wrmsr(REG_MTRRPAT, pat_attr_reg); + /* add our entry to the PAT */ + wrmsr(REG_PAT, pat_attr_reg); - wrmsr(REG_MTRRDEF, rdmsr(REG_MTRRDEF) & - ~((uint64_t)(uintptr_t)MTRRDEF_E)); - - if (mtrrcap & MTRRCAP_FIX) { - wrmsr(REG_MTRR64K, mtrr64k); - wrmsr(REG_MTRR16K1, mtrr16k1); - wrmsr(REG_MTRR16K2, mtrr16k2); - wrmsr(REG_MTRR4K1, mtrr4k1); - wrmsr(REG_MTRR4K2, mtrr4k2); - wrmsr(REG_MTRR4K3, mtrr4k3); - wrmsr(REG_MTRR4K4, mtrr4k4); - wrmsr(REG_MTRR4K5, mtrr4k5); - wrmsr(REG_MTRR4K6, mtrr4k6); - wrmsr(REG_MTRR4K7, mtrr4k7); - wrmsr(REG_MTRR4K8, mtrr4k8); + /* flush TLBs and cache again, then reenable cr0 caching */ + if (cr4 & CR4_PGE) { + setcr4(cr4 & ~(ulong_t)CR4_PGE); + setcr4(cr4); + } else { + reload_cr3(); } - if ((vcnt = (mtrrcap & MTRRCAP_VCNTMASK)) > MAX_MTRRVAR) - vcnt = MAX_MTRRVAR; - for (i = 0, ecx = REG_MTRRPHYSBASE0, mtrrphys = mtrrphys_arr; - i < vcnt - 1; i++, ecx += 2, mtrrphys++) { - wrmsr(ecx, mtrrphys->mtrrphys_base); - wrmsr(ecx + 1, mtrrphys->mtrrphys_mask); - } - wrmsr(REG_MTRRDEF, mtrrdef); - -#if !defined(__xpv) - reload_cr3(); -#endif invalidate_cache(); setcr0(cr0_orig); } -/* - * resync mtrr so that BIOS is happy. Called from mdboot - */ -void -mtrr_resync(void) -{ - if ((x86_feature & X86_PAT) && enable_relaxed_mtrr) { - /* - * We could have changed the default mtrr definition. - * Put it back to uncached which is what it is at power on - */ - mtrrdef = MTRR_TYPE_UC|MTRRDEF_FE|MTRRDEF_E; - mtrr_sync(); - } -} -#endif +#endif /* !__xpv */ void get_system_configuration(void)
--- a/usr/src/uts/i86pc/vm/hat_i86.c Mon Oct 01 15:48:08 2007 -0700 +++ b/usr/src/uts/i86pc/vm/hat_i86.c Mon Oct 01 15:49:09 2007 -0700 @@ -2893,7 +2893,7 @@ /* * The range of address space must cover a full table. */ - if (e_ism_addr - ism_addr < LEVEL_SIZE(1 + 1)) + if (e_ism_addr - ism_addr < LEVEL_SIZE(l + 1)) goto not_shared; /*
--- a/usr/src/uts/i86pc/vm/i86_mmu.c Mon Oct 01 15:48:08 2007 -0700 +++ b/usr/src/uts/i86pc/vm/i86_mmu.c Mon Oct 01 15:49:09 2007 -0700 @@ -139,11 +139,14 @@ } } - if (!PP_ISFREE(pp) || !page_reclaim(pp, (kmutex_t *)NULL)) { + if (!PP_ISFREE(pp)) { page_unlock(pp); return (NULL); } + if (!page_reclaim(pp, (kmutex_t *)NULL)) + return (NULL); + return (pp); }
--- a/usr/src/uts/i86xpv/os/mp_xen.c Mon Oct 01 15:48:08 2007 -0700 +++ b/usr/src/uts/i86xpv/os/mp_xen.c Mon Oct 01 15:49:09 2007 -0700 @@ -24,6 +24,71 @@ * Use is subject to license terms. */ +/* + * Virtual CPU management. + * + * VCPUs can be controlled in one of two ways; through the domain itself + * (psradm, p_online(), etc.), and via changes in xenstore (vcpu_config()). + * Unfortunately, the terminology is used in different ways; they work out as + * follows: + * + * P_ONLINE: the VCPU is up and running, taking interrupts and running threads + * + * P_OFFLINE: the VCPU is up and running, but quiesced (i.e. blocked in the + * hypervisor on the idle thread). It must be up since a downed VCPU cannot + * receive interrupts, and we require this for offline CPUs in Solaris. + * + * P_POWEROFF: the VCPU is down (we never called xen_vcpu_up(), or called + * xen_vcpu_down() for it). It can't take interrupts or run anything, though + * if it has run previously, its software state (cpu_t, machcpu structures, IPI + * event channels, etc.) will still exist. + * + * The hypervisor has two notions of CPU states as represented in the store: + * + * "offline": the VCPU is down. Corresponds to P_POWEROFF. + * + * "online": the VCPU is running. Corresponds to a CPU state other than + * P_POWEROFF. + * + * Currently, only a notification via xenstore can bring a CPU into a + * P_POWEROFF state, and only the domain can change between P_ONLINE, P_NOINTR, + * P_OFFLINE, etc. We need to be careful to treat xenstore notifications + * idempotently, as we'll get 'duplicate' entries when we resume a domain. + * + * Note that the xenstore configuration is strictly advisory, in that a domain + * can choose to ignore it and still power up a VCPU in the offline state. To + * play nice, we don't allow it. Thus, any attempt to power on/off a CPU is + * ENOTSUP from within Solaris. + * + * Powering off a VCPU and suspending the domain use similar code. The + * difficulty here is that we must ensure that each VCPU is in a stable + * state: it must have a saved PCB, and not be responding to interrupts + * (since we are just about to remove its ability to run on a real CPU, + * possibly forever). However, an offline CPU in Solaris can take + * cross-call interrupts, as mentioned, so we must go through a + * two-stage process. First, we use the standard Solaris pause_cpus(). + * This ensures that all CPUs are either in mach_cpu_pause() or + * mach_cpu_idle(), and nothing will cross-call them. + * + * Powered-off-CPUs are already safe, as we own the cpu_lock needed to + * bring them back up, and in state CPU_PHASE_POWERED_OFF. + * + * Running CPUs are spinning in mach_cpu_pause() waiting for either + * PAUSE_IDLE or CPU_PHASE_WAIT_SAFE. + * + * Offline CPUs are either running the idle thread and periodically + * checking for CPU_PHASE_WAIT_SAFE, or blocked in the hypervisor. + * + * Thus, we set CPU_PHASE_WAIT_SAFE for every powered-on CPU, as well as + * poking them to make sure they're not blocked[1]. When every CPU has + * responded by reaching a safe state and setting CPU_PHASE_SAFE, we + * know we can suspend, or power-off a CPU, without problems. + * + * [1] note that we have to repeatedly poke offline CPUs: it's the only + * way to ensure that the CPU doesn't miss the state change before + * dropping into HYPERVISOR_block(). + */ + #pragma ident "%Z%%M% %I% %E% SMI" #include <sys/types.h> @@ -35,26 +100,37 @@ #include <sys/machsystm.h> #include <sys/segments.h> #include <sys/cpuvar.h> -#include <sys/psw.h> #include <sys/x86_archext.h> #include <sys/controlregs.h> -#include <vm/as.h> -#include <vm/hat.h> -#include <vm/hat_i86.h> +#include <sys/hypervisor.h> +#include <sys/xpv_panic.h> #include <sys/mman.h> -#include <sys/hypervisor.h> -#include <xen/sys/xenbus_impl.h> -#include <sys/xpv_panic.h> +#include <sys/psw.h> +#include <sys/cpu.h> +#include <sys/sunddi.h> #include <util/sscanf.h> -#include <sys/cpu.h> -#include <asm/cpu.h> +#include <vm/hat_i86.h> +#include <vm/hat.h> +#include <vm/as.h> +#include <xen/public/io/xs_wire.h> +#include <xen/sys/xenbus_impl.h> #include <xen/public/vcpu.h> -#include <xen/public/io/xs_wire.h> + +#define CPU_PHASE_NONE 0 +#define CPU_PHASE_WAIT_SAFE 1 +#define CPU_PHASE_SAFE 2 +#define CPU_PHASE_POWERED_OFF 3 -struct xen_evt_data cpu0_evt_data; /* cpu0's pending event data */ +/* + * We can only poke CPUs during barrier enter 256 times a second at + * most. + */ +#define POKE_TIMEOUT (NANOSEC / 256) static taskq_t *cpu_config_tq; +static int cpu_phase[NCPU]; + static void vcpu_config_event(struct xenbus_watch *, const char **, uint_t); static int xen_vcpu_initialize(processorid_t, vcpu_guest_context_t *); @@ -352,10 +428,8 @@ } /* - * Restore the context of a CPU during resume. The CPU must either - * have been blocked in cpu_idle() (running the idle thread), if it was - * offline, or inside cpu_pause_thread(). Either way we can restore safely - * from the t_pcb. + * Restore the context of a CPU during resume. This context is always + * inside enter_safe_phase(), below. */ void mach_cpucontext_restore(cpu_t *cp) @@ -390,16 +464,56 @@ ASSERT(err == 0); } +/* + * Reach a point at which the CPU can be safely powered-off or + * suspended. Nothing can wake this CPU out of the loop. + */ +static void +enter_safe_phase(void) +{ + ulong_t flags = intr_clear(); + + if (setjmp(&curthread->t_pcb) == 0) { + cpu_phase[CPU->cpu_id] = CPU_PHASE_SAFE; + while (cpu_phase[CPU->cpu_id] == CPU_PHASE_SAFE) + SMT_PAUSE(); + } + + ASSERT(!interrupts_enabled()); + + intr_restore(flags); +} + +/* + * Offline CPUs run this code even under a pause_cpus(), so we must + * check if we need to enter the safe phase. + */ void mach_cpu_idle(void) { if (IN_XPV_PANIC()) { xpv_panic_halt(); } else { - (void) setjmp(&curthread->t_pcb); - CPUSET_ATOMIC_ADD(cpu_suspend_set, CPU->cpu_id); (void) HYPERVISOR_block(); - CPUSET_ATOMIC_DEL(cpu_suspend_set, CPU->cpu_id); + if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE) + enter_safe_phase(); + } +} + +/* + * Spin until either start_cpus() wakes us up, or we get a request to + * enter the safe phase (followed by a later start_cpus()). + */ +void +mach_cpu_pause(volatile char *safe) +{ + *safe = PAUSE_WAIT; + membar_enter(); + + while (*safe != PAUSE_IDLE) { + if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE) + enter_safe_phase(); + SMT_PAUSE(); } } @@ -411,67 +525,6 @@ (void) xen_vcpu_down(CPU->cpu_id); } -void -mach_cpu_pause(volatile char *safe) -{ - ulong_t flags; - - flags = intr_clear(); - - if (setjmp(&curthread->t_pcb) == 0) { - CPUSET_ATOMIC_ADD(cpu_suspend_set, CPU->cpu_id); - /* - * This cpu is now safe. - */ - *safe = PAUSE_WAIT; - membar_enter(); - } - - while (*safe != PAUSE_IDLE) - SMT_PAUSE(); - - CPUSET_ATOMIC_DEL(cpu_suspend_set, CPU->cpu_id); - - intr_restore(flags); -} - -/* - * Virtual CPU management. - * - * VCPUs can be controlled in one of two ways; through the domain itself - * (psradm, p_online(), etc.), and via changes in xenstore (vcpu_config()). - * Unfortunately, the terminology is used in different ways; they work out as - * follows: - * - * P_ONLINE: the VCPU is up and running, taking interrupts and running threads - * - * P_OFFLINE: the VCPU is up and running, but quiesced (i.e. blocked in the - * hypervisor on the idle thread). It must be up since a downed VCPU cannot - * receive interrupts, and we require this for offline CPUs in Solaris. - * - * P_POWEROFF: the VCPU is down (we never called xen_vcpu_up(), or called - * xen_vcpu_down() for it). It can't take interrupts or run anything, though - * if it has run previously, its software state (cpu_t, machcpu structures, IPI - * event channels, etc.) will still exist. - * - * The hypervisor has two notions of CPU states as represented in the store: - * - * "offline": the VCPU is down. Corresponds to P_POWEROFF. - * - * "online": the VCPU is running. Corresponds to a CPU state other than - * P_POWEROFF. - * - * Currently, only a notification via xenstore can bring a CPU into a - * P_POWEROFF state, and only the domain can change between P_ONLINE, P_NOINTR, - * P_OFFLINE, etc. We need to be careful to treat xenstore notifications - * idempotently, as we'll get 'duplicate' entries when we resume a domain. - * - * Note that the xenstore configuration is strictly advisory, in that a domain - * can choose to ignore it and still power up a VCPU in the offline state. To - * play nice, we don't allow it. Thus, any attempt to power on/off a CPU is - * ENOTSUP from within Solaris. - */ - /*ARGSUSED*/ int mp_cpu_poweron(struct cpu *cp) @@ -486,78 +539,122 @@ return (ENOTSUP); } -static int -poweron_vcpu(struct cpu *cp) +void +mp_enter_barrier(void) { - int error; + hrtime_t last_poke_time = 0; + int poke_allowed = 0; + int done = 0; + int i; ASSERT(MUTEX_HELD(&cpu_lock)); - if (HYPERVISOR_vcpu_op(VCPUOP_is_up, cp->cpu_id, NULL) != 0) { - printf("poweron_vcpu: vcpu%d is not available!\n", - cp->cpu_id); - return (ENXIO); + pause_cpus(NULL); + + while (!done) { + done = 1; + poke_allowed = 0; + + if (xpv_gethrtime() - last_poke_time > POKE_TIMEOUT) { + last_poke_time = xpv_gethrtime(); + poke_allowed = 1; + } + + for (i = 0; i < NCPU; i++) { + cpu_t *cp = cpu_get(i); + + if (cp == NULL || cp == CPU) + continue; + + switch (cpu_phase[i]) { + case CPU_PHASE_NONE: + cpu_phase[i] = CPU_PHASE_WAIT_SAFE; + poke_cpu(i); + done = 0; + break; + + case CPU_PHASE_WAIT_SAFE: + if (poke_allowed) + poke_cpu(i); + done = 0; + break; + + case CPU_PHASE_SAFE: + case CPU_PHASE_POWERED_OFF: + break; + } + } + + SMT_PAUSE(); + } +} + +void +mp_leave_barrier(void) +{ + int i; + + ASSERT(MUTEX_HELD(&cpu_lock)); + + for (i = 0; i < NCPU; i++) { + cpu_t *cp = cpu_get(i); + + if (cp == NULL || cp == CPU) + continue; + + switch (cpu_phase[i]) { + /* + * If we see a CPU in one of these phases, something has + * gone badly wrong with the guarantees + * mp_enter_barrier() is supposed to provide. Rather + * than attempt to stumble along (and since we can't + * panic properly in this context), we tell the + * hypervisor we've crashed. + */ + case CPU_PHASE_NONE: + case CPU_PHASE_WAIT_SAFE: + (void) HYPERVISOR_shutdown(SHUTDOWN_crash); + break; + + case CPU_PHASE_POWERED_OFF: + break; + + case CPU_PHASE_SAFE: + cpu_phase[i] = CPU_PHASE_NONE; + } } - if ((error = xen_vcpu_up(cp->cpu_id)) == 0) { - CPUSET_ADD(cpu_ready_set, cp->cpu_id); - cp->cpu_flags |= CPU_EXISTS | CPU_READY | CPU_RUNNING; - cp->cpu_flags &= ~CPU_POWEROFF; - /* - * There are some nasty races possible here. - * Tell the vcpu it's up one more time. - * XXPV Is this enough? Is this safe? - */ - (void) xen_vcpu_up(cp->cpu_id); - - cpu_set_state(cp); - } - return (error); + start_cpus(); } static int -poweroff_poke(void) -{ - CPUSET_ATOMIC_DEL(cpu_suspend_set, CPU->cpu_id); - return (0); -} - -/* - * We must ensure that the VCPU reaches a safe state (in the suspend set, and - * thus is not going to change) before we can power it off. The VCPU could - * still be in mach_cpu_pause() and about to head back out; so just checking - * cpu_suspend_set() isn't sufficient to make sure the VCPU has stopped moving. - * Instead, we xcall it to delete itself from the set; whichever way it comes - * back from that xcall, it won't mark itself in the set until it's safely back - * in mach_cpu_idle(). - */ -static int poweroff_vcpu(struct cpu *cp) { int error; - cpuset_t set; ASSERT(MUTEX_HELD(&cpu_lock)); ASSERT(CPU->cpu_id != cp->cpu_id); ASSERT(cp->cpu_flags & CPU_QUIESCED); - CPUSET_ONLY(set, cp->cpu_id); - - xc_sync(0, 0, 0, X_CALL_HIPRI, set, (xc_func_t)poweroff_poke); - - while (!CPU_IN_SET(cpu_suspend_set, cp->cpu_id)) - SMT_PAUSE(); + mp_enter_barrier(); if ((error = xen_vcpu_down(cp->cpu_id)) == 0) { - ASSERT(CPU_IN_SET(cpu_suspend_set, cp->cpu_id)); + ASSERT(cpu_phase[cp->cpu_id] == CPU_PHASE_SAFE); + CPUSET_DEL(cpu_ready_set, cp->cpu_id); + cp->cpu_flags |= CPU_POWEROFF | CPU_OFFLINE; cp->cpu_flags &= ~(CPU_RUNNING | CPU_READY | CPU_EXISTS | CPU_ENABLE); + cpu_phase[cp->cpu_id] = CPU_PHASE_POWERED_OFF; + cpu_set_state(cp); } + + mp_leave_barrier(); + return (error); } @@ -631,6 +728,37 @@ } static int +poweron_vcpu(struct cpu *cp) +{ + int error; + + ASSERT(MUTEX_HELD(&cpu_lock)); + + if (HYPERVISOR_vcpu_op(VCPUOP_is_up, cp->cpu_id, NULL) != 0) { + printf("poweron_vcpu: vcpu%d is not available!\n", + cp->cpu_id); + return (ENXIO); + } + + if ((error = xen_vcpu_up(cp->cpu_id)) == 0) { + CPUSET_ADD(cpu_ready_set, cp->cpu_id); + cp->cpu_flags |= CPU_EXISTS | CPU_READY | CPU_RUNNING; + cp->cpu_flags &= ~CPU_POWEROFF; + /* + * There are some nasty races possible here. + * Tell the vcpu it's up one more time. + * XXPV Is this enough? Is this safe? + */ + (void) xen_vcpu_up(cp->cpu_id); + + cpu_phase[cp->cpu_id] = CPU_PHASE_NONE; + + cpu_set_state(cp); + } + return (error); +} + +static int vcpu_config_poweron(processorid_t id) { cpu_t *cp;
--- a/usr/src/uts/i86xpv/os/xen_machdep.c Mon Oct 01 15:48:08 2007 -0700 +++ b/usr/src/uts/i86xpv/os/xen_machdep.c Mon Oct 01 15:49:09 2007 -0700 @@ -64,7 +64,6 @@ #include <sys/cmn_err.h> #include <sys/trap.h> #include <sys/segments.h> -#include <sys/sunddi.h> /* for ddi_strtoul */ #include <sys/hypervisor.h> #include <sys/xen_mmu.h> #include <sys/machsystm.h> @@ -92,17 +91,6 @@ #include <sys/balloon_impl.h> #include <sys/ddi.h> -/* - * Hypervisor-specific utility routines - these can be invoked from the - * normal control flow. It might be useful to partition these into - * different files, but let's see how it looks before we get too - * carried away with that idea. - */ - -/* - * In the current absence of any useful way to debug domains that are hung - * whilst suspending, we have a more clumsy approach... - */ #ifdef DEBUG #define SUSPEND_DEBUG if (xen_suspend_debug) xen_printf #else @@ -110,9 +98,7 @@ #endif int cpr_debug; -cpuset_t cpu_suspend_set; cpuset_t cpu_suspend_lost_set; -volatile int xen_suspending_cpus; static int xen_suspend_debug; void @@ -210,7 +196,6 @@ /* * The list of mfn pages is out of date. Recompute it. - * XXPV: can we race against another suspend call? Think not. */ static void rebuild_mfn_list(void) @@ -248,21 +233,7 @@ SUSPEND_DEBUG("suspend_cpus\n"); - xen_suspending_cpus = 1; - - pause_cpus(NULL); - - SUSPEND_DEBUG("waiting for offline CPUs\n"); - - /* - * For us to proceed safely, all CPUs except the current one must be - * present in cpu_suspend_set. Running CPUs will participate in - * pause_cpus(), and eventually reach mach_cpu_pause(). Powered-off - * VCPUs will already be in the set, again in mach_cpu_pause(). - * Finally, offline CPUs will be sitting in mach_cpu_idle(). - */ - while (!CPUSET_ISEQUAL(mp_cpus, cpu_suspend_set)) - SMT_PAUSE(); + mp_enter_barrier(); for (i = 1; i < ncpus; i++) { if (!CPU_IN_SET(cpu_suspend_lost_set, i)) { @@ -279,8 +250,6 @@ { int i; - xen_suspending_cpus = 0; - for (i = 1; i < ncpus; i++) { if (cpu[i] == NULL) continue; @@ -292,7 +261,7 @@ } } - start_cpus(); + mp_leave_barrier(); } /* @@ -573,7 +542,6 @@ } taskq_t *xen_shutdown_tq; -volatile int shutdown_req_active; #define SHUTDOWN_INVALID -1 #define SHUTDOWN_POWEROFF 0 @@ -623,7 +591,6 @@ if (cmd == SHUTDOWN_SUSPEND) { xen_suspend_domain(); - shutdown_req_active = 0; return; } @@ -716,12 +683,6 @@ kmem_free(str, slen); if (shutdown_code != SHUTDOWN_INVALID) { - if (shutdown_code == SHUTDOWN_SUSPEND) { - while (shutdown_req_active) - SMT_PAUSE(); - } - - shutdown_req_active = 1; (void) taskq_dispatch(xen_shutdown_tq, xen_shutdown, (void *)(intptr_t)shutdown_code, 0); }
--- a/usr/src/uts/i86xpv/sys/hypervisor.h Mon Oct 01 15:48:08 2007 -0700 +++ b/usr/src/uts/i86xpv/sys/hypervisor.h Mon Oct 01 15:49:09 2007 -0700 @@ -106,9 +106,10 @@ extern void mach_cpucontext_reset(cpu_t *); extern void mach_cpucontext_restore(cpu_t *); -extern cpuset_t cpu_suspend_set; +extern void mp_enter_barrier(void); +extern void mp_leave_barrier(void); + extern cpuset_t cpu_suspend_lost_set; -extern volatile int xen_suspending_cpus; extern int xen_gdt_setprot(cpu_t *, uint_t); extern int xen_ldt_setprot(user_desc_t *, size_t, uint_t);
--- a/usr/src/uts/intel/sys/archsystm.h Mon Oct 01 15:48:08 2007 -0700 +++ b/usr/src/uts/intel/sys/archsystm.h Mon Oct 01 15:49:09 2007 -0700 @@ -187,7 +187,7 @@ #define IN_XPV_PANIC() (xpv_panicking > 0) #else extern void setup_mca(void); -extern void setup_mtrr(void); +extern void pat_sync(void); #define cpr_dprintf prom_printf #define IN_XPV_PANIC() (__lintzero) #endif
--- a/usr/src/uts/intel/sys/x86_archext.h Mon Oct 01 15:48:08 2007 -0700 +++ b/usr/src/uts/intel/sys/x86_archext.h Mon Oct 01 15:49:09 2007 -0700 @@ -191,23 +191,8 @@ #define K5_TSC 0x10 #define K5_TR12 0x12 -#define REG_MTRRCAP 0xfe -#define REG_MTRRDEF 0x2ff -#define REG_MTRR64K 0x250 -#define REG_MTRR16K1 0x258 -#define REG_MTRR16K2 0x259 -#define REG_MTRR4K1 0x268 -#define REG_MTRR4K2 0x269 -#define REG_MTRR4K3 0x26a -#define REG_MTRR4K4 0x26b -#define REG_MTRR4K5 0x26c -#define REG_MTRR4K6 0x26d -#define REG_MTRR4K7 0x26e -#define REG_MTRR4K8 0x26f -#define REG_MTRRPAT 0x277 +#define REG_PAT 0x277 -#define REG_MTRRPHYSBASE0 0x200 -#define REG_MTRRPHYSMASK7 0x20f #define REG_MC0_CTL 0x400 #define REG_MC5_MISC 0x417 #define REG_PERFCTR0 0xc1 @@ -285,66 +270,34 @@ #define MCI_CTL_VALUE 0xffffffff -#define MTRRTYPE_MASK 0xff - - -#define MTRRCAP_FIX 0x100 -#define MTRRCAP_VCNTMASK 0xff -#define MTRRCAP_USWC 0x400 - -#define MTRRDEF_E 0x800 -#define MTRRDEF_FE 0x400 - -#define MTRRPHYSMASK_V 0x800 - #define MTRR_TYPE_UC 0 #define MTRR_TYPE_WC 1 #define MTRR_TYPE_WT 4 #define MTRR_TYPE_WP 5 #define MTRR_TYPE_WB 6 +#define MTRR_TYPE_UC_ 7 /* - * Page attribute table is setup in the following way - * PAT0 Write-BACK + * For Solaris we set up the page attritubute table in the following way: + * PAT0 Write-Back * PAT1 Write-Through - * PAT2 Unchacheable + * PAT2 Unchacheable- * PAT3 Uncacheable - * PAT4 Uncacheable - * PAT5 Write-Protect + * PAT4 Write-Back + * PAT5 Write-Through * PAT6 Write-Combine * PAT7 Uncacheable + * The only difference from h/w default is entry 6. */ -#define PAT_DEFAULT_ATTRIBUTE \ - ((uint64_t)MTRR_TYPE_WC << 48)|((uint64_t)MTRR_TYPE_WP << 40)| \ - (MTRR_TYPE_WT << 8)|(MTRR_TYPE_WB) - - -#define MTRR_SETTYPE(a, t) ((a &= (uint64_t)~0xff),\ - (a |= ((t) & 0xff))) -#define MTRR_SETVINVALID(a) ((a) &= ~MTRRPHYSMASK_V) - - -#define MTRR_SETVBASE(a, b, t) ((a) =\ - ((((uint64_t)(b)) & 0xffffff000)|\ - (((uint32_t)(t)) & 0xff))) - -#define MTRR_SETVMASK(a, s, v) ((a) =\ - ((~(((uint64_t)(s)) - 1) & 0xffffff000)|\ - (((uint32_t)(v)) << 11))) - -#define MTRR_GETVBASE(a) (((uint64_t)(a)) & 0xffffff000) -#define MTRR_GETVTYPE(a) (((uint64_t)(a)) & 0xff) -#define MTRR_GETVSIZE(a) ((~((uint64_t)(a)) + 1) & 0xffffff000) - - -#define MAX_MTRRVAR 8 - -#if !defined(_ASM) -typedef struct mtrrvar { - uint64_t mtrrphys_base; - uint64_t mtrrphys_mask; -} mtrrvar_t; -#endif /* _ASM */ +#define PAT_DEFAULT_ATTRIBUTE \ + ((uint64_t)MTRR_TYPE_WB | \ + ((uint64_t)MTRR_TYPE_WT << 8) | \ + ((uint64_t)MTRR_TYPE_UC_ << 16) | \ + ((uint64_t)MTRR_TYPE_UC << 24) | \ + ((uint64_t)MTRR_TYPE_WB << 32) | \ + ((uint64_t)MTRR_TYPE_WT << 40) | \ + ((uint64_t)MTRR_TYPE_WC << 48) | \ + ((uint64_t)MTRR_TYPE_UC << 56)) #define X86_LARGEPAGE 0x00000001 #define X86_TSC 0x00000002