changeset 11713:03615b084875

6892591 per-MMU context id domains for sun4v
author Pavel Tatashin <Pavel.Tatashin@Sun.COM>
date Fri, 19 Feb 2010 10:18:21 -0800
parents 3b88ce606c90
children c68907917e95
files usr/src/uts/sfmmu/ml/sfmmu_asm.s usr/src/uts/sfmmu/vm/hat_sfmmu.c usr/src/uts/sfmmu/vm/hat_sfmmu.h usr/src/uts/sun4v/os/fillsysinfo.c usr/src/uts/sun4v/os/mach_descrip.c usr/src/uts/sun4v/os/mach_startup.c usr/src/uts/sun4v/os/suspend.c usr/src/uts/sun4v/sys/mach_descrip.h
diffstat 8 files changed, 511 insertions(+), 65 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/uts/sfmmu/ml/sfmmu_asm.s	Fri Feb 19 10:41:19 2010 -0700
+++ b/usr/src/uts/sfmmu/ml/sfmmu_asm.s	Fri Feb 19 10:18:21 2010 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -635,6 +635,13 @@
 		
 	! load global mmu_ctxp info
 	ldx	[%o2 + CPU_MMU_CTXP], %o3		! %o3 = mmu_ctx_t ptr
+
+#ifdef sun4v
+	/* During suspend on sun4v, context domains can be temporary removed */
+	brz,a,pn       %o3, 0f
+	  nop
+#endif
+
         lduw	[%o2 + CPU_MMU_IDX], %g2		! %g2 = mmu index
 
 	! load global mmu_ctxp gnum
@@ -687,6 +694,13 @@
 	! (invalid HAT cnum) && (allocflag == 1)
 	ba,pt	%icc, 2f
 	  nop
+#ifdef sun4v
+0:
+	set	INVALID_CONTEXT, %o1
+	membar	#LoadStore|#StoreStore
+	ba,pt	%icc, 8f
+	  mov   %g0, %g4                ! %g4 = ret = 0
+#endif
 1:
 	! valid HAT cnum, check gnum
 	cmp	%g5, %o4
--- a/usr/src/uts/sfmmu/vm/hat_sfmmu.c	Fri Feb 19 10:41:19 2010 -0700
+++ b/usr/src/uts/sfmmu/vm/hat_sfmmu.c	Fri Feb 19 10:18:21 2010 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -532,7 +532,7 @@
 extern void	sfmmu_setup_tsbinfo(sfmmu_t *);
 extern void	sfmmu_clear_utsbinfo(void);
 
-static void	sfmmu_ctx_wrap_around(mmu_ctx_t *);
+static void		sfmmu_ctx_wrap_around(mmu_ctx_t *, boolean_t);
 
 extern int vpm_enable;
 
@@ -1112,19 +1112,11 @@
 	 * a set_platform_defaults() or does not choose to modify
 	 * max_mmu_ctxdoms, it gets one MMU context domain for every CPU.
 	 *
-	 * For sun4v, there will be one global context domain, this is to
-	 * avoid the ldom cpu substitution problem.
-	 *
 	 * For all platforms that have CPUs sharing MMUs, this
 	 * value must be defined.
 	 */
-	if (max_mmu_ctxdoms == 0) {
-#ifndef sun4v
+	if (max_mmu_ctxdoms == 0)
 		max_mmu_ctxdoms = max_ncpus;
-#else /* sun4v */
-		max_mmu_ctxdoms = 1;
-#endif /* sun4v */
-	}
 
 	size = max_mmu_ctxdoms * sizeof (mmu_ctx_t *);
 	mmu_ctxs_tbl = kmem_zalloc(size, KM_SLEEP);
@@ -1611,26 +1603,16 @@
  * specify that interface, then the function below is used instead to return
  * default information. The defaults are as follows:
  *
- *	- For sun4u systems there's one MMU context domain per CPU.
- *	  This default is used by all sun4u systems except OPL. OPL systems
- *	  provide platform specific interface to map CPU ids to MMU ids
- *	  because on OPL more than 1 CPU shares a single MMU.
- *        Note that on sun4v, there is one global context domain for
- *	  the entire system. This is to avoid running into potential problem
- *	  with ldom physical cpu substitution feature.
  *	- The number of MMU context IDs supported on any CPU in the
  *	  system is 8K.
+ *	- There is one MMU context domain per CPU.
  */
 /*ARGSUSED*/
 static void
 sfmmu_cpuid_to_mmu_ctx_info(processorid_t cpuid, mmu_ctx_info_t *infop)
 {
 	infop->mmu_nctxs = nctxs;
-#ifndef sun4v
 	infop->mmu_idx = cpu[cpuid]->cpu_seqid;
-#else /* sun4v */
-	infop->mmu_idx = 0;
-#endif /* sun4v */
 }
 
 /*
@@ -1676,6 +1658,7 @@
 		mmu_ctxs_tbl[info.mmu_idx] = mmu_ctxp;
 	} else {
 		ASSERT(mmu_ctxp->mmu_idx == info.mmu_idx);
+		ASSERT(mmu_ctxp->mmu_nctxs <= info.mmu_nctxs);
 	}
 
 	/*
@@ -1693,6 +1676,24 @@
 	mutex_exit(&mmu_ctxp->mmu_lock);
 }
 
+static void
+sfmmu_ctxdom_free(mmu_ctx_t *mmu_ctxp)
+{
+	ASSERT(MUTEX_HELD(&cpu_lock));
+	ASSERT(!MUTEX_HELD(&mmu_ctxp->mmu_lock));
+
+	mutex_destroy(&mmu_ctxp->mmu_lock);
+
+	if (mmu_ctxp->mmu_kstat)
+		kstat_delete(mmu_ctxp->mmu_kstat);
+
+	/* mmu_saved_gnum is protected by the cpu_lock. */
+	if (mmu_saved_gnum < mmu_ctxp->mmu_gnum)
+		mmu_saved_gnum = mmu_ctxp->mmu_gnum;
+
+	kmem_cache_free(mmuctxdom_cache, mmu_ctxp);
+}
+
 /*
  * Called to perform MMU context-related cleanup for a CPU.
  */
@@ -1718,23 +1719,165 @@
 	if (--mmu_ctxp->mmu_ncpus == 0) {
 		mmu_ctxs_tbl[mmu_ctxp->mmu_idx] = NULL;
 		mutex_exit(&mmu_ctxp->mmu_lock);
-		mutex_destroy(&mmu_ctxp->mmu_lock);
-
-		if (mmu_ctxp->mmu_kstat)
-			kstat_delete(mmu_ctxp->mmu_kstat);
-
-		/* mmu_saved_gnum is protected by the cpu_lock. */
-		if (mmu_saved_gnum < mmu_ctxp->mmu_gnum)
-			mmu_saved_gnum = mmu_ctxp->mmu_gnum;
-
-		kmem_cache_free(mmuctxdom_cache, mmu_ctxp);
-
+		sfmmu_ctxdom_free(mmu_ctxp);
 		return;
 	}
 
 	mutex_exit(&mmu_ctxp->mmu_lock);
 }
 
+uint_t
+sfmmu_ctxdom_nctxs(int idx)
+{
+	return (mmu_ctxs_tbl[idx]->mmu_nctxs);
+}
+
+#ifdef sun4v
+/*
+ * sfmmu_ctxdoms_* is an interface provided to help keep context domains
+ * consistant after suspend/resume on system that can resume on a different
+ * hardware than it was suspended.
+ *
+ * sfmmu_ctxdom_lock(void) locks all context domains and prevents new contexts
+ * from being allocated.  It acquires all hat_locks, which blocks most access to
+ * context data, except for a few cases that are handled separately or are
+ * harmless.  It wraps each domain to increment gnum and invalidate on-CPU
+ * contexts, and forces cnum to its max.  As a result of this call all user
+ * threads that are running on CPUs trap and try to perform wrap around but
+ * can't because hat_locks are taken.  Threads that were not on CPUs but started
+ * by scheduler go to sfmmu_alloc_ctx() to aquire context without checking
+ * hat_lock, but fail, because cnum == nctxs, and therefore also trap and block
+ * on hat_lock trying to wrap.  sfmmu_ctxdom_lock() must be called before CPUs
+ * are paused, else it could deadlock acquiring locks held by paused CPUs.
+ *
+ * sfmmu_ctxdoms_remove() removes context domains from every CPUs and records
+ * the CPUs that had them.  It must be called after CPUs have been paused. This
+ * ensures that no threads are in sfmmu_alloc_ctx() accessing domain data,
+ * because pause_cpus sends a mondo interrupt to every CPU, and sfmmu_alloc_ctx
+ * runs with interrupts disabled.  When CPUs are later resumed, they may enter
+ * sfmmu_alloc_ctx, but it will check for CPU_MMU_CTXP = NULL and immediately
+ * return failure.  Or, they will be blocked trying to acquire hat_lock. Thus
+ * after sfmmu_ctxdoms_remove returns, we are guaranteed that no one is
+ * accessing the old context domains.
+ *
+ * sfmmu_ctxdoms_update(void) frees space used by old context domains and
+ * allocates new context domains based on hardware layout.  It initializes
+ * every CPU that had context domain before migration to have one again.
+ * sfmmu_ctxdoms_update must be called after CPUs are resumed, else it
+ * could deadlock acquiring locks held by paused CPUs.
+ *
+ * sfmmu_ctxdoms_unlock(void) releases all hat_locks after which user threads
+ * acquire new context ids and continue execution.
+ *
+ * Therefore functions should be called in the following order:
+ *       suspend_routine()
+ *		sfmmu_ctxdom_lock()
+ *		pause_cpus()
+ *		suspend()
+ *			if (suspend failed)
+ *				sfmmu_ctxdom_unlock()
+ *		...
+ *		sfmmu_ctxdom_remove()
+ *		resume_cpus()
+ *		sfmmu_ctxdom_update()
+ *		sfmmu_ctxdom_unlock()
+ */
+static cpuset_t sfmmu_ctxdoms_pset;
+
+void
+sfmmu_ctxdoms_remove()
+{
+	processorid_t	id;
+	cpu_t		*cp;
+
+	/*
+	 * Record the CPUs that have domains in sfmmu_ctxdoms_pset, so they can
+	 * be restored post-migration. A CPU may be powered off and not have a
+	 * domain, for example.
+	 */
+	CPUSET_ZERO(sfmmu_ctxdoms_pset);
+
+	for (id = 0; id < NCPU; id++) {
+		if ((cp = cpu[id]) != NULL && CPU_MMU_CTXP(cp) != NULL) {
+			CPUSET_ADD(sfmmu_ctxdoms_pset, id);
+			CPU_MMU_CTXP(cp) = NULL;
+		}
+	}
+}
+
+void
+sfmmu_ctxdoms_lock(void)
+{
+	int		idx;
+	mmu_ctx_t	*mmu_ctxp;
+
+	sfmmu_hat_lock_all();
+
+	/*
+	 * At this point, no thread can be in sfmmu_ctx_wrap_around, because
+	 * hat_lock is always taken before calling it.
+	 *
+	 * For each domain, set mmu_cnum to max so no more contexts can be
+	 * allocated, and wrap to flush on-CPU contexts and force threads to
+	 * acquire a new context when we later drop hat_lock after migration.
+	 * Setting mmu_cnum may race with sfmmu_alloc_ctx which also sets cnum,
+	 * but the latter uses CAS and will miscompare and not overwrite it.
+	 */
+	kpreempt_disable(); /* required by sfmmu_ctx_wrap_around */
+	for (idx = 0; idx < max_mmu_ctxdoms; idx++) {
+		if ((mmu_ctxp = mmu_ctxs_tbl[idx]) != NULL) {
+			mutex_enter(&mmu_ctxp->mmu_lock);
+			mmu_ctxp->mmu_cnum = mmu_ctxp->mmu_nctxs;
+			/* make sure updated cnum visible */
+			membar_enter();
+			mutex_exit(&mmu_ctxp->mmu_lock);
+			sfmmu_ctx_wrap_around(mmu_ctxp, B_FALSE);
+		}
+	}
+	kpreempt_enable();
+}
+
+void
+sfmmu_ctxdoms_unlock(void)
+{
+	sfmmu_hat_unlock_all();
+}
+
+void
+sfmmu_ctxdoms_update(void)
+{
+	processorid_t	id;
+	cpu_t		*cp;
+	uint_t		idx;
+	mmu_ctx_t	*mmu_ctxp;
+
+	/*
+	 * Free all context domains.  As side effect, this increases
+	 * mmu_saved_gnum to the maximum gnum over all domains, which is used to
+	 * init gnum in the new domains, which therefore will be larger than the
+	 * sfmmu gnum for any process, guaranteeing that every process will see
+	 * a new generation and allocate a new context regardless of what new
+	 * domain it runs in.
+	 */
+	mutex_enter(&cpu_lock);
+
+	for (idx = 0; idx < max_mmu_ctxdoms; idx++) {
+		if (mmu_ctxs_tbl[idx] != NULL) {
+			mmu_ctxp = mmu_ctxs_tbl[idx];
+			mmu_ctxs_tbl[idx] = NULL;
+			sfmmu_ctxdom_free(mmu_ctxp);
+		}
+	}
+
+	for (id = 0; id < NCPU; id++) {
+		if (CPU_IN_SET(sfmmu_ctxdoms_pset, id) &&
+		    (cp = cpu[id]) != NULL)
+			sfmmu_cpu_init(cp);
+	}
+	mutex_exit(&cpu_lock);
+}
+#endif
+
 /*
  * Hat_setup, makes an address space context the current active one.
  * In sfmmu this translates to setting the secondary context with the
@@ -9745,7 +9888,7 @@
 	 * Do a wrap-around if cnum reaches the max # cnum supported by a MMU.
 	 */
 	if (mmu_ctxp->mmu_cnum == mmu_ctxp->mmu_nctxs)
-		sfmmu_ctx_wrap_around(mmu_ctxp);
+		sfmmu_ctx_wrap_around(mmu_ctxp, B_TRUE);
 
 	/*
 	 * Let the MMU set up the page sizes to use for
@@ -9786,7 +9929,7 @@
  * next generation and start from 2.
  */
 static void
-sfmmu_ctx_wrap_around(mmu_ctx_t *mmu_ctxp)
+sfmmu_ctx_wrap_around(mmu_ctx_t *mmu_ctxp, boolean_t reset_cnum)
 {
 
 	/* caller must have disabled the preemption */
@@ -9820,7 +9963,7 @@
 
 		/* xcall to others on the same MMU to invalidate ctx */
 		cpuset = mmu_ctxp->mmu_cpuset;
-		ASSERT(CPU_IN_SET(cpuset, CPU->cpu_id));
+		ASSERT(CPU_IN_SET(cpuset, CPU->cpu_id) || !reset_cnum);
 		CPUSET_DEL(cpuset, CPU->cpu_id);
 		CPUSET_AND(cpuset, cpu_ready_set);
 
@@ -9857,7 +10000,8 @@
 	}
 
 	/* reset mmu cnum, skips cnum 0 and 1 */
-	mmu_ctxp->mmu_cnum = NUM_LOCKED_CTXS;
+	if (reset_cnum == B_TRUE)
+		mmu_ctxp->mmu_cnum = NUM_LOCKED_CTXS;
 
 done:
 	mutex_exit(&mmu_ctxp->mmu_lock);
--- a/usr/src/uts/sfmmu/vm/hat_sfmmu.h	Fri Feb 19 10:41:19 2010 -0700
+++ b/usr/src/uts/sfmmu/vm/hat_sfmmu.h	Fri Feb 19 10:18:21 2010 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -554,10 +554,10 @@
  *	is protected via CAS.
  * mmu_nctxs
  *	The max number of context IDs supported on every CPU in this
- *	MMU context domain. It is 8K except for Rock where it is 64K.
- *      This is needed here in case the system supports mixed type of
- *      processors/MMUs. It also helps to make ctx switch code access
- *      fewer cache lines i.e. no need to retrieve it from some global nctxs.
+ *	MMU context domain. This is needed here in case the system supports
+ *      mixed type of processors/MMUs. It also helps to make ctx switch code
+ *      access fewer cache lines i.e. no need to retrieve it from some global
+ *      nctxs.
  * mmu_lock
  *	The mutex spin lock used to serialize context ID wrap around
  * mmu_idx
@@ -599,6 +599,15 @@
 extern void	sfmmu_cpu_init(cpu_t *);
 extern void	sfmmu_cpu_cleanup(cpu_t *);
 
+extern uint_t	sfmmu_ctxdom_nctxs(int);
+
+#ifdef sun4v
+extern void	sfmmu_ctxdoms_remove(void);
+extern void	sfmmu_ctxdoms_lock(void);
+extern void	sfmmu_ctxdoms_unlock(void);
+extern void	sfmmu_ctxdoms_update(void);
+#endif
+
 /*
  * The following structure is used to get MMU context domain information for
  * a CPU from the platform.
@@ -607,7 +616,6 @@
  *	The MMU context domain index within the global array mmu_ctxs
  * mmu_nctxs
  *	The number of context IDs supported in the MMU context domain
- *	(64K for Rock)
  */
 typedef struct mmu_ctx_info {
 	uint_t		mmu_idx;
@@ -2575,7 +2583,11 @@
 #define	SFMMU_STAT_ADD(stat, amount)	sfmmu_global_stat.stat += (amount)
 #define	SFMMU_STAT_SET(stat, count)	sfmmu_global_stat.stat = (count)
 
-#define	SFMMU_MMU_STAT(stat)		CPU->cpu_m.cpu_mmu_ctxp->stat++
+#define	SFMMU_MMU_STAT(stat)		{		\
+	mmu_ctx_t *ctx = CPU->cpu_m.cpu_mmu_ctxp;	\
+	if (ctx)					\
+		ctx->stat++;				\
+}
 
 #endif /* !_ASM */
 
--- a/usr/src/uts/sun4v/os/fillsysinfo.c	Fri Feb 19 10:41:19 2010 -0700
+++ b/usr/src/uts/sun4v/os/fillsysinfo.c	Fri Feb 19 10:18:21 2010 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -1050,3 +1050,244 @@
 
 	md_free_scan_dag(mdp, &platlist);
 }
+
+/*
+ * Number of bits forming a valid context for use in a sun4v TTE and the MMU
+ * context registers. Sun4v defines the minimum default value to be 13 if this
+ * property is not specified in a cpu node in machine descriptor graph.
+ */
+#define	MMU_INFO_CTXBITS_MIN		13
+
+/* Convert context bits to number of contexts */
+#define	MMU_INFO_BNCTXS(nbits)		((uint_t)(1u<<(nbits)))
+
+/*
+ * Read machine descriptor and load TLB to CPU mappings.
+ * Returned values: cpuid2pset[NCPU], nctxs[NCPU], md_gen
+ * - cpuid2pset is initialized so it can convert cpuids to processor set of CPUs
+ *   that are shared between TLBs.
+ * - nctxs is initialized to number of contexts for each CPU
+ * - md_gen is set to generation number of machine descriptor from which this
+ *   data was.
+ * Return: zero on success.
+ */
+static int
+load_tlb_cpu_mappings(cpuset_t **cpuid2pset, uint_t *nctxs, uint64_t *md_gen)
+{
+	mde_str_cookie_t cpu_sc, bck_sc;
+	int		tlbs_idx, cp_idx;
+	mde_cookie_t	root;
+	md_t		*mdp = NULL;
+	mde_cookie_t	*tlbs = NULL;
+	mde_cookie_t	*cp = NULL;
+	uint64_t	*cpids = NULL;
+	uint64_t	nbit;
+	int		ntlbs;
+	int		ncp;
+	int		retval = 1;
+	cpuset_t	*ppset;
+
+	/* get MD handle, and string cookies for cpu and back nodes */
+	if ((mdp = md_get_handle()) == NULL ||
+	    (cpu_sc = md_find_name(mdp, "cpu")) == MDE_INVAL_STR_COOKIE ||
+	    (bck_sc = md_find_name(mdp, "back")) == MDE_INVAL_STR_COOKIE)
+		goto cleanup;
+
+	/* set generation number of current MD handle */
+	*md_gen = md_get_gen(mdp);
+
+	/* Find root element, and search for all TLBs in MD */
+	if ((root = md_root_node(mdp)) == MDE_INVAL_ELEM_COOKIE ||
+	    (ntlbs = md_alloc_scan_dag(mdp, root, "tlb", "fwd", &tlbs)) <= 0)
+		goto cleanup;
+
+	cp = kmem_alloc(sizeof (mde_cookie_t) * NCPU, KM_SLEEP);
+	cpids = kmem_alloc(sizeof (uint64_t) * NCPU, KM_SLEEP);
+
+	/*
+	 * Build processor sets, one per possible context domain.  For each tlb,
+	 * search for connected CPUs.  If any CPU is already in a set, then add
+	 * all the TLB's CPUs to that set.  Otherwise, create and populate a new
+	 * pset.  Thus, a single pset is built to represent multiple TLBs if
+	 * they have CPUs in common.
+	 */
+	for (tlbs_idx = 0; tlbs_idx < ntlbs; tlbs_idx++) {
+		ncp = md_scan_dag(mdp, tlbs[tlbs_idx], cpu_sc, bck_sc, cp);
+		if (ncp < 0)
+			goto cleanup;
+		else if (ncp == 0)
+			continue;
+
+		/* Get the id and number of contexts for each cpu */
+		for (cp_idx = 0; cp_idx < ncp; cp_idx++) {
+			mde_cookie_t c = cp[cp_idx];
+
+			if (md_get_prop_val(mdp, c, "id", &cpids[cp_idx]))
+				goto cleanup;
+			if (md_get_prop_val(mdp, c, "mmu-#context-bits", &nbit))
+				nbit = MMU_INFO_CTXBITS_MIN;
+			nctxs[cpids[cp_idx]] = MMU_INFO_BNCTXS(nbit);
+		}
+
+		/*
+		 * If a CPU is already in a set as shown by cpuid2pset[], then
+		 * use that set.
+		 */
+		for (cp_idx = 0; cp_idx < ncp; cp_idx++) {
+			ASSERT(cpids[cp_idx] < NCPU);
+			ppset = cpuid2pset[cpids[cp_idx]];
+			if (ppset != NULL)
+				break;
+		}
+
+		/* No CPU has a set. Create a new one. */
+		if (ppset == NULL) {
+			ppset = kmem_alloc(sizeof (cpuset_t), KM_SLEEP);
+			CPUSET_ZERO(*ppset);
+		}
+
+		/* Add every CPU to the set, and record the set assignment. */
+		for (cp_idx = 0; cp_idx < ncp; cp_idx++) {
+			cpuid2pset[cpids[cp_idx]] = ppset;
+			CPUSET_ADD(*ppset, cpids[cp_idx]);
+		}
+	}
+
+	retval = 0;
+
+cleanup:
+	if (tlbs != NULL)
+		md_free_scan_dag(mdp, &tlbs);
+	if (cp != NULL)
+		kmem_free(cp, sizeof (mde_cookie_t) * NCPU);
+	if (cpids != NULL)
+		kmem_free(cpids, sizeof (uint64_t) * NCPU);
+	if (mdp != NULL)
+		(void) md_fini_handle(mdp);
+
+	return (retval);
+}
+
+/*
+ * Return MMU info based on cpuid.
+ *
+ * Algorithm:
+ * Read machine descriptor and find all CPUs that share the same TLB with CPU
+ * specified by cpuid. Go through found CPUs and see if any one of them already
+ * has MMU index, if so, set index based on that value. If CPU does not share
+ * TLB with any other CPU or if none of those CPUs has mmu_ctx pointer, find the
+ * smallest available MMU index and give it to current CPU. If no available
+ * domain, perform a round robin, and start assigning from the beginning.
+ *
+ * For optimization reasons, this function uses a cache to store all TLB to CPU
+ * mappings, and updates them only when machine descriptor graph is changed.
+ * Because of this, and because we search MMU table for smallest index id, this
+ * function needs to be serialized which is protected by cpu_lock.
+ */
+void
+plat_cpuid_to_mmu_ctx_info(processorid_t cpuid, mmu_ctx_info_t *info)
+{
+	static cpuset_t	**cpuid2pset = NULL;
+	static uint_t	*nctxs;
+	static uint_t	next_domain = 0;
+	static uint64_t	md_gen = MDESC_INVAL_GEN;
+	uint64_t	current_gen;
+	int		idx;
+	cpuset_t	cpuid_pset;
+	processorid_t	id;
+	cpu_t		*cp;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	current_gen = md_get_current_gen();
+
+	/*
+	 * Load TLB CPU mappings only if MD generation has changed, FW that do
+	 * not provide generation number, always return MDESC_INVAL_GEN, and as
+	 * result MD is read here only once on such machines: when cpuid2pset is
+	 * NULL
+	 */
+	if (current_gen != md_gen || cpuid2pset == NULL) {
+		if (cpuid2pset == NULL) {
+			cpuid2pset = kmem_zalloc(sizeof (cpuset_t *) * NCPU,
+			    KM_SLEEP);
+			nctxs = kmem_alloc(sizeof (uint_t) * NCPU, KM_SLEEP);
+		} else {
+			/* clean cpuid2pset[NCPU], before loading new values */
+			for (idx = 0; idx < NCPU; idx++) {
+				cpuset_t *pset = cpuid2pset[idx];
+
+				if (pset != NULL) {
+					for (;;) {
+						CPUSET_FIND(*pset, id);
+						if (id == CPUSET_NOTINSET)
+							break;
+						CPUSET_DEL(*pset, id);
+						ASSERT(id < NCPU);
+						cpuid2pset[id] = NULL;
+					}
+					ASSERT(cpuid2pset[idx] == NULL);
+					kmem_free(pset, sizeof (cpuset_t));
+				}
+			}
+		}
+
+		if (load_tlb_cpu_mappings(cpuid2pset, nctxs, &md_gen))
+			goto error_panic;
+	}
+
+	info->mmu_nctxs = nctxs[cpuid];
+
+	if (cpuid2pset[cpuid] == NULL)
+		goto error_panic;
+
+	cpuid_pset = *cpuid2pset[cpuid];
+	CPUSET_DEL(cpuid_pset, cpuid);
+
+	/* Search for a processor in the same TLB pset with MMU context */
+	for (;;) {
+		CPUSET_FIND(cpuid_pset, id);
+
+		if (id == CPUSET_NOTINSET)
+			break;
+
+		ASSERT(id < NCPU);
+		cp = cpu[id];
+		if (cp != NULL && CPU_MMU_CTXP(cp) != NULL) {
+			info->mmu_idx = CPU_MMU_IDX(cp);
+
+			return;
+		}
+		CPUSET_DEL(cpuid_pset, id);
+	}
+
+	/*
+	 * No CPU in the TLB pset has a context domain yet.
+	 * Use next_domain if available, or search for an unused domain, or
+	 * overload next_domain, in that order.  Overloading is necessary when
+	 * the number of TLB psets is greater than max_mmu_ctxdoms.
+	 */
+	idx = next_domain;
+
+	if (mmu_ctxs_tbl[idx] != NULL) {
+		for (idx = 0; idx < max_mmu_ctxdoms; idx++)
+			if (mmu_ctxs_tbl[idx] == NULL)
+				break;
+		if (idx == max_mmu_ctxdoms) {
+			/* overload next_domain */
+			idx = next_domain;
+
+			if (info->mmu_nctxs < sfmmu_ctxdom_nctxs(idx))
+				cmn_err(CE_PANIC, "max_mmu_ctxdoms is too small"
+				    " to support CPUs with different nctxs");
+		}
+	}
+
+	info->mmu_idx = idx;
+	next_domain = (idx + 1) % max_mmu_ctxdoms;
+
+	return;
+
+error_panic:
+	cmn_err(CE_PANIC, "!cpu%d: failed to get MMU CTX domain index", cpuid);
+}
--- a/usr/src/uts/sun4v/os/mach_descrip.c	Fri Feb 19 10:41:19 2010 -0700
+++ b/usr/src/uts/sun4v/os/mach_descrip.c	Fri Feb 19 10:18:21 2010 -0800
@@ -20,12 +20,10 @@
  */
 
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Kernel Machine Description (MD)
  *
@@ -861,3 +859,23 @@
 
 	mdp->freep(*list, sizeof (mde_cookie_t) * mdp->node_count);
 }
+
+/*
+ * Return generation number of current machine descriptor. Can be used for
+ * performance purposes to avoid requesting new md handle just to see if graph
+ * was updated.
+ */
+uint64_t
+md_get_current_gen(void)
+{
+	uint64_t gen = MDESC_INVAL_GEN;
+
+	mutex_enter(&curr_mach_descrip_lock);
+
+	if (curr_mach_descrip != NULL)
+		gen = (curr_mach_descrip->gen);
+
+	mutex_exit(&curr_mach_descrip_lock);
+
+	return (gen);
+}
--- a/usr/src/uts/sun4v/os/mach_startup.c	Fri Feb 19 10:41:19 2010 -0700
+++ b/usr/src/uts/sun4v/os/mach_startup.c	Fri Feb 19 10:18:21 2010 -0800
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -527,3 +527,15 @@
 	/* virtual console concentrator */
 	(void) i_ddi_attach_hw_nodes("vcc");
 }
+
+void
+set_platform_defaults(void)
+{
+	/*
+	 * Allow at most one context domain per 8 CPUs, which is ample for
+	 * good performance.  Do not make this too large, because it
+	 * increases the space consumed in the per-process sfmmu structure.
+	 */
+	if (max_mmu_ctxdoms == 0)
+		max_mmu_ctxdoms = (NCPU + 7) / 8;
+}
--- a/usr/src/uts/sun4v/os/suspend.c	Fri Feb 19 10:41:19 2010 -0700
+++ b/usr/src/uts/sun4v/os/suspend.c	Fri Feb 19 10:18:21 2010 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -44,6 +44,7 @@
 #include <sys/sunddi.h>
 #include <sys/cpupart.h>
 #include <sys/hsvc.h>
+#include <vm/hat_sfmmu.h>
 
 /*
  * Sun4v OS Suspend
@@ -125,10 +126,9 @@
 boolean_t tick_stick_emulation_active = B_FALSE;
 
 /*
- * Controls whether or not MD information is refreshed after a
- * successful suspend and resume. When non-zero, after a successful
- * suspend and resume, the MD will be downloaded, cpunodes updated,
- * and processor grouping information recalculated.
+ * When non-zero, after a successful suspend and resume, cpunodes, CPU HW
+ * sharing data structures, and processor groups will be updated using
+ * information from the updated MD.
  */
 static int suspend_update_cpu_mappings = 1;
 
@@ -243,15 +243,8 @@
 	md_t		*mdp;
 	processorid_t	id;
 	cpu_t		*cp;
-	int		rv;
 	cpu_pg_t	*pgps[NCPU];
 
-	/* Download the latest MD */
-	if ((rv = mach_descrip_update()) != 0) {
-		DBG("suspend: mach_descrip_update error: %d", rv);
-		return;
-	}
-
 	if ((mdp = md_get_handle()) == NULL) {
 		DBG("suspend: md_get_handle failed");
 		return;
@@ -491,6 +484,8 @@
 	ASSERT(suspend_supported());
 	DBG("suspend: %s", __func__);
 
+	sfmmu_ctxdoms_lock();
+
 	mutex_enter(&cpu_lock);
 
 	/* Suspend the watchdog */
@@ -535,6 +530,7 @@
 		start_cpus();
 		watchdog_resume();
 		mutex_exit(&cpu_lock);
+		sfmmu_ctxdoms_unlock();
 		DBG("suspend: failed, rv: %ld\n", rv);
 		return (rv);
 	}
@@ -561,6 +557,8 @@
 		tick_stick_emulation_active = B_TRUE;
 	}
 
+	sfmmu_ctxdoms_remove();
+
 	/* Resume cyclics, unpause CPUs */
 	cyclic_resume();
 	start_cpus();
@@ -575,6 +573,14 @@
 
 	mutex_exit(&cpu_lock);
 
+	/* Download the latest MD */
+	if ((rv = mach_descrip_update()) != 0)
+		cmn_err(CE_PANIC, "suspend: mach_descrip_update failed: %ld",
+		    rv);
+
+	sfmmu_ctxdoms_update();
+	sfmmu_ctxdoms_unlock();
+
 	/* Get new MD, update CPU mappings/relationships */
 	if (suspend_update_cpu_mappings)
 		update_cpu_mappings();
--- a/usr/src/uts/sun4v/sys/mach_descrip.h	Fri Feb 19 10:41:19 2010 -0700
+++ b/usr/src/uts/sun4v/sys/mach_descrip.h	Fri Feb 19 10:18:21 2010 -0800
@@ -20,15 +20,13 @@
  */
 
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _MACH_DESCRIP_H
 #define	_MACH_DESCRIP_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -85,6 +83,7 @@
 extern int md_alloc_scan_dag(md_t *, mde_cookie_t, char *, char *,
 	    mde_cookie_t **);
 extern void md_free_scan_dag(md_t *, mde_cookie_t **);
+extern uint64_t md_get_current_gen(void);
 
 #ifdef __cplusplus
 }