changeset 2961:8b33bed4151e

PSARC 2006/236 Hashed Cache index support 6409758 Integrate the support for Hashed Cache index mode 6249718 page_freelist_fill() can return a page of the wrong mtype 6478353 page_get_mnode_freelist(szc > 0) should call page_freelist_coalesce() for every bin it tries 6478363 page_get_mnode_cachelist() may ignore PG_MATCH_COLOR flag 6470374 PLCNT_SZ() macro in sun4/vm/vm_dep.h has incorrect loop condition
author dp78419
date Sat, 21 Oct 2006 06:27:59 -0700
parents 0a0e45155fbd
children 28074feb4448
files usr/src/uts/common/os/mem_cage.c usr/src/uts/common/sys/mem_cage.h usr/src/uts/common/vm/page.h usr/src/uts/common/vm/vm_pagelist.c usr/src/uts/i86pc/vm/vm_dep.h usr/src/uts/i86pc/vm/vm_machdep.c usr/src/uts/sfmmu/vm/hat_sfmmu.h usr/src/uts/sun4/vm/vm_dep.c usr/src/uts/sun4/vm/vm_dep.h usr/src/uts/sun4u/vm/mach_vm_dep.c usr/src/uts/sun4v/vm/mach_vm_dep.c
diffstat 11 files changed, 1556 insertions(+), 762 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/uts/common/os/mem_cage.c	Fri Oct 20 17:13:50 2006 -0700
+++ b/usr/src/uts/common/os/mem_cage.c	Sat Oct 21 06:27:59 2006 -0700
@@ -47,6 +47,7 @@
 #include <vm/vm_dep.h>
 #include <sys/mem_config.h>
 #include <sys/lgrp.h>
+#include <sys/rwlock.h>
 
 extern pri_t maxclsyspri;
 
@@ -205,7 +206,7 @@
 static int kcage_cageout_ready;		/* nonzero when cageout thread ready */
 kthread_id_t kcage_cageout_thread;	/* to aid debugging */
 
-static kmutex_t kcage_range_mutex;	/* proctects kcage_glist elements */
+static krwlock_t kcage_range_rwlock;	/* protects kcage_glist elements */
 
 /*
  * Cage expansion happens within a range.
@@ -272,28 +273,26 @@
  * kcage_set_thresholds()
  */
 
-int
-kcage_range_trylock(void)
-{
-	return (mutex_tryenter(&kcage_range_mutex));
-}
-
+/*
+ * Called outside of this file to add/remove from the list,
+ * therefore, it takes a writer lock
+ */
 void
 kcage_range_lock(void)
 {
-	mutex_enter(&kcage_range_mutex);
+	rw_enter(&kcage_range_rwlock, RW_WRITER);
 }
 
 void
 kcage_range_unlock(void)
 {
-	mutex_exit(&kcage_range_mutex);
+	rw_exit(&kcage_range_rwlock);
 }
 
 int
 kcage_range_islocked(void)
 {
-	return (MUTEX_HELD(&kcage_range_mutex));
+	return (rw_lock_held(&kcage_range_rwlock));
 }
 
 /*
@@ -318,6 +317,80 @@
 	return (lp->decr);
 }
 
+/*
+ * Called from vm_pagelist.c during coalesce to find kernel cage regions
+ * within an mnode. Looks for the lowest range between lo and hi.
+ *
+ * Kernel cage memory is defined between kcage_glist and kcage_current_glist.
+ * Non-cage memory is defined between kcage_current_glist and list end.
+ *
+ * If incage is set, returns the lowest kcage range. Otherwise returns lowest
+ * non-cage range.
+ *
+ * Returns zero on success and nlo, nhi:
+ * 	lo <= nlo < nhi <= hi
+ * Returns non-zero if no overlapping range is found.
+ */
+int
+kcage_next_range(int incage, pfn_t lo, pfn_t hi,
+    pfn_t *nlo, pfn_t *nhi)
+{
+	struct kcage_glist *lp;
+	pfn_t tlo = hi;
+	pfn_t thi = hi;
+
+	ASSERT(lo <= hi);
+
+	/*
+	 * Reader lock protects the list, but kcage_get_pfn
+	 * running concurrently may advance kcage_current_glist
+	 * and also update kcage_current_glist->curr. Page
+	 * coalesce can handle this race condition.
+	 */
+	rw_enter(&kcage_range_rwlock, RW_READER);
+
+	for (lp = incage ? kcage_glist : kcage_current_glist;
+		lp != NULL; lp = lp->next) {
+
+		pfn_t klo, khi;
+
+		/* find the range limits in this element */
+		if ((incage && lp->decr) || (!incage && !lp->decr)) {
+			klo = lp->curr;
+			khi = lp->lim;
+		} else {
+			klo = lp->base;
+			khi = lp->curr;
+		}
+
+		/* handle overlap */
+		if (klo < tlo && klo < khi && lo < khi && klo < hi) {
+			tlo = MAX(lo, klo);
+			thi = MIN(hi, khi);
+			if (tlo == lo)
+				break;
+		}
+
+		/* check end of kcage */
+		if (incage && lp == kcage_current_glist) {
+			break;
+		}
+	}
+
+	rw_exit(&kcage_range_rwlock);
+
+	/* return non-zero if no overlapping range found */
+	if (tlo == thi)
+		return (1);
+
+	ASSERT(lo <= tlo && tlo < thi && thi <= hi);
+
+	/* return overlapping range */
+	*nlo = tlo;
+	*nhi = thi;
+	return (0);
+}
+
 int
 kcage_range_init(struct memlist *ml, int decr)
 {
@@ -1296,10 +1369,10 @@
 	}
 
 	/*
-	 * Try to get the range list lock. If the lock is already
+	 * Try to get the range list reader lock. If the lock is already
 	 * held, then don't get stuck here waiting for it.
 	 */
-	if (!kcage_range_trylock())
+	if (!rw_tryenter(&kcage_range_rwlock, RW_READER))
 		return (0);
 
 	KCAGE_STAT_INCR(ke_calls);
@@ -1335,7 +1408,7 @@
 
 		/*
 		 * NORELOC is only set at boot-time or by this routine
-		 * under the kcage_range_mutex lock which is currently
+		 * under the kcage_range_rwlock lock which is currently
 		 * held. This means we can do a fast check here before
 		 * locking the page in kcage_assimilate_page.
 		 */
--- a/usr/src/uts/common/sys/mem_cage.h	Fri Oct 20 17:13:50 2006 -0700
+++ b/usr/src/uts/common/sys/mem_cage.h	Sat Oct 21 06:27:59 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -61,7 +60,6 @@
 extern int kcage_create_throttle(pgcnt_t, int);
 
 /* Third arg controls direction of growth: 0: increasing pfns, 1: decreasing. */
-extern int kcage_range_trylock(void);
 extern void kcage_range_lock(void);
 extern void kcage_range_unlock(void);
 extern int kcage_current_pfn(pfn_t *);
@@ -81,6 +79,10 @@
 /* Called from clock thread in clock.c */
 extern void kcage_tick(void);
 
+/* Called from vm_pagelist.c */
+extern int kcage_next_range(int incage,
+    pfn_t lo, pfn_t hi, pfn_t *nlo, pfn_t *nhi);
+
 #endif /* _KERNEL */
 
 #ifdef	__cplusplus
--- a/usr/src/uts/common/vm/page.h	Fri Oct 20 17:13:50 2006 -0700
+++ b/usr/src/uts/common/vm/page.h	Sat Oct 21 06:27:59 2006 -0700
@@ -801,13 +801,16 @@
 typedef struct {
 	size_t	hp_size;
 	uint_t	hp_shift;
+	uint_t  hp_colors;
 	pgcnt_t	hp_pgcnt;	/* base pagesize cnt */
 } hw_pagesize_t;
 
 extern hw_pagesize_t	hw_page_array[];
-extern uint_t		page_colors, page_colors_mask;
 extern uint_t		page_coloring_shift;
+extern uint_t		page_colors_mask;
 extern int		cpu_page_colors;
+extern uint_t		colorequiv;
+extern uchar_t		colorequivszc[];
 
 uint_t	page_num_pagesizes(void);
 uint_t	page_num_user_pagesizes(void);
@@ -818,11 +821,31 @@
 int	page_szc(size_t);
 int	page_szc_user_filtered(size_t);
 
-
 /* page_get_replacement page flags */
 #define	PGR_SAMESZC	0x1	/* only look for page size same as orig */
 #define	PGR_NORELOC	0x2	/* allocate a P_NORELOC page */
 
+/*
+ * macros for "masked arithmetic"
+ * The purpose is to step through all combinations of a set of bits while
+ * keeping some other bits fixed. Fixed bits need not be contiguous. The
+ * variable bits need not be contiguous either, or even right aligned. The
+ * trick is to set all fixed bits to 1, then increment, then restore the
+ * fixed bits. If incrementing causes a carry from a low bit position, the
+ * carry propagates thru the fixed bits, because they are temporarily set to 1.
+ *	v is the value
+ *	i is the increment
+ *	eq_mask defines the fixed bits
+ *	mask limits the size of the result
+ */
+#define	ADD_MASKED(v, i, eq_mask, mask) \
+	(((((v) | (eq_mask)) + (i)) & (mask) & ~(eq_mask)) | ((v) & (eq_mask)))
+
+/*
+ * convenience macro which increments by 1
+ */
+#define	INC_MASKED(v, eq_mask, mask) ADD_MASKED(v, 1, eq_mask, mask)
+
 #endif	/* _KERNEL */
 
 /*
@@ -1039,7 +1062,6 @@
 void build_pfn_hash();
 extern struct memseg *page_numtomemseg_nolock(pfn_t pfnum);
 
-
 #ifdef	__cplusplus
 }
 #endif
--- a/usr/src/uts/common/vm/vm_pagelist.c	Fri Oct 20 17:13:50 2006 -0700
+++ b/usr/src/uts/common/vm/vm_pagelist.c	Sat Oct 21 06:27:59 2006 -0700
@@ -79,7 +79,16 @@
  * from the local mnode in favor of acquiring the 'correct' page color from
  * a demoted large page or from a remote mnode.
  */
-int	colorequiv;
+uint_t	colorequiv;
+
+/*
+ * color equivalency mask for each page size.
+ * Mask is computed based on cpu L2$ way sizes and colorequiv global.
+ * High 4 bits determine the number of high order bits of the color to ignore.
+ * Low 4 bits determines number of low order bits of color to ignore (it's only
+ * relevant for hashed index based page coloring).
+ */
+uchar_t colorequivszc[MMU_PAGE_SIZES];
 
 /*
  * if set, specifies the percentage of large pages that are free from within
@@ -127,7 +136,7 @@
 int pg_lpgcreate_nocage = LPGCREATE;
 
 /*
- * page_freelist_fill pfn flag to signify no hi pfn requirement.
+ * page_freelist_split pfn flag to signify no hi pfn requirement.
  */
 #define	PFNNULL		0
 
@@ -141,17 +150,18 @@
  */
 #define	PC_NO_COLOR	(-1)
 
+/* mtype value for page_promote to use when mtype does not matter */
+#define	PC_MTYPE_ANY	(-1)
+
 /*
  * page counters candidates info
  * See page_ctrs_cands comment below for more details.
  * fields are as follows:
  *	pcc_pages_free:		# pages which freelist coalesce can create
- *	pcc_color_free_len:	number of elements in pcc_color_free array
  *	pcc_color_free:		pointer to page free counts per color
  */
 typedef struct pcc_info {
 	pgcnt_t	pcc_pages_free;
-	int	pcc_color_free_len;
 	pgcnt_t	*pcc_color_free;
 } pcc_info_t;
 
@@ -162,36 +172,33 @@
  * page_freelist_coalesce() searches page_counters only if an appropriate
  * element of page_ctrs_cands array is greater than 0.
  *
- * An extra dimension is used for page_ctrs_cands to spread the elements
- * over a few e$ cache lines to avoid serialization during the array
- * updates.
+ * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g)
  */
-#pragma	align 64(page_ctrs_cands)
-
-static pcc_info_t *page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES];
+pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES];
 
 /*
  * Return in val the total number of free pages which can be created
- * for the given mnode (m) and region size (r)
+ * for the given mnode (m), mrange (g), and region size (r)
  */
-#define	PGCTRS_CANDS_GETVALUE(m, r, val) {				\
+#define	PGCTRS_CANDS_GETVALUE(m, g, r, val) {				\
 	int i;								\
 	val = 0;							\
 	for (i = 0; i < NPC_MUTEX; i++) {				\
-	    val += page_ctrs_cands[i][(r)][(m)].pcc_pages_free;		\
+	    val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free;	\
 	}								\
 }
 
 /*
  * Return in val the total number of free pages which can be created
- * for the given mnode (m), region size (r), and color (c)
+ * for the given mnode (m), mrange (g), region size (r), and color (c)
  */
-#define	PGCTRS_CANDS_GETVALUECOLOR(m, r, c, val) {			\
+#define	PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) {			\
 	int i;								\
 	val = 0;							\
-	ASSERT((c) < page_ctrs_cands[0][(r)][(m)].pcc_color_free_len);	\
+	ASSERT((c) < PAGE_GET_PAGECOLORS(r));				\
 	for (i = 0; i < NPC_MUTEX; i++) {				\
-	    val += page_ctrs_cands[i][(r)][(m)].pcc_color_free[(c)];	\
+	    val +=							\
+		page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)];	\
 	}								\
 }
 
@@ -205,9 +212,12 @@
 static kmutex_t	*ctr_mutex[NPC_MUTEX];
 
 #define	PP_CTR_LOCK_INDX(pp)						\
-	(((pp)->p_pagenum >>					\
+	(((pp)->p_pagenum >>						\
 	    (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1))
 
+#define	INVALID_COLOR 0xffffffff
+#define	INVALID_MASK  0xffffffff
+
 /*
  * Local functions prototypes.
  */
@@ -215,20 +225,16 @@
 void page_ctr_add(int, int, page_t *, int);
 void page_ctr_add_internal(int, int, page_t *, int);
 void page_ctr_sub(int, int, page_t *, int);
-uint_t  page_convert_color(uchar_t, uchar_t, uint_t);
+void page_ctr_sub_internal(int, int, page_t *, int);
 void page_freelist_lock(int);
 void page_freelist_unlock(int);
-page_t *page_promote(int, pfn_t, uchar_t, int);
+page_t *page_promote(int, pfn_t, uchar_t, int, int);
 page_t *page_demote(int, pfn_t, uchar_t, uchar_t, int, int);
-page_t *page_freelist_fill(uchar_t, int, int, int, pfn_t);
+page_t *page_freelist_split(uchar_t,
+    uint_t, int, int, pfn_t, page_list_walker_t *);
 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
 static int page_trylock_cons(page_t *pp, se_t se);
 
-#define	PNUM_SIZE(szc)							\
-	(hw_page_array[(szc)].hp_size >> hw_page_array[0].hp_shift)
-#define	PNUM_SHIFT(szc)							\
-	(hw_page_array[(szc)].hp_shift - hw_page_array[0].hp_shift)
-
 /*
  * The page_counters array below is used to keep track of free contiguous
  * physical memory.  A hw_page_map_t will be allocated per mnode per szc.
@@ -272,7 +278,6 @@
  *	hpm_entries:	entries in hpm_counters
  *	hpm_shift:	shift for pnum/array index conv
  *	hpm_base:	PFN mapped to counter index 0
- *	hpm_color_current_len:	# of elements in hpm_color_current "array" below
  *	hpm_color_current:	last index in counter array for this color at
  *				which we successfully created a large page
  */
@@ -281,8 +286,7 @@
 	size_t		hpm_entries;
 	int		hpm_shift;
 	pfn_t		hpm_base;
-	size_t		hpm_color_current_len;
-	size_t 		*hpm_color_current;
+	size_t		*hpm_color_current[MAX_MNODE_MRANGES];
 } hw_page_map_t;
 
 /*
@@ -291,6 +295,13 @@
 static hw_page_map_t *page_counters[MMU_PAGE_SIZES];
 
 /*
+ * Cached value of MNODE_RANGE_CNT(mnode).
+ * This is a function call in x86.
+ */
+static int mnode_nranges[MAX_MEM_NODES];
+static int mnode_maxmrange[MAX_MEM_NODES];
+
+/*
  * The following macros are convenient ways to get access to the individual
  * elements of the page_counters arrays.  They can be used on both
  * the left side and right side of equations.
@@ -310,14 +321,12 @@
 #define	PAGE_COUNTERS_BASE(mnode, rg_szc) 			\
 	(page_counters[(rg_szc)][(mnode)].hpm_base)
 
-#define	PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, rg_szc)		\
-	(page_counters[(rg_szc)][(mnode)].hpm_color_current_len)
-
-#define	PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc)	\
-	(page_counters[(rg_szc)][(mnode)].hpm_color_current)
-
-#define	PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color)	\
-	(page_counters[(rg_szc)][(mnode)].hpm_color_current[(color)])
+#define	PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g)		\
+	(page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)])
+
+#define	PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange)	\
+	(page_counters[(rg_szc)][(mnode)].				\
+	hpm_color_current[(mrange)][(color)])
 
 #define	PNUM_TO_IDX(mnode, rg_szc, pnum)			\
 	(((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >>	\
@@ -464,14 +473,32 @@
 {
 	if (szc >= mmu_page_sizes)
 		panic("page_get_shift: out of range %d", szc);
-	return (hw_page_array[szc].hp_shift);
+	return (PAGE_GET_SHIFT(szc));
 }
 
 uint_t
 page_get_pagecolors(uint_t szc)
 {
-	ASSERT(page_colors != 0);
-	return (MAX(page_colors >> PAGE_BSZS_SHIFT(szc), 1));
+	if (szc >= mmu_page_sizes)
+		panic("page_get_pagecolors: out of range %d", szc);
+	return (PAGE_GET_PAGECOLORS(szc));
+}
+
+/*
+ * this assigns the desired equivalent color after a split
+ */
+uint_t
+page_correct_color(uchar_t szc, uchar_t nszc, uint_t color,
+    uint_t ncolor, uint_t ceq_mask)
+{
+	ASSERT(nszc > szc);
+	ASSERT(szc < mmu_page_sizes);
+	ASSERT(color < PAGE_GET_PAGECOLORS(szc));
+	ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc));
+
+	color &= ceq_mask;
+	ncolor <<= PAGE_GET_COLOR_SHIFT(szc, nszc);
+	return (color | (ncolor & ~ceq_mask));
 }
 
 /*
@@ -484,6 +511,7 @@
 {
 	int	r;		/* region size */
 	int	mnode;
+	int	nranges;
 	uint_t	ctrs_sz = 0;
 	int 	i;
 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
@@ -493,10 +521,8 @@
 	 * page size in order to allocate memory for any color specific
 	 * arrays.
 	 */
-	colors_per_szc[0] = page_colors;
-	for (i = 1; i < mmu_page_sizes; i++) {
-		colors_per_szc[i] =
-		    page_convert_color(0, i, page_colors - 1) + 1;
+	for (i = 0; i < mmu_page_sizes; i++) {
+		colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
 	}
 
 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
@@ -508,6 +534,10 @@
 		if (mem_node_config[mnode].exists == 0)
 			continue;
 
+		nranges = MNODE_RANGE_CNT(mnode);
+		mnode_nranges[mnode] = nranges;
+		mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
+
 		/*
 		 * determine size needed for page counter arrays with
 		 * base aligned to large page size.
@@ -527,18 +557,31 @@
 			    sizeof (hpmctr_t *));
 
 			/* add in space for hpm_color_current */
-			ctrs_sz += (colors_per_szc[r] *
-			    sizeof (size_t));
+			ctrs_sz += sizeof (size_t) *
+			    colors_per_szc[r] * nranges;
 		}
 	}
 
 	for (r = 1; r < mmu_page_sizes; r++) {
 		ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t));
-
-		/* add in space for page_ctrs_cands */
-		ctrs_sz += NPC_MUTEX * max_mem_nodes * (sizeof (pcc_info_t));
-		ctrs_sz += NPC_MUTEX * max_mem_nodes * colors_per_szc[r] *
-		    sizeof (pgcnt_t);
+	}
+
+	/* add in space for page_ctrs_cands and pcc_color_free */
+	ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes *
+	    mmu_page_sizes * NPC_MUTEX;
+
+	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
+
+		if (mem_node_config[mnode].exists == 0)
+			continue;
+
+		nranges = mnode_nranges[mnode];
+		ctrs_sz += sizeof (pcc_info_t) * nranges *
+		    mmu_page_sizes * NPC_MUTEX;
+		for (r = 1; r < mmu_page_sizes; r++) {
+			ctrs_sz += sizeof (pgcnt_t) * nranges *
+			    colors_per_szc[r] * NPC_MUTEX;
+		}
 	}
 
 	/* ctr_mutex */
@@ -559,6 +602,7 @@
 page_ctrs_alloc(caddr_t alloc_base)
 {
 	int	mnode;
+	int	mrange, nranges;
 	int	r;		/* region size */
 	int	i;
 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
@@ -568,10 +612,8 @@
 	 * page size in order to allocate memory for any color specific
 	 * arrays.
 	 */
-	colors_per_szc[0] = page_colors;
-	for (i = 1; i < mmu_page_sizes; i++) {
-		colors_per_szc[i] =
-		    page_convert_color(0, i, page_colors - 1) + 1;
+	for (i = 0; i < mmu_page_sizes; i++) {
+		colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
 	}
 
 	for (r = 1; r < mmu_page_sizes; r++) {
@@ -579,25 +621,32 @@
 		alloc_base += (max_mem_nodes * sizeof (hw_page_map_t));
 	}
 
-	/* page_ctrs_cands */
-	for (r = 1; r < mmu_page_sizes; r++) {
-		for (i = 0; i < NPC_MUTEX; i++) {
-			page_ctrs_cands[i][r] = (pcc_info_t *)alloc_base;
-			alloc_base += max_mem_nodes * (sizeof (pcc_info_t));
-
-		}
-	}
-
-	/* page_ctrs_cands pcc_color_free array */
-	for (r = 1; r < mmu_page_sizes; r++) {
-		for (i = 0; i < NPC_MUTEX; i++) {
+	/* page_ctrs_cands and pcc_color_free array */
+	for (i = 0; i < NPC_MUTEX; i++) {
+		for (r = 1; r < mmu_page_sizes; r++) {
+
+			page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base;
+			alloc_base += sizeof (pcc_info_t *) * max_mem_nodes;
+
 			for (mnode = 0; mnode < max_mem_nodes; mnode++) {
-				page_ctrs_cands[i][r][mnode].pcc_color_free_len
-				    = colors_per_szc[r];
-				page_ctrs_cands[i][r][mnode].pcc_color_free =
-				    (pgcnt_t *)alloc_base;
-				alloc_base += colors_per_szc[r] *
-				    sizeof (pgcnt_t);
+				pcc_info_t *pi;
+
+				if (mem_node_config[mnode].exists == 0)
+					continue;
+
+				nranges = mnode_nranges[mnode];
+
+				pi = (pcc_info_t *)alloc_base;
+				alloc_base += sizeof (pcc_info_t) * nranges;
+				page_ctrs_cands[i][r][mnode] = pi;
+
+				for (mrange = 0; mrange < nranges; mrange++) {
+					pi->pcc_color_free =
+					    (pgcnt_t *)alloc_base;
+					alloc_base += sizeof (pgcnt_t) *
+					    colors_per_szc[r];
+					pi++;
+				}
 			}
 		}
 	}
@@ -617,6 +666,7 @@
 		pfn_t	r_base;
 		pgcnt_t r_align;
 		int	r_shift;
+		int	nranges = mnode_nranges[mnode];
 
 		if (mem_node_config[mnode].exists == 0)
 			continue;
@@ -638,13 +688,26 @@
 			PAGE_COUNTERS_SHIFT(mnode, r) = r_shift;
 			PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt;
 			PAGE_COUNTERS_BASE(mnode, r) = r_base;
-			PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, r) =
-			    colors_per_szc[r];
-			PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r) =
-			    (size_t *)alloc_base;
-			alloc_base += (sizeof (size_t) * colors_per_szc[r]);
+			for (mrange = 0; mrange < nranges; mrange++) {
+				PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
+				    r, mrange) = (size_t *)alloc_base;
+				alloc_base += sizeof (size_t) *
+				    colors_per_szc[r];
+			}
 			for (i = 0; i < colors_per_szc[r]; i++) {
-				PAGE_COUNTERS_CURRENT_COLOR(mnode, r, i) = i;
+				uint_t color_mask = colors_per_szc[r] - 1;
+				pfn_t  pfnum = r_base;
+				size_t idx;
+				int mrange;
+
+				PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
+				    color_mask, color_mask);
+				idx = PNUM_TO_IDX(mnode, r, pfnum);
+				idx = (idx >= r_pgcnt) ? 0 : idx;
+				for (mrange = 0; mrange < nranges; mrange++) {
+					PAGE_COUNTERS_CURRENT_COLOR(mnode,
+					    r, i, mrange) = idx;
+				}
 			}
 			PAGE_COUNTERS_COUNTERS(mnode, r) =
 			    (hpmctr_t *)alloc_base;
@@ -724,12 +787,16 @@
 		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
 		ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r));
 
-		if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r))
+		if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) {
 			break;
-
-		page_ctrs_cands[lckidx][r][mnode].pcc_pages_free++;
-		page_ctrs_cands[lckidx][r][mnode].
-		    pcc_color_free[PP_2_BIN_SZC(pp, r)]++;
+		} else {
+			int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
+			pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
+			    [MTYPE_2_MRANGE(mnode, root_mtype)];
+
+			cand->pcc_pages_free++;
+			cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++;
+		}
 		r++;
 	}
 }
@@ -746,10 +813,9 @@
 }
 
 void
-page_ctr_sub(int mnode, int mtype, page_t *pp, int flags)
+page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags)
 {
 	int		lckidx;
-	kmutex_t	*lock;
 	ssize_t		r;	/* region size */
 	ssize_t		idx;
 	pfn_t		pfnum;
@@ -769,14 +835,12 @@
 	r = pp->p_szc + 1;
 	pfnum = pp->p_pagenum;
 	lckidx = PP_CTR_LOCK_INDX(pp);
-	lock = &ctr_mutex[lckidx][mnode];
 
 	/*
 	 * Decrement the count of free pages for the current
 	 * region. Continue looping up in region size decrementing
 	 * count if the preceeding region was full.
 	 */
-	mutex_enter(lock);
 	while (r < mmu_page_sizes) {
 		idx = PNUM_TO_IDX(mnode, r, pfnum);
 
@@ -785,16 +849,29 @@
 
 		if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) {
 			break;
+		} else {
+			int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
+			pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
+			    [MTYPE_2_MRANGE(mnode, root_mtype)];
+
+			ASSERT(cand->pcc_pages_free != 0);
+			ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0);
+
+			cand->pcc_pages_free--;
+			cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--;
 		}
-		ASSERT(page_ctrs_cands[lckidx][r][mnode].pcc_pages_free != 0);
-		ASSERT(page_ctrs_cands[lckidx][r][mnode].
-		    pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0);
-
-		page_ctrs_cands[lckidx][r][mnode].pcc_pages_free--;
-		page_ctrs_cands[lckidx][r][mnode].
-		    pcc_color_free[PP_2_BIN_SZC(pp, r)]--;
 		r++;
 	}
+}
+
+void
+page_ctr_sub(int mnode, int mtype, page_t *pp, int flags)
+{
+	int		lckidx = PP_CTR_LOCK_INDX(pp);
+	kmutex_t	*lock = &ctr_mutex[lckidx][mnode];
+
+	mutex_enter(lock);
+	page_ctr_sub_internal(mnode, mtype, pp, flags);
 	mutex_exit(lock);
 }
 
@@ -802,6 +879,11 @@
  * Adjust page counters following a memory attach, since typically the
  * size of the array needs to change, and the PFN to counter index
  * mapping needs to change.
+ *
+ * It is possible this mnode did not exist at startup. In that case
+ * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges
+ * to change (a theoretical possibility on x86), which means pcc_color_free
+ * arrays must be extended.
  */
 uint_t
 page_ctrs_adjust(int mnode)
@@ -815,23 +897,38 @@
 	size_t	old_npgs;
 	hpmctr_t *ctr_cache[MMU_PAGE_SIZES];
 	size_t	size_cache[MMU_PAGE_SIZES];
-	size_t	*color_cache[MMU_PAGE_SIZES];
-	size_t	*old_color_array;
+	size_t	*color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
+	size_t	*old_color_array[MAX_MNODE_MRANGES];
 	pgcnt_t	colors_per_szc[MMU_PAGE_SIZES];
+	pcc_info_t **cands_cache;
+	pcc_info_t *old_pi, *pi;
+	pgcnt_t *pgcntp;
+	int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode);
+	int cands_cache_nranges;
+	int old_maxmrange, new_maxmrange;
+	int rc = 0;
 
 	newbase = mem_node_config[mnode].physbase & ~PC_BASE_ALIGN_MASK;
 	npgs = roundup(mem_node_config[mnode].physmax,
 	    PC_BASE_ALIGN) - newbase;
 
+	cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX *
+	    MMU_PAGE_SIZES, KM_NOSLEEP);
+	if (cands_cache == NULL)
+		return (ENOMEM);
+
+	/* prepare to free non-null pointers on the way out */
+	cands_cache_nranges = nranges;
+	bzero(ctr_cache, sizeof (ctr_cache));
+	bzero(color_cache, sizeof (color_cache));
+
 	/*
 	 * We need to determine how many page colors there are for each
 	 * page size in order to allocate memory for any color specific
 	 * arrays.
 	 */
-	colors_per_szc[0] = page_colors;
-	for (r = 1; r < mmu_page_sizes; r++) {
-		colors_per_szc[r] =
-		    page_convert_color(0, r, page_colors - 1) + 1;
+	for (r = 0; r < mmu_page_sizes; r++) {
+		colors_per_szc[r] = PAGE_GET_PAGECOLORS(r);
 	}
 
 	/*
@@ -842,18 +939,15 @@
 	 */
 	for (r = 1; r < mmu_page_sizes; r++) {
 		pcsz = npgs >> PAGE_BSZS_SHIFT(r);
-
+		size_cache[r] = pcsz;
 		ctr_cache[r] = kmem_zalloc(pcsz *
 		    sizeof (hpmctr_t), KM_NOSLEEP);
 		if (ctr_cache[r] == NULL) {
-			while (--r >= 1) {
-				kmem_free(ctr_cache[r],
-				    size_cache[r] * sizeof (hpmctr_t));
-			}
-			return (ENOMEM);
+			rc = ENOMEM;
+			goto cleanup;
 		}
-		size_cache[r] = pcsz;
 	}
+
 	/*
 	 * Preallocate all of the new color current arrays as we can't
 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
@@ -861,18 +955,41 @@
 	 * and return failure.
 	 */
 	for (r = 1; r < mmu_page_sizes; r++) {
-		color_cache[r] = kmem_zalloc(sizeof (size_t) *
-		    colors_per_szc[r], KM_NOSLEEP);
-		if (color_cache[r] == NULL) {
-			while (--r >= 1) {
-				kmem_free(color_cache[r],
-				    colors_per_szc[r] * sizeof (size_t));
+		for (mrange = 0; mrange < nranges; mrange++) {
+			color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) *
+			    colors_per_szc[r], KM_NOSLEEP);
+			if (color_cache[r][mrange] == NULL) {
+				rc = ENOMEM;
+				goto cleanup;
 			}
-			for (r = 1; r < mmu_page_sizes; r++) {
-				kmem_free(ctr_cache[r],
-				    size_cache[r] * sizeof (hpmctr_t));
+		}
+	}
+
+	/*
+	 * Preallocate all of the new pcc_info_t arrays as we can't
+	 * hold the page_ctrs_rwlock as a writer and allocate memory.
+	 * If we can't allocate all of the arrays, undo our work so far
+	 * and return failure.
+	 */
+	for (r = 1; r < mmu_page_sizes; r++) {
+		for (i = 0; i < NPC_MUTEX; i++) {
+			pi = kmem_zalloc(nranges * sizeof (pcc_info_t),
+			    KM_NOSLEEP);
+			if (pi == NULL) {
+				rc = ENOMEM;
+				goto cleanup;
 			}
-			return (ENOMEM);
+			cands_cache[i * MMU_PAGE_SIZES + r] = pi;
+
+			for (mrange = 0; mrange < nranges; mrange++, pi++) {
+				pgcntp = kmem_zalloc(colors_per_szc[r] *
+				    sizeof (pgcnt_t), KM_NOSLEEP);
+				if (pgcntp == NULL) {
+					rc = ENOMEM;
+					goto cleanup;
+				}
+				pi->pcc_color_free = pgcntp;
+			}
 		}
 	}
 
@@ -882,13 +999,25 @@
 	 */
 	rw_enter(&page_ctrs_rwlock[mnode], RW_WRITER);
 	page_freelist_lock(mnode);
+
+	old_nranges = mnode_nranges[mnode];
+	cands_cache_nranges = old_nranges;
+	mnode_nranges[mnode] = nranges;
+	old_maxmrange = mnode_maxmrange[mnode];
+	mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
+	new_maxmrange = mnode_maxmrange[mnode];
+
 	for (r = 1; r < mmu_page_sizes; r++) {
 		PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r);
 		old_ctr = PAGE_COUNTERS_COUNTERS(mnode, r);
 		old_csz = PAGE_COUNTERS_ENTRIES(mnode, r);
 		oldbase = PAGE_COUNTERS_BASE(mnode, r);
 		old_npgs = old_csz << PAGE_COUNTERS_SHIFT(mnode, r);
-		old_color_array = PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r);
+		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
+			old_color_array[mrange] =
+			    PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
+				r, mrange);
+		}
 
 		pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r);
 		new_ctr = ctr_cache[r];
@@ -919,15 +1048,28 @@
 		PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr;
 		PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz;
 		PAGE_COUNTERS_BASE(mnode, r) = newbase;
-		PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, r) = colors_per_szc[r];
-		PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r) = color_cache[r];
-		color_cache[r] = NULL;
+		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
+			PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) =
+			    color_cache[r][mrange];
+			color_cache[r][mrange] = NULL;
+		}
 		/*
 		 * for now, just reset on these events as it's probably
 		 * not worthwhile to try and optimize this.
 		 */
 		for (i = 0; i < colors_per_szc[r]; i++) {
-			PAGE_COUNTERS_CURRENT_COLOR(mnode, r, i) = i;
+			uint_t color_mask = colors_per_szc[r] - 1;
+			pfn_t  pfnum = newbase;
+			size_t idx;
+
+			PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, color_mask,
+			    color_mask);
+			idx = PNUM_TO_IDX(mnode, r, pfnum);
+			idx = (idx < pcsz) ? idx : 0;
+			for (mrange = 0; mrange < nranges; mrange++) {
+				PAGE_COUNTERS_CURRENT_COLOR(mnode,
+				    r, i, mrange) = idx;
+			}
 		}
 
 		/* cache info for freeing out of the critical path */
@@ -936,9 +1078,12 @@
 			ctr_cache[r] = old_ctr;
 			size_cache[r] = old_csz;
 		}
-		if ((caddr_t)old_color_array >= kernelheap &&
-		    (caddr_t)old_color_array < ekernelheap) {
-			color_cache[r] = old_color_array;
+		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
+			size_t *tmp = old_color_array[mrange];
+			if ((caddr_t)tmp >= kernelheap &&
+			    (caddr_t)tmp < ekernelheap) {
+				color_cache[r][mrange] = tmp;
+			}
 		}
 		/*
 		 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
@@ -950,6 +1095,39 @@
 		    (IDX_TO_PNUM(mnode, r, 0))) == 0);
 		ASSERT(IDX_TO_PNUM(mnode, r,
 		    (PNUM_TO_IDX(mnode, r, newbase))) == newbase);
+
+		/* pcc_info_t and pcc_color_free */
+		for (i = 0; i < NPC_MUTEX; i++) {
+			pcc_info_t *epi;
+			pcc_info_t *eold_pi;
+
+			pi = cands_cache[i * MMU_PAGE_SIZES + r];
+			old_pi = page_ctrs_cands[i][r][mnode];
+			page_ctrs_cands[i][r][mnode] = pi;
+			cands_cache[i * MMU_PAGE_SIZES + r] = old_pi;
+
+			/* preserve old pcc_color_free values, if any */
+			if (old_pi == NULL)
+				continue;
+
+			/*
+			 * when/if x86 does DR, must account for
+			 * possible change in range index when
+			 * preserving pcc_info
+			 */
+			epi = &pi[nranges];
+			eold_pi = &old_pi[old_nranges];
+			if (new_maxmrange > old_maxmrange) {
+				pi += new_maxmrange - old_maxmrange;
+			} else if (new_maxmrange < old_maxmrange) {
+				old_pi += old_maxmrange - new_maxmrange;
+			}
+			for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) {
+				pcc_info_t tmp = *pi;
+				*pi = *old_pi;
+				*old_pi = tmp;
+			}
+		}
 	}
 	page_freelist_unlock(mnode);
 	rw_exit(&page_ctrs_rwlock[mnode]);
@@ -957,37 +1135,50 @@
 	/*
 	 * Now that we have dropped the write lock, it is safe to free all
 	 * of the memory we have cached above.
+	 * We come thru here to free memory when pre-alloc fails, and also to
+	 * free old pointers which were recorded while locked.
 	 */
+cleanup:
 	for (r = 1; r < mmu_page_sizes; r++) {
 		if (ctr_cache[r] != NULL) {
 			kmem_free(ctr_cache[r],
 			    size_cache[r] * sizeof (hpmctr_t));
 		}
-		if (color_cache[r] != NULL) {
-			kmem_free(color_cache[r],
-			    colors_per_szc[r] * sizeof (size_t));
+		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
+			if (color_cache[r][mrange] != NULL) {
+				kmem_free(color_cache[r][mrange],
+				    colors_per_szc[r] * sizeof (size_t));
+			}
+		}
+		for (i = 0; i < NPC_MUTEX; i++) {
+			pi = cands_cache[i * MMU_PAGE_SIZES + r];
+			if (pi == NULL)
+				continue;
+			nr = cands_cache_nranges;
+			for (mrange = 0; mrange < nr; mrange++, pi++) {
+				pgcntp = pi->pcc_color_free;
+				if (pgcntp == NULL)
+					continue;
+				if ((caddr_t)pgcntp >= kernelheap &&
+				    (caddr_t)pgcntp < ekernelheap) {
+					kmem_free(pgcntp,
+					    colors_per_szc[r] *
+					    sizeof (pgcnt_t));
+				}
+			}
+			pi = cands_cache[i * MMU_PAGE_SIZES + r];
+			if ((caddr_t)pi >= kernelheap &&
+			    (caddr_t)pi < ekernelheap) {
+				kmem_free(pi, nr * sizeof (pcc_info_t));
+			}
 		}
 	}
-	return (0);
+
+	kmem_free(cands_cache,
+	    sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES);
+	return (rc);
 }
 
-/*
- * color contains a valid color index or bin for cur_szc
- */
-uint_t
-page_convert_color(uchar_t cur_szc, uchar_t new_szc, uint_t color)
-{
-	uint_t shift;
-
-	if (cur_szc > new_szc) {
-		shift = page_get_shift(cur_szc) - page_get_shift(new_szc);
-		return (color << shift);
-	} else if (cur_szc < new_szc) {
-		shift = page_get_shift(new_szc) - page_get_shift(cur_szc);
-		return (color >> shift);
-	}
-	return (color);
-}
 
 #ifdef DEBUG
 
@@ -1129,7 +1320,7 @@
 			*ppp = (*ppp)->p_next;
 		/*
 		 * Add counters before releasing pcm mutex to avoid a race with
-		 * page_freelist_coalesce and page_freelist_fill.
+		 * page_freelist_coalesce and page_freelist_split.
 		 */
 		page_ctr_add(mnode, mtype, pp, flags);
 		mutex_exit(pcm);
@@ -1201,8 +1392,10 @@
 		pp->p_next->p_prev = pp->p_prev;
 	}
 
-	/* LINTED */
-	PLCNT_DECR(pp, mnode, mtype, 0, flags);
+	/*
+	 * Decrement page counters
+	 */
+	page_ctr_sub_internal(mnode, mtype, pp, flags);
 
 	/*
 	 * Set no reloc for cage initted pages.
@@ -1234,8 +1427,10 @@
 		pp->p_prev->p_next = pp;
 	}
 
-	/* LINTED */
-	PLCNT_INCR(pp, mnode, mtype, 0, flags);
+	/*
+	 * Increment page counters
+	 */
+	page_ctr_add_internal(mnode, mtype, pp, flags);
 
 	/*
 	 * Update cage freemem counter
@@ -1579,7 +1774,7 @@
 
 	idx = PNUM_TO_IDX(mnode, new_szc, pfn);
 	if (PAGE_COUNTERS(mnode, new_szc, idx) == full)
-		(void) page_promote(mnode, pfn, new_szc, PC_FREE);
+		(void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY);
 
 	page_freelist_unlock(mnode);
 }
@@ -1640,15 +1835,13 @@
  *	have done so far. Again this is rare.
  */
 page_t *
-page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags)
+page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype)
 {
 	page_t		*pp, *pplist, *tpp, *start_pp;
 	pgcnt_t		new_npgs, npgs;
 	uint_t		bin;
 	pgcnt_t		tmpnpgs, pages_left;
-	uint_t		mtype;
 	uint_t		noreloc;
-	uint_t 		i;
 	int 		which_list;
 	ulong_t		index;
 	kmutex_t	*phm;
@@ -1670,17 +1863,19 @@
 	new_npgs = page_get_pagecnt(new_szc);
 	ASSERT(IS_P2ALIGNED(pfnum, new_npgs));
 
+	/* don't return page of the wrong mtype */
+	if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp))
+			return (NULL);
+
 	/*
 	 * Loop through smaller pages to confirm that all pages
 	 * give the same result for PP_ISNORELOC().
 	 * We can check this reliably here as the protocol for setting
 	 * P_NORELOC requires pages to be taken off the free list first.
 	 */
-	for (i = 0, pp = start_pp; i < new_npgs; i++, pp++) {
-		if (pp == start_pp) {
-			/* First page, set requirement. */
-			noreloc = PP_ISNORELOC(pp);
-		} else if (noreloc != PP_ISNORELOC(pp)) {
+	noreloc = PP_ISNORELOC(start_pp);
+	for (pp = start_pp + new_npgs; --pp > start_pp; ) {
+		if (noreloc != PP_ISNORELOC(pp)) {
 			page_promote_noreloc_err++;
 			page_promote_err++;
 			return (NULL);
@@ -1921,63 +2116,155 @@
 /*
  * Coalesce free pages into a page of the given szc and color if possible.
  * Return the pointer to the page created, otherwise, return NULL.
+ *
+ * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
  */
-static page_t *
-page_freelist_coalesce(int mnode, uchar_t szc, int color)
+page_t *
+page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
+    int mtype, pfn_t pfnhi)
 {
-	int 	r;		/* region size */
-	int 	idx, full, i;
-	pfn_t	pfnum;
-	size_t	len;
-	size_t	buckets_to_check;
-	pgcnt_t	cands;
+	int 	r = szc;		/* region size */
+	int	mrange;
+	uint_t 	full, bin, color_mask, wrap = 0;
+	pfn_t	pfnum, lo, hi;
+	size_t	len, idx, idx0;
+	pgcnt_t	cands = 0, szcpgcnt = page_get_pagecnt(szc);
 	page_t	*ret_pp;
-	int	color_stride;
-
-	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce);
+#if defined(__sparc)
+	pfn_t pfnum0, nlo, nhi;
+#endif
 
 	if (mpss_coalesce_disable) {
+		ASSERT(szc < MMU_PAGE_SIZES);
+		VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]);
 		return (NULL);
 	}
 
-	r = szc;
-	PGCTRS_CANDS_GETVALUECOLOR(mnode, r, color, cands);
-	if (cands == 0) {
-		VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip);
-		return (NULL);
-	}
-	full = FULL_REGION_CNT(r);
-	color_stride = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 :
-	    page_colors;
+	ASSERT(szc < mmu_page_sizes);
+	color_mask = PAGE_GET_PAGECOLORS(szc) - 1;
+	ASSERT(ceq_mask <= color_mask);
+	ASSERT(color <= color_mask);
+	color &= ceq_mask;
 
 	/* Prevent page_counters dynamic memory from being freed */
 	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
-	len  = PAGE_COUNTERS_ENTRIES(mnode, r);
-	buckets_to_check = len / color_stride;
-	idx = PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color);
-	ASSERT((idx % color_stride) == color);
-	idx += color_stride;
-	if (idx >= len)
-		idx = color;
-	for (i = 0; i < buckets_to_check; i++) {
+
+	mrange = MTYPE_2_MRANGE(mnode, mtype);
+	ASSERT(mrange < mnode_nranges[mnode]);
+	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]);
+
+	/* get pfn range for mtype */
+	len = PAGE_COUNTERS_ENTRIES(mnode, r);
+#if defined(__sparc)
+	lo = PAGE_COUNTERS_BASE(mnode, r);
+	hi = IDX_TO_PNUM(mnode, r, len);
+#else
+	MNODETYPE_2_PFN(mnode, mtype, lo, hi);
+	hi++;
+#endif
+
+	/* use lower limit if given */
+	if (pfnhi != PFNNULL && pfnhi < hi)
+		hi = pfnhi;
+
+	/* round to szcpgcnt boundaries */
+	lo = P2ROUNDUP(lo, szcpgcnt);
+	hi = hi & ~(szcpgcnt - 1);
+
+	/* set lo to the closest pfn of the right color */
+	if ((PFN_2_COLOR(lo, szc) ^ color) & ceq_mask) {
+		PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask);
+	}
+
+	if (hi <= lo) {
+		rw_exit(&page_ctrs_rwlock[mnode]);
+		return (NULL);
+	}
+
+	full = FULL_REGION_CNT(r);
+
+	/* calculate the number of page candidates and initial search index */
+	bin = color;
+	idx0 = (size_t)(-1);
+	do {
+		pgcnt_t acand;
+
+		PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand);
+		if (acand) {
+			idx = PAGE_COUNTERS_CURRENT_COLOR(mnode,
+			    r, bin, mrange);
+			idx0 = MIN(idx0, idx);
+			cands += acand;
+		}
+		bin = ADD_MASKED(bin, 1, ceq_mask, color_mask);
+	} while (bin != color);
+
+	if (cands == 0) {
+		VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]);
+		rw_exit(&page_ctrs_rwlock[mnode]);
+		return (NULL);
+	}
+
+	pfnum = IDX_TO_PNUM(mnode, r, idx0);
+	if (pfnum < lo || pfnum >= hi) {
+		pfnum = lo;
+	} else if ((PFN_2_COLOR(pfnum, szc) ^ color) & ceq_mask) {
+		/* pfnum has invalid color get the closest correct pfn */
+		PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
+		    color_mask);
+		pfnum = (pfnum >= hi) ? lo : pfnum;
+	}
+
+	/* set starting index */
+	idx0 = PNUM_TO_IDX(mnode, r, pfnum);
+	ASSERT(idx0 < len);
+
+#if defined(__sparc)
+	pfnum0 = pfnum;		/* page corresponding to idx0 */
+	nhi = 0;		/* search kcage ranges */
+#endif
+
+	for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) {
+
+#if defined(__sparc)
+		/*
+		 * Find lowest intersection of kcage ranges and mnode.
+		 * MTYPE_NORELOC means look in the cage, otherwise outside.
+		 */
+		if (nhi <= pfnum) {
+			if (kcage_next_range(mtype == MTYPE_NORELOC, pfnum,
+			    (wrap == 0 ? hi : pfnum0), &nlo, &nhi))
+				goto wrapit;
+
+			/* jump to the next page in the range */
+			if (pfnum < nlo) {
+				pfnum = P2ROUNDUP(nlo, szcpgcnt);
+				idx = PNUM_TO_IDX(mnode, r, pfnum);
+				if (idx >= len || pfnum >= hi)
+					goto wrapit;
+				if ((PFN_2_COLOR(pfnum, szc) ^ color) &
+				    ceq_mask)
+					goto next;
+			}
+		}
+#endif
+
+		if (PAGE_COUNTERS(mnode, r, idx) != full)
+			goto next;
+
+		/*
+		 * RFE: For performance maybe we can do something less
+		 *	brutal than locking the entire freelist. So far
+		 * 	this doesn't seem to be a performance problem?
+		 */
+		page_freelist_lock(mnode);
 		if (PAGE_COUNTERS(mnode, r, idx) == full) {
-			pfnum = IDX_TO_PNUM(mnode, r, idx);
-			ASSERT(pfnum >= mem_node_config[mnode].physbase &&
-			    pfnum < mem_node_config[mnode].physmax);
-			/*
-			 * RFE: For performance maybe we can do something less
-			 *	brutal than locking the entire freelist. So far
-			 * 	this doesn't seem to be a performance problem?
-			 */
-			page_freelist_lock(mnode);
-			if (PAGE_COUNTERS(mnode, r, idx) != full) {
-				VM_STAT_ADD(vmm_vmstats.page_ctrs_changed);
-				goto skip_this_one;
-			}
-			ret_pp = page_promote(mnode, pfnum, r, PC_ALLOC);
+			ret_pp =
+			    page_promote(mnode, pfnum, r, PC_ALLOC, mtype);
 			if (ret_pp != NULL) {
-				PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color) =
-				    idx;
+				VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]);
+				PAGE_COUNTERS_CURRENT_COLOR(mnode, r,
+				    PFN_2_COLOR(pfnum, szc), mrange) = idx;
 				page_freelist_unlock(mnode);
 				rw_exit(&page_ctrs_rwlock[mnode]);
 #if defined(__sparc)
@@ -1990,30 +2277,43 @@
 #endif
 				return (ret_pp);
 			}
-skip_this_one:
-			page_freelist_unlock(mnode);
-			/*
-			 * No point looking for another page if we've
-			 * already tried all of the ones that
-			 * page_ctr_cands indicated.  Stash off where we left
-			 * off.
-			 * Note: this is not exact since we don't hold the
-			 * page_freelist_locks before we initially get the
-			 * value of cands for performance reasons, but should
-			 * be a decent approximation.
-			 */
-			if (--cands == 0) {
-				PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color) =
-				    idx;
-				break;
-			}
+		} else {
+			VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]);
 		}
-		idx += color_stride;
-		if (idx >= len)
-			idx = color;
+
+		page_freelist_unlock(mnode);
+		/*
+		 * No point looking for another page if we've
+		 * already tried all of the ones that
+		 * page_ctr_cands indicated.  Stash off where we left
+		 * off.
+		 * Note: this is not exact since we don't hold the
+		 * page_freelist_locks before we initially get the
+		 * value of cands for performance reasons, but should
+		 * be a decent approximation.
+		 */
+		if (--cands == 0) {
+			PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) =
+			    idx;
+			break;
+		}
+next:
+		PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
+			    color_mask);
+		idx = PNUM_TO_IDX(mnode, r, pfnum);
+		if (idx >= len || pfnum >= hi) {
+wrapit:
+			pfnum = lo;
+			idx = PNUM_TO_IDX(mnode, r, pfnum);
+			wrap++;
+#if defined(__sparc)
+			nhi = 0;	/* search kcage ranges */
+#endif
+		}
 	}
+
 	rw_exit(&page_ctrs_rwlock[mnode]);
-	VM_STAT_ADD(vmm_vmstats.page_ctrs_failed);
+	VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]);
 	return (NULL);
 }
 
@@ -2043,9 +2343,14 @@
 	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
 	page_freelist_lock(mnode);
 	for (r = mmu_page_sizes - 1; r > 0; r--) {
-		pgcnt_t cands;
-
-		PGCTRS_CANDS_GETVALUE(mnode, r, cands);
+		pgcnt_t cands = 0;
+		int mrange, nranges = mnode_nranges[mnode];
+
+		for (mrange = 0; mrange < nranges; mrange++) {
+			PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands);
+			if (cands != 0)
+				break;
+		}
 		if (cands == 0) {
 			VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip_all);
 			continue;
@@ -2061,7 +2366,8 @@
 				    mem_node_config[mnode].physbase &&
 				    pfnum <
 				    mem_node_config[mnode].physmax);
-				(void) page_promote(mnode, pfnum, r, PC_FREE);
+				(void) page_promote(mnode,
+				    pfnum, r, PC_FREE, PC_MTYPE_ANY);
 			}
 		}
 	}
@@ -2088,26 +2394,37 @@
  *
  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
  */
+
 page_t *
-page_freelist_fill(uchar_t szc, int color, int mnode, int mtype, pfn_t pfnhi)
+page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype,
+    pfn_t pfnhi, page_list_walker_t *plw)
 {
 	uchar_t nszc = szc + 1;
-	int 	bin;
+	uint_t 	bin, sbin, bin_prev;
 	page_t	*pp, *firstpp;
 	page_t	*ret_pp = NULL;
-
-	ASSERT(szc < mmu_page_sizes);
-
-	VM_STAT_ADD(vmm_vmstats.pff_req[szc]);
+	uint_t  color_mask;
+
+	if (nszc == mmu_page_sizes)
+		return (NULL);
+
+	ASSERT(nszc < mmu_page_sizes);
+	color_mask = PAGE_GET_PAGECOLORS(nszc) - 1;
+	bin = sbin = PAGE_GET_NSZ_COLOR(szc, color);
+	bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR :
+	    PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev);
+
+	VM_STAT_ADD(vmm_vmstats.pfs_req[szc]);
 	/*
-	 * First try to break up a larger page to fill
-	 * current size freelist.
+	 * First try to break up a larger page to fill current size freelist.
 	 */
-	while (nszc < mmu_page_sizes) {
+	while (plw->plw_bins[nszc] != 0) {
+
+		ASSERT(nszc < mmu_page_sizes);
+
 		/*
 		 * If page found then demote it.
 		 */
-		bin = page_convert_color(szc, nszc, color);
 		if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) {
 			page_freelist_lock(mnode);
 			firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype);
@@ -2126,10 +2443,13 @@
 				} while (pp->p_pagenum >= pfnhi);
 			}
 			if (pp) {
+				uint_t ccolor = page_correct_color(szc, nszc,
+				    color, bin, plw->plw_ceq_mask[szc]);
+
 				ASSERT(pp->p_szc == nszc);
-				VM_STAT_ADD(vmm_vmstats.pff_demote[nszc]);
+				VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]);
 				ret_pp = page_demote(mnode, pp->p_pagenum,
-				    pp->p_szc, szc, color, PC_ALLOC);
+				    pp->p_szc, szc, ccolor, PC_ALLOC);
 				if (ret_pp) {
 					page_freelist_unlock(mnode);
 #if defined(__sparc)
@@ -2146,20 +2466,37 @@
 			}
 			page_freelist_unlock(mnode);
 		}
-		nszc++;
-	}
-
-	/*
-	 * Ok that didn't work. Time to coalesce.
-	 */
-	if (szc != 0) {
-		ret_pp = page_freelist_coalesce(mnode, szc, color);
-		VM_STAT_COND_ADD(ret_pp, vmm_vmstats.pff_coalok[szc]);
+
+		/* loop through next size bins */
+		bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask);
+		plw->plw_bins[nszc]--;
+
+		if (bin == sbin) {
+			uchar_t nnszc = nszc + 1;
+
+			/* we are done with this page size - check next */
+			if (plw->plw_bins[nnszc] == 0)
+				/* we have already checked next size bins */
+				break;
+
+			bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin);
+			if (bin_prev != INVALID_COLOR) {
+				bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev);
+				if (!((bin ^ bin_prev) &
+				    plw->plw_ceq_mask[nnszc]))
+					break;
+			}
+			ASSERT(nnszc < mmu_page_sizes);
+			color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1;
+			nszc = nnszc;
+			ASSERT(nszc < mmu_page_sizes);
+		}
 	}
 
 	return (ret_pp);
 }
 
+
 /*
  * Helper routine used only by the freelist code to lock
  * a page. If the page is a large page then it succeeds in
@@ -2206,80 +2543,256 @@
 	return (1);
 }
 
+/*
+ * init context for walking page lists
+ * Called when a page of the given szc in unavailable. Sets markers
+ * for the beginning of the search to detect when search has
+ * completed a full cycle. Sets flags for splitting larger pages
+ * and coalescing smaller pages. Page walking procedes until a page
+ * of the desired equivalent color is found.
+ */
+void
+page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split,
+    int use_ceq, page_list_walker_t *plw)
+{
+	uint_t  nszc, ceq_mask, colors;
+	uchar_t ceq = use_ceq ? colorequivszc[szc] : 0;
+
+	ASSERT(szc < mmu_page_sizes);
+	colors = PAGE_GET_PAGECOLORS(szc);
+
+	plw->plw_colors = colors;
+	plw->plw_color_mask = colors - 1;
+	plw->plw_bin_marker = plw->plw_bin0 = bin;
+	plw->plw_bin_split_prev = bin;
+	plw->plw_bin_step = (szc == 0) ? vac_colors : 1;
+
+	/*
+	 * if vac aliasing is possible make sure lower order color
+	 * bits are never ignored
+	 */
+	if (vac_colors > 1)
+		ceq &= 0xf0;
+
+	/*
+	 * calculate the number of non-equivalent colors and
+	 * color equivalency mask
+	 */
+	plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
+	ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors);
+	ASSERT(plw->plw_ceq_dif > 0);
+	plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf);
+
+	if (flags & PG_MATCH_COLOR) {
+		if (cpu_page_colors <  0) {
+			/*
+			 * this is a heterogeneous machine with different CPUs
+			 * having different size e$ (not supported for ni2/rock
+			 */
+			uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc);
+			cpucolors = MAX(cpucolors, 1);
+			ceq_mask = plw->plw_color_mask & (cpucolors - 1);
+			plw->plw_ceq_mask[szc] =
+			    MIN(ceq_mask, plw->plw_ceq_mask[szc]);
+		}
+		plw->plw_ceq_dif = 1;
+	}
+
+	/* we can split pages in the freelist, but not the cachelist */
+	if (can_split) {
+	    plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0;
+
+	    /* calculate next sizes color masks and number of free list bins */
+	    for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) {
+		plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc,
+		    plw->plw_ceq_mask[szc]);
+		plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc);
+	    }
+	    plw->plw_ceq_mask[nszc] = INVALID_MASK;
+	    plw->plw_bins[nszc] = 0;
+
+	} else {
+	    ASSERT(szc == 0);
+	    plw->plw_do_split = 0;
+	    plw->plw_bins[1] = 0;
+	    plw->plw_ceq_mask[1] = INVALID_MASK;
+	}
+}
+
+/*
+ * set mark to flag where next split should occur
+ */
+#define	PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) {		     \
+	uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin);			     \
+	uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0);	     \
+	uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask;    \
+	plw->plw_split_next =						     \
+		INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask);	     \
+	if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \
+		plw->plw_split_next =					     \
+		INC_MASKED(plw->plw_split_next,				     \
+		    neq_mask, plw->plw_color_mask);			     \
+	}								     \
+}
+
+uint_t
+page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw)
+{
+	uint_t  neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask;
+	uint_t  bin0_nsz, nbin_nsz, nbin0, nbin;
+	uchar_t nszc = szc + 1;
+
+	nbin = ADD_MASKED(bin,
+	    plw->plw_bin_step, neq_mask, plw->plw_color_mask);
+
+	if (plw->plw_do_split) {
+		plw->plw_bin_split_prev = bin;
+		PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw);
+		plw->plw_do_split = 0;
+	}
+
+	if (szc == 0) {
+		if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) {
+			if (nbin == plw->plw_bin0 &&
+			    (vac_colors == 1 || nbin != plw->plw_bin_marker)) {
+				nbin = ADD_MASKED(nbin, plw->plw_bin_step,
+				    neq_mask, plw->plw_color_mask);
+				plw->plw_bin_split_prev = plw->plw_bin0;
+			}
+
+			if (vac_colors > 1 && nbin == plw->plw_bin_marker) {
+				plw->plw_bin_marker =
+				    nbin = INC_MASKED(nbin, neq_mask,
+					plw->plw_color_mask);
+				plw->plw_bin_split_prev = plw->plw_bin0;
+				/*
+				 * large pages all have the same vac color
+				 * so by now we should be done with next
+				 * size page splitting process
+				 */
+				ASSERT(plw->plw_bins[1] == 0);
+				plw->plw_do_split = 0;
+				return (nbin);
+			}
+
+		} else {
+			uint_t bin_jump = (vac_colors == 1) ?
+			    (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP;
+
+			bin_jump &= ~(vac_colors - 1);
+
+			nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask,
+			    plw->plw_color_mask);
+
+			if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) {
+
+				plw->plw_bin_marker = nbin = nbin0;
+
+				if (plw->plw_bins[nszc] != 0) {
+					/*
+					 * check if next page size bin is the
+					 * same as the next page size bin for
+					 * bin0
+					 */
+					nbin_nsz = PAGE_GET_NSZ_COLOR(szc,
+					    nbin);
+					bin0_nsz = PAGE_GET_NSZ_COLOR(szc,
+					    plw->plw_bin0);
+
+					if ((bin0_nsz ^ nbin_nsz) &
+					    plw->plw_ceq_mask[nszc])
+						plw->plw_do_split = 1;
+				}
+				return (nbin);
+			}
+		}
+	}
+
+	if (plw->plw_bins[nszc] != 0) {
+	    nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin);
+	    if (!((plw->plw_split_next ^ nbin_nsz) &
+		plw->plw_ceq_mask[nszc]))
+		plw->plw_do_split = 1;
+	}
+
+	return (nbin);
+}
+
 page_t *
 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc,
     uint_t flags)
 {
-	kmutex_t	*pcm;
-	int		i, fill_tried, fill_marker;
-	page_t		*pp, *first_pp;
-	uint_t		bin_marker;
-	int		colors, cpucolors;
-	uchar_t		nszc;
-	uint_t		nszc_color_shift;
-	int		nwaybins = 0, nwaycnt;
+	kmutex_t		*pcm;
+	page_t			*pp, *first_pp;
+	uint_t			sbin;
+	int			plw_initialized;
+	page_list_walker_t	plw;
 
 	ASSERT(szc < mmu_page_sizes);
 
 	VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]);
 
 	MTYPE_START(mnode, mtype, flags);
-	if (mtype < 0) {	/* mnode foes not have memory in mtype range */
+	if (mtype < 0) {	/* mnode does not have memory in mtype range */
 		VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]);
 		return (NULL);
 	}
-
-	/*
-	 * Set how many physical colors for this page size.
-	 */
-	colors = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 :
-	    page_colors;
-
-	nszc = MIN(szc + 1, mmu_page_sizes - 1);
-	nszc_color_shift = page_get_shift(nszc) - page_get_shift(szc);
-
-	/* cpu_page_colors is non-zero if a page color may be in > 1 bin */
-	cpucolors = cpu_page_colors;
-
-	/*
-	 * adjust cpucolors to possibly check additional 'equivalent' bins
-	 * to try to minimize fragmentation of large pages by delaying calls
-	 * to page_freelist_fill.
-	 */
-	if (colorequiv > 1) {
-		int equivcolors = colors / colorequiv;
-
-		if (equivcolors && (cpucolors == 0 || equivcolors < cpucolors))
-			cpucolors = equivcolors;
-	}
-
-	ASSERT(colors <= page_colors);
-	ASSERT(colors);
-	ASSERT((colors & (colors - 1)) == 0);
-
-	ASSERT(bin < colors);
+try_again:
+
+	plw_initialized = 0;
+	plw.plw_ceq_dif = 1;
 
 	/*
 	 * Only hold one freelist lock at a time, that way we
 	 * can start anywhere and not have to worry about lock
 	 * ordering.
 	 */
-big_try_again:
-	fill_tried = 0;
-	nwaycnt = 0;
-	for (i = 0; i <= colors; i++) {
-try_again:
-		ASSERT(bin < colors);
-		if (PAGE_FREELISTS(mnode, szc, bin, mtype)) {
+	for (plw.plw_count = 0;
+	    plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
+		sbin = bin;
+		do {
+			if (!PAGE_FREELISTS(mnode, szc, bin, mtype))
+				goto bin_empty_1;
+
 			pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
 			mutex_enter(pcm);
 			pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
-			if (pp != NULL) {
-				/*
-				 * These were set before the page
-				 * was put on the free list,
-				 * they must still be set.
-				 */
+			if (pp == NULL)
+				goto bin_empty_0;
+
+			/*
+			 * These were set before the page
+			 * was put on the free list,
+			 * they must still be set.
+			 */
+			ASSERT(PP_ISFREE(pp));
+			ASSERT(PP_ISAGED(pp));
+			ASSERT(pp->p_vnode == NULL);
+			ASSERT(pp->p_hash == NULL);
+			ASSERT(pp->p_offset == (u_offset_t)-1);
+			ASSERT(pp->p_szc == szc);
+			ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
+
+			/*
+			 * Walk down the hash chain.
+			 * 8k pages are linked on p_next
+			 * and p_prev fields. Large pages
+			 * are a contiguous group of
+			 * constituent pages linked together
+			 * on their p_next and p_prev fields.
+			 * The large pages are linked together
+			 * on the hash chain using p_vpnext
+			 * p_vpprev of the base constituent
+			 * page of each large page.
+			 */
+			first_pp = pp;
+			while (!page_trylock_cons(pp, SE_EXCL)) {
+				if (szc == 0) {
+					pp = pp->p_next;
+				} else {
+					pp = pp->p_vpnext;
+				}
+
 				ASSERT(PP_ISFREE(pp));
 				ASSERT(PP_ISAGED(pp));
 				ASSERT(pp->p_vnode == NULL);
@@ -2288,188 +2801,89 @@
 				ASSERT(pp->p_szc == szc);
 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
 
-				/*
-				 * Walk down the hash chain.
-				 * 8k pages are linked on p_next
-				 * and p_prev fields. Large pages
-				 * are a contiguous group of
-				 * constituent pages linked together
-				 * on their p_next and p_prev fields.
-				 * The large pages are linked together
-				 * on the hash chain using p_vpnext
-				 * p_vpprev of the base constituent
-				 * page of each large page.
-				 */
-				first_pp = pp;
-				while (!page_trylock_cons(pp, SE_EXCL)) {
-					if (szc == 0) {
-						pp = pp->p_next;
-					} else {
-						pp = pp->p_vpnext;
-					}
-
-					ASSERT(PP_ISFREE(pp));
-					ASSERT(PP_ISAGED(pp));
-					ASSERT(pp->p_vnode == NULL);
-					ASSERT(pp->p_hash == NULL);
-					ASSERT(pp->p_offset == (u_offset_t)-1);
-					ASSERT(pp->p_szc == szc);
-					ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
-							mnode);
-
-					if (pp == first_pp) {
-						pp = NULL;
-						break;
-					}
-				}
-
-				if (pp) {
-					ASSERT(mtype == PP_2_MTYPE(pp));
-					ASSERT(pp->p_szc == szc);
-					if (szc == 0) {
-						page_sub(&PAGE_FREELISTS(mnode,
-						    szc, bin, mtype), pp);
-					} else {
-						page_vpsub(&PAGE_FREELISTS(
-						    mnode, szc, bin, mtype),
-						    pp);
-						CHK_LPG(pp, szc);
-					}
-					page_ctr_sub(mnode, mtype, pp,
-					    PG_FREE_LIST);
-
-					if ((PP_ISFREE(pp) == 0) ||
-					    (PP_ISAGED(pp) == 0))
-						panic("free page is not. pp %p",
-						    (void *)pp);
-					mutex_exit(pcm);
+				if (pp == first_pp)
+					goto bin_empty_0;
+			}
+
+			ASSERT(pp != NULL);
+			ASSERT(mtype == PP_2_MTYPE(pp));
+			ASSERT(pp->p_szc == szc);
+			if (szc == 0) {
+				page_sub(&PAGE_FREELISTS(mnode,
+				    szc, bin, mtype), pp);
+			} else {
+				page_vpsub(&PAGE_FREELISTS(mnode,
+				    szc, bin, mtype), pp);
+				CHK_LPG(pp, szc);
+			}
+			page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
+
+			if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0))
+				panic("free page is not. pp %p", (void *)pp);
+			mutex_exit(pcm);
 
 #if defined(__sparc)
-					ASSERT(!kcage_on || PP_ISNORELOC(pp) ||
-					    (flags & PG_NORELOC) == 0);
-
-					if (PP_ISNORELOC(pp)) {
-						pgcnt_t	npgs;
-
-						npgs = page_get_pagecnt(szc);
-						kcage_freemem_sub(npgs);
-					}
+			ASSERT(!kcage_on || PP_ISNORELOC(pp) ||
+			    (flags & PG_NORELOC) == 0);
+
+			if (PP_ISNORELOC(pp))
+				kcage_freemem_sub(page_get_pagecnt(szc));
 #endif
-					VM_STAT_ADD(vmm_vmstats.
-					    pgmf_allocok[szc]);
-					return (pp);
-				}
+			VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]);
+			return (pp);
+
+bin_empty_0:
+			mutex_exit(pcm);
+bin_empty_1:
+			if (plw_initialized == 0) {
+				page_list_walk_init(szc, flags, bin, 1, 1,
+				    &plw);
+				plw_initialized = 1;
+				ASSERT(plw.plw_colors <=
+				    PAGE_GET_PAGECOLORS(szc));
+				ASSERT(plw.plw_colors > 0);
+				ASSERT((plw.plw_colors &
+				    (plw.plw_colors - 1)) == 0);
+				ASSERT(bin < plw.plw_colors);
+				ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors);
 			}
-			mutex_exit(pcm);
-		}
+			/* calculate the next bin with equivalent color */
+			bin = ADD_MASKED(bin, plw.plw_bin_step,
+			    plw.plw_ceq_mask[szc], plw.plw_color_mask);
+		} while (sbin != bin);
 
 		/*
-		 * Wow! The initial bin is empty.
-		 * If specific color is needed, check if page color may be
-		 * in other bins. cpucolors is:
-		 *   0	if the colors for this cpu is equal to page_colors.
-		 *	This means that pages with a particular color are in a
-		 *	single bin.
-		 *  -1	if colors of cpus (cheetah+) are heterogenous. Need to
-		 *	first determine the colors for the current cpu.
-		 *  >0	colors of all cpus are homogenous and < page_colors
-		 */
-
-		if ((flags & PG_MATCH_COLOR) && (cpucolors != 0)) {
-			if (!nwaybins) {
-				/*
-				 * cpucolors is negative if ecache setsizes
-				 * are heterogenous. determine colors for this
-				 * particular cpu.
-				 */
-				if (cpucolors < 0) {
-					cpucolors = CPUSETSIZE() / MMU_PAGESIZE;
-					ASSERT(cpucolors > 0);
-					nwaybins = colors / cpucolors;
-				} else {
-					nwaybins = colors / cpucolors;
-					ASSERT(szc > 0 || nwaybins > 1);
-				}
-				if (nwaybins < 2)
-					cpucolors = 0;
-			}
-
-			if (cpucolors && (nwaycnt + 1 <= nwaybins)) {
-				nwaycnt++;
-				bin = (bin + (colors / nwaybins)) &
-				    (colors - 1);
-				if (nwaycnt < nwaybins) {
-					goto try_again;
-				}
-			}
-			/* back to initial color if fall-thru */
-		}
-
-		/*
-		 * color bins are all empty if color match. Try and satisfy
-		 * the request by breaking up or coalescing pages from
-		 * a different size freelist of the correct color that
-		 * satisfies the ORIGINAL color requested. If that
-		 * fails then try pages of the same size but different
-		 * colors assuming we are not called with
+		 * color bins are all empty if color match. Try and
+		 * satisfy the request by breaking up or coalescing
+		 * pages from a different size freelist of the correct
+		 * color that satisfies the ORIGINAL color requested.
+		 * If that fails then try pages of the same size but
+		 * different colors assuming we are not called with
 		 * PG_MATCH_COLOR.
 		 */
-		if (!fill_tried) {
-			fill_tried = 1;
-			fill_marker = bin >> nszc_color_shift;
-			pp = page_freelist_fill(szc, bin, mnode, mtype,
-			    PFNNULL);
-			if (pp != NULL) {
-				return (pp);
-			}
-		}
-
-		if (flags & PG_MATCH_COLOR)
-			break;
-
-		/*
-		 * Select next color bin to try.
-		 */
-		if (szc == 0) {
-			/*
-			 * PAGESIZE page case.
-			 */
-			if (i == 0) {
-				bin = (bin + BIN_STEP) & page_colors_mask;
-				bin_marker = bin;
-			} else {
-				bin = (bin + vac_colors) & page_colors_mask;
-				if (bin == bin_marker) {
-					bin = (bin + 1) & page_colors_mask;
-					bin_marker = bin;
-				}
-			}
-		} else {
-			/*
-			 * Large page case.
-			 */
-			bin = (bin + 1) & (colors - 1);
-		}
-		/*
-		 * If bin advanced to the next color bin of the
-		 * next larger pagesize, there is a chance the fill
-		 * could succeed.
-		 */
-		if (fill_marker != (bin >> nszc_color_shift))
-			fill_tried = 0;
+		if (plw.plw_do_split &&
+		    (pp = page_freelist_split(szc, bin, mnode,
+			mtype, PFNNULL, &plw)) != NULL)
+		    return (pp);
+
+		if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc,
+		    bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) !=  NULL)
+			return (pp);
+
+		if (plw.plw_ceq_dif > 1)
+			bin = page_list_walk_next_bin(szc, bin, &plw);
 	}
 
 	/* if allowed, cycle through additional mtypes */
 	MTYPE_NEXT(mnode, mtype, flags);
 	if (mtype >= 0)
-		goto big_try_again;
+		goto try_again;
 
 	VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]);
 
 	return (NULL);
 }
 
-
 /*
  * Returns the count of free pages for 'pp' with size code 'szc'.
  * Note: This function does not return an exact value as the page freelist
@@ -2785,13 +3199,13 @@
 }
 
 /*
- * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to "claim" a
+ * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a
  * page with size code 'szc'. Claiming such a page requires acquiring
  * exclusive locks on all constituent pages (page_trylock_contig_pages),
  * relocating pages in use and concatenating these constituent pages into a
  * large page.
  *
- * The page lists do not have such a large page and page_freelist_fill has
+ * The page lists do not have such a large page and page_freelist_split has
  * already failed to demote larger pages and/or coalesce smaller free pages.
  *
  * 'flags' may specify PG_COLOR_MATCH which would limit the search of large
@@ -2810,7 +3224,9 @@
 	pgcnt_t szcpgmask = szcpgcnt - 1;
 	pfn_t	randpfn;
 	page_t *pp, *randpp, *endpp;
-	uint_t colors;
+	uint_t colors, ceq_mask;
+	/* LINTED : set but not used in function */
+	uint_t color_mask;
 	pfn_t hi, lo;
 	uint_t skip;
 
@@ -2821,11 +3237,23 @@
 
 	ASSERT(szc < mmu_page_sizes);
 
-	colors = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 :
-	    page_colors;
+	colors = PAGE_GET_PAGECOLORS(szc);
+	color_mask = colors - 1;
+	if ((colors > 1) && (flags & PG_MATCH_COLOR)) {
+		uchar_t ceq = colorequivszc[szc];
+		uint_t  ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
+
+		ASSERT(ceq_dif > 0);
+		ceq_mask = (ceq_dif - 1) << (ceq & 0xf);
+	} else {
+		ceq_mask = 0;
+	}
 
 	ASSERT(bin < colors);
 
+	/* clear "non-significant" color bits */
+	bin &= ceq_mask;
+
 	/*
 	 * trim the pfn range to search based on pfnflag. pfnflag is set
 	 * when there have been previous page_get_contig_page failures to
@@ -2889,38 +3317,25 @@
 		 * set lo to point to the pfn for the desired bin. Large
 		 * page sizes may only have a single page color
 		 */
-		if ((colors > 1) && (flags & PG_MATCH_COLOR)) {
-			uint_t	lobin;
-
-			/*
-			 * factor in colorequiv to check additional
-			 * 'equivalent' bins.
-			 */
-			if (colorequiv > 1 && colors > colorequiv)
-				colors = colors / colorequiv;
-
-			/* determine bin that lo currently points to */
-			lobin = (lo & ((szcpgcnt * colors) - 1)) / szcpgcnt;
-
-			/*
-			 * set lo to point at appropriate color and set skip
-			 * to arrive at the next szc page of the same color.
-			 */
-			lo += ((bin - lobin) & (colors - 1)) * szcpgcnt;
-
-			skip = colors * szcpgcnt;
-		} else {
-			/* check all pages starting from lo */
-			skip = szcpgcnt;
+		skip = szcpgcnt;
+		if (ceq_mask > 0) {
+			/* set lo to point at appropriate color */
+			PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask,
+			    color_mask);
+			if (hi <= lo)
+				/* mseg cannot satisfy color request */
+				continue;
 		}
-		if (hi <= lo)
-			/* mseg cannot satisfy color request */
-			continue;
 
 		/* randomly choose a point between lo and hi to begin search */
 
 		randpfn = (pfn_t)GETTICK();
 		randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1);
+		if (ceq_mask) {
+			PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin, ceq_mask,
+			    color_mask);
+			randpfn = (randpfn >= hi) ? lo : randpfn;
+		}
 		randpp = mseg->pages + (randpfn - mseg->pages_base);
 
 		ASSERT(randpp->p_pagenum == randpfn);
@@ -2932,9 +3347,8 @@
 
 		do {
 			ASSERT(!(pp->p_pagenum & szcpgmask));
-			ASSERT((flags & PG_MATCH_COLOR) == 0 ||
-			    colorequiv > 1 ||
-			    PP_2_BIN(pp) == bin);
+			ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0);
+
 			if (page_trylock_contig_pages(mnode, pp, szc, flags)) {
 				/* pages unlocked by page_claim on failure */
 				if (page_claim_contig_pages(pp, szc, flags)) {
@@ -2943,7 +3357,15 @@
 				}
 			}
 
-			pp += skip;
+			if (ceq_mask == 0) {
+				pp += skip;
+			} else {
+				pfn_t pfn = pp->p_pagenum;
+
+				PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin,
+				    ceq_mask, color_mask);
+				pp = mseg->pages + (pfn - mseg->pages_base);
+			}
 			if (pp >= endpp) {
 				/* start from the beginning */
 				pp = mseg->pages + (lo - mseg->pages_base);
@@ -3095,11 +3517,9 @@
 	VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]);
 
 	/* LINTED */
-	AS_2_BIN(as, seg, vp, vaddr, bin);
-
-	/* bin is for base pagesize color - convert if larger pagesize. */
-	if (szc)
-		bin = page_convert_color(0, szc, bin);
+	AS_2_BIN(as, seg, vp, vaddr, bin, szc);
+
+	ASSERT(bin < PAGE_GET_PAGECOLORS(szc));
 
 	/*
 	 * Try to get a local page first, but try remote if we can't
@@ -3229,9 +3649,9 @@
 	}
 
 	/* LINTED */
-	AS_2_BIN(as, seg, vp, vaddr, bin);
-
-	ASSERT(bin <= page_colors_mask);
+	AS_2_BIN(as, seg, vp, vaddr, bin, 0);
+
+	ASSERT(bin < PAGE_GET_PAGECOLORS(0));
 
 	/* LINTED */
 	MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE);
@@ -3294,13 +3714,11 @@
 page_t *
 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype)
 {
-	kmutex_t	*pcm;
-	int		i;
-	page_t		*pp;
-	page_t		*first_pp;
-	uint_t		bin_marker;
-	int		nwaybins, nwaycnt;
-	int		cpucolors;
+	kmutex_t		*pcm;
+	page_t			*pp, *first_pp;
+	uint_t			sbin;
+	int			plw_initialized;
+	page_list_walker_t	plw;
 
 	VM_STAT_ADD(vmm_vmstats.pgmc_alloc);
 
@@ -3311,19 +3729,10 @@
 		return (NULL);
 	}
 
-	nwaybins = 0;
-	cpucolors = cpu_page_colors;
-	/*
-	 * adjust cpucolors to possibly check additional 'equivalent' bins
-	 * to try to minimize fragmentation of large pages by delaying calls
-	 * to page_freelist_fill.
-	 */
-	if (colorequiv > 1) {
-		int equivcolors = page_colors / colorequiv;
-
-		if (equivcolors && (cpucolors == 0 || equivcolors < cpucolors))
-			cpucolors = equivcolors;
-	}
+try_again:
+
+	plw_initialized = 0;
+	plw.plw_ceq_dif = 1;
 
 	/*
 	 * Only hold one cachelist lock at a time, that way we
@@ -3331,128 +3740,96 @@
 	 * ordering.
 	 */
 
-big_try_again:
-	nwaycnt = 0;
-	for (i = 0; i <= page_colors; i++) {
-		if (PAGE_CACHELISTS(mnode, bin, mtype)) {
+	for (plw.plw_count = 0;
+	    plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
+		sbin = bin;
+		do {
+
+			if (!PAGE_CACHELISTS(mnode, bin, mtype))
+				goto bin_empty_1;
 			pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
 			mutex_enter(pcm);
 			pp = PAGE_CACHELISTS(mnode, bin, mtype);
-			if (pp != NULL) {
-				first_pp = pp;
+			if (pp == NULL)
+				goto bin_empty_0;
+
+			first_pp = pp;
+			ASSERT(pp->p_vnode);
+			ASSERT(PP_ISAGED(pp) == 0);
+			ASSERT(pp->p_szc == 0);
+			ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
+			while (!page_trylock(pp, SE_EXCL)) {
+				pp = pp->p_next;
+				ASSERT(pp->p_szc == 0);
+				if (pp == first_pp) {
+					/*
+					 * We have searched the complete list!
+					 * And all of them (might only be one)
+					 * are locked. This can happen since
+					 * these pages can also be found via
+					 * the hash list. When found via the
+					 * hash list, they are locked first,
+					 * then removed. We give up to let the
+					 * other thread run.
+					 */
+					pp = NULL;
+					break;
+				}
+				ASSERT(pp->p_vnode);
+				ASSERT(PP_ISFREE(pp));
+				ASSERT(PP_ISAGED(pp) == 0);
+				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
+				    mnode);
+			}
+
+			if (pp) {
+				page_t	**ppp;
+				/*
+				 * Found and locked a page.
+				 * Pull it off the list.
+				 */
+				ASSERT(mtype == PP_2_MTYPE(pp));
+				ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
+				page_sub(ppp, pp);
+				/*
+				 * Subtract counters before releasing pcm mutex
+				 * to avoid a race with page_freelist_coalesce
+				 * and page_freelist_split.
+				 */
+				page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
+				mutex_exit(pcm);
 				ASSERT(pp->p_vnode);
 				ASSERT(PP_ISAGED(pp) == 0);
-				ASSERT(pp->p_szc == 0);
-				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
-				while (!page_trylock(pp, SE_EXCL)) {
-					pp = pp->p_next;
-					ASSERT(pp->p_szc == 0);
-					if (pp == first_pp) {
-						/*
-						 * We have searched the
-						 * complete list!
-						 * And all of them (might
-						 * only be one) are locked.
-						 * This can happen since
-						 * these pages can also be
-						 * found via the hash list.
-						 * When found via the hash
-						 * list, they are locked
-						 * first, then removed.
-						 * We give up to let the
-						 * other thread run.
-						 */
-						pp = NULL;
-						break;
-					}
-					ASSERT(pp->p_vnode);
-					ASSERT(PP_ISFREE(pp));
-					ASSERT(PP_ISAGED(pp) == 0);
-					ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
-							mnode);
+#if defined(__sparc)
+				ASSERT(!kcage_on ||
+				    (flags & PG_NORELOC) == 0 ||
+				    PP_ISNORELOC(pp));
+				if (PP_ISNORELOC(pp)) {
+					kcage_freemem_sub(1);
 				}
-
-				if (pp) {
-					page_t	**ppp;
-					/*
-					 * Found and locked a page.
-					 * Pull it off the list.
-					 */
-					ASSERT(mtype == PP_2_MTYPE(pp));
-					ppp = &PAGE_CACHELISTS(mnode, bin,
-					    mtype);
-					page_sub(ppp, pp);
-					/*
-					 * Subtract counters before releasing
-					 * pcm mutex to avoid a race with
-					 * page_freelist_coalesce and
-					 * page_freelist_fill.
-					 */
-					page_ctr_sub(mnode, mtype, pp,
-					    PG_CACHE_LIST);
-					mutex_exit(pcm);
-					ASSERT(pp->p_vnode);
-					ASSERT(PP_ISAGED(pp) == 0);
-#if defined(__sparc)
-					ASSERT(!kcage_on ||
-					    (flags & PG_NORELOC) == 0 ||
-					    PP_ISNORELOC(pp));
-					if (PP_ISNORELOC(pp)) {
-						kcage_freemem_sub(1);
-					}
 #endif
-					VM_STAT_ADD(vmm_vmstats.
-					    pgmc_allocok);
-					return (pp);
-				}
+				VM_STAT_ADD(vmm_vmstats. pgmc_allocok);
+				return (pp);
 			}
+bin_empty_0:
 			mutex_exit(pcm);
-		}
-
-		/*
-		 * Wow! The initial bin is empty or no page in the bin could
-		 * be locked.
-		 *
-		 * If specific color is needed, check if page color may be in
-		 * other bins.
-		 */
-		if ((flags & PG_MATCH_COLOR) && (cpucolors != 0)) {
-			if (!nwaybins) {
-				if (cpucolors < 0) {
-					cpucolors = CPUSETSIZE() / MMU_PAGESIZE;
-					ASSERT(cpucolors > 0);
-					nwaybins = page_colors / cpucolors;
-					if (nwaybins < 2)
-						cpucolors = 0;
-				} else {
-					nwaybins = page_colors / cpucolors;
-					ASSERT(nwaybins > 1);
-				}
+bin_empty_1:
+			if (plw_initialized == 0) {
+				page_list_walk_init(0, flags, bin, 0, 1, &plw);
+				plw_initialized = 1;
 			}
-
-			if (++nwaycnt >= nwaybins) {
-				break;
-			}
-			bin = (bin + (page_colors / nwaybins)) &
-			    page_colors_mask;
-			continue;
-		}
-
-		if (i == 0) {
-			bin = (bin + BIN_STEP) & page_colors_mask;
-			bin_marker = bin;
-		} else {
-			bin = (bin + vac_colors) & page_colors_mask;
-			if (bin == bin_marker) {
-				bin = (bin + 1) & page_colors_mask;
-				bin_marker = bin;
-			}
-		}
+			/* calculate the next bin with equivalent color */
+			bin = ADD_MASKED(bin, plw.plw_bin_step,
+			    plw.plw_ceq_mask[0], plw.plw_color_mask);
+		} while (sbin != bin);
+
+		if (plw.plw_ceq_dif > 1)
+			bin = page_list_walk_next_bin(0, bin, &plw);
 	}
 
 	MTYPE_NEXT(mnode, mtype, flags);
 	if (mtype >= 0)
-		goto big_try_again;
+		goto try_again;
 
 	VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed);
 	return (NULL);
--- a/usr/src/uts/i86pc/vm/vm_dep.h	Fri Oct 20 17:13:50 2006 -0700
+++ b/usr/src/uts/i86pc/vm/vm_dep.h	Sat Oct 21 06:27:59 2006 -0700
@@ -233,10 +233,20 @@
 extern int		mtype_func(int, int, uint_t);
 extern void		mtype_modify_max(pfn_t, long);
 extern int		mnode_pgcnt(int);
+extern int		mnode_range_cnt(int);
 
 #define	NUM_MEM_RANGES	4		/* memory range types */
 
 /*
+ * candidate counters in vm_pagelist.c are indexed by color and range
+ */
+#define	MAX_MNODE_MRANGES	NUM_MEM_RANGES
+#define	MNODE_RANGE_CNT(mnode)	mnode_range_cnt(mnode)
+#define	MNODE_MAX_MRANGE(mnode)	(memrange_num(mem_node_config[mnode].physbase))
+#define	MTYPE_2_MRANGE(mnode, mtype)	\
+	(mnode_maxmrange[mnode] - mnoderanges[mtype].mnr_memrange)
+
+/*
  * Per page size free lists. Allocated dynamically.
  * dimensions [mtype][mmu_page_sizes][colors]
  *
@@ -274,10 +284,51 @@
 extern page_t *page_get_mnode_freelist(int, uint_t, int, uchar_t, uint_t);
 extern page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
 
+#define	PAGE_GET_COLOR_SHIFT(szc, nszc)                          \
+	    (hw_page_array[(nszc)].hp_shift - hw_page_array[(szc)].hp_shift)
+
+#define	PFN_2_COLOR(pfn, szc)						\
+	(((pfn) & page_colors_mask) >>			                \
+	(hw_page_array[szc].hp_shift - hw_page_array[0].hp_shift))
+
+#define	PNUM_SIZE(szc)							\
+	(hw_page_array[(szc)].hp_pgcnt)
+#define	PNUM_SHIFT(szc)							\
+	(hw_page_array[(szc)].hp_shift - hw_page_array[0].hp_shift)
+#define	PAGE_GET_SHIFT(szc)						\
+	(hw_page_array[(szc)].hp_shift)
+#define	PAGE_GET_PAGECOLORS(szc)					\
+	(hw_page_array[(szc)].hp_colors)
+
+/*
+ * This macro calculates the next sequential pfn with the specified
+ * color using color equivalency mask
+ */
+#define	PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, color, ceq_mask, color_mask)        \
+	ASSERT(((color) & ~(ceq_mask)) == 0);                                 \
+	{								      \
+		uint_t	pfn_shift = PAGE_BSZS_SHIFT(szc);                     \
+		pfn_t	spfn = pfn >> pfn_shift;                              \
+		pfn_t	stride = (ceq_mask) + 1;                              \
+		ASSERT((((ceq_mask) + 1) & (ceq_mask)) == 0);                 \
+		if (((spfn ^ (color)) & (ceq_mask)) == 0) {                   \
+			pfn += stride << pfn_shift;                           \
+		} else {                                                      \
+			pfn = (spfn & ~(pfn_t)(ceq_mask)) | (color);          \
+			pfn = (pfn > spfn ? pfn : pfn + stride) << pfn_shift; \
+		}                                                             \
+	}
+
+/* get the color equivalency mask for the next szc */
+#define	PAGE_GET_NSZ_MASK(szc, mask)                                         \
+	((mask) >> (PAGE_GET_SHIFT((szc) + 1) - PAGE_GET_SHIFT(szc)))
+
+/* get the color of the next szc */
+#define	PAGE_GET_NSZ_COLOR(szc, color)                                       \
+	((color) >> (PAGE_GET_SHIFT((szc) + 1) - PAGE_GET_SHIFT(szc)))
+
 /* Find the bin for the given page if it was of size szc */
-#define	PP_2_BIN_SZC(pp, szc)						\
-	(((pp->p_pagenum) & page_colors_mask) >>			\
-	(hw_page_array[szc].hp_shift - hw_page_array[0].hp_shift))
+#define	PP_2_BIN_SZC(pp, szc)	(PFN_2_COLOR(pp->p_pagenum, szc))
 
 #define	PP_2_BIN(pp)		(PP_2_BIN_SZC(pp, pp->p_szc))
 
@@ -288,6 +339,33 @@
 #define	SZCPAGES(szc)		(1 << PAGE_BSZS_SHIFT(szc))
 #define	PFN_BASE(pfnum, szc)	(pfnum & ~(SZCPAGES(szc) - 1))
 
+/*
+ * this structure is used for walking free page lists
+ * controls when to split large pages into smaller pages,
+ * and when to coalesce smaller pages into larger pages
+ */
+typedef struct page_list_walker {
+	uint_t	plw_colors;		/* num of colors for szc */
+	uint_t  plw_color_mask;		/* colors-1 */
+	uint_t	plw_bin_step;		/* next bin: 1 or 2 */
+	uint_t  plw_count;		/* loop count */
+	uint_t	plw_bin0;		/* starting bin */
+	uint_t  plw_bin_marker;		/* bin after initial jump */
+	uint_t  plw_bin_split_prev;	/* last bin we tried to split */
+	uint_t  plw_do_split;		/* set if OK to split */
+	uint_t  plw_split_next;		/* next bin to split */
+	uint_t	plw_ceq_dif;		/* number of different color groups */
+					/* to check */
+	uint_t	plw_ceq_mask[MMU_PAGE_SIZES + 1]; /* color equiv mask */
+	uint_t	plw_bins[MMU_PAGE_SIZES + 1];	/* num of bins */
+} page_list_walker_t;
+
+void	page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin,
+    int can_split, int use_ceq, page_list_walker_t *plw);
+
+uint_t	page_list_walk_next_bin(uchar_t szc, uint_t bin,
+    page_list_walker_t *plw);
+
 extern struct cpu	cpus[];
 #define	CPU0		cpus
 
@@ -494,9 +572,10 @@
  * hash as and addr to get a bin.
  */
 
-#define	AS_2_BIN(as, seg, vp, addr, bin)				\
-	bin = ((((uintptr_t)(addr) >> PAGESHIFT) + ((uintptr_t)(as) >> 4)) \
-	    & page_colors_mask)
+#define	AS_2_BIN(as, seg, vp, addr, bin, szc)				    \
+	bin = (((((uintptr_t)(addr) >> PAGESHIFT) + ((uintptr_t)(as) >> 4)) \
+	    & page_colors_mask) >>					    \
+	    (hw_page_array[szc].hp_shift - hw_page_array[0].hp_shift))
 
 /*
  * cpu private vm data - accessed thru CPU->cpu_vm_data
@@ -575,19 +654,23 @@
 	ulong_t	plsub_cache;
 	ulong_t	plsubpages_szcbig;
 	ulong_t	plsubpages_szc0;
-	ulong_t	pff_req[MMU_PAGE_SIZES];	/* page_freelist_fill */
-	ulong_t	pff_demote[MMU_PAGE_SIZES];
-	ulong_t	pff_coalok[MMU_PAGE_SIZES];
+	ulong_t	pfs_req[MMU_PAGE_SIZES];	/* page_freelist_split */
+	ulong_t	pfs_demote[MMU_PAGE_SIZES];
+	ulong_t	pfc_coalok[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
 	ulong_t	ppr_reloc[MMU_PAGE_SIZES];	/* page_relocate */
 	ulong_t ppr_relocnoroot[MMU_PAGE_SIZES];
 	ulong_t ppr_reloc_replnoroot[MMU_PAGE_SIZES];
 	ulong_t ppr_relocnolock[MMU_PAGE_SIZES];
 	ulong_t ppr_relocnomem[MMU_PAGE_SIZES];
 	ulong_t ppr_relocok[MMU_PAGE_SIZES];
-	ulong_t page_ctrs_coalesce;	/* page coalesce counter */
-	ulong_t page_ctrs_cands_skip;	/* candidates useful */
-	ulong_t page_ctrs_changed;	/* ctrs changed after locking */
-	ulong_t page_ctrs_failed;	/* page_freelist_coalesce failed */
+	/* page coalesce counter */
+	ulong_t page_ctrs_coalesce[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
+	/* candidates useful */
+	ulong_t page_ctrs_cands_skip[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
+	/* ctrs changed after locking */
+	ulong_t page_ctrs_changed[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
+	/* page_freelist_coalesce failed */
+	ulong_t page_ctrs_failed[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
 	ulong_t page_ctrs_coalesce_all;	/* page coalesce all counter */
 	ulong_t page_ctrs_cands_skip_all; /* candidates useful for all func */
 	ulong_t	restrict4gcnt;
@@ -600,7 +683,10 @@
 extern size_t page_ctrs_sz(void);
 extern caddr_t page_ctrs_alloc(caddr_t);
 extern void page_ctr_sub(int, int, page_t *, int);
-extern page_t *page_freelist_fill(uchar_t, int, int, int, pfn_t);
+extern page_t *page_freelist_split(uchar_t,
+    uint_t, int, int, pfn_t, page_list_walker_t *);
+extern page_t *page_freelist_coalesce(int, uchar_t, uint_t, uint_t, int,
+    pfn_t);
 extern uint_t page_get_pagecolors(uint_t);
 
 #ifdef	__cplusplus
--- a/usr/src/uts/i86pc/vm/vm_machdep.c	Fri Oct 20 17:13:50 2006 -0700
+++ b/usr/src/uts/i86pc/vm/vm_machdep.c	Sat Oct 21 06:27:59 2006 -0700
@@ -83,7 +83,7 @@
 #include <sys/memnode.h>
 #include <sys/stack.h>
 
-uint_t vac_colors = 0;
+uint_t vac_colors = 1;
 
 int largepagesupport = 0;
 extern uint_t page_create_new;
@@ -953,16 +953,12 @@
 int		mtype4g;
 
 int
-mnode_range_cnt()
+mnode_range_cnt(int mnode)
 {
 	int	mri;
 	int	mnrcnt = 0;
-	int	mnode;
 
-	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
-		if (mem_node_config[mnode].exists == 0)
-			continue;
-
+	if (mem_node_config[mnode].exists != 0) {
 		mri = nranges - 1;
 
 		/* find the memranges index below contained in mnode range */
@@ -983,6 +979,7 @@
 				break;
 		}
 	}
+	ASSERT(mnrcnt <= MAX_MNODE_MRANGES);
 	return (mnrcnt);
 }
 
@@ -1128,15 +1125,6 @@
 	if (i == 0)
 		physmax4g = 1;
 
-	/*
-	 * setup pagesize for generic page layer
-	 */
-	for (i = 0; i <= mmu.max_page_level; ++i) {
-		hw_page_array[i].hp_size = LEVEL_SIZE(i);
-		hw_page_array[i].hp_shift = LEVEL_SHIFT(i);
-		hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0);
-	}
-
 	ASSERT(ISP2(l2_sz));
 	ASSERT(ISP2(l2_linesz));
 	ASSERT(l2_sz > MMU_PAGESIZE);
@@ -1164,8 +1152,62 @@
 	ASSERT(ISP2(CPUSETSIZE()));
 	page_coloring_shift = lowbit(CPUSETSIZE());
 
+	/* initialize number of colors per page size */
+	for (i = 0; i <= mmu.max_page_level; i++) {
+		hw_page_array[i].hp_size = LEVEL_SIZE(i);
+		hw_page_array[i].hp_shift = LEVEL_SHIFT(i);
+		hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0);
+		hw_page_array[i].hp_colors = (page_colors_mask >>
+		    (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift))
+		    + 1;
+	}
+
+	/*
+	 * The value of cpu_page_colors determines if additional color bins
+	 * need to be checked for a particular color in the page_get routines.
+	 */
+	if (cpu_page_colors != 0) {
+
+		int a = lowbit(page_colors) - lowbit(cpu_page_colors);
+		ASSERT(a > 0);
+		ASSERT(a < 16);
+
+		for (i = 0; i <= mmu.max_page_level; i++) {
+			if ((colors = hw_page_array[i].hp_colors) <= 1) {
+				colorequivszc[i] = 0;
+				continue;
+			}
+			while ((colors >> a) == 0)
+				a--;
+			ASSERT(a >= 0);
+
+			/* higher 4 bits encodes color equiv mask */
+			colorequivszc[i] = (a << 4);
+		}
+	}
+
+	/* factor in colorequiv to check additional 'equivalent' bins. */
+	if (colorequiv > 1) {
+
+		int a = lowbit(colorequiv) - 1;
+		if (a > 15)
+			a = 15;
+
+		for (i = 0; i <= mmu.max_page_level; i++) {
+			if ((colors = hw_page_array[i].hp_colors) <= 1) {
+				continue;
+			}
+			while ((colors >> a) == 0)
+				a--;
+			if ((a << 4) > colorequivszc[i]) {
+				colorequivszc[i] = (a << 4);
+			}
+		}
+	}
+
 	/* size for mnoderanges */
-	mnoderangecnt = mnode_range_cnt();
+	for (mnoderangecnt = 0, i = 0; i < max_mem_nodes; i++)
+		mnoderangecnt += mnode_range_cnt(i);
 	colorsz = mnoderangecnt * sizeof (mnoderange_t);
 
 	/* size for fpc_mutex and cpc_mutex */
@@ -1255,13 +1297,15 @@
 page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags,
     int mnode, int mtype, ddi_dma_attr_t *dma_attr)
 {
-	kmutex_t	*pcm;
-	int		i;
-	page_t		*pp;
-	page_t		*first_pp;
-	uint64_t	pgaddr;
-	ulong_t		bin;
-	int		mtypestart;
+	kmutex_t		*pcm;
+	int			i;
+	page_t			*pp;
+	page_t			*first_pp;
+	uint64_t		pgaddr;
+	ulong_t			bin;
+	int			mtypestart;
+	int			plw_initialized;
+	page_list_walker_t	plw;
 
 	VM_STAT_ADD(pga_vmstats.pgma_alloc);
 
@@ -1269,7 +1313,6 @@
 	ASSERT(szc == 0);
 	ASSERT(dma_attr != NULL);
 
-
 	MTYPE_START(mnode, mtype, flags);
 	if (mtype < 0) {
 		VM_STAT_ADD(pga_vmstats.pgma_allocempty);
@@ -1285,8 +1328,11 @@
 	 * because of BIN_STEP skip
 	 */
 	do {
-		i = 0;
-		while (i <= page_colors) {
+		plw_initialized = 0;
+
+		for (plw.plw_count = 0;
+		    plw.plw_count < page_colors; plw.plw_count++) {
+
 			if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL)
 				goto nextfreebin;
 
@@ -1348,16 +1394,24 @@
 			}
 			mutex_exit(pcm);
 nextfreebin:
-			pp = page_freelist_fill(szc, bin, mnode, mtype,
-			    mmu_btop(dma_attr->dma_attr_addr_hi + 1));
-			if (pp)
-				return (pp);
+			if (plw_initialized == 0) {
+				page_list_walk_init(szc, 0, bin, 1, 0, &plw);
+				ASSERT(plw.plw_ceq_dif == page_colors);
+				plw_initialized = 1;
+			}
 
-			/* try next bin */
-			bin += (i == 0) ? BIN_STEP : 1;
-			bin &= page_colors_mask;
-			i++;
+			if (plw.plw_do_split) {
+				pp = page_freelist_split(szc, bin, mnode,
+				    mtype,
+				    mmu_btop(dma_attr->dma_attr_addr_hi + 1),
+				    &plw);
+				if (pp != NULL)
+					return (pp);
+			}
+
+			bin = page_list_walk_next_bin(szc, bin, &plw);
 		}
+
 		MTYPE_NEXT(mnode, mtype, flags);
 	} while (mtype >= 0);
 
@@ -1475,7 +1529,7 @@
 		lgrp = lgrp_home_lgrp();
 
 	/* LINTED */
-	AS_2_BIN(as, seg, vp, vaddr, bin);
+	AS_2_BIN(as, seg, vp, vaddr, bin, 0);
 
 	/*
 	 * Only hold one freelist or cachelist lock at a time, that way we
--- a/usr/src/uts/sfmmu/vm/hat_sfmmu.h	Fri Oct 20 17:13:50 2006 -0700
+++ b/usr/src/uts/sfmmu/vm/hat_sfmmu.h	Sat Oct 21 06:27:59 2006 -0700
@@ -400,7 +400,7 @@
 	uint_t		sfmmu_ismhat:1;	/* hat is dummy ism hatid */
 	uint_t		sfmmu_ctxflushed:1;	/* ctx has been flushed */
 	uchar_t		sfmmu_rmstat;	/* refmod stats refcnt */
-	uchar_t		sfmmu_clrstart;	/* start color bin for page coloring */
+	ushort_t	sfmmu_clrstart;	/* start color bin for page coloring */
 	ushort_t	sfmmu_clrbin;	/* per as phys page coloring bin */
 	ushort_t	sfmmu_flags;	/* flags */
 	struct tsb_info	*sfmmu_tsb;	/* list of per as tsbs */
--- a/usr/src/uts/sun4/vm/vm_dep.c	Fri Oct 20 17:13:50 2006 -0700
+++ b/usr/src/uts/sun4/vm/vm_dep.c	Sat Oct 21 06:27:59 2006 -0700
@@ -47,6 +47,7 @@
 #include <sys/mem_config.h>
 #include <sys/mem_cage.h>
 #include <vm/vm_dep.h>
+#include <vm/page.h>
 #include <sys/platform_module.h>
 
 /*
@@ -76,6 +77,10 @@
 uint_t vac_colors = 0;
 uint_t vac_colors_mask = 0;
 
+/* cpu specific coloring initialization */
+extern void page_coloring_init_cpu();
+#pragma weak page_coloring_init_cpu
+
 /*
  * get the ecache setsize for the current cpu.
  */
@@ -864,9 +869,6 @@
 	return (szcvec);
 }
 
-#define	PNUM_SIZE(size_code)						\
-	(hw_page_array[size_code].hp_size >> hw_page_array[0].hp_shift)
-
 /*
  * Anchored in the table below are counters used to keep track
  * of free contiguous physical memory. Each element of the table contains
@@ -924,7 +926,7 @@
 	 */
 	for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
 		page_cachelists[mtype][mnode] = (page_t **)alloc_base;
-		alloc_base += (sizeof (page_t *) * page_colors);
+		alloc_base += (sizeof (page_t *) * page_get_pagecolors(0));
 		/*
 		 * Allocate freelists bins for all
 		 * supported page sizes.
@@ -1009,7 +1011,7 @@
 	 * Calculate the size needed by alloc_page_freelists().
 	 */
 	for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
-		alloc_sz += sizeof (page_t *) * page_colors;
+		alloc_sz += sizeof (page_t *) * page_get_pagecolors(0);
 
 		for (szc = 0; szc < mmu_page_sizes; szc++)
 			alloc_sz += sizeof (page_t *) *
@@ -1044,7 +1046,7 @@
 
 	if (consistent_coloring == 2 || color_start_random) {
 		return ((uint_t)(((gettick()) << (vac_shift - MMU_PAGESHIFT)) &
-		    page_colors_mask));
+		    (hw_page_array[0].hp_colors - 1)));
 	}
 
 	do {
@@ -1066,10 +1068,13 @@
 void
 page_coloring_init()
 {
-	int	a;
+	int	a, i;
+	uint_t colors;
 
 	if (do_pg_coloring == 0) {
 		page_colors = 1;
+		for (i = 0; i < mmu_page_sizes; i++)
+			hw_page_array[i].hp_colors = 1;
 		return;
 	}
 
@@ -1082,6 +1087,22 @@
 	page_colors = ecache_setsize / MMU_PAGESIZE;
 	page_colors_mask = page_colors - 1;
 
+	vac_colors = vac_size / MMU_PAGESIZE;
+	vac_colors_mask = vac_colors -1;
+
+	page_coloring_shift = 0;
+	a = ecache_setsize;
+	while (a >>= 1) {
+		page_coloring_shift++;
+	}
+
+	/* initialize number of colors per page size */
+	for (i = 0; i < mmu_page_sizes; i++) {
+		hw_page_array[i].hp_colors = (page_colors_mask >>
+		    (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift))
+		    + 1;
+	}
+
 	/*
 	 * initialize cpu_page_colors if ecache setsizes are homogenous.
 	 * cpu_page_colors set to -1 during DR operation or during startup
@@ -1090,16 +1111,50 @@
 	 * The value of cpu_page_colors determines if additional color bins
 	 * need to be checked for a particular color in the page_get routines.
 	 */
-	if ((cpu_page_colors == 0) && (cpu_setsize < ecache_setsize))
+	if ((cpu_page_colors == 0) && (cpu_setsize < ecache_setsize)) {
+
 		cpu_page_colors = cpu_setsize / MMU_PAGESIZE;
+		a = lowbit(page_colors) - lowbit(cpu_page_colors);
+		ASSERT(a > 0);
+		ASSERT(a < 16);
+
+		for (i = 0; i < mmu_page_sizes; i++) {
+			if ((colors = hw_page_array[i].hp_colors) <= 1) {
+				colorequivszc[i] = 0;
+				continue;
+			}
+			while ((colors >> a) == 0)
+				a--;
+			ASSERT(a >= 0);
+
+			/* higher 4 bits encodes color equiv mask */
+			colorequivszc[i] = (a << 4);
+		}
+	}
 
-	vac_colors = vac_size / MMU_PAGESIZE;
-	vac_colors_mask = vac_colors -1;
+	/* factor in colorequiv to check additional 'equivalent' bins. */
+	if (colorequiv > 1 && &page_coloring_init_cpu == NULL) {
+
+		a = lowbit(colorequiv) - 1;
+
+		if (a > 15)
+			a = 15;
 
-	page_coloring_shift = 0;
-	a = ecache_setsize;
-	while (a >>= 1) {
-		page_coloring_shift++;
+		for (i = 0; i < mmu_page_sizes; i++) {
+			if ((colors = hw_page_array[i].hp_colors) <= 1) {
+				continue;
+			}
+			while ((colors >> a) == 0)
+				a--;
+			if ((a << 4) > colorequivszc[i]) {
+				colorequivszc[i] = (a << 4);
+			}
+		}
+	}
+
+	/* do cpu specific color initialization */
+	if (&page_coloring_init_cpu) {
+		page_coloring_init_cpu();
 	}
 }
 
--- a/usr/src/uts/sun4/vm/vm_dep.h	Fri Oct 20 17:13:50 2006 -0700
+++ b/usr/src/uts/sun4/vm/vm_dep.h	Sat Oct 21 06:27:59 2006 -0700
@@ -55,7 +55,6 @@
 	mtype = (flags & PG_NORELOC) ? MTYPE_NORELOC : MTYPE_RELOC;
 
 /* mtype init for page_get_replacement_page */
-
 #define	MTYPE_PGR_INIT(mtype, flags, pp, mnode, pgcnt)			\
 	mtype = (flags & PG_NORELOC) ? MTYPE_NORELOC : MTYPE_RELOC;
 
@@ -65,6 +64,14 @@
 	pfnhi = mem_node_config[mnode].physmax;
 
 /*
+ * candidate counters in vm_pagelist.c are indexed by color and range
+ */
+#define	MAX_MNODE_MRANGES		MAX_MEM_TYPES
+#define	MNODE_RANGE_CNT(mnode)		MAX_MNODE_MRANGES
+#define	MNODE_MAX_MRANGE(mnode)		(MAX_MEM_TYPES - 1)
+#define	MTYPE_2_MRANGE(mnode, mtype)	(mtype)
+
+/*
  * Internal PG_ flags.
  */
 #define	PGI_RELOCONLY	0x10000	/* acts in the opposite sense to PG_NORELOC */
@@ -99,10 +106,83 @@
 extern kmutex_t	*fpc_mutex[NPC_MUTEX];
 extern kmutex_t	*cpc_mutex[NPC_MUTEX];
 
+/*
+ * cpu specific color conversion functions
+ */
+extern uint_t page_get_nsz_color_mask_cpu(uchar_t, uint_t);
+#pragma weak page_get_nsz_color_mask_cpu
+
+extern uint_t page_get_nsz_color_cpu(uchar_t, uint_t);
+#pragma weak page_get_nsz_color_cpu
+
+extern uint_t page_get_color_shift_cpu(uchar_t, uchar_t);
+#pragma weak page_get_color_shift_cpu
+
+extern pfn_t page_next_pfn_for_color_cpu(pfn_t,
+    uchar_t, uint_t, uint_t, uint_t);
+#pragma weak page_next_pfn_for_color_cpu
+
+extern uint_t  page_pfn_2_color_cpu(pfn_t, uchar_t);
+#pragma weak page_pfn_2_color_cpu
+
+#define	PAGE_GET_COLOR_SHIFT(szc, nszc)				\
+	((&page_get_color_shift_cpu != NULL) ?			\
+	    page_get_color_shift_cpu(szc, nszc) :		\
+	    (hw_page_array[(nszc)].hp_shift -			\
+		hw_page_array[(szc)].hp_shift))
+
+#define	PFN_2_COLOR(pfn, szc)					\
+	((&page_pfn_2_color_cpu != NULL) ?			\
+	    page_pfn_2_color_cpu(pfn, szc) :			\
+	    ((pfn & (hw_page_array[0].hp_colors - 1)) >>	\
+		(hw_page_array[szc].hp_shift -			\
+		    hw_page_array[0].hp_shift)))
+
+#define	PNUM_SIZE(szc)							\
+	(hw_page_array[(szc)].hp_pgcnt)
+#define	PNUM_SHIFT(szc)							\
+	(hw_page_array[(szc)].hp_shift - hw_page_array[0].hp_shift)
+#define	PAGE_GET_SHIFT(szc)						\
+	(hw_page_array[(szc)].hp_shift)
+#define	PAGE_GET_PAGECOLORS(szc)					\
+	(hw_page_array[(szc)].hp_colors)
+
+/*
+ * This macro calculates the next sequential pfn with the specified
+ * color using color equivalency mask
+ */
+#define	PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, color, ceq_mask, color_mask)        \
+	ASSERT(((color) & ~(ceq_mask)) == 0);                                 \
+	if (&page_next_pfn_for_color_cpu == NULL) {                           \
+		uint_t	pfn_shift = PAGE_BSZS_SHIFT(szc);                     \
+		pfn_t	spfn = pfn >> pfn_shift;                              \
+		pfn_t	stride = (ceq_mask) + 1;                              \
+		ASSERT((((ceq_mask) + 1) & (ceq_mask)) == 0);                 \
+		if (((spfn ^ (color)) & (ceq_mask)) == 0) {                   \
+			pfn += stride << pfn_shift;                           \
+		} else {                                                      \
+			pfn = (spfn & ~(pfn_t)(ceq_mask)) | (color);          \
+			pfn = (pfn > spfn ? pfn : pfn + stride) << pfn_shift; \
+		}                                                             \
+	} else {                                                              \
+		pfn = page_next_pfn_for_color_cpu(pfn, szc, color,	      \
+		    ceq_mask, color_mask);                                    \
+	}
+
+/* get the color equivalency mask for the next szc */
+#define	PAGE_GET_NSZ_MASK(szc, mask)                                         \
+	((&page_get_nsz_color_mask_cpu == NULL) ?                            \
+	    ((mask) >> (PAGE_GET_SHIFT((szc) + 1) - PAGE_GET_SHIFT(szc))) :  \
+	    page_get_nsz_color_mask_cpu(szc, mask))
+
+/* get the color of the next szc */
+#define	PAGE_GET_NSZ_COLOR(szc, color)                                       \
+	((&page_get_nsz_color_cpu == NULL) ?                                 \
+	    ((color) >> (PAGE_GET_SHIFT((szc) + 1) - PAGE_GET_SHIFT(szc))) : \
+	    page_get_nsz_color_cpu(szc, color))
+
 /* Find the bin for the given page if it was of size szc */
-#define	PP_2_BIN_SZC(pp, szc)                                           \
-	(((pp->p_pagenum) & page_colors_mask) >>                        \
-	(hw_page_array[szc].hp_shift - hw_page_array[0].hp_shift))
+#define	PP_2_BIN_SZC(pp, szc)	(PFN_2_COLOR(pp->p_pagenum, szc))
 
 #define	PP_2_BIN(pp)		(PP_2_BIN_SZC(pp, pp->p_szc))
 
@@ -117,6 +197,30 @@
 
 #define	PFN_BASE(pfnum, szc)	(pfnum & ~((1 << PAGE_BSZS_SHIFT(szc)) - 1))
 
+/*
+ * this structure is used for walking free page lists
+ * controls when to split large pages into smaller pages,
+ * and when to coalesce smaller pages into larger pages
+ */
+typedef struct page_list_walker {
+	uint_t	plw_colors;		/* num of colors for szc */
+	uint_t  plw_color_mask;		/* colors-1 */
+	uint_t	plw_bin_step;		/* next bin: 1 or 2 */
+	uint_t  plw_count;		/* loop count */
+	uint_t	plw_bin0;		/* starting bin */
+	uint_t  plw_bin_marker;		/* bin after initial jump */
+	uint_t  plw_bin_split_prev;	/* last bin we tried to split */
+	uint_t  plw_do_split;		/* set if OK to split */
+	uint_t  plw_split_next;		/* next bin to split */
+	uint_t	plw_ceq_dif;		/* number of different color groups */
+					/* to check */
+	uint_t	plw_ceq_mask[MMU_PAGE_SIZES + 1]; /* color equiv mask */
+	uint_t	plw_bins[MMU_PAGE_SIZES + 1];	/* num of bins */
+} page_list_walker_t;
+
+void	page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin,
+    int can_split, int use_ceq, page_list_walker_t *plw);
+
 typedef	char	hpmctr_t;
 
 #ifdef DEBUG
@@ -147,7 +251,7 @@
 
 #define	PLCNT_SZ(ctrs_sz) {						\
 	int	szc;							\
-	for (szc = 0; szc <= mmu_page_sizes; szc++) {			\
+	for (szc = 0; szc < mmu_page_sizes; szc++) {			\
 		int	colors = page_get_pagecolors(szc);		\
 		ctrs_sz += (max_mem_nodes * MAX_MEM_TYPES *		\
 		    colors * sizeof (pgcnt_t));				\
@@ -285,6 +389,7 @@
  * get the ecache setsize for the current cpu.
  */
 #define	CPUSETSIZE()	(cpunodes[CPU->cpu_id].ecache_setsize)
+#define	CPUASSOC()	(cpunodes[CPU->cpu_id].ecache_associativity)
 
 extern struct cpu	cpu0;
 #define	CPU0		&cpu0
@@ -337,7 +442,7 @@
  * 1 virtual=paddr
  * 2 bin hopping
  */
-#define	AS_2_BIN(as, seg, vp, addr, bin)				\
+#define	AS_2_BIN(as, seg, vp, addr, bin, szc)				\
 switch (consistent_coloring) {						\
 	default:                                                        \
 		cmn_err(CE_WARN,					\
@@ -346,41 +451,53 @@
 	case 0: {                                                       \
 		uint32_t ndx, new;					\
 		int slew = 0;						\
+		pfn_t pfn;                                              \
                                                                         \
 		if (vp != NULL && IS_SWAPVP(vp) &&			\
-			seg->s_ops == &segvn_ops)			\
+		    seg->s_ops == &segvn_ops)				\
 			slew = as_color_bin(as);			\
                                                                         \
-		bin = (((uintptr_t)addr >> MMU_PAGESHIFT) +		\
+		pfn = ((uintptr_t)addr >> MMU_PAGESHIFT) +		\
 			(((uintptr_t)addr >> page_coloring_shift) <<	\
-			(vac_shift - MMU_PAGESHIFT)) + slew) &		\
-			page_colors_mask;				\
-                                                                        \
+			(vac_shift - MMU_PAGESHIFT));			\
+		if ((szc) == 0 ||					\
+		    (szc == 1 && &page_pfn_2_color_cpu == NULL &&	\
+		    CPUASSOC() > PNUM_SIZE(1))) {			\
+			pfn += slew;					\
+			bin = PFN_2_COLOR(pfn, szc);			\
+		} else {						\
+			bin = PFN_2_COLOR(pfn, szc);			\
+			bin += slew >> (vac_shift - MMU_PAGESHIFT);	\
+			bin &= hw_page_array[(szc)].hp_colors - 1;	\
+		}							\
 		break;                                                  \
 	}                                                               \
 	case 1:                                                         \
-		bin = ((uintptr_t)addr >> MMU_PAGESHIFT) &		\
-			page_colors_mask;				\
+		bin = PFN_2_COLOR(((uintptr_t)addr >> MMU_PAGESHIFT),   \
+					szc);	                        \
 		break;                                                  \
 	case 2: {                                                       \
 		int cnt = as_color_bin(as);				\
+		uint_t color_mask = page_get_pagecolors(0) - 1;		\
+                                                                        \
 		/* make sure physical color aligns with vac color */	\
 		while ((cnt & vac_colors_mask) !=			\
 		    addr_to_vcolor(addr)) {				\
 			cnt++;						\
 		}                                                       \
-		bin = cnt = cnt & page_colors_mask;			\
+		bin = cnt = cnt & color_mask;			        \
+		bin >>= PAGE_GET_COLOR_SHIFT(0, szc);                   \
 		/* update per as page coloring fields */		\
-		cnt = (cnt + 1) & page_colors_mask;			\
-		if (cnt == (as_color_start(as) & page_colors_mask)) {	\
+		cnt = (cnt + 1) & color_mask;			        \
+		if (cnt == (as_color_start(as) & color_mask)) {	        \
 			cnt = as_color_start(as) = as_color_start(as) + \
 				PGCLR_LOOPFACTOR;			\
 		}                                                       \
-		as_color_bin(as) = cnt & page_colors_mask;		\
+		as_color_bin(as) = cnt & color_mask;		        \
 		break;                                                  \
 	}								\
 }									\
-	ASSERT(bin <= page_colors_mask);
+	ASSERT(bin < page_get_pagecolors(szc));
 
 /*
  * cpu private vm data - accessed thru CPU->cpu_vm_data
@@ -488,9 +605,9 @@
 	ulong_t	plsub_cache;
 	ulong_t	plsubpages_szcbig;
 	ulong_t	plsubpages_szc0;
-	ulong_t	pff_req[MMU_PAGE_SIZES];	/* page_freelist_fill */
-	ulong_t	pff_demote[MMU_PAGE_SIZES];
-	ulong_t	pff_coalok[MMU_PAGE_SIZES];
+	ulong_t	pfs_req[MMU_PAGE_SIZES];	/* page_freelist_split */
+	ulong_t	pfs_demote[MMU_PAGE_SIZES];
+	ulong_t	pfc_coalok[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
 	ulong_t ppr_reloc[MMU_PAGE_SIZES];	/* page_relocate */
 	ulong_t ppr_relocok[MMU_PAGE_SIZES];
 	ulong_t ppr_relocnoroot[MMU_PAGE_SIZES];
@@ -498,10 +615,14 @@
 	ulong_t ppr_relocnolock[MMU_PAGE_SIZES];
 	ulong_t ppr_relocnomem[MMU_PAGE_SIZES];
 	ulong_t ppr_krelocfail[MMU_PAGE_SIZES];
-	ulong_t	page_ctrs_coalesce;	/* page coalesce counter */
-	ulong_t	page_ctrs_cands_skip;	/* candidates useful */
-	ulong_t	page_ctrs_changed;	/* ctrs changed after locking */
-	ulong_t	page_ctrs_failed;	/* page_freelist_coalesce failed */
+	/* page coalesce counter */
+	ulong_t	page_ctrs_coalesce[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
+	/* candidates useful */
+	ulong_t	page_ctrs_cands_skip[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
+	/* ctrs changed after locking */
+	ulong_t	page_ctrs_changed[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
+	/* page_freelist_coalesce failed */
+	ulong_t	page_ctrs_failed[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
 	ulong_t	page_ctrs_coalesce_all;	/* page coalesce all counter */
 	ulong_t	page_ctrs_cands_skip_all; /* candidates useful for all func */
 };
--- a/usr/src/uts/sun4u/vm/mach_vm_dep.c	Fri Oct 20 17:13:50 2006 -0700
+++ b/usr/src/uts/sun4u/vm/mach_vm_dep.c	Sat Oct 21 06:27:59 2006 -0700
@@ -81,15 +81,17 @@
 extern int vac_shift;
 
 hw_pagesize_t hw_page_array[] = {
-	{MMU_PAGESIZE, MMU_PAGESHIFT, MMU_PAGESIZE >> MMU_PAGESHIFT},
-	{MMU_PAGESIZE64K, MMU_PAGESHIFT64K, MMU_PAGESIZE64K >> MMU_PAGESHIFT},
-	{MMU_PAGESIZE512K, MMU_PAGESHIFT512K,
+	{MMU_PAGESIZE, MMU_PAGESHIFT, 0, MMU_PAGESIZE >> MMU_PAGESHIFT},
+	{MMU_PAGESIZE64K, MMU_PAGESHIFT64K, 0,
+	    MMU_PAGESIZE64K >> MMU_PAGESHIFT},
+	{MMU_PAGESIZE512K, MMU_PAGESHIFT512K, 0,
 	    MMU_PAGESIZE512K >> MMU_PAGESHIFT},
-	{MMU_PAGESIZE4M, MMU_PAGESHIFT4M, MMU_PAGESIZE4M >> MMU_PAGESHIFT},
-	{MMU_PAGESIZE32M, MMU_PAGESHIFT32M, MMU_PAGESIZE32M >> MMU_PAGESHIFT},
-	{MMU_PAGESIZE256M, MMU_PAGESHIFT256M,
+	{MMU_PAGESIZE4M, MMU_PAGESHIFT4M, 0, MMU_PAGESIZE4M >> MMU_PAGESHIFT},
+	{MMU_PAGESIZE32M, MMU_PAGESHIFT32M, 0,
+	    MMU_PAGESIZE32M >> MMU_PAGESHIFT},
+	{MMU_PAGESIZE256M, MMU_PAGESHIFT256M, 0,
 	    MMU_PAGESIZE256M >> MMU_PAGESHIFT},
-	{0, 0, 0}
+	{0, 0, 0, 0}
 };
 
 /*
--- a/usr/src/uts/sun4v/vm/mach_vm_dep.c	Fri Oct 20 17:13:50 2006 -0700
+++ b/usr/src/uts/sun4v/vm/mach_vm_dep.c	Sat Oct 21 06:27:59 2006 -0700
@@ -77,15 +77,17 @@
 extern int vac_shift;
 
 hw_pagesize_t hw_page_array[] = {
-	{MMU_PAGESIZE, MMU_PAGESHIFT, MMU_PAGESIZE >> MMU_PAGESHIFT},
-	{MMU_PAGESIZE64K, MMU_PAGESHIFT64K, MMU_PAGESIZE64K >> MMU_PAGESHIFT},
-	{MMU_PAGESIZE512K, MMU_PAGESHIFT512K,
+	{MMU_PAGESIZE, MMU_PAGESHIFT, 0, MMU_PAGESIZE >> MMU_PAGESHIFT},
+	{MMU_PAGESIZE64K, MMU_PAGESHIFT64K, 0,
+	    MMU_PAGESIZE64K >> MMU_PAGESHIFT},
+	{MMU_PAGESIZE512K, MMU_PAGESHIFT512K, 0,
 	    MMU_PAGESIZE512K >> MMU_PAGESHIFT},
-	{MMU_PAGESIZE4M, MMU_PAGESHIFT4M, MMU_PAGESIZE4M >> MMU_PAGESHIFT},
-	{MMU_PAGESIZE32M, MMU_PAGESHIFT32M, MMU_PAGESIZE32M >> MMU_PAGESHIFT},
-	{MMU_PAGESIZE256M, MMU_PAGESHIFT256M,
+	{MMU_PAGESIZE4M, MMU_PAGESHIFT4M, 0, MMU_PAGESIZE4M >> MMU_PAGESHIFT},
+	{MMU_PAGESIZE32M, MMU_PAGESHIFT32M, 0,
+	    MMU_PAGESIZE32M >> MMU_PAGESHIFT},
+	{MMU_PAGESIZE256M, MMU_PAGESHIFT256M, 0,
 	    MMU_PAGESIZE256M >> MMU_PAGESHIFT},
-	{0, 0, 0}
+	{0, 0, 0, 0}
 };
 
 /*