Mercurial > illumos > illumos-gate

--- a/usr/src/uts/common/fs/swapfs/swap_vnops.c	Thu May 22 22:08:42 2008 -0700
+++ b/usr/src/uts/common/fs/swapfs/swap_vnops.c	Thu May 22 22:23:49 2008 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */

@@ -167,7 +167,7 @@
 	int upgrade = 0;

 	SWAPFS_PRINT(SWAP_VOPS, "swap_getapage: vp %p, off %llx, len %lx\n",
-		vp, off, len, 0, 0);
+	    vp, off, len, 0, 0);

 	/*
 	 * Until there is a call-back mechanism to cause SEGKP
@@ -247,8 +247,10 @@
 					mutex_enter(ahm);

 					ap = swap_anon(vp, off);
-					if (ap == NULL)
-					    panic("swap_getapage: null anon");
+					if (ap == NULL) {
+						panic("swap_getapage:"
+						    " null anon");
+					}

 					if (ap->an_pvp == pvp &&
 					    ap->an_poff == poff) {
@@ -298,7 +300,7 @@
 			pvn_plist_init(pp, pl, plsz, off, PAGESIZE, rw);
 	}
 	TRACE_3(TR_FAC_SWAPFS, TR_SWAPFS_GETAPAGE,
-		"swapfs getapage:pp %p vp %p off %llx", pp, vp, off);
+	    "swapfs getapage:pp %p vp %p off %llx", pp, vp, off);
 	return (err);
 }

@@ -340,7 +342,7 @@
 	ASSERT(nreloc != NULL);
 	ASSERT(!SEG_IS_SEGKP(seg)); /* XXX for now not supported */
 	SWAPFS_PRINT(SWAP_VOPS, "swap_getconpage: vp %p, off %llx, len %lx\n",
-		vp, off, len, 0, 0);
+	    vp, off, len, 0, 0);

 	/*
 	 * If we are not using a preallocated page then we know one already
@@ -384,7 +386,7 @@
 			pl[1] = NULL;
 			if (page_pptonum(pp) &
 			    (page_get_pagecnt(conpp->p_szc) - 1))
-			    cmn_err(CE_PANIC, "swap_getconpage: no root");
+				cmn_err(CE_PANIC, "swap_getconpage: no root");
 		}
 		return (err);
 	}
@@ -415,9 +417,27 @@
 			    "swap_getconpage: swap_getphysname failed!");
 		}

-		if (pvp) {
-			err = VOP_PAGEIO(pvp, pp, poff, PAGESIZE, B_READ, cr,
-				NULL);
+		if (pvp != NULL) {
+			err = VOP_PAGEIO(pvp, pp, poff, PAGESIZE, B_READ,
+			    cr, NULL);
+			if (err == 0) {
+				struct anon *ap;
+				kmutex_t *ahm;
+
+				ahm = &anonhash_lock[AH_LOCK(vp, off)];
+				mutex_enter(ahm);
+				ap = swap_anon(vp, off);
+				if (ap == NULL)
+					panic("swap_getconpage: null anon");
+				if (ap->an_pvp != pvp || ap->an_poff != poff)
+					panic("swap_getconpage: bad anon");
+
+				swap_phys_free(pvp, poff, PAGESIZE);
+				ap->an_pvp = NULL;
+				ap->an_poff = NULL;
+				hat_setmod(pp);
+				mutex_exit(ahm);
+			}
 		} else {
 			pagezero(pp, 0, PAGESIZE);
 		}
@@ -435,7 +455,7 @@
 	ASSERT(pp->p_prev == pp);

 	TRACE_3(TR_FAC_SWAPFS, TR_SWAPFS_GETAPAGE,
-		"swapfs getconpage:pp %p vp %p off %llx", pp, vp, off);
+	    "swapfs getconpage:pp %p vp %p off %llx", pp, vp, off);

 	pl[0] = pp;
 	pl[1] = NULL;
@@ -552,7 +572,7 @@
 				pp = page_lookup(vp, io_off, SE_EXCL);
 			else
 				pp = page_lookup_nowait(vp, io_off,
-					(flags & B_FREE) ? SE_EXCL : SE_SHARED);
+				    (flags & B_FREE) ? SE_EXCL : SE_SHARED);

 			if (pp == NULL || pvn_getdirty(pp, flags) == 0)
 				io_len = PAGESIZE;
@@ -628,8 +648,8 @@
 	}

 	SWAPFS_PRINT(SWAP_PUTP,
-		"swap_putapage: pp %p, vp %p, off %llx, flags %x\n",
-		pp, vp, pp->p_offset, flags, 0);
+	    "swap_putapage: pp %p, vp %p, off %llx, flags %x\n",
+	    pp, vp, pp->p_offset, flags, 0);

 	ASSERT(PAGE_LOCKED(pp));

@@ -683,7 +703,7 @@
 		doff = off;
 		dlen = PAGESIZE;
 		if (err = swap_newphysname(vp, off, &doff, &dlen,
-						&pvp, &poff)) {
+		    &pvp, &poff)) {
 			swap_otherfail++;
 			swap_otherpages += btop(klsz);
 			hat_setmod(pp);
@@ -715,7 +735,7 @@
 	}

 	err = VOP_PAGEIO(klvp, pplist, klstart, klsz,
-		    B_WRITE | flags, cr, NULL);
+	    B_WRITE | flags, cr, NULL);

 	if ((flags & B_ASYNC) == 0)
 		pvn_write_done(pp, ((err) ? B_ERROR : 0) | B_WRITE | flags);
@@ -727,8 +747,8 @@
 	}
 out:
 	TRACE_4(TR_FAC_SWAPFS, TR_SWAPFS_PUTAPAGE,
-		"swapfs putapage:vp %p klvp %p, klstart %lx, klsz %lx",
-		vp, klvp, klstart, klsz);
+	    "swapfs putapage:vp %p klvp %p, klstart %lx, klsz %lx",
+	    vp, klvp, klstart, klsz);
 	if (err && err != ENOMEM)
 		cmn_err(CE_WARN, "swapfs_putapage: err %d\n", err);
 	if (lenp)
--- a/usr/src/uts/common/io/dump.c	Thu May 22 22:08:42 2008 -0700
+++ b/usr/src/uts/common/io/dump.c	Thu May 22 22:23:49 2008 -0700
@@ -116,13 +116,12 @@
 			 * of these counters.
 			 */
 			dumpsize_in_pages = (physinstalled - obp_pages -
-						availrmem -
-						anon_segkp_pages_locked -
-						k_anoninfo.ani_mem_resv -
-						segvn_pages_locked -
-						pages_locked -
-						pages_claimed -
-						pages_useclaim);
+			    availrmem -
+			    anon_segkp_pages_locked -
+			    k_anoninfo.ani_mem_resv -
+			    pages_locked -
+			    pages_claimed -
+			    pages_useclaim);

 			/*
 			 * Protect against vm vagaries.
--- a/usr/src/uts/common/os/kstat_fr.c	Thu May 22 22:08:42 2008 -0700
+++ b/usr/src/uts/common/os/kstat_fr.c	Thu May 22 22:23:49 2008 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
@@ -494,7 +494,7 @@
 	 */
 	kstat_chain_id = 0;
 	ksp = kstat_create("unix", 0, "kstat_headers", "kstat", KSTAT_TYPE_RAW,
-		0, KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_VAR_SIZE);
+	    0, KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_VAR_SIZE);
 	if (ksp) {
 		ksp->ks_lock = &kstat_chain_lock;
 		ksp->ks_update = header_kstat_update;
@@ -505,35 +505,35 @@
 	}

 	ksp = kstat_create("unix", 0, "kstat_types", "kstat",
-		KSTAT_TYPE_NAMED, KSTAT_NUM_TYPES, 0);
+	    KSTAT_TYPE_NAMED, KSTAT_NUM_TYPES, 0);
 	if (ksp) {
 		int i;
 		kstat_named_t *kn = KSTAT_NAMED_PTR(ksp);

 		for (i = 0; i < KSTAT_NUM_TYPES; i++) {
 			kstat_named_init(&kn[i], kstat_data_type[i].name,
-				KSTAT_DATA_ULONG);
+			    KSTAT_DATA_ULONG);
 			kn[i].value.ul = i;
 		}
 		kstat_install(ksp);
 	}

 	ksp = kstat_create("unix", 0, "sysinfo", "misc", KSTAT_TYPE_RAW,
-		sizeof (sysinfo_t), KSTAT_FLAG_VIRTUAL);
+	    sizeof (sysinfo_t), KSTAT_FLAG_VIRTUAL);
 	if (ksp) {
 		ksp->ks_data = (void *) &sysinfo;
 		kstat_install(ksp);
 	}

 	ksp = kstat_create("unix", 0, "vminfo", "vm", KSTAT_TYPE_RAW,
-		sizeof (vminfo_t), KSTAT_FLAG_VIRTUAL);
+	    sizeof (vminfo_t), KSTAT_FLAG_VIRTUAL);
 	if (ksp) {
 		ksp->ks_data = (void *) &vminfo;
 		kstat_install(ksp);
 	}

 	ksp = kstat_create("unix", 0, "segmap", "vm", KSTAT_TYPE_NAMED,
-		segmapcnt_ndata, KSTAT_FLAG_VIRTUAL);
+	    segmapcnt_ndata, KSTAT_FLAG_VIRTUAL);
 	if (ksp) {
 		ksp->ks_data = (void *) segmapcnt_ptr;
 		ksp->ks_update = segmap_kstat_update;
@@ -541,7 +541,7 @@
 	}

 	ksp = kstat_create("unix", 0, "biostats", "misc", KSTAT_TYPE_NAMED,
-		biostats_ndata, KSTAT_FLAG_VIRTUAL);
+	    biostats_ndata, KSTAT_FLAG_VIRTUAL);
 	if (ksp) {
 		ksp->ks_data = (void *) biostats_ptr;
 		kstat_install(ksp);
@@ -549,7 +549,7 @@

 #ifdef VAC
 	ksp = kstat_create("unix", 0, "flushmeter", "hat", KSTAT_TYPE_RAW,
-		sizeof (struct flushmeter), KSTAT_FLAG_VIRTUAL);
+	    sizeof (struct flushmeter), KSTAT_FLAG_VIRTUAL);
 	if (ksp) {
 		ksp->ks_data = (void *) &flush_cnt;
 		kstat_install(ksp);
@@ -557,15 +557,15 @@
 #endif	/* VAC */

 	ksp = kstat_create("unix", 0, "var", "misc", KSTAT_TYPE_RAW,
-		sizeof (struct var), KSTAT_FLAG_VIRTUAL);
+	    sizeof (struct var), KSTAT_FLAG_VIRTUAL);
 	if (ksp) {
 		ksp->ks_data = (void *) &v;
 		kstat_install(ksp);
 	}

 	ksp = kstat_create("unix", 0, "system_misc", "misc", KSTAT_TYPE_NAMED,
-		sizeof (system_misc_kstat) / sizeof (kstat_named_t),
-		KSTAT_FLAG_VIRTUAL);
+	    sizeof (system_misc_kstat) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
 	if (ksp) {
 		ksp->ks_data = (void *) &system_misc_kstat;
 		ksp->ks_update = system_misc_kstat_update;
@@ -573,8 +573,8 @@
 	}

 	ksp = kstat_create("unix", 0, "system_pages", "pages", KSTAT_TYPE_NAMED,
-		sizeof (system_pages_kstat) / sizeof (kstat_named_t),
-		KSTAT_FLAG_VIRTUAL);
+	    sizeof (system_pages_kstat) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
 	if (ksp) {
 		ksp->ks_data = (void *) &system_pages_kstat;
 		ksp->ks_update = system_pages_kstat_update;
@@ -911,9 +911,9 @@
 	 * user explicit page locking.
 	 */
 	system_pages_kstat.pp_kernel.value.ul   = (ulong_t)(physinstalled -
-		obp_pages - availrmem - k_anoninfo.ani_mem_resv -
-		anon_segkp_pages_locked - segvn_pages_locked -
-		pages_locked - pages_claimed - pages_useclaim);
+	    obp_pages - availrmem - k_anoninfo.ani_mem_resv -
+	    anon_segkp_pages_locked - pages_locked -
+	    pages_claimed - pages_useclaim);

 	return (0);
 }
@@ -923,7 +923,7 @@
     const char *ks_class, uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags)
 {
 	return (kstat_create_zone(ks_module, ks_instance, ks_name, ks_class,
-		    ks_type, ks_ndata, ks_flags, ALL_ZONES));
+	    ks_type, ks_ndata, ks_flags, ALL_ZONES));
 }

 /*
@@ -966,8 +966,8 @@
 	 */
 	if (ks_type >= KSTAT_NUM_TYPES) {
 		cmn_err(CE_WARN, "kstat_create('%s', %d, '%s'): "
-			"invalid kstat type %d",
-			ks_module, ks_instance, ks_name, ks_type);
+		    "invalid kstat type %d",
+		    ks_module, ks_instance, ks_name, ks_type);
 		return (NULL);
 	}

@@ -978,8 +978,8 @@
 	if ((ks_flags & KSTAT_FLAG_PERSISTENT) &&
 	    (ks_flags & KSTAT_FLAG_VIRTUAL)) {
 		cmn_err(CE_WARN, "kstat_create('%s', %d, '%s'): "
-			"cannot create persistent virtual kstat",
-			ks_module, ks_instance, ks_name);
+		    "cannot create persistent virtual kstat",
+		    ks_module, ks_instance, ks_name);
 		return (NULL);
 	}

@@ -990,8 +990,8 @@
 	if ((ks_flags & KSTAT_FLAG_VAR_SIZE) &&
 	    !(ks_flags & KSTAT_FLAG_VIRTUAL)) {
 		cmn_err(CE_WARN, "kstat_create('%s', %d, '%s'): "
-			"cannot create variable-size physical kstat",
-			ks_module, ks_instance, ks_name);
+		    "cannot create variable-size physical kstat",
+		    ks_module, ks_instance, ks_name);
 		return (NULL);
 	}

@@ -1001,10 +1001,10 @@
 	if (ks_ndata < kstat_data_type[ks_type].min_ndata ||
 	    ks_ndata > kstat_data_type[ks_type].max_ndata) {
 		cmn_err(CE_WARN, "kstat_create('%s', %d, '%s'): "
-			"ks_ndata=%d out of range [%d, %d]",
-			ks_module, ks_instance, ks_name, (int)ks_ndata,
-			kstat_data_type[ks_type].min_ndata,
-			kstat_data_type[ks_type].max_ndata);
+		    "ks_ndata=%d out of range [%d, %d]",
+		    ks_module, ks_instance, ks_name, (int)ks_ndata,
+		    kstat_data_type[ks_type].min_ndata,
+		    kstat_data_type[ks_type].max_ndata);
 		return (NULL);
 	}

@@ -1036,8 +1036,8 @@
 			 */
 			kstat_rele(ksp);
 			cmn_err(CE_WARN, "kstat_create('%s', %d, '%s'): "
-				"invalid reactivation of dormant kstat",
-				ks_module, ks_instance, ks_name);
+			    "invalid reactivation of dormant kstat",
+			    ks_module, ks_instance, ks_name);
 			return (NULL);
 		}
 		/*
@@ -1056,8 +1056,8 @@
 	e = kstat_alloc(ks_flags & KSTAT_FLAG_VIRTUAL ? 0 : ks_data_size);
 	if (e == NULL) {
 		cmn_err(CE_NOTE, "kstat_create('%s', %d, '%s'): "
-			"insufficient kernel memory",
-			ks_module, ks_instance, ks_name);
+		    "insufficient kernel memory",
+		    ks_module, ks_instance, ks_name);
 		return (NULL);
 	}
--- a/usr/src/uts/common/os/mem_cage.c	Thu May 22 22:08:42 2008 -0700
+++ b/usr/src/uts/common/os/mem_cage.c	Thu May 22 22:23:49 2008 -0700
@@ -1271,6 +1271,11 @@
 				}
 			}
 		}
+
+		if (NOMEMWAIT() && freemem < minfree) {
+			return (KCT_CRIT);
+		}
+
 	}
 	return (KCT_NONCRIT);
 }
--- a/usr/src/uts/common/os/schedctl.c	Thu May 22 22:08:42 2008 -0700
+++ b/usr/src/uts/common/os/schedctl.c	Thu May 22 22:23:49 2008 -0700
@@ -676,6 +676,7 @@
 		 * we have to free everything rather than letting as_free
 		 * do the work.
 		 */
+		anonmap_purge(amp);
 		anon_free(amp->ahp, 0, PAGESIZE);
 		ANON_LOCK_EXIT(&amp->a_rwlock);
 		anonmap_free(amp);
--- a/usr/src/uts/common/os/shm.c	Thu May 22 22:08:42 2008 -0700
+++ b/usr/src/uts/common/os/shm.c	Thu May 22 22:23:49 2008 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */

@@ -718,9 +718,8 @@
 			if (error = shmem_lock(sp, sp->shm_amp)) {
 				ANON_LOCK_ENTER(&sp->shm_amp->a_rwlock,
 				    RW_WRITER);
-				cmn_err(CE_NOTE,
-				    "shmctl - couldn't lock %ld pages into "
-				    "memory", sp->shm_amp->size);
+				cmn_err(CE_NOTE, "shmctl - couldn't lock %ld"
+				    " pages into memory", sp->shm_amp->size);
 				ANON_LOCK_EXIT(&sp->shm_amp->a_rwlock);
 				error = ENOMEM;
 				sp->shm_lkcnt--;
@@ -1253,13 +1252,14 @@
 	 * Free up the anon_map.
 	 */
 	lgrp_shm_policy_fini(amp, NULL);
+	ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
+	anonmap_purge(amp);
 	if (amp->a_szc != 0) {
-		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
 		anon_shmap_free_pages(amp, 0, amp->size);
-		ANON_LOCK_EXIT(&amp->a_rwlock);
 	} else {
 		anon_free(amp->ahp, 0, amp->size);
 	}
+	ANON_LOCK_EXIT(&amp->a_rwlock);
 	anon_unresv_zone(amp->swresv, zone);
 	anonmap_free(amp);
 }
--- a/usr/src/uts/common/os/vm_pageout.c	Thu May 22 22:08:42 2008 -0700
+++ b/usr/src/uts/common/os/vm_pageout.c	Thu May 22 22:23:49 2008 -0700
@@ -531,7 +531,7 @@
 	if (freemem < lotsfree + needfree + kmem_reapahead)
 		kmem_reap();

-	if (freemem < lotsfree + needfree + seg_preapahead)
+	if (freemem < lotsfree + needfree)
 		seg_preap();

 	if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
@@ -957,9 +957,10 @@
 	 *
 	 * NOTE:  These optimizations assume that reads are atomic.
 	 */
-top:
-	if ((PP_ISKAS(pp)) || (PP_ISFREE(pp)) ||
-	    hat_page_checkshare(pp, po_share) || PAGE_LOCKED(pp)) {
+
+	if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) ||
+	    pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
+	    hat_page_checkshare(pp, po_share)) {
 		return (-1);
 	}
--- a/usr/src/uts/common/vm/anon.h	Thu May 22 22:08:42 2008 -0700
+++ b/usr/src/uts/common/vm/anon.h	Thu May 22 22:23:49 2008 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */

@@ -139,7 +139,6 @@
  * Declaration for the Global counters to accurately
  * track the kernel foot print in memory.
  */
-extern  pgcnt_t segvn_pages_locked;
 extern  pgcnt_t pages_locked;
 extern  pgcnt_t pages_claimed;
 extern  pgcnt_t pages_useclaim;
@@ -278,7 +277,7 @@
  * 0 (base page size) or page_num_pagesizes() - 1, while MAP_PRIVATE
  * the amp->szc could be anything in [0, page_num_pagesizes() - 1].
  */
-struct anon_map {
+typedef struct anon_map {
 	krwlock_t a_rwlock;	/* protect anon_map and anon array */
 	size_t	size;		/* size in bytes mapped by the anon array */
 	struct	anon_hdr *ahp; 	/* anon array header pointer, containing */
@@ -288,7 +287,13 @@
 	ushort_t a_szc;		/* max szc among shared processes */
 	void	*locality;	/* lgroup locality info */
 	struct kshmid *a_sp;	/* kshmid if amp backs sysV, or NULL */
-};
+	int	a_purgewait;	/* somebody waits for slocks to go away */
+	kcondvar_t a_purgecv;	/* cv for waiting for slocks to go away */
+	kmutex_t a_purgemtx;	/* mutex for anonmap_purge() */
+	spgcnt_t a_softlockcnt; /* number of pages locked in pcache */
+	kmutex_t a_pmtx;	/* protects amp's pcache list */
+	pcache_link_t a_phead;	/* head of amp's pcache list */
+} amp_t;

 #ifdef _KERNEL

@@ -303,6 +308,9 @@

 #define	ANON_LOCK_ENTER(lock, type)	rw_enter((lock), (type))
 #define	ANON_LOCK_EXIT(lock)		rw_exit((lock))
+#define	ANON_LOCK_HELD(lock)		RW_LOCK_HELD((lock))
+#define	ANON_READ_HELD(lock)		RW_READ_HELD((lock))
+#define	ANON_WRITE_HELD(lock)		RW_WRITE_HELD((lock))

 #define	ANON_ARRAY_HASH(amp, idx)\
 	((((idx) + ((idx) >> ANON_ARRAY_SHIFT) +\
@@ -334,9 +342,9 @@
 /*
  * Swap slots currently available for reservation
  */
-#define	CURRENT_TOTAL_AVAILABLE_SWAP \
+#define	CURRENT_TOTAL_AVAILABLE_SWAP				\
 	((k_anoninfo.ani_max - k_anoninfo.ani_phys_resv) +	\
-			MAX((spgcnt_t)(availrmem - swapfs_minfree), 0))
+	    MAX((spgcnt_t)(availrmem - swapfs_minfree), 0))

 struct k_anoninfo {
 	pgcnt_t	ani_max;	/* total reservable slots on phys */
@@ -392,6 +400,8 @@
 extern void	anon_unresvmem(size_t, zone_t *);
 extern struct	anon_map *anonmap_alloc(size_t, size_t, int);
 extern void	anonmap_free(struct anon_map *);
+extern void	anonmap_purge(struct anon_map *);
+extern void	anon_swap_free(struct anon *, struct page *);
 extern void	anon_decref(struct anon *);
 extern int	non_anon(struct anon_hdr *, ulong_t, u_offset_t *, size_t *);
 extern pgcnt_t	anon_pages(struct anon_hdr *, ulong_t, pgcnt_t);
--- a/usr/src/uts/common/vm/as.h	Thu May 22 22:08:42 2008 -0700
+++ b/usr/src/uts/common/vm/as.h	Thu May 22 22:23:49 2008 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */

@@ -131,23 +131,26 @@
 #define	AS_CLAIMGAP		0x40
 #define	AS_UNMAPWAIT		0x20
 #define	AS_NEEDSPURGE		0x10	/* mostly for seg_nf, see as_purge() */
+#define	AS_NOUNMAPWAIT		0x02
 #define	AS_BUSY			0x01	/* needed by XHAT framework */

 #define	AS_ISPGLCK(as)		((as)->a_flags & AS_PAGLCK)
 #define	AS_ISCLAIMGAP(as)	((as)->a_flags & AS_CLAIMGAP)
 #define	AS_ISUNMAPWAIT(as)	((as)->a_flags & AS_UNMAPWAIT)
 #define	AS_ISBUSY(as)		((as)->a_flags & AS_BUSY)
-
+#define	AS_ISNOUNMAPWAIT(as)	((as)->a_flags & AS_NOUNMAPWAIT)

 #define	AS_SETPGLCK(as)		((as)->a_flags |= AS_PAGLCK)
 #define	AS_SETCLAIMGAP(as)	((as)->a_flags |= AS_CLAIMGAP)
 #define	AS_SETUNMAPWAIT(as)	((as)->a_flags |= AS_UNMAPWAIT)
 #define	AS_SETBUSY(as)		((as)->a_flags |= AS_BUSY)
+#define	AS_SETNOUNMAPWAIT(as)	((as)->a_flags |= AS_NOUNMAPWAIT)

 #define	AS_CLRPGLCK(as)		((as)->a_flags &= ~AS_PAGLCK)
 #define	AS_CLRCLAIMGAP(as)	((as)->a_flags &= ~AS_CLAIMGAP)
 #define	AS_CLRUNMAPWAIT(as)	((as)->a_flags &= ~AS_UNMAPWAIT)
 #define	AS_CLRBUSY(as)		((as)->a_flags &= ~AS_BUSY)
+#define	AS_CLRNOUNMAPWAIT(as)	((as)->a_flags &= ~AS_NOUNMAPWAIT)

 #define	AS_TYPE_64BIT(as)	\
 	    (((as)->a_userlimit > (caddr_t)UINT32_MAX) ? 1 : 0)
@@ -281,8 +284,6 @@
 		size_t size, enum seg_rw rw);
 void	as_pageunlock(struct as *as, struct page **pp, caddr_t addr,
 		size_t size, enum seg_rw rw);
-void	as_pagereclaim(struct as *as, struct page **pp, caddr_t addr,
-		size_t size, enum seg_rw rw);
 int	as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
 		boolean_t wait);
 int	as_set_default_lpsize(struct as *as, caddr_t addr, size_t size);
--- a/usr/src/uts/common/vm/seg.h	Thu May 22 22:08:42 2008 -0700
+++ b/usr/src/uts/common/vm/seg.h	Thu May 22 22:23:49 2008 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */

@@ -55,6 +55,8 @@
  * VM - Segments.
  */

+struct anon_map;
+
 /*
  * kstat statistics for segment advise
  */
@@ -93,7 +95,12 @@
  * write locked.
  */

-struct seg {
+typedef struct pcache_link {
+	struct pcache_link	*p_lnext;
+	struct pcache_link	*p_lprev;
+} pcache_link_t;
+
+typedef struct seg {
 	caddr_t	s_base;			/* base virtual address */
 	size_t	s_size;			/* size in bytes */
 	uint_t	s_szc;			/* max page size code */
@@ -102,7 +109,9 @@
 	avl_node_t s_tree;		/* AVL tree links to segs in this as */
 	struct	seg_ops *s_ops;		/* ops vector: see below */
 	void *s_data;			/* private data for instance */
-};
+	kmutex_t s_pmtx;		/* protects seg's pcache list */
+	pcache_link_t s_phead;		/* head of seg's pcache list */
+} seg_t;

 #define	S_PURGE		(0x01)		/* seg should be purged in as_gap() */

@@ -136,6 +145,7 @@
 };

 #ifdef _KERNEL
+
 /*
  * Generic segment operations
  */
@@ -149,28 +159,41 @@
 /*
  * functions for pagelock cache support
  */
-extern	void	seg_ppurge(struct seg *seg);
-extern	void	seg_ppurge_seg(int (*callback)());
-extern	void	seg_pinactive(struct seg *seg, caddr_t addr, size_t len,
-			struct page **pp, enum seg_rw rw, int (*callback)());
-extern	int	seg_pinsert_check(struct seg *seg, size_t len, uint_t flags);
-extern	int	seg_pinsert(struct seg *seg, caddr_t addr, size_t len,
-			struct page **pp, enum seg_rw rw, uint_t flags,
-			int (*callback)());
-extern	struct	page **seg_plookup(struct seg *seg, caddr_t addr,
-			size_t len, enum seg_rw rw);
+typedef	int (*seg_preclaim_cbfunc_t)(void *, caddr_t, size_t,
+    struct page **, enum seg_rw, int);
+
+extern	struct	page **seg_plookup(struct seg *seg, struct anon_map *amp,
+    caddr_t addr, size_t len, enum seg_rw rw, uint_t flags);
+extern	void	seg_pinactive(struct seg *seg, struct anon_map *amp,
+    caddr_t addr, size_t len, struct page **pp, enum seg_rw rw,
+    uint_t flags, seg_preclaim_cbfunc_t callback);
+
+extern	void	seg_ppurge(struct seg *seg, struct anon_map *amp,
+    uint_t flags);
+extern	void	seg_ppurge_wiredpp(struct page **pp);
+
+extern	int	seg_pinsert_check(struct seg *seg, struct anon_map *amp,
+    caddr_t addr, size_t len, uint_t flags);
+extern	int	seg_pinsert(struct seg *seg, struct anon_map *amp,
+    caddr_t addr, size_t len, size_t wlen, struct page **pp, enum seg_rw rw,
+    uint_t flags, seg_preclaim_cbfunc_t callback);
+
 extern	void	seg_pasync_thread(void);
 extern	void	seg_preap(void);
 extern	int	seg_p_disable(void);
 extern	void	seg_p_enable(void);

-extern	int	seg_preapahead;
-extern	segadvstat_t  segadvstat;
+extern	segadvstat_t	segadvstat;
+
 /*
- * Flags for pagelock cache support
+ * Flags for pagelock cache support.
+ * Flags argument is passed as uint_t to pcache routines.  upper 16 bits of
+ * the flags argument are reserved for alignment page shift when SEGP_PSHIFT
+ * is set.
  */
-#define	SEGP_ASYNC_FLUSH	0x1	/* flushed by async thread */
-#define	SEGP_FORCE_WIRED	0x2	/* skip check against seg_pwindow */
+#define	SEGP_FORCE_WIRED	0x1	/* skip check against seg_pwindow */
+#define	SEGP_AMP		0x2	/* anon map's pcache entry */
+#define	SEGP_PSHIFT		0x4	/* addr pgsz shift for hash function */

 /*
  * Return values for seg_pinsert and seg_pinsert_check functions.
--- a/usr/src/uts/common/vm/seg_enum.h	Thu May 22 22:08:42 2008 -0700
+++ b/usr/src/uts/common/vm/seg_enum.h	Thu May 22 22:23:49 2008 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
@@ -62,8 +61,7 @@
  */
 enum lock_type {
 	L_PAGELOCK,		/* lock pages */
-	L_PAGEUNLOCK,		/* unlock pages */
-	L_PAGERECLAIM		/* reclaim pages */
+	L_PAGEUNLOCK		/* unlock pages */
 };

 /*
--- a/usr/src/uts/common/vm/seg_kmem.c	Thu May 22 22:08:42 2008 -0700
+++ b/usr/src/uts/common/vm/seg_kmem.c	Thu May 22 22:23:49 2008 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */

@@ -674,9 +674,6 @@
 	    BT_TEST(segkp_bitmap, btop((uintptr_t)(addr - seg->s_base))))
 		return (SEGOP_PAGELOCK(segkp, addr, len, ppp, type, rw));

-	if (type == L_PAGERECLAIM)
-		return (ENOTSUP);
-
 	npages = btopr(len);
 	nb = sizeof (page_t *) * npages;
--- a/usr/src/uts/common/vm/seg_spt.c	Thu May 22 22:08:42 2008 -0700
+++ b/usr/src/uts/common/vm/seg_spt.c	Thu May 22 22:23:49 2008 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */

@@ -174,8 +174,8 @@
 };

 static void segspt_purge(struct seg *seg);
-static int segspt_reclaim(struct seg *, caddr_t, size_t, struct page **,
-		enum seg_rw);
+static int segspt_reclaim(void *, caddr_t, size_t, struct page **,
+		enum seg_rw, int);
 static int spt_anon_getpages(struct seg *seg, caddr_t addr, size_t len,
 		page_t **ppa);

@@ -833,6 +833,7 @@
 	uint_t	szc;

 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+	ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK);

 	/*
 	 * We want to lock/unlock the entire ISM segment. Therefore,
@@ -857,8 +858,8 @@
 	if (type == L_PAGEUNLOCK) {
 		ASSERT(sptd->spt_ppa != NULL);

-		seg_pinactive(seg, seg->s_base, sptd->spt_amp->size,
-		    sptd->spt_ppa, sptd->spt_prot, segspt_reclaim);
+		seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size,
+		    sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);

 		/*
 		 * If someone is blocked while unmapping, we purge
@@ -868,17 +869,16 @@
 		 * raw async i/o is still in progress or where a thread
 		 * exits on data fault in a multithreaded application.
 		 */
-		if (AS_ISUNMAPWAIT(seg->s_as) && (shmd->shm_softlockcnt > 0)) {
+		if ((sptd->spt_flags & DISM_PPA_CHANGED) ||
+		    (AS_ISUNMAPWAIT(seg->s_as) &&
+		    shmd->shm_softlockcnt > 0)) {
 			segspt_purge(seg);
 		}
 		return (0);
-	} else if (type == L_PAGERECLAIM) {
-		ASSERT(sptd->spt_ppa != NULL);
-		(void) segspt_reclaim(seg, seg->s_base, sptd->spt_amp->size,
-		    sptd->spt_ppa, sptd->spt_prot);
-		return (0);
 	}

+	/* The L_PAGELOCK case ... */
+
 	if (sptd->spt_flags & DISM_PPA_CHANGED) {
 		segspt_purge(seg);
 		/*
@@ -893,17 +893,17 @@
 	 * First try to find pages in segment page cache, without
 	 * holding the segment lock.
 	 */
-	pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size,
-	    sptd->spt_prot);
+	pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
+	    S_WRITE, SEGP_FORCE_WIRED);
 	if (pplist != NULL) {
 		ASSERT(sptd->spt_ppa != NULL);
 		ASSERT(sptd->spt_ppa == pplist);
 		ppa = sptd->spt_ppa;
 		for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
 			if (ppa[an_idx] == NULL) {
-				seg_pinactive(seg, seg->s_base,
+				seg_pinactive(seg, NULL, seg->s_base,
 				    sptd->spt_amp->size, ppa,
-				    sptd->spt_prot, segspt_reclaim);
+				    S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
 				*ppp = NULL;
 				return (ENOTSUP);
 			}
@@ -923,13 +923,12 @@
 		return (0);
 	}

-	/* The L_PAGELOCK case... */
 	mutex_enter(&sptd->spt_lock);
 	/*
 	 * try to find pages in segment page cache with mutex
 	 */
-	pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size,
-	    sptd->spt_prot);
+	pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
+	    S_WRITE, SEGP_FORCE_WIRED);
 	if (pplist != NULL) {
 		ASSERT(sptd->spt_ppa != NULL);
 		ASSERT(sptd->spt_ppa == pplist);
@@ -937,9 +936,9 @@
 		for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
 			if (ppa[an_idx] == NULL) {
 				mutex_exit(&sptd->spt_lock);
-				seg_pinactive(seg, seg->s_base,
+				seg_pinactive(seg, NULL, seg->s_base,
 				    sptd->spt_amp->size, ppa,
-				    sptd->spt_prot, segspt_reclaim);
+				    S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
 				*ppp = NULL;
 				return (ENOTSUP);
 			}
@@ -959,8 +958,8 @@
 		*ppp = &(sptd->spt_ppa[pg_idx]);
 		return (0);
 	}
-	if (seg_pinsert_check(seg, sptd->spt_amp->size, SEGP_FORCE_WIRED) ==
-	    SEGP_FAIL) {
+	if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size,
+	    SEGP_FORCE_WIRED) == SEGP_FAIL) {
 		mutex_exit(&sptd->spt_lock);
 		*ppp = NULL;
 		return (ENOTSUP);
@@ -1038,16 +1037,18 @@
 		}
 		ANON_LOCK_EXIT(&amp->a_rwlock);

-		mutex_enter(&freemem_lock);
-		if (availrmem < tune.t_minarmem + claim_availrmem) {
+		if (claim_availrmem) {
+			mutex_enter(&freemem_lock);
+			if (availrmem < tune.t_minarmem + claim_availrmem) {
+				mutex_exit(&freemem_lock);
+				ret = ENOTSUP;
+				claim_availrmem = 0;
+				goto insert_fail;
+			} else {
+				availrmem -= claim_availrmem;
+			}
 			mutex_exit(&freemem_lock);
-			ret = FC_MAKE_ERR(ENOMEM);
-			claim_availrmem = 0;
-			goto insert_fail;
-		} else {
-			availrmem -= claim_availrmem;
 		}
-		mutex_exit(&freemem_lock);

 		sptd->spt_ppa = pl;
 	} else {
@@ -1059,8 +1060,8 @@

 	ASSERT(pl != NULL);

-	ret = seg_pinsert(seg, seg->s_base, sptd->spt_amp->size,
-	    pl, sptd->spt_prot, SEGP_FORCE_WIRED | SEGP_ASYNC_FLUSH,
+	ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size,
+	    sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED,
 	    segspt_reclaim);
 	if (ret == SEGP_FAIL) {
 		/*
@@ -1089,8 +1090,9 @@
 	for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
 		if (ppa[an_idx] == NULL) {
 			mutex_exit(&sptd->spt_lock);
-			seg_pinactive(seg, seg->s_base, sptd->spt_amp->size,
-			    pl, sptd->spt_prot, segspt_reclaim);
+			seg_pinactive(seg, NULL, seg->s_base,
+			    sptd->spt_amp->size,
+			    pl, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
 			*ppp = NULL;
 			return (ENOTSUP);
 		}
@@ -1113,7 +1115,7 @@
 	 * to the requested addr, i.e. pg_idx.
 	 */
 	*ppp = &(sptd->spt_ppa[pg_idx]);
-	return (ret);
+	return (0);

 insert_fail:
 	/*
@@ -1125,9 +1127,11 @@
 	mutex_exit(&sptd->spt_lock);

 	if (pl_built) {
-		mutex_enter(&freemem_lock);
-		availrmem += claim_availrmem;
-		mutex_exit(&freemem_lock);
+		if (claim_availrmem) {
+			mutex_enter(&freemem_lock);
+			availrmem += claim_availrmem;
+			mutex_exit(&freemem_lock);
+		}

 		/*
 		 * We created pl and we need to destroy it.
@@ -1184,6 +1188,8 @@
 	u_offset_t off;

 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+	ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK);
+

 	/*
 	 * We want to lock/unlock the entire ISM segment. Therefore,
@@ -1213,8 +1219,8 @@

 		ASSERT(sptd->spt_ppa != NULL);

-		seg_pinactive(seg, seg->s_base, sptd->spt_amp->size,
-		    sptd->spt_ppa, sptd->spt_prot, segspt_reclaim);
+		seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size,
+		    sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);

 		/*
 		 * If someone is blocked while unmapping, we purge
@@ -1228,20 +1234,16 @@
 			segspt_purge(seg);
 		}
 		return (0);
-	} else if (type == L_PAGERECLAIM) {
-		ASSERT(sptd->spt_ppa != NULL);
+	}

-		(void) segspt_reclaim(seg, seg->s_base, sptd->spt_amp->size,
-		    sptd->spt_ppa, sptd->spt_prot);
-		return (0);
-	}
+	/* The L_PAGELOCK case... */

 	/*
 	 * First try to find pages in segment page cache, without
 	 * holding the segment lock.
 	 */
-	pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size,
-	    sptd->spt_prot);
+	pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
+	    S_WRITE, SEGP_FORCE_WIRED);
 	if (pplist != NULL) {
 		ASSERT(sptd->spt_ppa == pplist);
 		ASSERT(sptd->spt_ppa[page_index]);
@@ -1254,14 +1256,13 @@
 		return (0);
 	}

-	/* The L_PAGELOCK case... */
 	mutex_enter(&sptd->spt_lock);

 	/*
 	 * try to find pages in segment page cache
 	 */
-	pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size,
-	    sptd->spt_prot);
+	pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
+	    S_WRITE, SEGP_FORCE_WIRED);
 	if (pplist != NULL) {
 		ASSERT(sptd->spt_ppa == pplist);
 		/*
@@ -1274,8 +1275,8 @@
 		return (0);
 	}

-	if (seg_pinsert_check(seg, sptd->spt_amp->size, SEGP_FORCE_WIRED) ==
-	    SEGP_FAIL) {
+	if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size,
+	    SEGP_FORCE_WIRED) == SEGP_FAIL) {
 		mutex_exit(&sptd->spt_lock);
 		*ppp = NULL;
 		return (ENOTSUP);
@@ -1338,8 +1339,9 @@

 	ASSERT(pl != NULL);

-	ret = seg_pinsert(seg, seg->s_base, sptd->spt_amp->size,
-	    pl, sptd->spt_prot, SEGP_FORCE_WIRED, segspt_reclaim);
+	ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size,
+	    sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED,
+	    segspt_reclaim);
 	if (ret == SEGP_FAIL) {
 		/*
 		 * seg_pinsert failed. We return
@@ -1375,7 +1377,7 @@
 	 * to the requested addr, i.e. page_index.
 	 */
 	*ppp = &(sptd->spt_ppa[page_index]);
-	return (ret);
+	return (0);

 insert_fail:
 	/*
@@ -1419,13 +1421,14 @@
 static void
 segspt_purge(struct seg *seg)
 {
-	seg_ppurge(seg);
+	seg_ppurge(seg, NULL, SEGP_FORCE_WIRED);
 }

 static int
-segspt_reclaim(struct seg *seg, caddr_t addr, size_t len, struct page **pplist,
-	enum seg_rw rw)
+segspt_reclaim(void *ptag, caddr_t addr, size_t len, struct page **pplist,
+	enum seg_rw rw, int async)
 {
+	struct seg *seg = (struct seg *)ptag;
 	struct	shm_data *shmd = (struct shm_data *)seg->s_data;
 	struct	seg	*sptseg;
 	struct	spt_data *sptd;
@@ -1442,6 +1445,8 @@
 	ASSERT(sptd->spt_pcachecnt != 0);
 	ASSERT(sptd->spt_ppa == pplist);
 	ASSERT(npages == btopr(sptd->spt_amp->size));
+	ASSERT(async || AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
 	/*
 	 * Acquire the lock on the dummy seg and destroy the
 	 * ppa array IF this is the last pcachecnt.
@@ -1462,7 +1467,7 @@
 				free_availrmem++;
 			page_unlock(pplist[i]);
 		}
-		if (sptd->spt_flags & SHM_PAGEABLE) {
+		if ((sptd->spt_flags & SHM_PAGEABLE) && free_availrmem) {
 			mutex_enter(&freemem_lock);
 			availrmem += free_availrmem;
 			mutex_exit(&freemem_lock);
@@ -1482,14 +1487,41 @@
 		done = 1;
 	}
 	mutex_exit(&sptd->spt_lock);
+
+	/*
+	 * If we are pcache async thread or called via seg_ppurge_wiredpp() we
+	 * may not hold AS lock (in this case async argument is not 0). This
+	 * means if softlockcnt drops to 0 after the decrement below address
+	 * space may get freed. We can't allow it since after softlock
+	 * derement to 0 we still need to access as structure for possible
+	 * wakeup of unmap waiters. To prevent the disappearance of as we take
+	 * this segment's shm_segfree_syncmtx. segspt_shmfree() also takes
+	 * this mutex as a barrier to make sure this routine completes before
+	 * segment is freed.
+	 *
+	 * The second complication we have to deal with in async case is a
+	 * possibility of missed wake up of unmap wait thread. When we don't
+	 * hold as lock here we may take a_contents lock before unmap wait
+	 * thread that was first to see softlockcnt was still not 0. As a
+	 * result we'll fail to wake up an unmap wait thread. To avoid this
+	 * race we set nounmapwait flag in as structure if we drop softlockcnt
+	 * to 0 if async is not 0.  unmapwait thread
+	 * will not block if this flag is set.
+	 */
+	if (async)
+		mutex_enter(&shmd->shm_segfree_syncmtx);
+
 	/*
 	 * Now decrement softlockcnt.
 	 */
+	ASSERT(shmd->shm_softlockcnt > 0);
 	atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -1);

 	if (shmd->shm_softlockcnt <= 0) {
-		if (AS_ISUNMAPWAIT(seg->s_as)) {
+		if (async || AS_ISUNMAPWAIT(seg->s_as)) {
 			mutex_enter(&seg->s_as->a_contents);
+			if (async)
+				AS_SETNOUNMAPWAIT(seg->s_as);
 			if (AS_ISUNMAPWAIT(seg->s_as)) {
 				AS_CLRUNMAPWAIT(seg->s_as);
 				cv_broadcast(&seg->s_as->a_cv);
@@ -1497,6 +1529,10 @@
 			mutex_exit(&seg->s_as->a_contents);
 		}
 	}
+
+	if (async)
+		mutex_exit(&shmd->shm_segfree_syncmtx);
+
 	return (done);
 }

@@ -1604,6 +1640,7 @@

 softlock_decrement:
 	npages = btopr(len);
+	ASSERT(shmd->shm_softlockcnt >= npages);
 	atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -npages);
 	if (shmd->shm_softlockcnt == 0) {
 		/*
@@ -1646,6 +1683,8 @@
 	(void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, shm_amp, 0,
 	    NULL, 0, seg->s_size);

+	mutex_init(&shmd->shm_segfree_syncmtx, NULL, MUTEX_DEFAULT, NULL);
+
 	seg->s_data = (void *)shmd;
 	seg->s_ops = &segspt_shmops;
 	seg->s_szc = shmd->shm_sptseg->s_szc;
@@ -1741,6 +1780,15 @@
 		kmem_free(shmd->shm_vpage, btopr(shm_amp->size));
 		shmd->shm_vpage = NULL;
 	}
+
+	/*
+	 * Take shm_segfree_syncmtx lock to let segspt_reclaim() finish if it's
+	 * still working with this segment without holding as lock.
+	 */
+	ASSERT(shmd->shm_softlockcnt == 0);
+	mutex_enter(&shmd->shm_segfree_syncmtx);
+	mutex_destroy(&shmd->shm_segfree_syncmtx);
+
 	kmem_free(shmd, sizeof (*shmd));
 }

@@ -1834,14 +1882,6 @@

 	case F_SOFTLOCK:

-		mutex_enter(&freemem_lock);
-		if (availrmem < tune.t_minarmem + npages) {
-			mutex_exit(&freemem_lock);
-			return (FC_MAKE_ERR(ENOMEM));
-		} else {
-			availrmem -= npages;
-		}
-		mutex_exit(&freemem_lock);
 		atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages);
 		/*
 		 * Fall through to the F_INVAL case to load up the hat layer
@@ -1858,9 +1898,6 @@
 		err = spt_anon_getpages(sptseg, segspt_addr, size, ppa);
 		if (err != 0) {
 			if (type == F_SOFTLOCK) {
-				mutex_enter(&freemem_lock);
-				availrmem += npages;
-				mutex_exit(&freemem_lock);
 				atomic_add_long((ulong_t *)(
 				    &(shmd->shm_softlockcnt)), -npages);
 			}
@@ -1934,10 +1971,6 @@

 	case F_SOFTUNLOCK:

-		mutex_enter(&freemem_lock);
-		availrmem += npages;
-		mutex_exit(&freemem_lock);
-
 		/*
 		 * This is a bit ugly, we pass in the real seg pointer,
 		 * but the segspt_addr is the virtual address within the
@@ -2616,6 +2649,7 @@
 		int		kernel;
 		anon_sync_obj_t cookie;
 		rctl_qty_t	unlocked = 0;
+		page_t		**ppa;

 		amp = sptd->spt_amp;
 		mutex_enter(&sptd->spt_lock);
@@ -2661,12 +2695,15 @@
 			}
 		}
 		ANON_LOCK_EXIT(&amp->a_rwlock);
-		if (sptd->spt_ppa != NULL)
+		if ((ppa = sptd->spt_ppa) != NULL)
 			sptd->spt_flags |= DISM_PPA_CHANGED;
 		mutex_exit(&sptd->spt_lock);

 		rctl_decr_locked_mem(NULL, proj, unlocked, 0);
 		mutex_exit(&sp->shm_mlock);
+
+		if (ppa != NULL)
+			seg_ppurge_wiredpp(ppa);
 	}
 	return (sts);
 }
@@ -2748,6 +2785,7 @@
 	ushort_t gen;
 	clock_t	end_lbolt;
 	int writer;
+	page_t **ppa;

 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));

@@ -2759,7 +2797,7 @@
 		pg_idx = seg_page(seg, addr);

 		mutex_enter(&sptd->spt_lock);
-		if (sptd->spt_ppa == NULL) {
+		if ((ppa = sptd->spt_ppa) == NULL) {
 			mutex_exit(&sptd->spt_lock);
 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
 			anon_disclaim(amp, pg_idx, len);
@@ -2775,7 +2813,7 @@
 		/*
 		 * Purge all DISM cached pages
 		 */
-		seg_ppurge_seg(segspt_reclaim);
+		seg_ppurge_wiredpp(ppa);

 		/*
 		 * Drop the AS_LOCK so that other threads can grab it
--- a/usr/src/uts/common/vm/seg_spt.h	Thu May 22 22:08:42 2008 -0700
+++ b/usr/src/uts/common/vm/seg_spt.h	Thu May 22 22:23:49 2008 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */

@@ -74,7 +74,7 @@
 typedef struct shm_data {
 	struct as	*shm_sptas;
 	struct anon_map *shm_amp;
-	size_t		shm_softlockcnt; /* # outstanding lock operations */
+	spgcnt_t	shm_softlockcnt; /* # outstanding lock operations */
 	struct seg 	*shm_sptseg;	/* pointer to spt segment */
 	char		*shm_vpage;	/* indicating locked pages */
 	spgcnt_t	shm_lckpgs;	/* # of locked pages per attached seg */
@@ -82,6 +82,7 @@
 	 * Memory allocation policy after shmat()
 	 */
 	lgrp_mem_policy_info_t	shm_policy_info;
+	kmutex_t shm_segfree_syncmtx;	/* barrier lock for segspt_shmfree() */
 } shm_data_t;

 #define	DISM_PG_LOCKED		0x1	/* DISM page is locked */
--- a/usr/src/uts/common/vm/seg_vn.c	Thu May 22 22:08:42 2008 -0700
+++ b/usr/src/uts/common/vm/seg_vn.c	Thu May 22 22:23:49 2008 -0700
@@ -162,6 +162,11 @@

 size_t	segvn_comb_thrshld = UINT_MAX;	/* patchable -- see 1196681 */

+size_t	segvn_pglock_comb_thrshld = (1UL << 16);	/* 64K */
+size_t	segvn_pglock_comb_balign = (1UL << 16);		/* 64K */
+uint_t	segvn_pglock_comb_bshift;
+size_t	segvn_pglock_comb_palign;
+
 static int	segvn_concat(struct seg *, struct seg *, int);
 static int	segvn_extend_prev(struct seg *, struct seg *,
 		    struct segvn_crargs *, size_t);
@@ -180,13 +185,15 @@
     caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int);
 static faultcode_t segvn_faultpage(struct hat *, struct seg *, caddr_t,
     u_offset_t, struct vpage *, page_t **, uint_t,
-    enum fault_type, enum seg_rw, int, int);
+    enum fault_type, enum seg_rw, int);
 static void	segvn_vpage(struct seg *);
 static size_t	segvn_count_swap_by_vpages(struct seg *);

 static void segvn_purge(struct seg *seg);
-static int segvn_reclaim(struct seg *, caddr_t, size_t, struct page **,
-    enum seg_rw);
+static int segvn_reclaim(void *, caddr_t, size_t, struct page **,
+    enum seg_rw, int);
+static int shamp_reclaim(void *, caddr_t, size_t, struct page **,
+    enum seg_rw, int);

 static int sameprot(struct seg *, caddr_t, size_t);

@@ -199,9 +206,6 @@
 static void segvn_hat_rgn_unload_callback(caddr_t, caddr_t, caddr_t,
     size_t, void *, u_offset_t);

-static int segvn_slock_anonpages(page_t *, int);
-static void segvn_sunlock_anonpages(page_t *, int);
-
 static struct kmem_cache *segvn_cache;
 static struct kmem_cache **segvn_szc_cache;

@@ -212,7 +216,7 @@
 	ulong_t	fullszcpages[10];
 	ulong_t	relocatepages[3];
 	ulong_t	fltanpages[17];
-	ulong_t pagelock[3];
+	ulong_t pagelock[2];
 	ulong_t	demoterange[3];
 } segvnvmstats;
 #endif /* VM_STATS */
@@ -240,7 +244,7 @@
 	struct segvn_data *svd = buf;

 	rw_init(&svd->lock, NULL, RW_DEFAULT, NULL);
-	mutex_init(&svd->segp_slock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&svd->segfree_syncmtx, NULL, MUTEX_DEFAULT, NULL);
 	svd->svn_trnext = svd->svn_trprev = NULL;
 	return (0);
 }
@@ -252,7 +256,7 @@
 	struct segvn_data *svd = buf;

 	rw_destroy(&svd->lock);
-	mutex_destroy(&svd->segp_slock);
+	mutex_destroy(&svd->segfree_syncmtx);
 }

 /*ARGSUSED*/
@@ -467,6 +471,13 @@
 		    NULL, 0, &p0, TS_RUN, minclsyspri);
 	}
 #endif
+
+	if (!ISP2(segvn_pglock_comb_balign) ||
+	    segvn_pglock_comb_balign < PAGESIZE) {
+		segvn_pglock_comb_balign = 1UL << 16; /* 64K */
+	}
+	segvn_pglock_comb_bshift = highbit(segvn_pglock_comb_balign) - 1;
+	segvn_pglock_comb_palign = btop(segvn_pglock_comb_balign);
 }

 #define	SEGVN_PAGEIO	((void *)0x1)
@@ -786,6 +797,8 @@
 	svd->pageadvice = 0;
 	svd->flags = (ushort_t)a->flags;
 	svd->softlockcnt = 0;
+	svd->softlockcnt_sbase = 0;
+	svd->softlockcnt_send = 0;
 	svd->rcookie = HAT_INVALID_REGION_COOKIE;
 	svd->pageswap = 0;

@@ -991,7 +1004,7 @@
 	    (!svd1->pageprot && !svd2->pageprot && incompat(prot)) ||
 	    incompat(type) || incompat(cred) || incompat(flags) ||
 	    seg1->s_szc != seg2->s_szc || incompat(policy_info.mem_policy) ||
-	    (svd2->softlockcnt > 0))
+	    (svd2->softlockcnt > 0) || svd1->softlockcnt_send > 0)
 		return (-1);
 #undef incompat

@@ -1232,7 +1245,7 @@
 	if (svd1->vp != a->vp || svd1->maxprot != a->maxprot ||
 	    (!svd1->pageprot && (svd1->prot != a->prot)) ||
 	    svd1->type != a->type || svd1->flags != a->flags ||
-	    seg1->s_szc != a->szc)
+	    seg1->s_szc != a->szc || svd1->softlockcnt_send > 0)
 		return (-1);

 	/* vp == NULL implies zfod, offset doesn't matter */
@@ -1353,7 +1366,7 @@
 	if (svd2->vp != a->vp || svd2->maxprot != a->maxprot ||
 	    (!svd2->pageprot && (svd2->prot != a->prot)) ||
 	    svd2->type != a->type || svd2->flags != a->flags ||
-	    seg2->s_szc != a->szc)
+	    seg2->s_szc != a->szc || svd2->softlockcnt_sbase > 0)
 		return (-1);
 	/* vp == NULL implies zfod, offset doesn't matter */
 	if (svd2->vp != NULL &&
@@ -1498,6 +1511,8 @@
 	newsvd->pageswap = svd->pageswap;
 	newsvd->flags = svd->flags;
 	newsvd->softlockcnt = 0;
+	newsvd->softlockcnt_sbase = 0;
+	newsvd->softlockcnt_send = 0;
 	newsvd->policy_info = svd->policy_info;
 	newsvd->rcookie = HAT_INVALID_REGION_COOKIE;

@@ -1797,6 +1812,15 @@
 retry:
 	if (svd->softlockcnt > 0) {
 		ASSERT(svd->tr_state == SEGVN_TR_OFF);
+
+		/*
+		 * If this is shared segment non 0 softlockcnt
+		 * means locked pages are still in use.
+		 */
+		if (svd->type == MAP_SHARED) {
+			return (EAGAIN);
+		}
+
 		/*
 		 * since we do have the writers lock nobody can fill
 		 * the cache during the purge. The flush either succeeds
@@ -1946,6 +1970,16 @@
 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
 			if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) {
 				/*
+				 * Shared anon map is no longer in use. Before
+				 * freeing its pages purge all entries from
+				 * pcache that belong to this amp.
+				 */
+				if (svd->type == MAP_SHARED) {
+					ASSERT(amp->refcnt == 1);
+					ASSERT(svd->softlockcnt == 0);
+					anonmap_purge(amp);
+				}
+				/*
 				 * Free up now unused parts of anon_map array.
 				 */
 				if (amp->a_szc == seg->s_szc) {
@@ -2040,6 +2074,18 @@
 				 * Free up now unused parts of anon_map array.
 				 */
 				ulong_t an_idx = svd->anon_index + npages;
+
+				/*
+				 * Shared anon map is no longer in use. Before
+				 * freeing its pages purge all entries from
+				 * pcache that belong to this amp.
+				 */
+				if (svd->type == MAP_SHARED) {
+					ASSERT(amp->refcnt == 1);
+					ASSERT(svd->softlockcnt == 0);
+					anonmap_purge(amp);
+				}
+
 				if (amp->a_szc == seg->s_szc) {
 					if (seg->s_szc != 0) {
 						anon_free_pages(amp->ahp,
@@ -2123,6 +2169,8 @@
 	nsvd->offset = svd->offset + (uintptr_t)(nseg->s_base - seg->s_base);
 	nsvd->swresv = 0;
 	nsvd->softlockcnt = 0;
+	nsvd->softlockcnt_sbase = 0;
+	nsvd->softlockcnt_send = 0;
 	ASSERT(nsvd->rcookie == HAT_INVALID_REGION_COOKIE);

 	if (svd->vp != NULL) {
@@ -2173,6 +2221,18 @@
 			 * Free up now unused parts of anon_map array.
 			 */
 			ulong_t an_idx = svd->anon_index + opages;
+
+			/*
+			 * Shared anon map is no longer in use. Before
+			 * freeing its pages purge all entries from
+			 * pcache that belong to this amp.
+			 */
+			if (svd->type == MAP_SHARED) {
+				ASSERT(amp->refcnt == 1);
+				ASSERT(svd->softlockcnt == 0);
+				anonmap_purge(amp);
+			}
+
 			if (amp->a_szc == seg->s_szc) {
 				if (seg->s_szc != 0) {
 					anon_free_pages(amp->ahp, an_idx, len,
@@ -2316,6 +2376,15 @@
 					    seg->s_size);
 				}
 			} else {
+
+				/*
+				 * Shared anon map is no longer in use. Before
+				 * freeing its pages purge all entries from
+				 * pcache that belong to this amp.
+				 */
+				ASSERT(svd->softlockcnt == 0);
+				anonmap_purge(amp);
+
 				/*
 				 * Shared - anon_free the entire
 				 * anon_map's worth of stuff and
@@ -2380,155 +2449,19 @@
 	svd->pageswap = 0;
 	svd->cred = NULL;

+	/*
+	 * Take segfree_syncmtx lock to let segvn_reclaim() finish if it's
+	 * still working with this segment without holding as lock (in case
+	 * it's called by pcache async thread).
+	 */
+	ASSERT(svd->softlockcnt == 0);
+	mutex_enter(&svd->segfree_syncmtx);
+	mutex_exit(&svd->segfree_syncmtx);
+
 	seg->s_data = NULL;
 	kmem_cache_free(segvn_cache, svd);
 }

-#ifdef DEBUG
-uint32_t segvn_slock_mtbf = 0;
-#endif
-
-ulong_t segvn_lpglck_limit = 0;
-
-/*
- * Support routines used by segvn_pagelock() and softlock faults for anonymous
- * pages to implement availrmem accounting in a way that makes sure the
- * same memory is accounted just once for all softlock/pagelock purposes.
- * This prevents a bug when availrmem is quickly incorrectly exhausted from
- * several pagelocks to different parts of the same large page since each
- * pagelock has to decrement availrmem by the size of the entire large
- * page. Note those pages are not COW shared until softunlock/pageunlock so
- * we don't need to use cow style accounting here.  We also need to make sure
- * the entire large page is accounted even if softlock range is less than the
- * entire large page because large anon pages can't be demoted when any of
- * constituent pages is locked. The caller calls this routine for every page_t
- * it locks. The very first page in the range may not be the root page of a
- * large page. For all other pages it's guaranteed we are going to visit the
- * root of a particular large page before any other constituent page as we are
- * locking sequential pages belonging to the same anon map. So we do all the
- * locking when the root is encountered except for the very first page.  Since
- * softlocking is not supported (except S_READ_NOCOW special case) for vmpss
- * segments and since vnode pages can be demoted without locking all
- * constituent pages vnode pages don't come here.  Unlocking relies on the
- * fact that pagesize can't change whenever any of constituent large pages is
- * locked at least SE_SHARED. This allows unlocking code to find the right
- * root and decrement availrmem by the same amount it was incremented when the
- * page was locked.
- */
-static int
-segvn_slock_anonpages(page_t *pp, int first)
-{
-	pgcnt_t		pages;
-	pfn_t		pfn;
-	uchar_t		szc = pp->p_szc;
-
-	ASSERT(PAGE_LOCKED(pp));
-	ASSERT(pp->p_vnode != NULL);
-	ASSERT(IS_SWAPFSVP(pp->p_vnode));
-
-	/*
-	 * pagesize won't change as long as any constituent page is locked.
-	 */
-	pages = page_get_pagecnt(pp->p_szc);
-	pfn = page_pptonum(pp);
-
-	if (!first) {
-		if (!IS_P2ALIGNED(pfn, pages)) {
-#ifdef DEBUG
-			pp = &pp[-(spgcnt_t)(pfn & (pages - 1))];
-			pfn = page_pptonum(pp);
-			ASSERT(IS_P2ALIGNED(pfn, pages));
-			ASSERT(pp->p_szc == szc);
-			ASSERT(pp->p_vnode != NULL);
-			ASSERT(IS_SWAPFSVP(pp->p_vnode));
-			ASSERT(pp->p_slckcnt != 0);
-#endif /* DEBUG */
-			return (1);
-		}
-	} else if (!IS_P2ALIGNED(pfn, pages)) {
-		pp = &pp[-(spgcnt_t)(pfn & (pages - 1))];
-#ifdef DEBUG
-		pfn = page_pptonum(pp);
-		ASSERT(IS_P2ALIGNED(pfn, pages));
-		ASSERT(pp->p_szc == szc);
-		ASSERT(pp->p_vnode != NULL);
-		ASSERT(IS_SWAPFSVP(pp->p_vnode));
-#endif /* DEBUG */
-	}
-
-#ifdef DEBUG
-	if (segvn_slock_mtbf && !(gethrtime() % segvn_slock_mtbf)) {
-		return (0);
-	}
-#endif /* DEBUG */
-
-	/*
-	 * pp is a root page.
-	 * We haven't locked this large page yet.
-	 */
-	page_struct_lock(pp);
-	if (pp->p_slckcnt != 0) {
-		if (pp->p_slckcnt < PAGE_SLOCK_MAXIMUM) {
-			pp->p_slckcnt++;
-			page_struct_unlock(pp);
-			return (1);
-		}
-		page_struct_unlock(pp);
-		segvn_lpglck_limit++;
-		return (0);
-	}
-	mutex_enter(&freemem_lock);
-	if (availrmem < tune.t_minarmem + pages) {
-		mutex_exit(&freemem_lock);
-		page_struct_unlock(pp);
-		return (0);
-	}
-	pp->p_slckcnt++;
-	availrmem -= pages;
-	mutex_exit(&freemem_lock);
-	page_struct_unlock(pp);
-	return (1);
-}
-
-static void
-segvn_sunlock_anonpages(page_t *pp, int first)
-{
-	pgcnt_t		pages;
-	pfn_t		pfn;
-
-	ASSERT(PAGE_LOCKED(pp));
-	ASSERT(pp->p_vnode != NULL);
-	ASSERT(IS_SWAPFSVP(pp->p_vnode));
-
-	/*
-	 * pagesize won't change as long as any constituent page is locked.
-	 */
-	pages = page_get_pagecnt(pp->p_szc);
-	pfn = page_pptonum(pp);
-
-	if (!first) {
-		if (!IS_P2ALIGNED(pfn, pages)) {
-			return;
-		}
-	} else if (!IS_P2ALIGNED(pfn, pages)) {
-		pp = &pp[-(spgcnt_t)(pfn & (pages - 1))];
-#ifdef DEBUG
-		pfn = page_pptonum(pp);
-		ASSERT(IS_P2ALIGNED(pfn, pages));
-#endif /* DEBUG */
-	}
-	ASSERT(pp->p_vnode != NULL);
-	ASSERT(IS_SWAPFSVP(pp->p_vnode));
-	ASSERT(pp->p_slckcnt != 0);
-	page_struct_lock(pp);
-	if (--pp->p_slckcnt == 0) {
-		mutex_enter(&freemem_lock);
-		availrmem += pages;
-		mutex_exit(&freemem_lock);
-	}
-	page_struct_unlock(pp);
-}
-
 /*
  * Do a F_SOFTUNLOCK call over the range requested.  The range must have
  * already been F_SOFTLOCK'ed.
@@ -2601,19 +2534,10 @@
 		}
 		TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT,
 		    "segvn_fault:pp %p vp %p offset %llx", pp, vp, offset);
-		if (svd->vp == NULL) {
-			segvn_sunlock_anonpages(pp, adr == addr);
-		}
 		page_unlock(pp);
 	}
-	mutex_enter(&freemem_lock); /* for availrmem */
-	if (svd->vp != NULL) {
-		availrmem += btop(len);
-	}
-	segvn_pages_locked -= btop(len);
-	svd->softlockcnt -= btop(len);
-	mutex_exit(&freemem_lock);
-	if (svd->softlockcnt == 0) {
+	ASSERT(svd->softlockcnt >= btop(len));
+	if (!atomic_add_long_nv((ulong_t *)&svd->softlockcnt, -btop(len))) {
 		/*
 		 * All SOFTLOCKS are gone. Wakeup any waiting
 		 * unmappers so they can try again to unmap.
@@ -2691,8 +2615,7 @@
 	uint_t vpprot,			/* access allowed to object pages */
 	enum fault_type type,		/* type of fault */
 	enum seg_rw rw,			/* type of access at fault */
-	int brkcow,			/* we may need to break cow */
-	int first)			/* first page for this fault if 1 */
+	int brkcow)			/* we may need to break cow */
 {
 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
 	page_t *pp, **ppp;
@@ -2749,17 +2672,8 @@
 		prot = svd->prot;
 	}

-	if (type == F_SOFTLOCK && svd->vp != NULL) {
-		mutex_enter(&freemem_lock);
-		if (availrmem <= tune.t_minarmem) {
-			mutex_exit(&freemem_lock);
-			return (FC_MAKE_ERR(ENOMEM));	/* out of real memory */
-		} else {
-			availrmem--;
-			svd->softlockcnt++;
-			segvn_pages_locked++;
-		}
-		mutex_exit(&freemem_lock);
+	if (type == F_SOFTLOCK) {
+		atomic_add_long((ulong_t *)&svd->softlockcnt, 1);
 	}

 	/*
@@ -2809,19 +2723,6 @@
 			if (lgrp_optimizations())
 				page_migrate(seg, addr, &pp, 1);

-			if (type == F_SOFTLOCK) {
-				if (!segvn_slock_anonpages(pp, first)) {
-					page_unlock(pp);
-					err = ENOMEM;
-					goto out;
-				} else {
-					mutex_enter(&freemem_lock);
-					svd->softlockcnt++;
-					segvn_pages_locked++;
-					mutex_exit(&freemem_lock);
-				}
-			}
-
 			if (enable_mbit_wa) {
 				if (rw == S_WRITE)
 					hat_setmod(pp);
@@ -2981,23 +2882,6 @@
 		if (lgrp_optimizations())
 			page_migrate(seg, addr, &opp, 1);

-		if (type == F_SOFTLOCK && svd->vp == NULL) {
-
-			ASSERT(opp->p_szc == 0 ||
-			    (svd->type == MAP_SHARED &&
-			    amp != NULL && amp->a_szc != 0));
-
-			if (!segvn_slock_anonpages(opp, first)) {
-				page_unlock(opp);
-				err = ENOMEM;
-				goto out;
-			} else {
-				mutex_enter(&freemem_lock);
-				svd->softlockcnt++;
-				segvn_pages_locked++;
-				mutex_exit(&freemem_lock);
-			}
-		}
 		if (IS_VMODSORT(opp->p_vnode) || enable_mbit_wa) {
 			if (rw == S_WRITE)
 				hat_setmod(opp);
@@ -3124,18 +3008,6 @@
 		page_migrate(seg, addr, &pp, 1);

 	ASSERT(pp->p_szc == 0);
-	if (type == F_SOFTLOCK && svd->vp == NULL) {
-		if (!segvn_slock_anonpages(pp, first)) {
-			page_unlock(pp);
-			err = ENOMEM;
-			goto out;
-		} else {
-			mutex_enter(&freemem_lock);
-			svd->softlockcnt++;
-			segvn_pages_locked++;
-			mutex_exit(&freemem_lock);
-		}
-	}

 	ASSERT(!IS_VMODSORT(pp->p_vnode));
 	if (enable_mbit_wa) {
@@ -3158,12 +3030,8 @@
 	if (anon_lock)
 		anon_array_exit(&cookie);

-	if (type == F_SOFTLOCK && svd->vp != NULL) {
-		mutex_enter(&freemem_lock);
-		availrmem++;
-		segvn_pages_locked--;
-		svd->softlockcnt--;
-		mutex_exit(&freemem_lock);
+	if (type == F_SOFTLOCK) {
+		atomic_add_long((ulong_t *)&svd->softlockcnt, -1);
 	}
 	return (FC_MAKE_ERR(err));
 }
@@ -3819,13 +3687,10 @@

 int segvn_anypgsz = 0;

-#define	SEGVN_RESTORE_SOFTLOCK(type, pages) 		\
-		if ((type) == F_SOFTLOCK) {		\
-			mutex_enter(&freemem_lock);	\
-			availrmem += (pages);		\
-			segvn_pages_locked -= (pages);	\
-			svd->softlockcnt -= (pages);	\
-			mutex_exit(&freemem_lock);	\
+#define	SEGVN_RESTORE_SOFTLOCK_VP(type, pages) 				\
+		if ((type) == F_SOFTLOCK) {				\
+			atomic_add_long((ulong_t *)&(svd)->softlockcnt, \
+			    -(pages));					\
 		}

 #define	SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot)		\
@@ -4032,17 +3897,8 @@
 				}
 			}
 			if (type == F_SOFTLOCK) {
-				mutex_enter(&freemem_lock);
-				if (availrmem < tune.t_minarmem + pages) {
-					mutex_exit(&freemem_lock);
-					err = FC_MAKE_ERR(ENOMEM);
-					goto out;
-				} else {
-					availrmem -= pages;
-					segvn_pages_locked += pages;
-					svd->softlockcnt += pages;
-				}
-				mutex_exit(&freemem_lock);
+				atomic_add_long((ulong_t *)&svd->softlockcnt,
+				    pages);
 			}

 			pplist = NULL;
@@ -4123,7 +3979,7 @@
 					page_free_replacement_page(pplist);
 					page_create_putback(pages);
 				}
-				SEGVN_RESTORE_SOFTLOCK(type, pages);
+				SEGVN_RESTORE_SOFTLOCK_VP(type, pages);
 				if (a + pgsz <= eaddr) {
 					SEGVN_VMSTAT_FLTVNPAGES(19);
 					err = FC_MAKE_ERR(ierr);
@@ -4179,7 +4035,7 @@
 					page_free_replacement_page(pplist);
 					page_create_putback(pages);
 				}
-				SEGVN_RESTORE_SOFTLOCK(type, pages);
+				SEGVN_RESTORE_SOFTLOCK_VP(type, pages);
 				if (szc < seg->s_szc) {
 					SEGVN_VMSTAT_FLTVNPAGES(26);
 					/*
@@ -4226,7 +4082,7 @@
 					SEGVN_VMSTAT_FLTVNPAGES(28);
 					anon_array_exit(&an_cookie);
 					ANON_LOCK_EXIT(&amp->a_rwlock);
-					SEGVN_RESTORE_SOFTLOCK(type, pages);
+					SEGVN_RESTORE_SOFTLOCK_VP(type, pages);
 					err = FC_MAKE_ERR(ierr);
 					goto out;
 				}
@@ -4724,9 +4580,7 @@
 	ulong_t i;
 	int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD;
 	anon_sync_obj_t cookie;
-	int first = 1;
 	int adjszc_chk;
-	int purged = 0;
 	int pgflags = (svd->tr_state == SEGVN_TR_ON) ? PG_LOCAL : 0;

 	ASSERT(szc != 0);
@@ -4794,18 +4648,9 @@
 				lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr,
 				    pgsz);
 			}
-			if (type == F_SOFTLOCK && svd->vp != NULL) {
-				mutex_enter(&freemem_lock);
-				if (availrmem < tune.t_minarmem + pages) {
-					mutex_exit(&freemem_lock);
-					err = FC_MAKE_ERR(ENOMEM);
-					goto error;
-				} else {
-					availrmem -= pages;
-					segvn_pages_locked += pages;
-					svd->softlockcnt += pages;
-				}
-				mutex_exit(&freemem_lock);
+			if (type == F_SOFTLOCK) {
+				atomic_add_long((ulong_t *)&svd->softlockcnt,
+				    pages);
 			}
 			anon_array_enter(amp, aindx, &cookie);
 			ppa_szc = (uint_t)-1;
@@ -4815,13 +4660,10 @@
 			if (ierr != 0) {
 				anon_array_exit(&cookie);
 				VM_STAT_ADD(segvnvmstats.fltanpages[4]);
-				if (type == F_SOFTLOCK && svd->vp != NULL) {
-					VM_STAT_ADD(segvnvmstats.fltanpages[5]);
-					mutex_enter(&freemem_lock);
-					availrmem += pages;
-					segvn_pages_locked -= pages;
-					svd->softlockcnt -= pages;
-					mutex_exit(&freemem_lock);
+				if (type == F_SOFTLOCK) {
+					atomic_add_long(
+					    (ulong_t *)&svd->softlockcnt,
+					    -pages);
 				}
 				if (ierr > 0) {
 					VM_STAT_ADD(segvnvmstats.fltanpages[6]);
@@ -4845,41 +4687,6 @@
 				page_migrate(seg, a, ppa, pages);

 			ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
-			if (type == F_SOFTLOCK && svd->vp == NULL) {
-				/*
-				 * If all pages in ppa array belong to the same
-				 * large page call segvn_slock_anonpages()
-				 * just for ppa[0].
-				 */
-				for (i = 0; i < pages; i++) {
-					if (!segvn_slock_anonpages(ppa[i],
-					    i == 0 && first)) {
-						ulong_t j;
-						for (j = 0; j < i; j++) {
-							segvn_sunlock_anonpages(
-							    ppa[j], j == 0 &&
-							    first);
-							page_unlock(ppa[j]);
-						}
-						for (j = i; j < pages; j++) {
-							page_unlock(ppa[j]);
-						}
-						anon_array_exit(&cookie);
-						err = FC_MAKE_ERR(ENOMEM);
-						goto error;
-					}
-					if (i == 0 && ppa[0]->p_szc >= szc) {
-						ASSERT(!(page_pptonum(ppa[0]) &
-						    (pages - 1)));
-						break;
-					}
-				}
-				first = 0;
-				mutex_enter(&freemem_lock);
-				svd->softlockcnt += pages;
-				segvn_pages_locked += pages;
-				mutex_exit(&freemem_lock);
-			}

 			if (segtype == MAP_SHARED) {
 				vpprot |= PROT_WRITE;
@@ -4920,17 +4727,6 @@
 		 * have relocated locked pages.
 		 */
 		ASSERT(ierr == -1 || ierr == -2);
-		/*
-		 * For the very first relocation failure try to purge this
-		 * segment's cache so that the relocator can obtain an
-		 * exclusive lock on pages we want to relocate.
-		 */
-		if (!purged && ierr == -1 && ppa_szc != (uint_t)-1 &&
-		    svd->softlockcnt != 0) {
-			purged = 1;
-			segvn_purge(seg);
-			continue;
-		}

 		if (segvn_anypgsz) {
 			ASSERT(ierr == -2 || szc != 0);
@@ -5613,7 +5409,7 @@
 	 */
 	for (a = addr; a < addr + len; a += PAGESIZE, off += PAGESIZE) {
 		err = segvn_faultpage(hat, seg, a, off, vpage, plp, vpprot,
-		    type, rw, brkcow, a == addr);
+		    type, rw, brkcow);
 		if (err) {
 			if (amp != NULL)
 				ANON_LOCK_EXIT(&amp->a_rwlock);
@@ -5826,6 +5622,16 @@
 	 */
 	if (svd->softlockcnt > 0) {
 		ASSERT(svd->tr_state == SEGVN_TR_OFF);
+
+		/*
+		 * If this is shared segment non 0 softlockcnt
+		 * means locked pages are still in use.
+		 */
+		if (svd->type == MAP_SHARED) {
+			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+			return (EAGAIN);
+		}
+
 		/*
 		 * Since we do have the segvn writers lock nobody can fill
 		 * the cache with entries belonging to this seg during
@@ -6084,15 +5890,17 @@
 					if ((VPP_PROT(svp) ^ prot) &
 					    PROT_WRITE) {
 						if (prot & PROT_WRITE) {
-						    if (!page_addclaim(pp)) {
-							page_unlock(pp);
-							break;
-						    }
+							if (!page_addclaim(
+							    pp)) {
+								page_unlock(pp);
+								break;
+							}
 						} else {
-						    if (!page_subclaim(pp)) {
-							page_unlock(pp);
-							break;
-						    }
+							if (!page_subclaim(
+							    pp)) {
+								page_unlock(pp);
+								break;
+							}
 						}
 					}
 					page_unlock(pp);
@@ -6257,6 +6065,15 @@
 	 */
 	if (svd->softlockcnt > 0) {
 		ASSERT(svd->tr_state == SEGVN_TR_OFF);
+
+		/*
+		 * If this is shared segment non 0 softlockcnt
+		 * means locked pages are still in use.
+		 */
+		if (svd->type == MAP_SHARED) {
+			return (EAGAIN);
+		}
+
 		/*
 		 * Since we do have the segvn writers lock nobody can fill
 		 * the cache with entries belonging to this seg during
@@ -6339,6 +6156,13 @@
 		}
 		nsvd = (struct segvn_data *)nseg->s_data;
 		if (nsvd->softlockcnt > 0) {
+			/*
+			 * If this is shared segment non 0 softlockcnt
+			 * means locked pages are still in use.
+			 */
+			if (nsvd->type == MAP_SHARED) {
+				return (EAGAIN);
+			}
 			segvn_purge(nseg);
 			if (nsvd->softlockcnt > 0) {
 				return (EAGAIN);
@@ -6698,6 +6522,8 @@
 	}

 	ASSERT(svd->softlockcnt == 0);
+	ASSERT(svd->softlockcnt_sbase == 0);
+	ASSERT(svd->softlockcnt_send == 0);
 	crhold(svd->cred);

 	if (svd->vpage != NULL) {
@@ -7336,11 +7162,20 @@

 	if (svd->softlockcnt > 0) {
 		/*
+		 * If this is shared segment non 0 softlockcnt
+		 * means locked pages are still in use.
+		 */
+		if (svd->type == MAP_SHARED) {
+			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+			return (EAGAIN);
+		}
+
+		/*
 		 * flush all pages from seg cache
 		 * otherwise we may deadlock in swap_putpage
 		 * for B_INVAL page (4175402).
 		 *
-		 * Even if we grab segvn WRITER's lock or segp_slock
+		 * Even if we grab segvn WRITER's lock
 		 * here, there might be another thread which could've
 		 * successfully performed lookup/insert just before
 		 * we acquired the lock here.  So, grabbing either
@@ -7354,6 +7189,18 @@
 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
 			return (EAGAIN);
 		}
+	} else if (svd->type == MAP_SHARED && svd->amp != NULL &&
+	    svd->amp->a_softlockcnt > 0) {
+		/*
+		 * Try to purge this amp's entries from pcache. It will
+		 * succeed only if other segments that share the amp have no
+		 * outstanding softlock's.
+		 */
+		segvn_purge(seg);
+		if (svd->amp->a_softlockcnt > 0 || svd->softlockcnt > 0) {
+			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+			return (EAGAIN);
+		}
 	}

 	vpp = svd->vpage;
@@ -7904,8 +7751,10 @@
 					vp = svd->vp;
 					off = offset;
 				}
-				anon_array_exit(&cookie);
-				ANON_LOCK_EXIT(&amp->a_rwlock);
+				if (op != MC_LOCK || ap == NULL) {
+					anon_array_exit(&cookie);
+					ANON_LOCK_EXIT(&amp->a_rwlock);
+				}
 			} else {
 				vp = svd->vp;
 				off = offset;
@@ -7933,6 +7782,11 @@
 				    (uint_t *)NULL, pl, PAGESIZE, seg, addr,
 				    S_OTHER, svd->cred, NULL);

+				if (error && ap != NULL) {
+					anon_array_exit(&cookie);
+					ANON_LOCK_EXIT(&amp->a_rwlock);
+				}
+
 				/*
 				 * If the error is EDEADLK then we must bounce
 				 * up and drop all vm subsystem locks and then
@@ -8004,6 +7858,13 @@
 				ASSERT(!VPP_ISPPLOCK(vpp));

 				ret = page_pp_lock(pp, claim, 0);
+				if (ap != NULL) {
+					if (ap->an_pvp != NULL) {
+						anon_swap_free(ap, pp);
+					}
+					anon_array_exit(&cookie);
+					ANON_LOCK_EXIT(&amp->a_rwlock);
+				}
 				if (ret == 0) {
 					/* locking page failed */
 					page_unlock(pp);
@@ -8146,6 +8007,14 @@
 		 */
 		if (svd->softlockcnt > 0) {
 			/*
+			 * If this is shared segment non 0 softlockcnt
+			 * means locked pages are still in use.
+			 */
+			if (svd->type == MAP_SHARED) {
+				SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+				return (EAGAIN);
+			}
+			/*
 			 * Since we do have the segvn writers lock
 			 * nobody can fill the cache with entries
 			 * belonging to this seg during the purge.
@@ -8164,6 +8033,14 @@
 				SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
 				return (EAGAIN);
 			}
+		} else if (svd->type == MAP_SHARED && svd->amp != NULL &&
+		    svd->amp->a_softlockcnt > 0) {
+			/*
+			 * Try to purge this amp's entries from pcache. It
+			 * will succeed only if other segments that share the
+			 * amp have no outstanding softlock's.
+			 */
+			segvn_purge(seg);
 		}
 	}

@@ -8182,6 +8059,8 @@
 			return (0);
 		}

+		segvn_purge(seg);
+
 		page = seg_page(seg, addr);
 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
 		anon_disclaim(amp, svd->anon_index + page, len);
@@ -8623,59 +8502,289 @@
 		ANON_LOCK_EXIT(&amp->a_rwlock);
 }

+#ifdef DEBUG
+static uint32_t segvn_pglock_mtbf = 0;
+#endif
+
+#define	PCACHE_SHWLIST		((page_t *)-2)
+#define	NOPCACHE_SHWLIST	((page_t *)-1)
+
 /*
- * lock/unlock anon pages over a given range. Return shadow list
+ * Lock/Unlock anon pages over a given range. Return shadow list. This routine
+ * uses global segment pcache to cache shadow lists (i.e. pp arrays) of pages
+ * to avoid the overhead of per page locking, unlocking for subsequent IOs to
+ * the same parts of the segment. Currently shadow list creation is only
+ * supported for pure anon segments. MAP_PRIVATE segment pcache entries are
+ * tagged with segment pointer, starting virtual address and length. This
+ * approach for MAP_SHARED segments may add many pcache entries for the same
+ * set of pages and lead to long hash chains that decrease pcache lookup
+ * performance. To avoid this issue for shared segments shared anon map and
+ * starting anon index are used for pcache entry tagging. This allows all
+ * segments to share pcache entries for the same anon range and reduces pcache
+ * chain's length as well as memory overhead from duplicate shadow lists and
+ * pcache entries.
+ *
+ * softlockcnt field in segvn_data structure counts the number of F_SOFTLOCK'd
+ * pages via segvn_fault() and pagelock'd pages via this routine. But pagelock
+ * part of softlockcnt accounting is done differently for private and shared
+ * segments. In private segment case softlock is only incremented when a new
+ * shadow list is created but not when an existing one is found via
+ * seg_plookup(). pcache entries have reference count incremented/decremented
+ * by each seg_plookup()/seg_pinactive() operation. Only entries that have 0
+ * reference count can be purged (and purging is needed before segment can be
+ * freed). When a private segment pcache entry is purged segvn_reclaim() will
+ * decrement softlockcnt. Since in private segment case each of its pcache
+ * entries only belongs to this segment we can expect that when
+ * segvn_pagelock(L_PAGEUNLOCK) was called for all outstanding IOs in this
+ * segment purge will succeed and softlockcnt will drop to 0. In shared
+ * segment case reference count in pcache entry counts active locks from many
+ * different segments so we can't expect segment purging to succeed even when
+ * segvn_pagelock(L_PAGEUNLOCK) was called for all outstanding IOs in this
+ * segment. To be able to determine when there're no pending pagelocks in
+ * shared segment case we don't rely on purging to make softlockcnt drop to 0
+ * but instead softlockcnt is incremented and decremented for every
+ * segvn_pagelock(L_PAGELOCK/L_PAGEUNLOCK) call regardless if a new shadow
+ * list was created or an existing one was found. When softlockcnt drops to 0
+ * this segment no longer has any claims for pcached shadow lists and the
+ * segment can be freed even if there're still active pcache entries
+ * shared by this segment anon map. Shared segment pcache entries belong to
+ * anon map and are typically removed when anon map is freed after all
+ * processes destroy the segments that use this anon map.
  */
 static int
 segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp,
     enum lock_type type, enum seg_rw rw)
 {
 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
-	size_t np, adjustpages = 0, npages = (len >> PAGESHIFT);
+	size_t np;
+	pgcnt_t adjustpages;
+	pgcnt_t npages;
 	ulong_t anon_index;
-	uint_t protchk;
+	uint_t protchk = (rw == S_READ) ? PROT_READ : PROT_WRITE;
 	uint_t error;
 	struct anon_map *amp;
+	pgcnt_t anpgcnt;
 	struct page **pplist, **pl, *pp;
 	caddr_t a;
 	size_t page;
 	caddr_t lpgaddr, lpgeaddr;
-	pgcnt_t szc0_npages = 0;
+	anon_sync_obj_t cookie;
+	int anlock;
+	struct anon_map *pamp;
+	caddr_t paddr;
+	seg_preclaim_cbfunc_t preclaim_callback;
+	size_t pgsz;
+	int use_pcache;
+	size_t wlen;
+	uint_t pflags = 0;
+	int sftlck_sbase = 0;
+	int sftlck_send = 0;
+
+#ifdef DEBUG
+	if (type == L_PAGELOCK && segvn_pglock_mtbf) {
+		hrtime_t ts = gethrtime();
+		if ((ts % segvn_pglock_mtbf) == 0) {
+			return (ENOTSUP);
+		}
+		if ((ts % segvn_pglock_mtbf) == 1) {
+			return (EFAULT);
+		}
+	}
+#endif

 	TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_START,
 	    "segvn_pagelock: start seg %p addr %p", seg, addr);

 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
-	if (seg->s_szc != 0 && (type == L_PAGELOCK || type == L_PAGEUNLOCK)) {
+	ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK);
+
+	SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
+
+	/*
+	 * for now we only support pagelock to anon memory. We would have to
+	 * check protections for vnode objects and call into the vnode driver.
+	 * That's too much for a fast path. Let the fault entry point handle
+	 * it.
+	 */
+	if (svd->vp != NULL) {
+		if (type == L_PAGELOCK) {
+			error = ENOTSUP;
+			goto out;
+		}
+		panic("segvn_pagelock(L_PAGEUNLOCK): vp != NULL");
+	}
+	if ((amp = svd->amp) == NULL) {
+		if (type == L_PAGELOCK) {
+			error = EFAULT;
+			goto out;
+		}
+		panic("segvn_pagelock(L_PAGEUNLOCK): amp == NULL");
+	}
+	if (rw != S_READ && rw != S_WRITE) {
+		if (type == L_PAGELOCK) {
+			error = ENOTSUP;
+			goto out;
+		}
+		panic("segvn_pagelock(L_PAGEUNLOCK): bad rw");
+	}
+
+	if (seg->s_szc != 0) {
 		/*
 		 * We are adjusting the pagelock region to the large page size
 		 * boundary because the unlocked part of a large page cannot
 		 * be freed anyway unless all constituent pages of a large
-		 * page are locked. Therefore this adjustment allows us to
-		 * decrement availrmem by the right value (note we don't want
-		 * to just decrement availrem by the large page size without
-		 * adjusting addr and len because then we may end up
-		 * decrementing availrmem by large page size for every
-		 * constituent page locked by a new as_pagelock call).
-		 * as_pageunlock caller must always match as_pagelock call's
-		 * addr and len.
+		 * page are locked. Bigger regions reduce pcache chain length
+		 * and improve lookup performance. The tradeoff is that the
+		 * very first segvn_pagelock() call for a given page is more
+		 * expensive if only 1 page_t is needed for IO. This is only
+		 * an issue if pcache entry doesn't get reused by several
+		 * subsequent calls. We optimize here for the case when pcache
+		 * is heavily used by repeated IOs to the same address range.
 		 *
 		 * Note segment's page size cannot change while we are holding
 		 * as lock.  And then it cannot change while softlockcnt is
 		 * not 0. This will allow us to correctly recalculate large
-		 * page size region for the matching pageunlock/reclaim call.
+		 * page size region for the matching pageunlock/reclaim call
+		 * since as_pageunlock() caller must always match
+		 * as_pagelock() call's addr and len.
 		 *
-		 * for pageunlock *ppp points to the pointer of page_t that
+		 * For pageunlock *ppp points to the pointer of page_t that
 		 * corresponds to the real unadjusted start address. Similar
 		 * for pagelock *ppp must point to the pointer of page_t that
 		 * corresponds to the real unadjusted start address.
 		 */
-		size_t pgsz = page_get_pagesize(seg->s_szc);
+		pgsz = page_get_pagesize(seg->s_szc);
 		CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr);
-		adjustpages = ((uintptr_t)(addr - lpgaddr)) >> PAGESHIFT;
+		adjustpages = btop((uintptr_t)(addr - lpgaddr));
+	} else if (len < segvn_pglock_comb_thrshld) {
+		lpgaddr = addr;
+		lpgeaddr = addr + len;
+		adjustpages = 0;
+		pgsz = PAGESIZE;
+	} else {
+		/*
+		 * Align the address range of large enough requests to allow
+		 * combining of different shadow lists into 1 to reduce memory
+		 * overhead from potentially overlapping large shadow lists
+		 * (worst case is we have a 1MB IO into buffers with start
+		 * addresses separated by 4K).  Alignment is only possible if
+		 * padded chunks have sufficient access permissions. Note
+		 * permissions won't change between L_PAGELOCK and
+		 * L_PAGEUNLOCK calls since non 0 softlockcnt will force
+		 * segvn_setprot() to wait until softlockcnt drops to 0. This
+		 * allows us to determine in L_PAGEUNLOCK the same range we
+		 * computed in L_PAGELOCK.
+		 *
+		 * If alignment is limited by segment ends set
+		 * sftlck_sbase/sftlck_send flags. In L_PAGELOCK case when
+		 * these flags are set bump softlockcnt_sbase/softlockcnt_send
+		 * per segment counters. In L_PAGEUNLOCK case decrease
+		 * softlockcnt_sbase/softlockcnt_send counters if
+		 * sftlck_sbase/sftlck_send flags are set.  When
+		 * softlockcnt_sbase/softlockcnt_send are non 0
+		 * segvn_concat()/segvn_extend_prev()/segvn_extend_next()
+		 * won't merge the segments. This restriction combined with
+		 * restriction on segment unmapping and splitting for segments
+		 * that have non 0 softlockcnt allows L_PAGEUNLOCK to
+		 * correctly determine the same range that was previously
+		 * locked by matching L_PAGELOCK.
+		 */
+		pflags = SEGP_PSHIFT | (segvn_pglock_comb_bshift << 16);
+		pgsz = PAGESIZE;
+		if (svd->type == MAP_PRIVATE) {
+			lpgaddr = (caddr_t)P2ALIGN((uintptr_t)addr,
+			    segvn_pglock_comb_balign);
+			if (lpgaddr < seg->s_base) {
+				lpgaddr = seg->s_base;
+				sftlck_sbase = 1;
+			}
+		} else {
+			ulong_t aix = svd->anon_index + seg_page(seg, addr);
+			ulong_t aaix = P2ALIGN(aix, segvn_pglock_comb_palign);
+			if (aaix < svd->anon_index) {
+				lpgaddr = seg->s_base;
+				sftlck_sbase = 1;
+			} else {
+				lpgaddr = addr - ptob(aix - aaix);
+				ASSERT(lpgaddr >= seg->s_base);
+			}
+		}
+		if (svd->pageprot && lpgaddr != addr) {
+			struct vpage *vp = &svd->vpage[seg_page(seg, lpgaddr)];
+			struct vpage *evp = &svd->vpage[seg_page(seg, addr)];
+			while (vp < evp) {
+				if ((VPP_PROT(vp) & protchk) == 0) {
+					break;
+				}
+				vp++;
+			}
+			if (vp < evp) {
+				lpgaddr = addr;
+				pflags = 0;
+			}
+		}
+		lpgeaddr = addr + len;
+		if (pflags) {
+			if (svd->type == MAP_PRIVATE) {
+				lpgeaddr = (caddr_t)P2ROUNDUP(
+				    (uintptr_t)lpgeaddr,
+				    segvn_pglock_comb_balign);
+			} else {
+				ulong_t aix = svd->anon_index +
+				    seg_page(seg, lpgeaddr);
+				ulong_t aaix = P2ROUNDUP(aix,
+				    segvn_pglock_comb_palign);
+				if (aaix < aix) {
+					lpgeaddr = 0;
+				} else {
+					lpgeaddr += ptob(aaix - aix);
+				}
+			}
+			if (lpgeaddr == 0 ||
+			    lpgeaddr > seg->s_base + seg->s_size) {
+				lpgeaddr = seg->s_base + seg->s_size;
+				sftlck_send = 1;
+			}
+		}
+		if (svd->pageprot && lpgeaddr != addr + len) {
+			struct vpage *vp;
+			struct vpage *evp;
+
+			vp = &svd->vpage[seg_page(seg, addr + len)];
+			evp = &svd->vpage[seg_page(seg, lpgeaddr)];
+
+			while (vp < evp) {
+				if ((VPP_PROT(vp) & protchk) == 0) {
+					break;
+				}
+				vp++;
+			}
+			if (vp < evp) {
+				lpgeaddr = addr + len;
+			}
+		}
+		adjustpages = btop((uintptr_t)(addr - lpgaddr));
+	}
+
+	/*
+	 * For MAP_SHARED segments we create pcache entries tagged by amp and
+	 * anon index so that we can share pcache entries with other segments
+	 * that map this amp.  For private segments pcache entries are tagged
+	 * with segment and virtual address.
+	 */
+	if (svd->type == MAP_SHARED) {
+		pamp = amp;
+		paddr = (caddr_t)((lpgaddr - seg->s_base) +
+		    ptob(svd->anon_index));
+		preclaim_callback = shamp_reclaim;
+	} else {
+		pamp = NULL;
+		paddr = lpgaddr;
+		preclaim_callback = segvn_reclaim;
 	}

 	if (type == L_PAGEUNLOCK) {
+		VM_STAT_ADD(segvnvmstats.pagelock[0]);

 		/*
 		 * update hat ref bits for /proc. We need to make sure
@@ -8694,13 +8803,50 @@
 				}
 			}
 		}
-		SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
-		if (seg->s_szc != 0) {
-			VM_STAT_ADD(segvnvmstats.pagelock[0]);
-			seg_pinactive(seg, lpgaddr, lpgeaddr - lpgaddr,
-			    *ppp - adjustpages, rw, segvn_reclaim);
+
+		/*
+		 * Check the shadow list entry after the last page used in
+		 * this IO request. If it's NOPCACHE_SHWLIST the shadow list
+		 * was not inserted into pcache and is not large page
+		 * adjusted.  In this case call reclaim callback directly and
+		 * don't adjust the shadow list start and size for large
+		 * pages.
+		 */
+		npages = btop(len);
+		if ((*ppp)[npages] == NOPCACHE_SHWLIST) {
+			void *ptag;
+			if (pamp != NULL) {
+				ASSERT(svd->type == MAP_SHARED);
+				ptag = (void *)pamp;
+				paddr = (caddr_t)((addr - seg->s_base) +
+				    ptob(svd->anon_index));
+			} else {
+				ptag = (void *)seg;
+				paddr = addr;
+			}
+			(*preclaim_callback)(ptag, paddr, len, *ppp, rw, 0);
 		} else {
-			seg_pinactive(seg, addr, len, *ppp, rw, segvn_reclaim);
+			ASSERT((*ppp)[npages] == PCACHE_SHWLIST ||
+			    IS_SWAPFSVP((*ppp)[npages]->p_vnode));
+			len = lpgeaddr - lpgaddr;
+			npages = btop(len);
+			seg_pinactive(seg, pamp, paddr, len,
+			    *ppp - adjustpages, rw, pflags, preclaim_callback);
+		}
+
+		if (pamp != NULL) {
+			ASSERT(svd->type == MAP_SHARED);
+			ASSERT(svd->softlockcnt >= npages);
+			atomic_add_long((ulong_t *)&svd->softlockcnt, -npages);
+		}
+
+		if (sftlck_sbase) {
+			ASSERT(svd->softlockcnt_sbase > 0);
+			atomic_add_long((ulong_t *)&svd->softlockcnt_sbase, -1);
+		}
+		if (sftlck_send) {
+			ASSERT(svd->softlockcnt_send > 0);
+			atomic_add_long((ulong_t *)&svd->softlockcnt_send, -1);
 		}

 		/*
@@ -8711,77 +8857,97 @@
 		 * raw async i/o is still in progress or where a thread
 		 * exits on data fault in a multithreaded application.
 		 */
-		if (AS_ISUNMAPWAIT(seg->s_as) && (svd->softlockcnt > 0)) {
-			/*
-			 * Even if we grab segvn WRITER's lock or segp_slock
-			 * here, there might be another thread which could've
-			 * successfully performed lookup/insert just before
-			 * we acquired the lock here.  So, grabbing either
-			 * lock here is of not much use.  Until we devise
-			 * a strategy at upper layers to solve the
-			 * synchronization issues completely, we expect
-			 * applications to handle this appropriately.
-			 */
-			segvn_purge(seg);
+		if (AS_ISUNMAPWAIT(seg->s_as)) {
+			if (svd->softlockcnt == 0) {
+				mutex_enter(&seg->s_as->a_contents);
+				if (AS_ISUNMAPWAIT(seg->s_as)) {
+					AS_CLRUNMAPWAIT(seg->s_as);
+					cv_broadcast(&seg->s_as->a_cv);
+				}
+				mutex_exit(&seg->s_as->a_contents);
+			} else if (pamp == NULL) {
+				/*
+				 * softlockcnt is not 0 and this is a
+				 * MAP_PRIVATE segment. Try to purge its
+				 * pcache entries to reduce softlockcnt.
+				 * If it drops to 0 segvn_reclaim()
+				 * will wake up a thread waiting on
+				 * unmapwait flag.
+				 *
+				 * We don't purge MAP_SHARED segments with non
+				 * 0 softlockcnt since IO is still in progress
+				 * for such segments.
+				 */
+				ASSERT(svd->type == MAP_PRIVATE);
+				segvn_purge(seg);
+			}
 		}
 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
 		TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END,
 		    "segvn_pagelock: unlock seg %p addr %p", seg, addr);
 		return (0);
-	} else if (type == L_PAGERECLAIM) {
-		VM_STAT_COND_ADD(seg->s_szc != 0, segvnvmstats.pagelock[1]);
-		SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
-		(void) segvn_reclaim(seg, addr, len, *ppp, rw);
-		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
-		TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END,
-		    "segvn_pagelock: reclaim seg %p addr %p", seg, addr);
-		return (0);
-	}
-
-	if (seg->s_szc != 0) {
-		VM_STAT_ADD(segvnvmstats.pagelock[2]);
-		addr = lpgaddr;
-		len = lpgeaddr - lpgaddr;
-		npages = (len >> PAGESHIFT);
-	}
-
-	/*
-	 * for now we only support pagelock to anon memory. We've to check
-	 * protections for vnode objects and call into the vnode driver.
-	 * That's too much for a fast path. Let the fault entry point handle it.
-	 */
-	if (svd->vp != NULL) {
-		TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END,
-		    "segvn_pagelock: mapped vnode seg %p addr %p", seg, addr);
-		*ppp = NULL;
-		return (ENOTSUP);
-	}
-
-	/*
-	 * if anonmap is not yet created, let the fault entry point populate it
-	 * with anon ptrs.
-	 */
-	if ((amp = svd->amp) == NULL) {
-		TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END,
-		    "segvn_pagelock: anonmap null seg %p addr %p", seg, addr);
-		*ppp = NULL;
-		return (EFAULT);
-	}
-
-	SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
-
-	/*
-	 * we acquire segp_slock to prevent duplicate entries
-	 * in seg_pcache
-	 */
-	mutex_enter(&svd->segp_slock);
+	}
+
+	/* The L_PAGELOCK case ... */
+
+	VM_STAT_ADD(segvnvmstats.pagelock[1]);
+
+	/*
+	 * For MAP_SHARED segments we have to check protections before
+	 * seg_plookup() since pcache entries may be shared by many segments
+	 * with potentially different page protections.
+	 */
+	if (pamp != NULL) {
+		ASSERT(svd->type == MAP_SHARED);
+		if (svd->pageprot == 0) {
+			if ((svd->prot & protchk) == 0) {
+				error = EACCES;
+				goto out;
+			}
+		} else {
+			/*
+			 * check page protections
+			 */
+			caddr_t ea;
+
+			if (seg->s_szc) {
+				a = lpgaddr;
+				ea = lpgeaddr;
+			} else {
+				a = addr;
+				ea = addr + len;
+			}
+			for (; a < ea; a += pgsz) {
+				struct vpage *vp;
+
+				ASSERT(seg->s_szc == 0 ||
+				    sameprot(seg, a, pgsz));
+				vp = &svd->vpage[seg_page(seg, a)];
+				if ((VPP_PROT(vp) & protchk) == 0) {
+					error = EACCES;
+					goto out;
+				}
+			}
+		}
+	}

 	/*
 	 * try to find pages in segment page cache
 	 */
-	pplist = seg_plookup(seg, addr, len, rw);
+	pplist = seg_plookup(seg, pamp, paddr, lpgeaddr - lpgaddr, rw, pflags);
 	if (pplist != NULL) {
-		mutex_exit(&svd->segp_slock);
+		if (pamp != NULL) {
+			npages = btop((uintptr_t)(lpgeaddr - lpgaddr));
+			ASSERT(svd->type == MAP_SHARED);
+			atomic_add_long((ulong_t *)&svd->softlockcnt,
+			    npages);
+		}
+		if (sftlck_sbase) {
+			atomic_add_long((ulong_t *)&svd->softlockcnt_sbase, 1);
+		}
+		if (sftlck_send) {
+			atomic_add_long((ulong_t *)&svd->softlockcnt_send, 1);
+		}
 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
 		*ppp = pplist + adjustpages;
 		TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_HIT_END,
@@ -8789,145 +8955,211 @@
 		return (0);
 	}

-	if (rw == S_READ) {
-		protchk = PROT_READ;
-	} else {
-		protchk = PROT_WRITE;
-	}
-
-	if (svd->pageprot == 0) {
-		if ((svd->prot & protchk) == 0) {
-			mutex_exit(&svd->segp_slock);
-			error = EFAULT;
-			goto out;
-		}
-	} else {
-		/*
-		 * check page protections
-		 */
-		for (a = addr; a < addr + len; a += PAGESIZE) {
-			struct vpage *vp;
-
-			vp = &svd->vpage[seg_page(seg, a)];
-			if ((VPP_PROT(vp) & protchk) == 0) {
-				mutex_exit(&svd->segp_slock);
-				error = EFAULT;
+	/*
+	 * For MAP_SHARED segments we already verified above that segment
+	 * protections allow this pagelock operation.
+	 */
+	if (pamp == NULL) {
+		ASSERT(svd->type == MAP_PRIVATE);
+		if (svd->pageprot == 0) {
+			if ((svd->prot & protchk) == 0) {
+				error = EACCES;
 				goto out;
 			}
-		}
-	}
-
-	/*
-	 * Avoid per page overhead of segvn_slock_anonpages() for small
-	 * pages. For large pages segvn_slock_anonpages() only does real
-	 * work once per large page.  The tradeoff is that we may decrement
-	 * availrmem more than once for the same page but this is ok
-	 * for small pages.
-	 */
-	if (seg->s_szc == 0) {
-		mutex_enter(&freemem_lock);
-		if (availrmem < tune.t_minarmem + npages) {
-			mutex_exit(&freemem_lock);
-			mutex_exit(&svd->segp_slock);
-			error = ENOMEM;
-			goto out;
-		}
-		availrmem -= npages;
-		mutex_exit(&freemem_lock);
-	}
-
-	pplist = kmem_alloc(sizeof (page_t *) * npages, KM_SLEEP);
+			if (svd->prot & PROT_WRITE) {
+				wlen = lpgeaddr - lpgaddr;
+			} else {
+				wlen = 0;
+				ASSERT(rw == S_READ);
+			}
+		} else {
+			int wcont = 1;
+			/*
+			 * check page protections
+			 */
+			for (a = lpgaddr, wlen = 0; a < lpgeaddr; a += pgsz) {
+				struct vpage *vp;
+
+				ASSERT(seg->s_szc == 0 ||
+				    sameprot(seg, a, pgsz));
+				vp = &svd->vpage[seg_page(seg, a)];
+				if ((VPP_PROT(vp) & protchk) == 0) {
+					error = EACCES;
+					goto out;
+				}
+				if (wcont && (VPP_PROT(vp) & PROT_WRITE)) {
+					wlen += pgsz;
+				} else {
+					wcont = 0;
+					ASSERT(rw == S_READ);
+				}
+			}
+		}
+		ASSERT(rw == S_READ || wlen == lpgeaddr - lpgaddr);
+		ASSERT(rw == S_WRITE || wlen <= lpgeaddr - lpgaddr);
+	}
+
+	/*
+	 * Only build large page adjusted shadow list if we expect to insert
+	 * it into pcache. For large enough pages it's a big overhead to
+	 * create a shadow list of the entire large page. But this overhead
+	 * should be amortized over repeated pcache hits on subsequent reuse
+	 * of this shadow list (IO into any range within this shadow list will
+	 * find it in pcache since we large page align the request for pcache
+	 * lookups). pcache performance is improved with bigger shadow lists
+	 * as it reduces the time to pcache the entire big segment and reduces
+	 * pcache chain length.
+	 */
+	if (seg_pinsert_check(seg, pamp, paddr,
+	    lpgeaddr - lpgaddr, pflags) == SEGP_SUCCESS) {
+		addr = lpgaddr;
+		len = lpgeaddr - lpgaddr;
+		use_pcache = 1;
+	} else {
+		use_pcache = 0;
+		/*
+		 * Since this entry will not be inserted into the pcache, we
+		 * will not do any adjustments to the starting address or
+		 * size of the memory to be locked.
+		 */
+		adjustpages = 0;
+	}
+	npages = btop(len);
+
+	pplist = kmem_alloc(sizeof (page_t *) * (npages + 1), KM_SLEEP);
 	pl = pplist;
 	*ppp = pplist + adjustpages;
+	/*
+	 * If use_pcache is 0 this shadow list is not large page adjusted.
+	 * Record this info in the last entry of shadow array so that
+	 * L_PAGEUNLOCK can determine if it should large page adjust the
+	 * address range to find the real range that was locked.
+	 */
+	pl[npages] = use_pcache ? PCACHE_SHWLIST : NOPCACHE_SHWLIST;

 	page = seg_page(seg, addr);
 	anon_index = svd->anon_index + page;

+	anlock = 0;
 	ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+	ASSERT(amp->a_szc >= seg->s_szc);
+	anpgcnt = page_get_pagecnt(amp->a_szc);
 	for (a = addr; a < addr + len; a += PAGESIZE, anon_index++) {
 		struct anon *ap;
 		struct vnode *vp;
 		u_offset_t off;
-		anon_sync_obj_t cookie;
-
-		anon_array_enter(amp, anon_index, &cookie);
+
+		/*
+		 * Lock and unlock anon array only once per large page.
+		 * anon_array_enter() locks the root anon slot according to
+		 * a_szc which can't change while anon map is locked.  We lock
+		 * anon the first time through this loop and each time we
+		 * reach anon index that corresponds to a root of a large
+		 * page.
+		 */
+		if (a == addr || P2PHASE(anon_index, anpgcnt) == 0) {
+			ASSERT(anlock == 0);
+			anon_array_enter(amp, anon_index, &cookie);
+			anlock = 1;
+		}
 		ap = anon_get_ptr(amp->ahp, anon_index);
-		if (ap == NULL) {
+
+		/*
+		 * We must never use seg_pcache for COW pages
+		 * because we might end up with original page still
+		 * lying in seg_pcache even after private page is
+		 * created. This leads to data corruption as
+		 * aio_write refers to the page still in cache
+		 * while all other accesses refer to the private
+		 * page.
+		 */
+		if (ap == NULL || ap->an_refcnt != 1) {
+			struct vpage *vpage;
+
+			if (seg->s_szc) {
+				error = EFAULT;
+				break;
+			}
+			if (svd->vpage != NULL) {
+				vpage = &svd->vpage[seg_page(seg, a)];
+			} else {
+				vpage = NULL;
+			}
+			ASSERT(anlock);
 			anon_array_exit(&cookie);
-			break;
-		} else {
-			/*
-			 * We must never use seg_pcache for COW pages
-			 * because we might end up with original page still
-			 * lying in seg_pcache even after private page is
-			 * created. This leads to data corruption as
-			 * aio_write refers to the page still in cache
-			 * while all other accesses refer to the private
-			 * page.
-			 */
-			if (ap->an_refcnt != 1) {
-				anon_array_exit(&cookie);
+			anlock = 0;
+			pp = NULL;
+			error = segvn_faultpage(seg->s_as->a_hat, seg, a, 0,
+			    vpage, &pp, 0, F_INVAL, rw, 1);
+			if (error) {
+				error = fc_decode(error);
+				break;
+			}
+			anon_array_enter(amp, anon_index, &cookie);
+			anlock = 1;
+			ap = anon_get_ptr(amp->ahp, anon_index);
+			if (ap == NULL || ap->an_refcnt != 1) {
+				error = EFAULT;
 				break;
 			}
 		}
 		swap_xlate(ap, &vp, &off);
-		anon_array_exit(&cookie);
-
 		pp = page_lookup_nowait(vp, off, SE_SHARED);
 		if (pp == NULL) {
+			error = EFAULT;
 			break;
 		}
-		if (seg->s_szc != 0 || pp->p_szc != 0) {
-			if (!segvn_slock_anonpages(pp, a == addr)) {
-				page_unlock(pp);
-				break;
-			}
-		} else {
-			szc0_npages++;
+		if (ap->an_pvp != NULL) {
+			anon_swap_free(ap, pp);
+		}
+		/*
+		 * Unlock anon if this is the last slot in a large page.
+		 */
+		if (P2PHASE(anon_index, anpgcnt) == anpgcnt - 1) {
+			ASSERT(anlock);
+			anon_array_exit(&cookie);
+			anlock = 0;
 		}
 		*pplist++ = pp;
 	}
+	if (anlock) {		/* Ensure the lock is dropped */
+		anon_array_exit(&cookie);
+	}
 	ANON_LOCK_EXIT(&amp->a_rwlock);

-	ASSERT(npages >= szc0_npages);
-
 	if (a >= addr + len) {
-		mutex_enter(&freemem_lock);
-		if (seg->s_szc == 0 && npages != szc0_npages) {
-			ASSERT(svd->type == MAP_SHARED && amp->a_szc > 0);
-			availrmem += (npages - szc0_npages);
-		}
-		svd->softlockcnt += npages;
-		segvn_pages_locked += npages;
-		mutex_exit(&freemem_lock);
-		(void) seg_pinsert(seg, addr, len, pl, rw, SEGP_ASYNC_FLUSH,
-		    segvn_reclaim);
-		mutex_exit(&svd->segp_slock);
+		atomic_add_long((ulong_t *)&svd->softlockcnt, npages);
+		if (pamp != NULL) {
+			ASSERT(svd->type == MAP_SHARED);
+			atomic_add_long((ulong_t *)&pamp->a_softlockcnt,
+			    npages);
+			wlen = len;
+		}
+		if (sftlck_sbase) {
+			atomic_add_long((ulong_t *)&svd->softlockcnt_sbase, 1);
+		}
+		if (sftlck_send) {
+			atomic_add_long((ulong_t *)&svd->softlockcnt_send, 1);
+		}
+		if (use_pcache) {
+			(void) seg_pinsert(seg, pamp, paddr, len, wlen, pl,
+			    rw, pflags, preclaim_callback);
+		}
 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
 		TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_FILL_END,
 		    "segvn_pagelock: cache fill seg %p addr %p", seg, addr);
 		return (0);
 	}

-	mutex_exit(&svd->segp_slock);
-	if (seg->s_szc == 0) {
-		mutex_enter(&freemem_lock);
-		availrmem += npages;
-		mutex_exit(&freemem_lock);
-	}
-	error = EFAULT;
 	pplist = pl;
 	np = ((uintptr_t)(a - addr)) >> PAGESHIFT;
 	while (np > (uint_t)0) {
 		ASSERT(PAGE_LOCKED(*pplist));
-		if (seg->s_szc != 0 || (*pplist)->p_szc != 0) {
-			segvn_sunlock_anonpages(*pplist, pplist == pl);
-		}
 		page_unlock(*pplist);
 		np--;
 		pplist++;
 	}
-	kmem_free(pl, sizeof (page_t *) * npages);
+	kmem_free(pl, sizeof (page_t *) * (npages + 1));
 out:
 	SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
 	*ppp = NULL;
@@ -8942,34 +9174,55 @@
 static void
 segvn_purge(struct seg *seg)
 {
-	seg_ppurge(seg);
+	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+
+	/*
+	 * pcache is only used by pure anon segments.
+	 */
+	if (svd->amp == NULL || svd->vp != NULL) {
+		return;
+	}
+
+	/*
+	 * For MAP_SHARED segments non 0 segment's softlockcnt means
+	 * active IO is still in progress via this segment. So we only
+	 * purge MAP_SHARED segments when their softlockcnt is 0.
+	 */
+	if (svd->type == MAP_PRIVATE) {
+		if (svd->softlockcnt) {
+			seg_ppurge(seg, NULL, 0);
+		}
+	} else if (svd->softlockcnt == 0 && svd->amp->a_softlockcnt != 0) {
+		seg_ppurge(seg, svd->amp, 0);
+	}
 }

+/*
+ * If async argument is not 0 we are called from pcache async thread and don't
+ * hold AS lock.
+ */
+
+/*ARGSUSED*/
 static int
-segvn_reclaim(struct seg *seg, caddr_t addr, size_t len, struct page **pplist,
-	enum seg_rw rw)
+segvn_reclaim(void *ptag, caddr_t addr, size_t len, struct page **pplist,
+	enum seg_rw rw, int async)
 {
+	struct seg *seg = (struct seg *)ptag;
 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
 	pgcnt_t np, npages;
 	struct page **pl;
-	pgcnt_t szc0_npages = 0;
-
-#ifdef lint
-	addr = addr;
-#endif
-
-	npages = np = (len >> PAGESHIFT);
+
+	npages = np = btop(len);
 	ASSERT(npages);
-	pl = pplist;
-	if (seg->s_szc != 0) {
-		size_t pgsz = page_get_pagesize(seg->s_szc);
-		if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) {
-			panic("segvn_reclaim: unaligned addr or len");
-			/*NOTREACHED*/
-		}
-	}

 	ASSERT(svd->vp == NULL && svd->amp != NULL);
+	ASSERT(svd->softlockcnt >= npages);
+	ASSERT(async || AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	pl = pplist;
+
+	ASSERT(pl[np] == NOPCACHE_SHWLIST || pl[np] == PCACHE_SHWLIST);
+	ASSERT(!async || pl[np] == PCACHE_SHWLIST);

 	while (np > (uint_t)0) {
 		if (rw == S_WRITE) {
@@ -8977,27 +9230,41 @@
 		} else {
 			hat_setref(*pplist);
 		}
-		if (seg->s_szc != 0 || (*pplist)->p_szc != 0) {
-			segvn_sunlock_anonpages(*pplist, pplist == pl);
-		} else {
-			szc0_npages++;
-		}
 		page_unlock(*pplist);
 		np--;
 		pplist++;
 	}
-	kmem_free(pl, sizeof (page_t *) * npages);
-
-	mutex_enter(&freemem_lock);
-	segvn_pages_locked -= npages;
-	svd->softlockcnt -= npages;
-	if (szc0_npages != 0) {
-		availrmem += szc0_npages;
-	}
-	mutex_exit(&freemem_lock);
-	if (svd->softlockcnt <= 0) {
-		if (AS_ISUNMAPWAIT(seg->s_as)) {
+
+	kmem_free(pl, sizeof (page_t *) * (npages + 1));
+
+	/*
+	 * If we are pcache async thread we don't hold AS lock. This means if
+	 * softlockcnt drops to 0 after the decrement below address space may
+	 * get freed. We can't allow it since after softlock derement to 0 we
+	 * still need to access as structure for possible wakeup of unmap
+	 * waiters. To prevent the disappearance of as we take this segment
+	 * segfree_syncmtx. segvn_free() also takes this mutex as a barrier to
+	 * make sure this routine completes before segment is freed.
+	 *
+	 * The second complication we have to deal with in async case is a
+	 * possibility of missed wake up of unmap wait thread. When we don't
+	 * hold as lock here we may take a_contents lock before unmap wait
+	 * thread that was first to see softlockcnt was still not 0. As a
+	 * result we'll fail to wake up an unmap wait thread. To avoid this
+	 * race we set nounmapwait flag in as structure if we drop softlockcnt
+	 * to 0 when we were called by pcache async thread.  unmapwait thread
+	 * will not block if this flag is set.
+	 */
+	if (async) {
+		mutex_enter(&svd->segfree_syncmtx);
+	}
+
+	if (!atomic_add_long_nv((ulong_t *)&svd->softlockcnt, -npages)) {
+		if (async || AS_ISUNMAPWAIT(seg->s_as)) {
 			mutex_enter(&seg->s_as->a_contents);
+			if (async) {
+				AS_SETNOUNMAPWAIT(seg->s_as);
+			}
 			if (AS_ISUNMAPWAIT(seg->s_as)) {
 				AS_CLRUNMAPWAIT(seg->s_as);
 				cv_broadcast(&seg->s_as->a_cv);
@@ -9005,8 +9272,59 @@
 			mutex_exit(&seg->s_as->a_contents);
 		}
 	}
+
+	if (async) {
+		mutex_exit(&svd->segfree_syncmtx);
+	}
 	return (0);
 }
+
+/*ARGSUSED*/
+static int
+shamp_reclaim(void *ptag, caddr_t addr, size_t len, struct page **pplist,
+	enum seg_rw rw, int async)
+{
+	amp_t *amp = (amp_t *)ptag;
+	pgcnt_t np, npages;
+	struct page **pl;
+
+	npages = np = btop(len);
+	ASSERT(npages);
+	ASSERT(amp->a_softlockcnt >= npages);
+
+	pl = pplist;
+
+	ASSERT(pl[np] == NOPCACHE_SHWLIST || pl[np] == PCACHE_SHWLIST);
+	ASSERT(!async || pl[np] == PCACHE_SHWLIST);
+
+	while (np > (uint_t)0) {
+		if (rw == S_WRITE) {
+			hat_setrefmod(*pplist);
+		} else {
+			hat_setref(*pplist);
+		}
+		page_unlock(*pplist);
+		np--;
+		pplist++;
+	}
+
+	kmem_free(pl, sizeof (page_t *) * (npages + 1));
+
+	/*
+	 * If somebody sleeps in anonmap_purge() wake them up if a_softlockcnt
+	 * drops to 0. anon map can't be freed until a_softlockcnt drops to 0
+	 * and anonmap_purge() acquires a_purgemtx.
+	 */
+	mutex_enter(&amp->a_purgemtx);
+	if (!atomic_add_long_nv((ulong_t *)&amp->a_softlockcnt, -npages) &&
+	    amp->a_purgewait) {
+		amp->a_purgewait = 0;
+		cv_broadcast(&amp->a_purgecv);
+	}
+	mutex_exit(&amp->a_purgemtx);
+	return (0);
+}
+
 /*
  * get a memory ID for an addr in a given segment
  *
--- a/usr/src/uts/common/vm/seg_vn.h	Thu May 22 22:08:42 2008 -0700
+++ b/usr/src/uts/common/vm/seg_vn.h	Thu May 22 22:23:49 2008 -0700
@@ -86,7 +86,7 @@
  */
 typedef struct	segvn_data {
 	krwlock_t lock;		/* protect segvn_data and vpage array */
-	kmutex_t segp_slock;	/* serialize insertions into seg_pcache */
+	kmutex_t segfree_syncmtx; /* barrier lock for segvn_free() */
 	uchar_t	pageprot;	/* true if per page protections present */
 	uchar_t	prot;		/* current segment prot if pageprot == 0 */
 	uchar_t	maxprot;	/* maximum segment protections */
@@ -101,7 +101,7 @@
 	uchar_t	advice;		/* madvise flags for segment */
 	uchar_t	pageadvice;	/* true if per page advice set */
 	ushort_t flags;		/* flags - from sys/mman.h */
-	ssize_t	softlockcnt;	/* # of pages SOFTLOCKED in seg */
+	spgcnt_t softlockcnt;	/* # of pages SOFTLOCKED in seg */
 	lgrp_mem_policy_info_t policy_info; /* memory allocation policy */
 	hat_region_cookie_t rcookie;	/* region for hat calls */
 	lgrp_mem_policy_info_t tr_policy_info; /* memory allocation for TR */
@@ -110,6 +110,8 @@
 	struct	segvn_data *svn_trprev; /* textrepl list prev link */
 	int	tr_state;	/* TR (text replication) state */
 	uchar_t	pageswap;	/* true if per page swap accounting is set */
+	spgcnt_t softlockcnt_sbase; /* # of softlocks for seg start addr */
+	spgcnt_t softlockcnt_send; /* # of softlocks for seg end addr */
 } segvn_data_t;

 #ifdef _KERNEL
--- a/usr/src/uts/common/vm/vm_anon.c	Thu May 22 22:08:42 2008 -0700
+++ b/usr/src/uts/common/vm/vm_anon.c	Thu May 22 22:23:49 2008 -0700
@@ -106,6 +106,7 @@
 #include <sys/sysmacros.h>
 #include <sys/bitmap.h>
 #include <sys/vmsystm.h>
+#include <sys/tuneable.h>
 #include <sys/debug.h>
 #include <sys/fs/swapnode.h>
 #include <sys/tnf_probe.h>
@@ -156,7 +157,6 @@
 } anonvmstats;
 #endif /* VM_STATS */

-
 /*ARGSUSED*/
 static int
 anonmap_cache_constructor(void *buf, void *cdrarg, int kmflags)
@@ -164,6 +164,9 @@
 	struct anon_map *amp = buf;

 	rw_init(&amp->a_rwlock, NULL, RW_DEFAULT, NULL);
+	cv_init(&amp->a_purgecv, NULL, CV_DEFAULT, NULL);
+	mutex_init(&amp->a_pmtx, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&amp->a_purgemtx, NULL, MUTEX_DEFAULT, NULL);
 	return (0);
 }

@@ -174,6 +177,9 @@
 	struct anon_map *amp = buf;

 	rw_destroy(&amp->a_rwlock);
+	cv_destroy(&amp->a_purgecv);
+	mutex_destroy(&amp->a_pmtx);
+	mutex_destroy(&amp->a_purgemtx);
 }

 kmutex_t	anonhash_lock[AH_LOCK_SIZE];
@@ -870,6 +876,7 @@
 	mutex_enter(&anoninfo_lock);

 	ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
+
 	/*
 	 * If some of this reservation belonged to swapfs
 	 * give it back to availrmem.
@@ -944,6 +951,48 @@
 }

 /*
+ * Called for pages locked in memory via softlock/pagelock/mlock to make sure
+ * such pages don't consume any physical swap resources needed for swapping
+ * unlocked pages.
+ */
+void
+anon_swap_free(struct anon *ap, page_t *pp)
+{
+	kmutex_t *ahm;
+
+	ASSERT(ap != NULL);
+	ASSERT(pp != NULL);
+	ASSERT(PAGE_LOCKED(pp));
+	ASSERT(pp->p_vnode != NULL);
+	ASSERT(IS_SWAPFSVP(pp->p_vnode));
+	ASSERT(ap->an_refcnt != 0);
+	ASSERT(pp->p_vnode == ap->an_vp);
+	ASSERT(pp->p_offset == ap->an_off);
+
+	if (ap->an_pvp == NULL)
+		return;
+
+	page_io_lock(pp);
+	ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)];
+	mutex_enter(ahm);
+
+	ASSERT(ap->an_refcnt != 0);
+	ASSERT(pp->p_vnode == ap->an_vp);
+	ASSERT(pp->p_offset == ap->an_off);
+
+	if (ap->an_pvp != NULL) {
+		swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE);
+		ap->an_pvp = NULL;
+		ap->an_poff = 0;
+		mutex_exit(ahm);
+		hat_setmod(pp);
+	} else {
+		mutex_exit(ahm);
+	}
+	page_io_unlock(pp);
+}
+
+/*
  * Decrement the reference count of an anon page.
  * If reference count goes to zero, free it and
  * its associated page (if any).
@@ -3154,7 +3203,7 @@
 	ulong_t sidx_aligned;
 	ulong_t eidx_aligned;

-	ASSERT(RW_WRITE_HELD(&amp->a_rwlock));
+	ASSERT(ANON_WRITE_HELD(&amp->a_rwlock));
 	ASSERT(amp->refcnt <= 1);
 	ASSERT(amp->a_szc > 0);
 	ASSERT(eidx <= ahp->size);
@@ -3205,6 +3254,53 @@
 }

 /*
+ * This routine should be called with amp's writer lock when there're no other
+ * users of amp.  All pcache entries of this amp must have been already
+ * inactivated. We must not drop a_rwlock here to prevent new users from
+ * attaching to this amp.
+ */
+void
+anonmap_purge(struct anon_map *amp)
+{
+	ASSERT(ANON_WRITE_HELD(&amp->a_rwlock));
+	ASSERT(amp->refcnt <= 1);
+
+	if (amp->a_softlockcnt != 0) {
+		seg_ppurge(NULL, amp, 0);
+	}
+
+	/*
+	 * Since all pcache entries were already inactive before this routine
+	 * was called seg_ppurge() couldn't return while there're still
+	 * entries that can be found via the list anchored at a_phead. So we
+	 * can assert this list is empty now. a_softlockcnt may be still non 0
+	 * if asynchronous thread that manages pcache already removed pcache
+	 * entries but hasn't unlocked the pages yet. If a_softlockcnt is non
+	 * 0 we just wait on a_purgecv for shamp_reclaim() to finish. Even if
+	 * a_softlockcnt is 0 we grab a_purgemtx to avoid freeing anon map
+	 * before shamp_reclaim() is done with it. a_purgemtx also taken by
+	 * shamp_reclaim() while a_softlockcnt was still not 0 acts as a
+	 * barrier that prevents anonmap_purge() to complete while
+	 * shamp_reclaim() may still be referencing this amp.
+	 */
+	ASSERT(amp->a_phead.p_lnext == &amp->a_phead);
+	ASSERT(amp->a_phead.p_lprev == &amp->a_phead);
+
+	mutex_enter(&amp->a_purgemtx);
+	while (amp->a_softlockcnt != 0) {
+		ASSERT(amp->a_phead.p_lnext == &amp->a_phead);
+		ASSERT(amp->a_phead.p_lprev == &amp->a_phead);
+		amp->a_purgewait = 1;
+		cv_wait(&amp->a_purgecv, &amp->a_purgemtx);
+	}
+	mutex_exit(&amp->a_purgemtx);
+
+	ASSERT(amp->a_phead.p_lnext == &amp->a_phead);
+	ASSERT(amp->a_phead.p_lprev == &amp->a_phead);
+	ASSERT(amp->a_softlockcnt == 0);
+}
+
+/*
  * Allocate and initialize an anon_map structure for seg
  * associating the given swap reservation with the new anon_map.
  */
@@ -3232,14 +3328,22 @@
 	amp->locality = 0;
 	amp->a_szc = 0;
 	amp->a_sp = NULL;
+	amp->a_softlockcnt = 0;
+	amp->a_purgewait = 0;
+	amp->a_phead.p_lnext = &amp->a_phead;
+	amp->a_phead.p_lprev = &amp->a_phead;
+
 	return (amp);
 }

 void
 anonmap_free(struct anon_map *amp)
 {
-	ASSERT(amp->ahp);
+	ASSERT(amp->ahp != NULL);
 	ASSERT(amp->refcnt == 0);
+	ASSERT(amp->a_softlockcnt == 0);
+	ASSERT(amp->a_phead.p_lnext == &amp->a_phead);
+	ASSERT(amp->a_phead.p_lprev == &amp->a_phead);

 	lgrp_shm_policy_fini(amp, NULL);
 	anon_release(amp->ahp, btopr(amp->size));
--- a/usr/src/uts/common/vm/vm_as.c	Thu May 22 22:08:42 2008 -0700
+++ b/usr/src/uts/common/vm/vm_as.c	Thu May 22 22:23:49 2008 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */

@@ -715,12 +715,13 @@
 		int err;

 		next = AS_SEGNEXT(as, seg);
+retry:
 		err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
 		if (err == EAGAIN) {
 			mutex_enter(&as->a_contents);
 			if (as->a_callbacks) {
 				AS_LOCK_EXIT(as, &as->a_lock);
-			} else {
+			} else if (!AS_ISNOUNMAPWAIT(as)) {
 				/*
 				 * Memory is currently locked. Wait for a
 				 * cv_signal that it has been unlocked, then
@@ -732,6 +733,20 @@
 				AS_LOCK_EXIT(as, &as->a_lock);
 				while (AS_ISUNMAPWAIT(as))
 					cv_wait(&as->a_cv, &as->a_contents);
+			} else {
+				/*
+				 * We may have raced with
+				 * segvn_reclaim()/segspt_reclaim(). In this
+				 * case clean nounmapwait flag and retry since
+				 * softlockcnt in this segment may be already
+				 * 0.  We don't drop as writer lock so our
+				 * number of retries without sleeping should
+				 * be very small. See segvn_reclaim() for
+				 * more comments.
+				 */
+				AS_CLRNOUNMAPWAIT(as);
+				mutex_exit(&as->a_contents);
+				goto retry;
 			}
 			mutex_exit(&as->a_contents);
 			goto top;
@@ -1193,6 +1208,7 @@
 			ssize = seg->s_base + seg->s_size - raddr;
 		else
 			ssize = rsize;
+retry:
 		error = SEGOP_SETPROT(seg, raddr, ssize, prot);

 		if (error == IE_NOMEM) {
@@ -1254,13 +1270,27 @@
 			    seg->s_base, seg->s_size))) {
 				AS_LOCK_EXIT(as, &as->a_lock);
 				as_execute_callback(as, cb, AS_SETPROT_EVENT);
-			} else {
+			} else if (!AS_ISNOUNMAPWAIT(as)) {
 				if (AS_ISUNMAPWAIT(as) == 0)
 					cv_broadcast(&as->a_cv);
 				AS_SETUNMAPWAIT(as);
 				AS_LOCK_EXIT(as, &as->a_lock);
 				while (AS_ISUNMAPWAIT(as))
 					cv_wait(&as->a_cv, &as->a_contents);
+			} else {
+				/*
+				 * We may have raced with
+				 * segvn_reclaim()/segspt_reclaim(). In this
+				 * case clean nounmapwait flag and retry since
+				 * softlockcnt in this segment may be already
+				 * 0.  We don't drop as writer lock so our
+				 * number of retries without sleeping should
+				 * be very small. See segvn_reclaim() for
+				 * more comments.
+				 */
+				AS_CLRNOUNMAPWAIT(as);
+				mutex_exit(&as->a_contents);
+				goto retry;
 			}
 			mutex_exit(&as->a_contents);
 			goto setprot_top;
@@ -1385,6 +1415,7 @@
 		 */
 		seg_next = AS_SEGNEXT(as, seg);

+retry:
 		err = SEGOP_UNMAP(seg, raddr, ssize);
 		if (err == EAGAIN) {
 			/*
@@ -1419,25 +1450,37 @@
 			 *	either there were no callbacks for this event
 			 *	or they were already in progress.
 			 */
-			as_setwatch(as);
 			mutex_enter(&as->a_contents);
 			if (as->a_callbacks &&
 			    (cb = as_find_callback(as, AS_UNMAP_EVENT,
 			    seg->s_base, seg->s_size))) {
 				AS_LOCK_EXIT(as, &as->a_lock);
 				as_execute_callback(as, cb, AS_UNMAP_EVENT);
-			} else {
+			} else if (!AS_ISNOUNMAPWAIT(as)) {
 				if (AS_ISUNMAPWAIT(as) == 0)
 					cv_broadcast(&as->a_cv);
 				AS_SETUNMAPWAIT(as);
 				AS_LOCK_EXIT(as, &as->a_lock);
 				while (AS_ISUNMAPWAIT(as))
 					cv_wait(&as->a_cv, &as->a_contents);
+			} else {
+				/*
+				 * We may have raced with
+				 * segvn_reclaim()/segspt_reclaim(). In this
+				 * case clean nounmapwait flag and retry since
+				 * softlockcnt in this segment may be already
+				 * 0.  We don't drop as writer lock so our
+				 * number of retries without sleeping should
+				 * be very small. See segvn_reclaim() for
+				 * more comments.
+				 */
+				AS_CLRNOUNMAPWAIT(as);
+				mutex_exit(&as->a_contents);
+				goto retry;
 			}
 			mutex_exit(&as->a_contents);
 			goto top;
 		} else if (err == IE_RETRY) {
-			as_setwatch(as);
 			AS_LOCK_EXIT(as, &as->a_lock);
 			goto top;
 		} else if (err) {
@@ -2539,6 +2582,167 @@
 }

 /*
+ * Pagelock pages from a range that spans more than 1 segment.  Obtain shadow
+ * lists from each segment and copy them to one contiguous shadow list (plist)
+ * as expected by the caller.  Save pointers to per segment shadow lists at
+ * the tail of plist so that they can be used during as_pageunlock().
+ */
+static int
+as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp,
+    caddr_t addr, size_t size, enum seg_rw rw)
+{
+	caddr_t sv_addr = addr;
+	size_t sv_size = size;
+	struct seg *sv_seg = seg;
+	ulong_t segcnt = 1;
+	ulong_t cnt;
+	size_t ssize;
+	pgcnt_t npages = btop(size);
+	page_t **plist;
+	page_t **pl;
+	int error;
+	caddr_t eaddr;
+	faultcode_t fault_err = 0;
+	pgcnt_t pl_off;
+	extern struct seg_ops segspt_shmops;
+
+	ASSERT(AS_LOCK_HELD(as, &as->a_lock));
+	ASSERT(seg != NULL);
+	ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
+	ASSERT(addr + size > seg->s_base + seg->s_size);
+	ASSERT(IS_P2ALIGNED(size, PAGESIZE));
+	ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
+
+	/*
+	 * Count the number of segments covered by the range we are about to
+	 * lock. The segment count is used to size the shadow list we return
+	 * back to the caller.
+	 */
+	for (; size != 0; size -= ssize, addr += ssize) {
+		if (addr >= seg->s_base + seg->s_size) {
+
+			seg = AS_SEGNEXT(as, seg);
+			if (seg == NULL || addr != seg->s_base) {
+				AS_LOCK_EXIT(as, &as->a_lock);
+				return (EFAULT);
+			}
+			/*
+			 * Do a quick check if subsequent segments
+			 * will most likely support pagelock.
+			 */
+			if (seg->s_ops == &segvn_ops) {
+				vnode_t *vp;
+
+				if (SEGOP_GETVP(seg, addr, &vp) != 0 ||
+				    vp != NULL) {
+					AS_LOCK_EXIT(as, &as->a_lock);
+					goto slow;
+				}
+			} else if (seg->s_ops != &segspt_shmops) {
+				AS_LOCK_EXIT(as, &as->a_lock);
+				goto slow;
+			}
+			segcnt++;
+		}
+		if (addr + size > seg->s_base + seg->s_size) {
+			ssize = seg->s_base + seg->s_size - addr;
+		} else {
+			ssize = size;
+		}
+	}
+	ASSERT(segcnt > 1);
+
+	plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP);
+
+	addr = sv_addr;
+	size = sv_size;
+	seg = sv_seg;
+
+	for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) {
+		if (addr >= seg->s_base + seg->s_size) {
+			seg = AS_SEGNEXT(as, seg);
+			ASSERT(seg != NULL && addr == seg->s_base);
+			cnt++;
+			ASSERT(cnt < segcnt);
+		}
+		if (addr + size > seg->s_base + seg->s_size) {
+			ssize = seg->s_base + seg->s_size - addr;
+		} else {
+			ssize = size;
+		}
+		pl = &plist[npages + cnt];
+		error = SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
+		    L_PAGELOCK, rw);
+		if (error) {
+			break;
+		}
+		ASSERT(plist[npages + cnt] != NULL);
+		ASSERT(pl_off + btop(ssize) <= npages);
+		bcopy(plist[npages + cnt], &plist[pl_off],
+		    btop(ssize) * sizeof (page_t *));
+		pl_off += btop(ssize);
+	}
+
+	if (size == 0) {
+		AS_LOCK_EXIT(as, &as->a_lock);
+		ASSERT(cnt == segcnt - 1);
+		*ppp = plist;
+		return (0);
+	}
+
+	/*
+	 * one of pagelock calls failed. The error type is in error variable.
+	 * Unlock what we've locked so far and retry with F_SOFTLOCK if error
+	 * type is either EFAULT or ENOTSUP. Otherwise just return the error
+	 * back to the caller.
+	 */
+
+	eaddr = addr;
+	seg = sv_seg;
+
+	for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) {
+		if (addr >= seg->s_base + seg->s_size) {
+			seg = AS_SEGNEXT(as, seg);
+			ASSERT(seg != NULL && addr == seg->s_base);
+			cnt++;
+			ASSERT(cnt < segcnt);
+		}
+		if (eaddr > seg->s_base + seg->s_size) {
+			ssize = seg->s_base + seg->s_size - addr;
+		} else {
+			ssize = eaddr - addr;
+		}
+		pl = &plist[npages + cnt];
+		ASSERT(*pl != NULL);
+		(void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
+		    L_PAGEUNLOCK, rw);
+	}
+
+	AS_LOCK_EXIT(as, &as->a_lock);
+
+	kmem_free(plist, (npages + segcnt) * sizeof (page_t *));
+
+	if (error != ENOTSUP && error != EFAULT) {
+		return (error);
+	}
+
+slow:
+	/*
+	 * If we are here because pagelock failed due to the need to cow fault
+	 * in the pages we want to lock F_SOFTLOCK will do this job and in
+	 * next as_pagelock() call for this address range pagelock will
+	 * hopefully succeed.
+	 */
+	fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw);
+	if (fault_err != 0) {
+		return (fc_decode(fault_err));
+	}
+	*ppp = NULL;
+
+	return (0);
+}
+
+/*
  * lock pages in a given address space. Return shadow list. If
  * the list is NULL, the MMU mapping is also locked.
  */
@@ -2547,12 +2751,10 @@
     size_t size, enum seg_rw rw)
 {
 	size_t rsize;
-	caddr_t base;
 	caddr_t raddr;
 	faultcode_t fault_err;
 	struct seg *seg;
-	int res;
-	int prefaulted = 0;
+	int err;

 	TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START,
 	    "as_pagelock_start: addr %p size %ld", addr, size);
@@ -2560,17 +2762,25 @@
 	raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
 	rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
 	    (size_t)raddr;
-top:
+
 	/*
 	 * if the request crosses two segments let
 	 * as_fault handle it.
 	 */
 	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
-	seg = as_findseg(as, addr, 0);
-	if ((seg == NULL) || ((base = seg->s_base) > addr) ||
-	    (addr + size) > base + seg->s_size) {
+
+	seg = as_segat(as, raddr);
+	if (seg == NULL) {
 		AS_LOCK_EXIT(as, &as->a_lock);
-		goto slow;
+		return (EFAULT);
+	}
+	ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
+	if (raddr + rsize > seg->s_base + seg->s_size) {
+		return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw));
+	}
+	if (raddr + rsize <= raddr) {
+		AS_LOCK_EXIT(as, &as->a_lock);
+		return (EFAULT);
 	}

 	TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START,
@@ -2579,46 +2789,22 @@
 	/*
 	 * try to lock pages and pass back shadow list
 	 */
-	res = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
+	err = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw);

 	TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end");
+
 	AS_LOCK_EXIT(as, &as->a_lock);
-	if (res == 0) {
-		return (0);
-	} else if (res == ENOTSUP || prefaulted) {
-		/*
-		 * (1) segment driver doesn't support PAGELOCK fastpath, or
-		 * (2) we've already tried fast path unsuccessfully after
-		 *    faulting in the addr range below; system might be
-		 *    thrashing or there may not be enough availrmem.
-		 */
-		goto slow;
+
+	if (err == 0 || (err != ENOTSUP && err != EFAULT)) {
+		return (err);
 	}

-	TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_FAULT_START,
-	    "as_fault_start: addr %p size %ld", addr, size);
-
 	/*
-	 * we might get here because of some COW fault or non
-	 * existing page. Let as_fault deal with it. Just load
-	 * the page, don't lock the MMU mapping.
-	 */
-	fault_err = as_fault(as->a_hat, as, addr, size, F_INVAL, rw);
-	if (fault_err != 0) {
-		return (fc_decode(fault_err));
-	}
-
-	prefaulted = 1;
-
-	/*
-	 * try fast path again; since we've dropped a_lock,
-	 * we need to try the dance from the start to see if
-	 * the addr range is still valid.
-	 */
-	goto top;
-slow:
-	/*
-	 * load the page and lock the MMU mapping.
+	 * Use F_SOFTLOCK to lock the pages because pagelock failed either due
+	 * to no pagelock support for this segment or pages need to be cow
+	 * faulted in. If fault is needed F_SOFTLOCK will do this job for
+	 * this as_pagelock() call and in the next as_pagelock() call for the
+	 * same address range pagelock call will hopefull succeed.
 	 */
 	fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
 	if (fault_err != 0) {
@@ -2631,6 +2817,52 @@
 }

 /*
+ * unlock pages locked by as_pagelock_segs().  Retrieve per segment shadow
+ * lists from the end of plist and call pageunlock interface for each segment.
+ * Drop as lock and free plist.
+ */
+static void
+as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size,
+    struct page **plist, enum seg_rw rw)
+{
+	ulong_t cnt;
+	caddr_t eaddr = addr + size;
+	pgcnt_t npages = btop(size);
+	size_t ssize;
+	page_t **pl;
+
+	ASSERT(AS_LOCK_HELD(as, &as->a_lock));
+	ASSERT(seg != NULL);
+	ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
+	ASSERT(addr + size > seg->s_base + seg->s_size);
+	ASSERT(IS_P2ALIGNED(size, PAGESIZE));
+	ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
+	ASSERT(plist != NULL);
+
+	for (cnt = 0; addr < eaddr; addr += ssize) {
+		if (addr >= seg->s_base + seg->s_size) {
+			seg = AS_SEGNEXT(as, seg);
+			ASSERT(seg != NULL && addr == seg->s_base);
+			cnt++;
+		}
+		if (eaddr > seg->s_base + seg->s_size) {
+			ssize = seg->s_base + seg->s_size - addr;
+		} else {
+			ssize = eaddr - addr;
+		}
+		pl = &plist[npages + cnt];
+		ASSERT(*pl != NULL);
+		(void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
+		    L_PAGEUNLOCK, rw);
+	}
+	ASSERT(cnt > 0);
+	AS_LOCK_EXIT(as, &as->a_lock);
+
+	cnt++;
+	kmem_free(plist, (npages + cnt) * sizeof (page_t *));
+}
+
+/*
  * unlock pages in a given address range
  */
 void
@@ -2652,44 +2884,29 @@
 		(void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
 		return;
 	}
-	raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
-	rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
-	    (size_t)raddr;
-	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
-	seg = as_findseg(as, addr, 0);
-	ASSERT(seg);
-	TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START,
-	    "seg_unlock_start: raddr %p rsize %ld", raddr, rsize);
-	SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
-	AS_LOCK_EXIT(as, &as->a_lock);
-	TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end");
-}
-
-/*
- * reclaim cached pages in a given address range
- */
-void
-as_pagereclaim(struct as *as, struct page **pp, caddr_t addr,
-    size_t size, enum seg_rw rw)
-{
-	struct seg *seg;
-	size_t rsize;
-	caddr_t raddr;
-
-	ASSERT(AS_READ_HELD(as, &as->a_lock));
-	ASSERT(pp != NULL);

 	raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
 	rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
 	    (size_t)raddr;
-	seg = as_findseg(as, addr, 0);
-	ASSERT(seg);
-	SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGERECLAIM, rw);
+
+	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+	seg = as_segat(as, raddr);
+	ASSERT(seg != NULL);
+
+	TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START,
+	    "seg_unlock_start: raddr %p rsize %ld", raddr, rsize);
+
+	ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
+	if (raddr + rsize <= seg->s_base + seg->s_size) {
+		SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
+	} else {
+		as_pageunlock_segs(as, seg, raddr, rsize, pp, rw);
+		return;
+	}
+	AS_LOCK_EXIT(as, &as->a_lock);
+	TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end");
 }

-#define	MAXPAGEFLIP	4
-#define	MAXPAGEFLIPSIZ	MAXPAGEFLIP*PAGESIZE
-
 int
 as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
     boolean_t wait)
@@ -2735,6 +2952,7 @@
 			ssize = rsize;
 		}

+retry:
 		error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);

 		if (error == IE_NOMEM) {
@@ -2778,13 +2996,29 @@
 			 *	as_unmap, as_setprot or as_free would do.
 			 */
 			mutex_enter(&as->a_contents);
-			if (AS_ISUNMAPWAIT(as) == 0) {
-				cv_broadcast(&as->a_cv);
-			}
-			AS_SETUNMAPWAIT(as);
-			AS_LOCK_EXIT(as, &as->a_lock);
-			while (AS_ISUNMAPWAIT(as)) {
-				cv_wait(&as->a_cv, &as->a_contents);
+			if (!AS_ISNOUNMAPWAIT(as)) {
+				if (AS_ISUNMAPWAIT(as) == 0) {
+					cv_broadcast(&as->a_cv);
+				}
+				AS_SETUNMAPWAIT(as);
+				AS_LOCK_EXIT(as, &as->a_lock);
+				while (AS_ISUNMAPWAIT(as)) {
+					cv_wait(&as->a_cv, &as->a_contents);
+				}
+			} else {
+				/*
+				 * We may have raced with
+				 * segvn_reclaim()/segspt_reclaim(). In this
+				 * case clean nounmapwait flag and retry since
+				 * softlockcnt in this segment may be already
+				 * 0.  We don't drop as writer lock so our
+				 * number of retries without sleeping should
+				 * be very small. See segvn_reclaim() for
+				 * more comments.
+				 */
+				AS_CLRNOUNMAPWAIT(as);
+				mutex_exit(&as->a_contents);
+				goto retry;
 			}
 			mutex_exit(&as->a_contents);
 			goto setpgsz_top;
@@ -2809,6 +3043,8 @@
 	size_t ssize;
 	int error;

+	ASSERT(AS_WRITE_HELD(as, &as->a_lock));
+
 	seg = as_segat(as, raddr);
 	if (seg == NULL) {
 		panic("as_iset3_default_lpsize: no seg");
@@ -2864,6 +3100,8 @@
 	int error;
 	int retry;

+	ASSERT(AS_WRITE_HELD(as, &as->a_lock));
+
 	for (;;) {
 		error = as_iset3_default_lpsize(as, addr, size, szc, &retry);
 		if (error == EINVAL && retry) {
@@ -3150,16 +3388,30 @@
 		error = EINVAL;
 	} else if (error == EAGAIN) {
 		mutex_enter(&as->a_contents);
-		if (AS_ISUNMAPWAIT(as) == 0) {
-			cv_broadcast(&as->a_cv);
+		if (!AS_ISNOUNMAPWAIT(as)) {
+			if (AS_ISUNMAPWAIT(as) == 0) {
+				cv_broadcast(&as->a_cv);
+			}
+			AS_SETUNMAPWAIT(as);
+			AS_LOCK_EXIT(as, &as->a_lock);
+			while (AS_ISUNMAPWAIT(as)) {
+				cv_wait(&as->a_cv, &as->a_contents);
+			}
+			mutex_exit(&as->a_contents);
+			AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
+		} else {
+			/*
+			 * We may have raced with
+			 * segvn_reclaim()/segspt_reclaim(). In this case
+			 * clean nounmapwait flag and retry since softlockcnt
+			 * in this segment may be already 0.  We don't drop as
+			 * writer lock so our number of retries without
+			 * sleeping should be very small. See segvn_reclaim()
+			 * for more comments.
+			 */
+			AS_CLRNOUNMAPWAIT(as);
+			mutex_exit(&as->a_contents);
 		}
-		AS_SETUNMAPWAIT(as);
-		AS_LOCK_EXIT(as, &as->a_lock);
-		while (AS_ISUNMAPWAIT(as)) {
-			cv_wait(&as->a_cv, &as->a_contents);
-		}
-		mutex_exit(&as->a_contents);
-		AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
 		goto again;
 	}
--- a/usr/src/uts/common/vm/vm_page.c	Thu May 22 22:08:42 2008 -0700
+++ b/usr/src/uts/common/vm/vm_page.c	Thu May 22 22:23:49 2008 -0700
@@ -106,9 +106,6 @@
  * These new counters will track the pages locked through segvn and
  * by explicit user locking.
  *
- * segvn_pages_locked : This keeps track on a global basis how many pages
- * are currently locked because of I/O.
- *
  * pages_locked : How many pages are locked because of user specified
  * locking through mlock or plock.
  *
@@ -117,10 +114,9 @@
  *
  * All these globals are protected by the same lock which protects availrmem.
  */
-pgcnt_t segvn_pages_locked;
-pgcnt_t pages_locked;
-pgcnt_t pages_useclaim;
-pgcnt_t pages_claimed;
+pgcnt_t pages_locked = 0;
+pgcnt_t pages_useclaim = 0;
+pgcnt_t pages_claimed = 0;


 /*
@@ -5878,7 +5874,6 @@
 		deficit = tune.t_minarmem + npages + epages - availrmem;
 		mutex_exit(&freemem_lock);
 		page_needfree(deficit);
-		seg_preap();
 		kmem_reap();
 		delay(hz);
 		page_needfree(-(spgcnt_t)deficit);
@@ -6285,7 +6280,7 @@
 static kmutex_t pc_thread_mutex;
 static clock_t pc_thread_shortwait;
 static clock_t pc_thread_longwait;
-static int pc_thread_ism_retry;
+static int pc_thread_retry;

 struct page_capture_callback pc_cb[PC_NUM_CALLBACKS];

@@ -6782,17 +6777,13 @@

 	ASSERT(pp != NULL);

-	/* only physmem currently has restrictions */
-	if (!(flags & CAPTURE_PHYSMEM)) {
-		return (0);
-	}
-
 #if defined(__sparc)
 	if (pp->p_vnode == &prom_ppages) {
 		return (EPERM);
 	}

-	if (PP_ISNORELOC(pp) && !(flags & CAPTURE_GET_CAGE)) {
+	if (PP_ISNORELOC(pp) && !(flags & CAPTURE_GET_CAGE) &&
+	    (flags & CAPTURE_PHYSMEM)) {
 		return (ENOENT);
 	}

@@ -6805,6 +6796,11 @@
 	}
 #endif /* __sparc */

+	/* only physmem currently has the restrictions checked below */
+	if (!(flags & CAPTURE_PHYSMEM)) {
+		return (0);
+	}
+
 	if (availrmem < swapfs_minfree) {
 		/*
 		 * We won't try to capture this page as we are
@@ -7187,7 +7183,7 @@

 	pc_thread_shortwait = 23 * hz;
 	pc_thread_longwait = 1201 * hz;
-	pc_thread_ism_retry = 3;
+	pc_thread_retry = 3;
 	mutex_init(&pc_thread_mutex, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&pc_cv, NULL, CV_DEFAULT, NULL);
 	pc_thread_id = thread_create(NULL, 0, page_capture_thread, NULL, 0, &p0,
@@ -7358,7 +7354,6 @@
 static void
 page_capture_handle_outstanding(void)
 {
-	extern size_t spt_used;
 	int ntry;

 	if (!page_retire_pend_count()) {
@@ -7380,34 +7375,23 @@
 		 * we reap prior to attempting to capture.
 		 */
 		kmem_reap();
-		/*
-		 * When ISM is in use, we need to disable and
-		 * purge the seg_pcache, and initiate aio
-		 * cleanup in order to release page locks and
-		 * subsquently retire pages in need of
-		 * retirement.
-		 */
-		if (spt_used) {
-			/* disable and purge seg_pcache */
-			(void) seg_p_disable();
-			for (ntry = 0; ntry < pc_thread_ism_retry; ntry++) {
-				if (!page_retire_pend_count())
-					break;
-				if (do_aio_cleanup()) {
-					/*
-					 * allow the apps cleanup threads
-					 * to run
-					 */
-					delay(pc_thread_shortwait);
-				}
-				page_capture_async();
+
+		/* disable and purge seg_pcache */
+		(void) seg_p_disable();
+		for (ntry = 0; ntry < pc_thread_retry; ntry++) {
+			if (!page_retire_pend_count())
+				break;
+			if (do_aio_cleanup()) {
+				/*
+				 * allow the apps cleanup threads
+				 * to run
+				 */
+				delay(pc_thread_shortwait);
 			}
-			/* reenable seg_pcache */
-			seg_p_enable();
-		} else {
-			seg_preap();
 			page_capture_async();
 		}
+		/* reenable seg_pcache */
+		seg_p_enable();
 	}
 }
--- a/usr/src/uts/common/vm/vm_seg.c	Thu May 22 22:08:42 2008 -0700
+++ b/usr/src/uts/common/vm/vm_seg.c	Thu May 22 22:23:49 2008 -0700
@@ -48,8 +48,11 @@
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kmem.h>
+#include <sys/sysmacros.h>
 #include <sys/vmsystm.h>
+#include <sys/tuneable.h>
 #include <sys/debug.h>
+#include <sys/fs/swapnode.h>
 #include <sys/cmn_err.h>
 #include <sys/callb.h>
 #include <sys/mem_config.h>
@@ -61,6 +64,8 @@
 #include <vm/seg_kmem.h>
 #include <vm/seg_spt.h>
 #include <vm/seg_vn.h>
+#include <vm/anon.h>
+
 /*
  * kstats for segment advise
  */
@@ -72,472 +77,1188 @@
 kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat;
 uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t);

-/* #define	PDEBUG */
-#if defined(PDEBUG) || defined(lint) || defined(__lint)
-int pdebug = 0;
-#else
-#define	pdebug		0
-#endif	/* PDEBUG */
-
-#define	PPRINTF				if (pdebug) printf
-#define	PPRINT(x)			PPRINTF(x)
-#define	PPRINT1(x, a)			PPRINTF(x, a)
-#define	PPRINT2(x, a, b)		PPRINTF(x, a, b)
-#define	PPRINT3(x, a, b, c)		PPRINTF(x, a, b, c)
-#define	PPRINT4(x, a, b, c, d)		PPRINTF(x, a, b, c, d)
-#define	PPRINT5(x, a, b, c, d, e)	PPRINTF(x, a, b, c, d, e)
-
-#define	P_HASHMASK		(p_hashsize - 1)
-#define	P_BASESHIFT		6
-
 /*
  * entry in the segment page cache
  */
 struct seg_pcache {
-	struct seg_pcache *p_hnext;	/* list for hashed blocks */
-	struct seg_pcache *p_hprev;
-	int		p_active;	/* active count */
-	int		p_ref;		/* ref bit */
-	size_t		p_len;		/* segment length */
-	caddr_t		p_addr;		/* base address */
-	struct seg 	*p_seg;		/* segment */
-	struct page	**p_pp;		/* pp shadow list */
-	enum seg_rw	p_rw;		/* rw */
-	uint_t		p_flags;	/* bit flags */
-	int		(*p_callback)(struct seg *, caddr_t, size_t,
-			    struct page **, enum seg_rw);
+	struct seg_pcache	*p_hnext;	/* list for hashed blocks */
+	struct seg_pcache	*p_hprev;
+	pcache_link_t		p_plink;	/* per segment/amp list */
+	void 			*p_htag0;	/* segment/amp pointer */
+	caddr_t			p_addr;		/* base address/anon_idx */
+	size_t			p_len;		/* total bytes */
+	size_t			p_wlen;		/* writtable bytes at p_addr */
+	struct page		**p_pp;		/* pp shadow list */
+	seg_preclaim_cbfunc_t	p_callback;	/* reclaim callback function */
+	clock_t			p_lbolt;	/* lbolt from last use */
+	struct seg_phash	*p_hashp;	/* our pcache hash bucket */
+	uint_t			p_active;	/* active count */
+	uchar_t			p_write;	/* true if S_WRITE */
+	uchar_t			p_ref;		/* reference byte */
+	ushort_t		p_flags;	/* bit flags */
 };

 struct seg_phash {
-	struct seg_pcache *p_hnext;	/* list for hashed blocks */
-	struct seg_pcache *p_hprev;
-	int p_qlen;			/* Q length */
-	kmutex_t p_hmutex;		/* protects hash bucket */
+	struct seg_pcache	*p_hnext;	/* list for hashed blocks */
+	struct seg_pcache	*p_hprev;
+	kmutex_t		p_hmutex;	/* protects hash bucket */
+	pcache_link_t		p_halink[2];	/* active bucket linkages */
+};
+
+struct seg_phash_wired {
+	struct seg_pcache	*p_hnext;	/* list for hashed blocks */
+	struct seg_pcache	*p_hprev;
+	kmutex_t		p_hmutex;	/* protects hash bucket */
 };

-static int seg_preap_time = 20;	/* reclaim every 20 secs */
-static int seg_pmaxqlen = 5;	/* max Q length in hash list */
-static int seg_ppcount = 5;	/* max # of purges per reclaim interval */
-static int seg_plazy = 1;	/* if 1, pages are cached after pageunlock */
-static pgcnt_t seg_pwindow;	/* max # of pages that can be cached */
-static pgcnt_t seg_plocked;	/* # of pages which are cached by pagelock */
-static pgcnt_t seg_plocked_window; /* # pages from window */
-int seg_preapahead;
+/*
+ * A parameter to control a maximum number of bytes that can be
+ * purged from pcache at a time.
+ */
+#define	P_MAX_APURGE_BYTES	(1024 * 1024 * 1024)
+
+/*
+ * log2(fraction of pcache to reclaim at a time).
+ */
+#define	P_SHRINK_SHFT		(5)
+
+/*
+ * The following variables can be tuned via /etc/system.
+ */
+
+int	segpcache_enabled = 1;		/* if 1, shadow lists are cached */
+pgcnt_t	segpcache_maxwindow = 0;	/* max # of pages that can be cached */
+ulong_t	segpcache_hashsize_win = 0;	/* # of non wired buckets */
+ulong_t	segpcache_hashsize_wired = 0;	/* # of wired buckets */
+int	segpcache_reap_sec = 1;		/* reap check rate in secs */
+clock_t	segpcache_reap_ticks = 0;	/* reap interval in ticks */
+int	segpcache_pcp_maxage_sec = 1;	/* pcp max age in secs */
+clock_t	segpcache_pcp_maxage_ticks = 0;	/* pcp max age in ticks */
+int	segpcache_shrink_shift = P_SHRINK_SHFT;	/* log2 reap fraction */
+pgcnt_t	segpcache_maxapurge_bytes = P_MAX_APURGE_BYTES;	/* max purge bytes */

-static uint_t seg_pdisable = 0;	/* if not 0, caching temporarily disabled */
+static kmutex_t seg_pcache_mtx;	/* protects seg_pdisabled counter */
+static kmutex_t seg_pasync_mtx;	/* protects async thread scheduling */
+static kcondvar_t seg_pasync_cv;
+
+#pragma align 64(pctrl1)
+#pragma align 64(pctrl2)
+#pragma align 64(pctrl3)

-static int seg_pupdate_active = 1;	/* background reclaim thread */
-static clock_t seg_preap_interval;	/* reap interval in ticks */
+/*
+ * Keep frequently used variables together in one cache line.
+ */
+static struct p_ctrl1 {
+	uint_t p_disabled;		/* if not 0, caching temporarily off */
+	pgcnt_t p_maxwin;		/* max # of pages that can be cached */
+	size_t p_hashwin_sz;		/* # of non wired buckets */
+	struct seg_phash *p_htabwin;	/* hash table for non wired entries */
+	size_t p_hashwired_sz;		/* # of wired buckets */
+	struct seg_phash_wired *p_htabwired; /* hash table for wired entries */
+	kmem_cache_t *p_kmcache;	/* kmem cache for seg_pcache structs */
+#ifdef _LP64
+	ulong_t pad[1];
+#endif /* _LP64 */
+} pctrl1;
+
+static struct p_ctrl2 {
+	kmutex_t p_mem_mtx;	/* protects window counter and p_halinks */
+	pgcnt_t  p_locked_win;	/* # pages from window */
+	pgcnt_t  p_locked;	/* # of pages cached by pagelock */
+	uchar_t	 p_ahcur;	/* current active links for insert/delete */
+	uchar_t  p_athr_on;	/* async reclaim thread is running. */
+	pcache_link_t p_ahhead[2]; /* active buckets linkages */
+} pctrl2;

-static kmutex_t seg_pcache;	/* protects the whole pagelock cache */
-static kmutex_t seg_pmem;	/* protects window counter */
-static ksema_t seg_pasync_sem;	/* sema for reclaim thread */
-static struct seg_phash *p_hashtab;
-static int p_hashsize = 0;
+static struct p_ctrl3 {
+	clock_t	p_pcp_maxage;		/* max pcp age in ticks */
+	ulong_t	p_athr_empty_ahb;	/* athread walk stats */
+	ulong_t p_athr_full_ahb;	/* athread walk stats */
+	pgcnt_t	p_maxapurge_npages;	/* max pages to purge at a time */
+	int	p_shrink_shft;		/* reap shift factor */
+#ifdef _LP64
+	ulong_t pad[3];
+#endif /* _LP64 */
+} pctrl3;

-#define	p_hash(seg) \
-	(P_HASHMASK & \
-	((uintptr_t)(seg) >> P_BASESHIFT))
+#define	seg_pdisabled			pctrl1.p_disabled
+#define	seg_pmaxwindow			pctrl1.p_maxwin
+#define	seg_phashsize_win		pctrl1.p_hashwin_sz
+#define	seg_phashtab_win		pctrl1.p_htabwin
+#define	seg_phashsize_wired		pctrl1.p_hashwired_sz
+#define	seg_phashtab_wired		pctrl1.p_htabwired
+#define	seg_pkmcache			pctrl1.p_kmcache
+#define	seg_pmem_mtx			pctrl2.p_mem_mtx
+#define	seg_plocked_window		pctrl2.p_locked_win
+#define	seg_plocked			pctrl2.p_locked
+#define	seg_pahcur			pctrl2.p_ahcur
+#define	seg_pathr_on			pctrl2.p_athr_on
+#define	seg_pahhead			pctrl2.p_ahhead
+#define	seg_pmax_pcpage			pctrl3.p_pcp_maxage
+#define	seg_pathr_empty_ahb		pctrl3.p_athr_empty_ahb
+#define	seg_pathr_full_ahb		pctrl3.p_athr_full_ahb
+#define	seg_pshrink_shift		pctrl3.p_shrink_shft
+#define	seg_pmaxapurge_npages		pctrl3.p_maxapurge_npages
+
+#define	P_HASHWIN_MASK			(seg_phashsize_win - 1)
+#define	P_HASHWIRED_MASK		(seg_phashsize_wired - 1)
+#define	P_BASESHIFT			(6)
+
+kthread_t *seg_pasync_thr;
+
+extern struct seg_ops segvn_ops;
+extern struct seg_ops segspt_shmops;

-#define	p_match(pcp, seg, addr, len, rw) \
-	(((pcp)->p_seg == (seg) && \
-	(pcp)->p_addr == (addr) && \
-	(pcp)->p_rw == (rw) && \
-	(pcp)->p_len == (len)) ? 1 : 0)
+#define	IS_PFLAGS_WIRED(flags) ((flags) & SEGP_FORCE_WIRED)
+#define	IS_PCP_WIRED(pcp) IS_PFLAGS_WIRED((pcp)->p_flags)
+
+#define	LBOLT_DELTA(t)	((ulong_t)(lbolt - (t)))
+
+#define	PCP_AGE(pcp)	LBOLT_DELTA((pcp)->p_lbolt)
+
+/*
+ * htag0 argument can be a seg or amp pointer.
+ */
+#define	P_HASHBP(seg, htag0, addr, flags)				\
+	(IS_PFLAGS_WIRED((flags)) ?					\
+	    ((struct seg_phash *)&seg_phashtab_wired[P_HASHWIRED_MASK &	\
+	    ((uintptr_t)(htag0) >> P_BASESHIFT)]) :			\
+	    (&seg_phashtab_win[P_HASHWIN_MASK &				\
+	    (((uintptr_t)(htag0) >> 3) ^				\
+	    ((uintptr_t)(addr) >> ((flags & SEGP_PSHIFT) ?		\
+	    (flags >> 16) : page_get_shift((seg)->s_szc))))]))

-#define	p_match_pp(pcp, seg, addr, len, pp, rw) \
-	(((pcp)->p_seg == (seg) && \
-	(pcp)->p_addr == (addr) && \
-	(pcp)->p_pp == (pp) && \
-	(pcp)->p_rw == (rw) && \
-	(pcp)->p_len == (len)) ? 1 : 0)
+/*
+ * htag0 argument can be a seg or amp pointer.
+ */
+#define	P_MATCH(pcp, htag0, addr, len)					\
+	((pcp)->p_htag0 == (htag0) &&					\
+	(pcp)->p_addr == (addr) &&					\
+	(pcp)->p_len >= (len))

+#define	P_MATCH_PP(pcp, htag0, addr, len, pp)				\
+	((pcp)->p_pp == (pp) &&						\
+	(pcp)->p_htag0 == (htag0) &&					\
+	(pcp)->p_addr == (addr) &&					\
+	(pcp)->p_len >= (len))
+
+#define	plink2pcache(pl)	((struct seg_pcache *)((uintptr_t)(pl) - \
+    offsetof(struct seg_pcache, p_plink)))
+
+#define	hlink2phash(hl, l)	((struct seg_phash *)((uintptr_t)(hl) -	\
+    offsetof(struct seg_phash, p_halink[l])))

 /*
- * lookup an address range in pagelock cache. Return shadow list
- * and bump up active count.
+ * seg_padd_abuck()/seg_premove_abuck() link and unlink hash buckets from
+ * active hash bucket lists. We maintain active bucket lists to reduce the
+ * overhead of finding active buckets during asynchronous purging since there
+ * can be 10s of millions of buckets on a large system but only a small subset
+ * of them in actual use.
+ *
+ * There're 2 active bucket lists. Current active list (as per seg_pahcur) is
+ * used by seg_pinsert()/seg_pinactive()/seg_ppurge() to add and delete
+ * buckets. The other list is used by asynchronous purge thread. This allows
+ * the purge thread to walk its active list without holding seg_pmem_mtx for a
+ * long time. When asynchronous thread is done with its list it switches to
+ * current active list and makes the list it just finished processing as
+ * current active list.
+ *
+ * seg_padd_abuck() only adds the bucket to current list if the bucket is not
+ * yet on any list.  seg_premove_abuck() may remove the bucket from either
+ * list. If the bucket is on current list it will be always removed. Otherwise
+ * the bucket is only removed if asynchronous purge thread is not currently
+ * running or seg_premove_abuck() is called by asynchronous purge thread
+ * itself. A given bucket can only be on one of active lists at a time. These
+ * routines should be called with per bucket lock held.  The routines use
+ * seg_pmem_mtx to protect list updates. seg_padd_abuck() must be called after
+ * the first entry is added to the bucket chain and seg_premove_abuck() must
+ * be called after the last pcp entry is deleted from its chain. Per bucket
+ * lock should be held by the callers.  This avoids a potential race condition
+ * when seg_premove_abuck() removes a bucket after pcp entries are added to
+ * its list after the caller checked that the bucket has no entries. (this
+ * race would cause a loss of an active bucket from the active lists).
+ *
+ * Both lists are circular doubly linked lists anchored at seg_pahhead heads.
+ * New entries are added to the end of the list since LRU is used as the
+ * purging policy.
+ */
+static void
+seg_padd_abuck(struct seg_phash *hp)
+{
+	int lix;
+
+	ASSERT(MUTEX_HELD(&hp->p_hmutex));
+	ASSERT((struct seg_phash *)hp->p_hnext != hp);
+	ASSERT((struct seg_phash *)hp->p_hprev != hp);
+	ASSERT(hp->p_hnext == hp->p_hprev);
+	ASSERT(!IS_PCP_WIRED(hp->p_hnext));
+	ASSERT(hp->p_hnext->p_hnext == (struct seg_pcache *)hp);
+	ASSERT(hp->p_hprev->p_hprev == (struct seg_pcache *)hp);
+	ASSERT(hp >= seg_phashtab_win &&
+	    hp < &seg_phashtab_win[seg_phashsize_win]);
+
+	/*
+	 * This bucket can already be on one of active lists
+	 * since seg_premove_abuck() may have failed to remove it
+	 * before.
+	 */
+	mutex_enter(&seg_pmem_mtx);
+	lix = seg_pahcur;
+	ASSERT(lix >= 0 && lix <= 1);
+	if (hp->p_halink[lix].p_lnext != NULL) {
+		ASSERT(hp->p_halink[lix].p_lprev != NULL);
+		ASSERT(hp->p_halink[!lix].p_lnext == NULL);
+		ASSERT(hp->p_halink[!lix].p_lprev == NULL);
+		mutex_exit(&seg_pmem_mtx);
+		return;
+	}
+	ASSERT(hp->p_halink[lix].p_lprev == NULL);
+
+	/*
+	 * If this bucket is still on list !lix async thread can't yet remove
+	 * it since we hold here per bucket lock. In this case just return
+	 * since async thread will eventually find and process this bucket.
+	 */
+	if (hp->p_halink[!lix].p_lnext != NULL) {
+		ASSERT(hp->p_halink[!lix].p_lprev != NULL);
+		mutex_exit(&seg_pmem_mtx);
+		return;
+	}
+	ASSERT(hp->p_halink[!lix].p_lprev == NULL);
+	/*
+	 * This bucket is not on any active bucket list yet.
+	 * Add the bucket to the tail of current active list.
+	 */
+	hp->p_halink[lix].p_lnext = &seg_pahhead[lix];
+	hp->p_halink[lix].p_lprev = seg_pahhead[lix].p_lprev;
+	seg_pahhead[lix].p_lprev->p_lnext = &hp->p_halink[lix];
+	seg_pahhead[lix].p_lprev = &hp->p_halink[lix];
+	mutex_exit(&seg_pmem_mtx);
+}
+
+static void
+seg_premove_abuck(struct seg_phash *hp, int athr)
+{
+	int lix;
+
+	ASSERT(MUTEX_HELD(&hp->p_hmutex));
+	ASSERT((struct seg_phash *)hp->p_hnext == hp);
+	ASSERT((struct seg_phash *)hp->p_hprev == hp);
+	ASSERT(hp >= seg_phashtab_win &&
+	    hp < &seg_phashtab_win[seg_phashsize_win]);
+
+	if (athr) {
+		ASSERT(seg_pathr_on);
+		ASSERT(seg_pahcur <= 1);
+		/*
+		 * We are called by asynchronous thread that found this bucket
+		 * on not currently active (i.e. !seg_pahcur) list. Remove it
+		 * from there.  Per bucket lock we are holding makes sure
+		 * seg_pinsert() can't sneak in and add pcp entries to this
+		 * bucket right before we remove the bucket from its list.
+		 */
+		lix = !seg_pahcur;
+		ASSERT(hp->p_halink[lix].p_lnext != NULL);
+		ASSERT(hp->p_halink[lix].p_lprev != NULL);
+		ASSERT(hp->p_halink[!lix].p_lnext == NULL);
+		ASSERT(hp->p_halink[!lix].p_lprev == NULL);
+		hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
+		hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
+		hp->p_halink[lix].p_lnext = NULL;
+		hp->p_halink[lix].p_lprev = NULL;
+		return;
+	}
+
+	mutex_enter(&seg_pmem_mtx);
+	lix = seg_pahcur;
+	ASSERT(lix >= 0 && lix <= 1);
+
+	/*
+	 * If the bucket is on currently active list just remove it from
+	 * there.
+	 */
+	if (hp->p_halink[lix].p_lnext != NULL) {
+		ASSERT(hp->p_halink[lix].p_lprev != NULL);
+		ASSERT(hp->p_halink[!lix].p_lnext == NULL);
+		ASSERT(hp->p_halink[!lix].p_lprev == NULL);
+		hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
+		hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
+		hp->p_halink[lix].p_lnext = NULL;
+		hp->p_halink[lix].p_lprev = NULL;
+		mutex_exit(&seg_pmem_mtx);
+		return;
+	}
+	ASSERT(hp->p_halink[lix].p_lprev == NULL);
+
+	/*
+	 * If asynchronous thread is not running we can remove the bucket from
+	 * not currently active list. The bucket must be on this list since we
+	 * already checked that it's not on the other list and the bucket from
+	 * which we just deleted the last pcp entry must be still on one of the
+	 * active bucket lists.
+	 */
+	lix = !lix;
+	ASSERT(hp->p_halink[lix].p_lnext != NULL);
+	ASSERT(hp->p_halink[lix].p_lprev != NULL);
+
+	if (!seg_pathr_on) {
+		hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
+		hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
+		hp->p_halink[lix].p_lnext = NULL;
+		hp->p_halink[lix].p_lprev = NULL;
+	}
+	mutex_exit(&seg_pmem_mtx);
+}
+
+/*
+ * Check if bucket pointed by hp already has a pcp entry that matches request
+ * htag0, addr and len. Set *found to 1 if match is found and to 0 otherwise.
+ * Also delete matching entries that cover smaller address range but start
+ * at the same address as addr argument. Return the list of deleted entries if
+ * any. This is an internal helper function called from seg_pinsert() only
+ * for non wired shadow lists. The caller already holds a per seg/amp list
+ * lock.
+ */
+static struct seg_pcache *
+seg_plookup_checkdup(struct seg_phash *hp, void *htag0,
+    caddr_t addr, size_t len, int *found)
+{
+	struct seg_pcache *pcp;
+	struct seg_pcache *delcallb_list = NULL;
+
+	ASSERT(MUTEX_HELD(&hp->p_hmutex));
+
+	*found = 0;
+	for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
+	    pcp = pcp->p_hnext) {
+		ASSERT(pcp->p_hashp == hp);
+		if (pcp->p_htag0 == htag0 && pcp->p_addr == addr) {
+			ASSERT(!IS_PCP_WIRED(pcp));
+			if (pcp->p_len < len) {
+				pcache_link_t *plinkp;
+				if (pcp->p_active) {
+					continue;
+				}
+				plinkp = &pcp->p_plink;
+				plinkp->p_lprev->p_lnext = plinkp->p_lnext;
+				plinkp->p_lnext->p_lprev = plinkp->p_lprev;
+				pcp->p_hprev->p_hnext = pcp->p_hnext;
+				pcp->p_hnext->p_hprev = pcp->p_hprev;
+				pcp->p_hprev = delcallb_list;
+				delcallb_list = pcp;
+			} else {
+				*found = 1;
+				break;
+			}
+		}
+	}
+	return (delcallb_list);
+}
+
+/*
+ * lookup an address range in pagelock cache. Return shadow list and bump up
+ * active count. If amp is not NULL use amp as a lookup tag otherwise use seg
+ * as a lookup tag.
  */
 struct page **
-seg_plookup(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw)
+seg_plookup(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
+    enum seg_rw rw, uint_t flags)
 {
 	struct seg_pcache *pcp;
 	struct seg_phash *hp;
+	void *htag0;
+
+	ASSERT(seg != NULL);
+	ASSERT(rw == S_READ || rw == S_WRITE);

 	/*
 	 * Skip pagelock cache, while DR is in progress or
 	 * seg_pcache is off.
 	 */
-	if (seg_pdisable || seg_plazy == 0) {
+	if (seg_pdisabled) {
 		return (NULL);
 	}
+	ASSERT(seg_phashsize_win != 0);

-	hp = &p_hashtab[p_hash(seg)];
+	htag0 = (amp == NULL ? (void *)seg : (void *)amp);
+	hp = P_HASHBP(seg, htag0, addr, flags);
 	mutex_enter(&hp->p_hmutex);
 	for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
 	    pcp = pcp->p_hnext) {
-		if (p_match(pcp, seg, addr, len, rw)) {
+		ASSERT(pcp->p_hashp == hp);
+		if (P_MATCH(pcp, htag0, addr, len)) {
+			ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
+			/*
+			 * If this request wants to write pages
+			 * but write permissions starting from
+			 * addr don't cover the entire length len
+			 * return lookup failure back to the caller.
+			 * It will check protections and fail this
+			 * pagelock operation with EACCESS error.
+			 */
+			if (rw == S_WRITE && pcp->p_wlen < len) {
+				break;
+			}
+			if (pcp->p_active == UINT_MAX) {
+				break;
+			}
 			pcp->p_active++;
+			if (rw == S_WRITE && !pcp->p_write) {
+				pcp->p_write = 1;
+			}
 			mutex_exit(&hp->p_hmutex);
-
-			PPRINT5("seg_plookup hit: seg %p, addr %p, "
-			    "len %lx, count %d, pplist %p \n",
-			    (void *)seg, (void *)addr, len, pcp->p_active,
-			    (void *)pcp->p_pp);
-
 			return (pcp->p_pp);
 		}
 	}
 	mutex_exit(&hp->p_hmutex);
-
-	PPRINT("seg_plookup miss:\n");
-
 	return (NULL);
 }

 /*
- * mark address range inactive. If the cache is off or the address
- * range is not in the cache we call the segment driver to reclaim
- * the pages. Otherwise just decrement active count and set ref bit.
+ * mark address range inactive. If the cache is off or the address range is
+ * not in the cache or another shadow list that covers bigger range is found
+ * we call the segment driver to reclaim the pages. Otherwise just decrement
+ * active count and set ref bit.  If amp is not NULL use amp as a lookup tag
+ * otherwise use seg as a lookup tag.
  */
 void
-seg_pinactive(struct seg *seg, caddr_t addr, size_t len, struct page **pp,
-    enum seg_rw rw, int (*callback)(struct seg *, caddr_t, size_t,
-    struct page **, enum seg_rw))
+seg_pinactive(struct seg *seg, struct anon_map *amp, caddr_t addr,
+    size_t len, struct page **pp, enum seg_rw rw, uint_t flags,
+    seg_preclaim_cbfunc_t callback)
 {
 	struct seg_pcache *pcp;
 	struct seg_phash *hp;
+	kmutex_t *pmtx = NULL;
+	pcache_link_t *pheadp;
+	void *htag0;
+	pgcnt_t npages = 0;
+	int keep = 0;

-	if (seg_plazy == 0) {
-		(void) (*callback)(seg, addr, len, pp, rw);
-		return;
+	ASSERT(seg != NULL);
+	ASSERT(rw == S_READ || rw == S_WRITE);
+
+	htag0 = (amp == NULL ? (void *)seg : (void *)amp);
+
+	/*
+	 * Skip lookup if pcache is not configured.
+	 */
+	if (seg_phashsize_win == 0) {
+		goto out;
 	}
-	hp = &p_hashtab[p_hash(seg)];
+
+	/*
+	 * Grab per seg/amp lock before hash lock if we are going to remove
+	 * inactive entry from pcache.
+	 */
+	if (!IS_PFLAGS_WIRED(flags) && seg_pdisabled) {
+		if (amp == NULL) {
+			pheadp = &seg->s_phead;
+			pmtx = &seg->s_pmtx;
+		} else {
+			pheadp = &amp->a_phead;
+			pmtx = &amp->a_pmtx;
+		}
+		mutex_enter(pmtx);
+	}
+
+	hp = P_HASHBP(seg, htag0, addr, flags);
 	mutex_enter(&hp->p_hmutex);
+again:
 	for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
 	    pcp = pcp->p_hnext) {
-		if (p_match_pp(pcp, seg, addr, len, pp, rw)) {
+		ASSERT(pcp->p_hashp == hp);
+		if (P_MATCH_PP(pcp, htag0, addr, len, pp)) {
+			ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
+			ASSERT(pcp->p_active);
+			if (keep) {
+				/*
+				 * Don't remove this pcp entry
+				 * if we didn't find duplicate
+				 * shadow lists on second search.
+				 * Somebody removed those duplicates
+				 * since we dropped hash lock after first
+				 * search.
+				 */
+				ASSERT(pmtx != NULL);
+				ASSERT(!IS_PFLAGS_WIRED(flags));
+				mutex_exit(pmtx);
+				pmtx = NULL;
+			}
 			pcp->p_active--;
-			ASSERT(pcp->p_active >= 0);
-			if (pcp->p_active == 0 && seg_pdisable) {
-				int npages;
+			if (pcp->p_active == 0 && (pmtx != NULL ||
+			    (seg_pdisabled && IS_PFLAGS_WIRED(flags)))) {
+
+				/*
+				 * This entry is no longer active.  Remove it
+				 * now either because pcaching is temporarily
+				 * disabled or there're other pcp entries that
+				 * can match this pagelock request (i.e. this
+				 * entry is a duplicate).
+				 */

 				ASSERT(callback == pcp->p_callback);
-				/* free the entry */
-				hp->p_qlen--;
+				if (pmtx != NULL) {
+					pcache_link_t *plinkp = &pcp->p_plink;
+					ASSERT(!IS_PCP_WIRED(pcp));
+					ASSERT(pheadp->p_lnext != pheadp);
+					ASSERT(pheadp->p_lprev != pheadp);
+					plinkp->p_lprev->p_lnext =
+					    plinkp->p_lnext;
+					plinkp->p_lnext->p_lprev =
+					    plinkp->p_lprev;
+				}
 				pcp->p_hprev->p_hnext = pcp->p_hnext;
 				pcp->p_hnext->p_hprev = pcp->p_hprev;
+				if (!IS_PCP_WIRED(pcp) &&
+				    hp->p_hnext == (struct seg_pcache *)hp) {
+					/*
+					 * We removed the last entry from this
+					 * bucket.  Now remove the bucket from
+					 * its active list.
+					 */
+					seg_premove_abuck(hp, 0);
+				}
 				mutex_exit(&hp->p_hmutex);
-				npages = pcp->p_len >> PAGESHIFT;
-				mutex_enter(&seg_pmem);
-				seg_plocked -= npages;
-				if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) {
-					seg_plocked_window -= npages;
+				if (pmtx != NULL) {
+					mutex_exit(pmtx);
+				}
+				len = pcp->p_len;
+				npages = btop(len);
+				if (rw != S_WRITE && pcp->p_write) {
+					rw = S_WRITE;
+				}
+				kmem_cache_free(seg_pkmcache, pcp);
+				goto out;
+			} else {
+				/*
+				 * We found a matching pcp entry but will not
+				 * free it right away even if it's no longer
+				 * active.
+				 */
+				if (!pcp->p_active && !IS_PCP_WIRED(pcp)) {
+					/*
+					 * Set the reference bit and mark the
+					 * time of last access to this pcp
+					 * so that asynchronous thread doesn't
+					 * free it immediately since
+					 * it may be reactivated very soon.
+					 */
+					pcp->p_lbolt = lbolt;
+					pcp->p_ref = 1;
+				}
+				mutex_exit(&hp->p_hmutex);
+				if (pmtx != NULL) {
+					mutex_exit(pmtx);
 				}
-				mutex_exit(&seg_pmem);
-				kmem_free(pcp, sizeof (struct seg_pcache));
-				goto out;
+				return;
+			}
+		} else if (!IS_PFLAGS_WIRED(flags) &&
+		    P_MATCH(pcp, htag0, addr, len)) {
+			/*
+			 * This is a duplicate pcp entry.  This situation may
+			 * happen if a bigger shadow list that covers our
+			 * range was added while our entry was still active.
+			 * Now we can free our pcp entry if it becomes
+			 * inactive.
+			 */
+			if (!pcp->p_active) {
+				/*
+				 * Mark this entry as referenced just in case
+				 * we'll free our own pcp entry soon.
+				 */
+				pcp->p_lbolt = lbolt;
+				pcp->p_ref = 1;
+			}
+			if (pmtx != NULL) {
+				/*
+				 * we are already holding pmtx and found a
+				 * duplicate.  Don't keep our own pcp entry.
+				 */
+				keep = 0;
+				continue;
 			}
-			pcp->p_ref = 1;
-			mutex_exit(&hp->p_hmutex);
-			return;
+			/*
+			 * We have to use mutex_tryenter to attempt to lock
+			 * seg/amp list lock since we already hold hash lock
+			 * and seg/amp list lock is above hash lock in lock
+			 * order.  If mutex_tryenter fails drop hash lock and
+			 * retake both locks in correct order and research
+			 * this hash chain.
+			 */
+			ASSERT(keep == 0);
+			if (amp == NULL) {
+				pheadp = &seg->s_phead;
+				pmtx = &seg->s_pmtx;
+			} else {
+				pheadp = &amp->a_phead;
+				pmtx = &amp->a_pmtx;
+			}
+			if (!mutex_tryenter(pmtx)) {
+				mutex_exit(&hp->p_hmutex);
+				mutex_enter(pmtx);
+				mutex_enter(&hp->p_hmutex);
+				/*
+				 * If we don't find bigger shadow list on
+				 * second search (it may happen since we
+				 * dropped bucket lock) keep the entry that
+				 * matches our own shadow list.
+				 */
+				keep = 1;
+				goto again;
+			}
 		}
 	}
 	mutex_exit(&hp->p_hmutex);
+	if (pmtx != NULL) {
+		mutex_exit(pmtx);
+	}
 out:
-	(void) (*callback)(seg, addr, len, pp, rw);
+	(*callback)(htag0, addr, len, pp, rw, 0);
+	if (npages) {
+		mutex_enter(&seg_pmem_mtx);
+		ASSERT(seg_plocked >= npages);
+		seg_plocked -= npages;
+		if (!IS_PFLAGS_WIRED(flags)) {
+			ASSERT(seg_plocked_window >= npages);
+			seg_plocked_window -= npages;
+		}
+		mutex_exit(&seg_pmem_mtx);
+	}
+
 }

+#ifdef DEBUG
+static uint32_t p_insert_chk_mtbf = 0;
+#endif
+
 /*
  * The seg_pinsert_check() is used by segment drivers to predict whether
  * a call to seg_pinsert will fail and thereby avoid wasteful pre-processing.
  */
-
+/*ARGSUSED*/
 int
-seg_pinsert_check(struct seg *seg, size_t len, uint_t flags)
+seg_pinsert_check(struct seg *seg, struct anon_map *amp, caddr_t addr,
+    size_t len, uint_t flags)
 {
-	struct seg_phash *hp;
+	ASSERT(seg != NULL);

-	if (seg_plazy == 0) {
-		return (SEGP_FAIL);
-	}
-	if (seg_pdisable != 0) {
+#ifdef DEBUG
+	if (p_insert_chk_mtbf && !(gethrtime() % p_insert_chk_mtbf)) {
 		return (SEGP_FAIL);
 	}
-	ASSERT((len & PAGEOFFSET) == 0);
-	hp = &p_hashtab[p_hash(seg)];
-	if (hp->p_qlen > seg_pmaxqlen && (flags & SEGP_FORCE_WIRED) == 0) {
+#endif
+
+	if (seg_pdisabled) {
 		return (SEGP_FAIL);
 	}
-	/*
-	 * If the SEGP_FORCE_WIRED flag is set,
-	 * we skip the check for seg_pwindow.
-	 */
-	if ((flags & SEGP_FORCE_WIRED) == 0) {
-		pgcnt_t npages;
+	ASSERT(seg_phashsize_win != 0);
+
+	if (IS_PFLAGS_WIRED(flags)) {
+		return (SEGP_SUCCESS);
+	}

-		npages = len >> PAGESHIFT;
-		if ((seg_plocked_window + npages) > seg_pwindow) {
-			return (SEGP_FAIL);
-		}
+	if (seg_plocked_window + btop(len) > seg_pmaxwindow) {
+		return (SEGP_FAIL);
 	}
+
+	if (freemem < desfree) {
+		return (SEGP_FAIL);
+	}
+
 	return (SEGP_SUCCESS);
 }

+#ifdef DEBUG
+static uint32_t p_insert_mtbf = 0;
+#endif

 /*
- * insert address range with shadow list into pagelock cache. If
- * the cache is off or caching is temporarily disabled or the allowed
- * 'window' is exceeded - return SEGP_FAIL. Otherwise return
- * SEGP_SUCCESS.
+ * Insert address range with shadow list into pagelock cache if there's no
+ * shadow list already cached for this address range. If the cache is off or
+ * caching is temporarily disabled or the allowed 'window' is exceeded return
+ * SEGP_FAIL. Otherwise return SEGP_SUCCESS.
+ *
+ * For non wired shadow lists (segvn case) include address in the hashing
+ * function to avoid linking all the entries from the same segment or amp on
+ * the same bucket.  amp is used instead of seg if amp is not NULL. Non wired
+ * pcache entries are also linked on a per segment/amp list so that all
+ * entries can be found quickly during seg/amp purge without walking the
+ * entire pcache hash table.  For wired shadow lists (segspt case) we
+ * don't use address hashing and per segment linking because the caller
+ * currently inserts only one entry per segment that covers the entire
+ * segment. If we used per segment linking even for segspt it would complicate
+ * seg_ppurge_wiredpp() locking.
+ *
+ * Both hash bucket and per seg/amp locks need to be held before adding a non
+ * wired entry to hash and per seg/amp lists. per seg/amp lock should be taken
+ * first.
+ *
+ * This function will also remove from pcache old inactive shadow lists that
+ * overlap with this request but cover smaller range for the same start
+ * address.
  */
 int
-seg_pinsert(struct seg *seg, caddr_t addr, size_t len, struct page **pp,
-    enum seg_rw rw, uint_t flags, int (*callback)(struct seg *, caddr_t,
-    size_t, struct page **, enum seg_rw))
+seg_pinsert(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
+    size_t wlen, struct page **pp, enum seg_rw rw, uint_t flags,
+    seg_preclaim_cbfunc_t callback)
 {
 	struct seg_pcache *pcp;
 	struct seg_phash *hp;
 	pgcnt_t npages;
+	pcache_link_t *pheadp;
+	kmutex_t *pmtx;
+	struct seg_pcache *delcallb_list = NULL;

-	if (seg_plazy == 0) {
+	ASSERT(seg != NULL);
+	ASSERT(rw == S_READ || rw == S_WRITE);
+	ASSERT(rw == S_READ || wlen == len);
+	ASSERT(rw == S_WRITE || wlen <= len);
+	ASSERT(amp == NULL || wlen == len);
+
+#ifdef DEBUG
+	if (p_insert_mtbf && !(gethrtime() % p_insert_mtbf)) {
 		return (SEGP_FAIL);
 	}
-	if (seg_pdisable != 0) {
-		return (SEGP_FAIL);
-	}
-	ASSERT((len & PAGEOFFSET) == 0);
-	hp = &p_hashtab[p_hash(seg)];
-	if (hp->p_qlen > seg_pmaxqlen && (flags & SEGP_FORCE_WIRED) == 0) {
+#endif
+
+	if (seg_pdisabled) {
 		return (SEGP_FAIL);
 	}
-	npages = len >> PAGESHIFT;
-	mutex_enter(&seg_pmem);
-	/*
-	 * If the SEGP_FORCE_WIRED flag is set,
-	 * we skip the check for seg_pwindow.
-	 */
-	if ((flags & SEGP_FORCE_WIRED) == 0) {
-		seg_plocked_window += npages;
-		if (seg_plocked_window > seg_pwindow) {
-			seg_plocked_window -= npages;
-			mutex_exit(&seg_pmem);
+	ASSERT(seg_phashsize_win != 0);
+
+	ASSERT((len & PAGEOFFSET) == 0);
+	npages = btop(len);
+	mutex_enter(&seg_pmem_mtx);
+	if (!IS_PFLAGS_WIRED(flags)) {
+		if (seg_plocked_window + npages > seg_pmaxwindow) {
+			mutex_exit(&seg_pmem_mtx);
 			return (SEGP_FAIL);
 		}
+		seg_plocked_window += npages;
 	}
 	seg_plocked += npages;
-	mutex_exit(&seg_pmem);
+	mutex_exit(&seg_pmem_mtx);

-	pcp = kmem_alloc(sizeof (struct seg_pcache), KM_SLEEP);
-	pcp->p_seg = seg;
+	pcp = kmem_cache_alloc(seg_pkmcache, KM_SLEEP);
+	/*
+	 * If amp is not NULL set htag0 to amp otherwise set it to seg.
+	 */
+	if (amp == NULL) {
+		pcp->p_htag0 = (void *)seg;
+		pcp->p_flags = flags & 0xffff;
+	} else {
+		pcp->p_htag0 = (void *)amp;
+		pcp->p_flags = (flags & 0xffff) | SEGP_AMP;
+	}
 	pcp->p_addr = addr;
 	pcp->p_len = len;
+	pcp->p_wlen = wlen;
 	pcp->p_pp = pp;
-	pcp->p_rw = rw;
+	pcp->p_write = (rw == S_WRITE);
 	pcp->p_callback = callback;
 	pcp->p_active = 1;
-	pcp->p_flags = flags;

-	PPRINT4("seg_pinsert: seg %p, addr %p, len %lx, pplist %p\n",
-	    (void *)seg, (void *)addr, len, (void *)pp);
-
-	hp = &p_hashtab[p_hash(seg)];
-	mutex_enter(&hp->p_hmutex);
-	hp->p_qlen++;
+	hp = P_HASHBP(seg, pcp->p_htag0, addr, flags);
+	if (!IS_PFLAGS_WIRED(flags)) {
+		int found;
+		void *htag0;
+		if (amp == NULL) {
+			pheadp = &seg->s_phead;
+			pmtx = &seg->s_pmtx;
+			htag0 = (void *)seg;
+		} else {
+			pheadp = &amp->a_phead;
+			pmtx = &amp->a_pmtx;
+			htag0 = (void *)amp;
+		}
+		mutex_enter(pmtx);
+		mutex_enter(&hp->p_hmutex);
+		delcallb_list = seg_plookup_checkdup(hp, htag0, addr,
+		    len, &found);
+		if (found) {
+			mutex_exit(&hp->p_hmutex);
+			mutex_exit(pmtx);
+			mutex_enter(&seg_pmem_mtx);
+			seg_plocked -= npages;
+			seg_plocked_window -= npages;
+			mutex_exit(&seg_pmem_mtx);
+			kmem_cache_free(seg_pkmcache, pcp);
+			goto out;
+		}
+		pcp->p_plink.p_lnext = pheadp->p_lnext;
+		pcp->p_plink.p_lprev = pheadp;
+		pheadp->p_lnext->p_lprev = &pcp->p_plink;
+		pheadp->p_lnext = &pcp->p_plink;
+	} else {
+		mutex_enter(&hp->p_hmutex);
+	}
+	pcp->p_hashp = hp;
 	pcp->p_hnext = hp->p_hnext;
 	pcp->p_hprev = (struct seg_pcache *)hp;
 	hp->p_hnext->p_hprev = pcp;
 	hp->p_hnext = pcp;
+	if (!IS_PFLAGS_WIRED(flags) &&
+	    hp->p_hprev == pcp) {
+		seg_padd_abuck(hp);
+	}
 	mutex_exit(&hp->p_hmutex);
+	if (!IS_PFLAGS_WIRED(flags)) {
+		mutex_exit(pmtx);
+	}
+
+out:
+	npages = 0;
+	while (delcallb_list != NULL) {
+		pcp = delcallb_list;
+		delcallb_list = pcp->p_hprev;
+		ASSERT(!IS_PCP_WIRED(pcp) && !pcp->p_active);
+		(void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
+		    pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
+		npages += btop(pcp->p_len);
+		kmem_cache_free(seg_pkmcache, pcp);
+	}
+	if (npages) {
+		ASSERT(!IS_PFLAGS_WIRED(flags));
+		mutex_enter(&seg_pmem_mtx);
+		ASSERT(seg_plocked >= npages);
+		ASSERT(seg_plocked_window >= npages);
+		seg_plocked -= npages;
+		seg_plocked_window -= npages;
+		mutex_exit(&seg_pmem_mtx);
+	}
+
 	return (SEGP_SUCCESS);
 }

 /*
- * purge all entries from the pagelock cache if not active
- * and not recently used. Drop all locks and call through
- * the address space into the segment driver to reclaim
- * the pages. This makes sure we get the address space
- * and segment driver locking right.
+ * purge entries from the pagelock cache if not active
+ * and not recently used.
  */
 static void
-seg_ppurge_all(int force)
+seg_ppurge_async(int force)
 {
 	struct seg_pcache *delcallb_list = NULL;
 	struct seg_pcache *pcp;
 	struct seg_phash *hp;
-	int purge_count = 0;
 	pgcnt_t npages = 0;
 	pgcnt_t npages_window = 0;
+	pgcnt_t	npgs_to_purge;
+	pgcnt_t npgs_purged = 0;
+	int hlinks = 0;
+	int hlix;
+	pcache_link_t *hlinkp;
+	pcache_link_t *hlnextp = NULL;
+	int lowmem;
+	int trim;
+
+	ASSERT(seg_phashsize_win != 0);

 	/*
-	 * if the cache if off or empty, return
+	 * if the cache is off or empty, return
 	 */
-	if (seg_plazy == 0 || seg_plocked == 0) {
+	if (seg_plocked == 0 || (!force && seg_plocked_window == 0)) {
 		return;
 	}
-	for (hp = p_hashtab; hp < &p_hashtab[p_hashsize]; hp++) {
+
+	if (!force) {
+		lowmem = 0;
+		trim = 0;
+		if (freemem < lotsfree + needfree) {
+			spgcnt_t fmem = MAX((spgcnt_t)(freemem - needfree), 0);
+			if (fmem <= 5 * (desfree >> 2)) {
+				lowmem = 1;
+			} else if (fmem <= 7 * (lotsfree >> 3)) {
+				if (seg_plocked_window >=
+				    (availrmem_initial >> 1)) {
+					lowmem = 1;
+				}
+			} else if (fmem < lotsfree) {
+				if (seg_plocked_window >=
+				    3 * (availrmem_initial >> 2)) {
+					lowmem = 1;
+				}
+			}
+		}
+		if (seg_plocked_window >= 7 * (seg_pmaxwindow >> 3)) {
+			trim = 1;
+		}
+		if (!lowmem && !trim) {
+			return;
+		}
+		npgs_to_purge = seg_plocked_window >>
+		    seg_pshrink_shift;
+		if (lowmem) {
+			npgs_to_purge = MIN(npgs_to_purge,
+			    MAX(seg_pmaxapurge_npages, desfree));
+		} else {
+			npgs_to_purge = MIN(npgs_to_purge,
+			    seg_pmaxapurge_npages);
+		}
+		if (npgs_to_purge == 0) {
+			return;
+		}
+	} else {
+		struct seg_phash_wired *hpw;
+
+		ASSERT(seg_phashsize_wired != 0);
+
+		for (hpw = seg_phashtab_wired;
+		    hpw < &seg_phashtab_wired[seg_phashsize_wired]; hpw++) {
+
+			if (hpw->p_hnext == (struct seg_pcache *)hpw) {
+				continue;
+			}
+
+			mutex_enter(&hpw->p_hmutex);
+
+			for (pcp = hpw->p_hnext;
+			    pcp != (struct seg_pcache *)hpw;
+			    pcp = pcp->p_hnext) {
+
+				ASSERT(IS_PCP_WIRED(pcp));
+				ASSERT(pcp->p_hashp ==
+				    (struct seg_phash *)hpw);
+
+				if (pcp->p_active) {
+					continue;
+				}
+				pcp->p_hprev->p_hnext = pcp->p_hnext;
+				pcp->p_hnext->p_hprev = pcp->p_hprev;
+				pcp->p_hprev = delcallb_list;
+				delcallb_list = pcp;
+			}
+			mutex_exit(&hpw->p_hmutex);
+		}
+	}
+
+	mutex_enter(&seg_pmem_mtx);
+	if (seg_pathr_on) {
+		mutex_exit(&seg_pmem_mtx);
+		goto runcb;
+	}
+	seg_pathr_on = 1;
+	mutex_exit(&seg_pmem_mtx);
+	ASSERT(seg_pahcur <= 1);
+	hlix = !seg_pahcur;
+
+again:
+	for (hlinkp = seg_pahhead[hlix].p_lnext; hlinkp != &seg_pahhead[hlix];
+	    hlinkp = hlnextp) {
+
+		hlnextp = hlinkp->p_lnext;
+		ASSERT(hlnextp != NULL);
+
+		hp = hlink2phash(hlinkp, hlix);
+		if (hp->p_hnext == (struct seg_pcache *)hp) {
+			seg_pathr_empty_ahb++;
+			continue;
+		}
+		seg_pathr_full_ahb++;
 		mutex_enter(&hp->p_hmutex);
-		pcp = hp->p_hnext;
+
+		for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
+		    pcp = pcp->p_hnext) {
+			pcache_link_t *pheadp;
+			pcache_link_t *plinkp;
+			void *htag0;
+			kmutex_t *pmtx;
+
+			ASSERT(!IS_PCP_WIRED(pcp));
+			ASSERT(pcp->p_hashp == hp);
+
+			if (pcp->p_active) {
+				continue;
+			}
+			if (!force && pcp->p_ref &&
+			    PCP_AGE(pcp) < seg_pmax_pcpage) {
+				pcp->p_ref = 0;
+				continue;
+			}
+			plinkp = &pcp->p_plink;
+			htag0 = pcp->p_htag0;
+			if (pcp->p_flags & SEGP_AMP) {
+				pheadp = &((amp_t *)htag0)->a_phead;
+				pmtx = &((amp_t *)htag0)->a_pmtx;
+			} else {
+				pheadp = &((seg_t *)htag0)->s_phead;
+				pmtx = &((seg_t *)htag0)->s_pmtx;
+			}
+			if (!mutex_tryenter(pmtx)) {
+				continue;
+			}
+			ASSERT(pheadp->p_lnext != pheadp);
+			ASSERT(pheadp->p_lprev != pheadp);
+			plinkp->p_lprev->p_lnext =
+			    plinkp->p_lnext;
+			plinkp->p_lnext->p_lprev =
+			    plinkp->p_lprev;
+			pcp->p_hprev->p_hnext = pcp->p_hnext;
+			pcp->p_hnext->p_hprev = pcp->p_hprev;
+			mutex_exit(pmtx);
+			pcp->p_hprev = delcallb_list;
+			delcallb_list = pcp;
+			npgs_purged += btop(pcp->p_len);
+		}
+		if (hp->p_hnext == (struct seg_pcache *)hp) {
+			seg_premove_abuck(hp, 1);
+		}
+		mutex_exit(&hp->p_hmutex);
+		if (npgs_purged >= seg_plocked_window) {
+			break;
+		}
+		if (!force) {
+			if (npgs_purged >= npgs_to_purge) {
+				break;
+			}
+			if (!trim && !(seg_pathr_full_ahb & 15)) {
+				ASSERT(lowmem);
+				if (freemem >= lotsfree + needfree) {
+					break;
+				}
+			}
+		}
+	}
+
+	if (hlinkp == &seg_pahhead[hlix]) {
+		/*
+		 * We processed the entire hlix active bucket list
+		 * but didn't find enough pages to reclaim.
+		 * Switch the lists and walk the other list
+		 * if we haven't done it yet.
+		 */
+		mutex_enter(&seg_pmem_mtx);
+		ASSERT(seg_pathr_on);
+		ASSERT(seg_pahcur == !hlix);
+		seg_pahcur = hlix;
+		mutex_exit(&seg_pmem_mtx);
+		if (++hlinks < 2) {
+			hlix = !hlix;
+			goto again;
+		}
+	} else if ((hlinkp = hlnextp) != &seg_pahhead[hlix] &&
+	    seg_pahhead[hlix].p_lnext != hlinkp) {
+		ASSERT(hlinkp != NULL);
+		ASSERT(hlinkp->p_lprev != &seg_pahhead[hlix]);
+		ASSERT(seg_pahhead[hlix].p_lnext != &seg_pahhead[hlix]);
+		ASSERT(seg_pahhead[hlix].p_lprev != &seg_pahhead[hlix]);

 		/*
-		 * While 'force' is set, seg_pasync_thread is not
-		 * throttled.  This is to speedup flushing of seg_pcache
-		 * in preparation for DR.
-		 *
-		 * In normal case, when 'force' is not set, we throttle
-		 * seg_pasync_thread so that we don't spend all the time
-		 * time in purging the cache.
+		 * Reinsert the header to point to hlinkp
+		 * so that we start from hlinkp bucket next time around.
 		 */
-		while ((pcp != (struct seg_pcache *)hp) &&
-		    (force || (purge_count <= seg_ppcount))) {
+		seg_pahhead[hlix].p_lnext->p_lprev = seg_pahhead[hlix].p_lprev;
+		seg_pahhead[hlix].p_lprev->p_lnext = seg_pahhead[hlix].p_lnext;
+		seg_pahhead[hlix].p_lnext = hlinkp;
+		seg_pahhead[hlix].p_lprev = hlinkp->p_lprev;
+		hlinkp->p_lprev->p_lnext = &seg_pahhead[hlix];
+		hlinkp->p_lprev = &seg_pahhead[hlix];
+	}
+
+	mutex_enter(&seg_pmem_mtx);
+	ASSERT(seg_pathr_on);
+	seg_pathr_on = 0;
+	mutex_exit(&seg_pmem_mtx);

+runcb:
+	/*
+	 * Run the delayed callback list. segments/amps can't go away until
+	 * callback is executed since they must have non 0 softlockcnt. That's
+	 * why we don't need to hold as/seg/amp locks to execute the callback.
+	 */
+	while (delcallb_list != NULL) {
+		pcp = delcallb_list;
+		delcallb_list = pcp->p_hprev;
+		ASSERT(!pcp->p_active);
+		(void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
+		    pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 1);
+		npages += btop(pcp->p_len);
+		if (!IS_PCP_WIRED(pcp)) {
+			npages_window += btop(pcp->p_len);
+		}
+		kmem_cache_free(seg_pkmcache, pcp);
+	}
+	if (npages) {
+		mutex_enter(&seg_pmem_mtx);
+		ASSERT(seg_plocked >= npages);
+		ASSERT(seg_plocked_window >= npages_window);
+		seg_plocked -= npages;
+		seg_plocked_window -= npages_window;
+		mutex_exit(&seg_pmem_mtx);
+	}
+}
+
+/*
+ * Remove cached pages for segment(s) entries from hashtable.  The segments
+ * are identified by pp array. This is useful for multiple seg's cached on
+ * behalf of dummy segment (ISM/DISM) with common pp array.
+ */
+void
+seg_ppurge_wiredpp(struct page **pp)
+{
+	struct seg_pcache *pcp;
+	struct seg_phash_wired *hp;
+	pgcnt_t npages = 0;
+	struct	seg_pcache *delcallb_list = NULL;
+
+	/*
+	 * if the cache is empty, return
+	 */
+	if (seg_plocked == 0) {
+		return;
+	}
+	ASSERT(seg_phashsize_wired != 0);
+
+	for (hp = seg_phashtab_wired;
+	    hp < &seg_phashtab_wired[seg_phashsize_wired]; hp++) {
+		if (hp->p_hnext == (struct seg_pcache *)hp) {
+			continue;
+		}
+		mutex_enter(&hp->p_hmutex);
+		pcp = hp->p_hnext;
+		while (pcp != (struct seg_pcache *)hp) {
+			ASSERT(pcp->p_hashp == (struct seg_phash *)hp);
+			ASSERT(IS_PCP_WIRED(pcp));
 			/*
-			 * purge entries which are not active and
-			 * have not been used recently and
-			 * have the SEGP_ASYNC_FLUSH flag.
-			 *
-			 * In the 'force' case, we ignore the
-			 * SEGP_ASYNC_FLUSH flag.
+			 * purge entries which are not active
 			 */
-			if (!(pcp->p_flags & SEGP_ASYNC_FLUSH))
-				pcp->p_ref = 1;
-			if (force)
-				pcp->p_ref = 0;
-			if (!pcp->p_ref && !pcp->p_active) {
-				struct as *as = pcp->p_seg->s_as;
-
-				/*
-				 * try to get the readers lock on the address
-				 * space before taking out the cache element.
-				 * This ensures as_pagereclaim() can actually
-				 * call through the address space and free
-				 * the pages. If we don't get the lock, just
-				 * skip this entry. The pages will be reclaimed
-				 * by the segment driver at unmap time.
-				 */
-				if (AS_LOCK_TRYENTER(as, &as->a_lock,
-				    RW_READER)) {
-					hp->p_qlen--;
-					pcp->p_hprev->p_hnext = pcp->p_hnext;
-					pcp->p_hnext->p_hprev = pcp->p_hprev;
-					pcp->p_hprev = delcallb_list;
-					delcallb_list = pcp;
-					purge_count++;
-				}
-			} else {
-				pcp->p_ref = 0;
+			if (!pcp->p_active && pcp->p_pp == pp) {
+				ASSERT(pcp->p_htag0 != NULL);
+				pcp->p_hprev->p_hnext = pcp->p_hnext;
+				pcp->p_hnext->p_hprev = pcp->p_hprev;
+				pcp->p_hprev = delcallb_list;
+				delcallb_list = pcp;
 			}
 			pcp = pcp->p_hnext;
 		}
 		mutex_exit(&hp->p_hmutex);
-		if (!force && purge_count > seg_ppcount)
-			break;
+		/*
+		 * segments can't go away until callback is executed since
+		 * they must have non 0 softlockcnt. That's why we don't
+		 * need to hold as/seg locks to execute the callback.
+		 */
+		while (delcallb_list != NULL) {
+			int done;
+			pcp = delcallb_list;
+			delcallb_list = pcp->p_hprev;
+			ASSERT(!pcp->p_active);
+			done = (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
+			    pcp->p_len, pcp->p_pp,
+			    pcp->p_write ? S_WRITE : S_READ, 1);
+			npages += btop(pcp->p_len);
+			ASSERT(IS_PCP_WIRED(pcp));
+			kmem_cache_free(seg_pkmcache, pcp);
+			if (done) {
+				ASSERT(delcallb_list == NULL);
+				goto out;
+			}
+		}
 	}

-	/*
-	 * run the delayed callback list. We don't want to hold the
-	 * cache lock during a call through the address space.
-	 */
-	while (delcallb_list != NULL) {
-		struct as *as;
-
-		pcp = delcallb_list;
-		delcallb_list = pcp->p_hprev;
-		as = pcp->p_seg->s_as;
-
-		PPRINT4("seg_ppurge_all: purge seg %p, addr %p, len %lx, "
-		    "pplist %p\n", (void *)pcp->p_seg, (void *)pcp->p_addr,
-		    pcp->p_len, (void *)pcp->p_pp);
-
-		as_pagereclaim(as, pcp->p_pp, pcp->p_addr,
-		    pcp->p_len, pcp->p_rw);
-		AS_LOCK_EXIT(as, &as->a_lock);
-		npages += pcp->p_len >> PAGESHIFT;
-		if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) {
-			npages_window += pcp->p_len >> PAGESHIFT;
-		}
-		kmem_free(pcp, sizeof (struct seg_pcache));
-	}
-	mutex_enter(&seg_pmem);
+out:
+	mutex_enter(&seg_pmem_mtx);
+	ASSERT(seg_plocked >= npages);
 	seg_plocked -= npages;
-	seg_plocked_window -= npages_window;
-	mutex_exit(&seg_pmem);
-}
-
-/*
- * Remove cached pages for segment(s) entries from hashtable.
- * The segments are identified by a given clients callback
- * function.
- * This is useful for multiple seg's cached on behalf of
- * dummy segment (ISM/DISM) with common callback function.
- * The clients callback function may return status indicating
- * that the last seg's entry has been purged. In such a case
- * the seg_ppurge_seg() stops searching hashtable and exits.
- * Otherwise all hashtable entries are scanned.
- */
-void
-seg_ppurge_seg(int (*callback)(struct seg *, caddr_t, size_t,
-    struct page **, enum seg_rw))
-{
-	struct seg_pcache *pcp, *npcp;
-	struct seg_phash *hp;
-	pgcnt_t npages = 0;
-	pgcnt_t npages_window = 0;
-	int	done = 0;
-
-	/*
-	 * if the cache if off or empty, return
-	 */
-	if (seg_plazy == 0 || seg_plocked == 0) {
-		return;
-	}
-	mutex_enter(&seg_pcache);
-	seg_pdisable++;
-	mutex_exit(&seg_pcache);
-
-	for (hp = p_hashtab; hp < &p_hashtab[p_hashsize]; hp++) {
-
-		mutex_enter(&hp->p_hmutex);
-		pcp = hp->p_hnext;
-		while (pcp != (struct seg_pcache *)hp) {
-
-			/*
-			 * purge entries which are not active
-			 */
-			npcp = pcp->p_hnext;
-			if (!pcp->p_active && pcp->p_callback == callback) {
-				hp->p_qlen--;
-				pcp->p_hprev->p_hnext = pcp->p_hnext;
-				pcp->p_hnext->p_hprev = pcp->p_hprev;
-
-				if ((*pcp->p_callback)(pcp->p_seg, pcp->p_addr,
-				    pcp->p_len, pcp->p_pp, pcp->p_rw)) {
-					done = 1;
-				}
-
-				npages += pcp->p_len >> PAGESHIFT;
-				if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) {
-					npages_window +=
-					    pcp->p_len >> PAGESHIFT;
-				}
-				kmem_free(pcp, sizeof (struct seg_pcache));
-			}
-			pcp = npcp;
-			if (done)
-				break;
-		}
-		mutex_exit(&hp->p_hmutex);
-		if (done)
-			break;
-	}
-
-	mutex_enter(&seg_pcache);
-	seg_pdisable--;
-	mutex_exit(&seg_pcache);
-
-	mutex_enter(&seg_pmem);
-	seg_plocked -= npages;
-	seg_plocked_window -= npages_window;
-	mutex_exit(&seg_pmem);
+	mutex_exit(&seg_pmem_mtx);
 }

 /*
@@ -546,55 +1267,99 @@
  * reclaim the caller needs to hold the right locks.
  */
 void
-seg_ppurge(struct seg *seg)
+seg_ppurge(struct seg *seg, struct anon_map *amp, uint_t flags)
 {
 	struct seg_pcache *delcallb_list = NULL;
 	struct seg_pcache *pcp;
 	struct seg_phash *hp;
 	pgcnt_t npages = 0;
-	pgcnt_t npages_window = 0;
+	void *htag0;

-	if (seg_plazy == 0) {
+	if (seg_plocked == 0) {
 		return;
 	}
-	hp = &p_hashtab[p_hash(seg)];
-	mutex_enter(&hp->p_hmutex);
-	pcp = hp->p_hnext;
-	while (pcp != (struct seg_pcache *)hp) {
-		if (pcp->p_seg == seg) {
+	ASSERT(seg_phashsize_win != 0);
+
+	/*
+	 * If amp is not NULL use amp as a lookup tag otherwise use seg
+	 * as a lookup tag.
+	 */
+	htag0 = (amp == NULL ? (void *)seg : (void *)amp);
+	ASSERT(htag0 != NULL);
+	if (IS_PFLAGS_WIRED(flags)) {
+		hp = P_HASHBP(seg, htag0, 0, flags);
+		mutex_enter(&hp->p_hmutex);
+		pcp = hp->p_hnext;
+		while (pcp != (struct seg_pcache *)hp) {
+			ASSERT(pcp->p_hashp == hp);
+			ASSERT(IS_PCP_WIRED(pcp));
+			if (pcp->p_htag0 == htag0) {
+				if (pcp->p_active) {
+					break;
+				}
+				pcp->p_hprev->p_hnext = pcp->p_hnext;
+				pcp->p_hnext->p_hprev = pcp->p_hprev;
+				pcp->p_hprev = delcallb_list;
+				delcallb_list = pcp;
+			}
+			pcp = pcp->p_hnext;
+		}
+		mutex_exit(&hp->p_hmutex);
+	} else {
+		pcache_link_t *plinkp;
+		pcache_link_t *pheadp;
+		kmutex_t *pmtx;
+
+		if (amp == NULL) {
+			ASSERT(seg != NULL);
+			pheadp = &seg->s_phead;
+			pmtx = &seg->s_pmtx;
+		} else {
+			pheadp = &amp->a_phead;
+			pmtx = &amp->a_pmtx;
+		}
+		mutex_enter(pmtx);
+		while ((plinkp = pheadp->p_lnext) != pheadp) {
+			pcp = plink2pcache(plinkp);
+			ASSERT(!IS_PCP_WIRED(pcp));
+			ASSERT(pcp->p_htag0 == htag0);
+			hp = pcp->p_hashp;
+			mutex_enter(&hp->p_hmutex);
 			if (pcp->p_active) {
+				mutex_exit(&hp->p_hmutex);
 				break;
 			}
-			hp->p_qlen--;
+			ASSERT(plinkp->p_lprev == pheadp);
+			pheadp->p_lnext = plinkp->p_lnext;
+			plinkp->p_lnext->p_lprev = pheadp;
 			pcp->p_hprev->p_hnext = pcp->p_hnext;
 			pcp->p_hnext->p_hprev = pcp->p_hprev;
 			pcp->p_hprev = delcallb_list;
 			delcallb_list = pcp;
+			if (hp->p_hnext == (struct seg_pcache *)hp) {
+				seg_premove_abuck(hp, 0);
+			}
+			mutex_exit(&hp->p_hmutex);
 		}
-		pcp = pcp->p_hnext;
+		mutex_exit(pmtx);
 	}
-	mutex_exit(&hp->p_hmutex);
 	while (delcallb_list != NULL) {
 		pcp = delcallb_list;
 		delcallb_list = pcp->p_hprev;
-
-		PPRINT4("seg_ppurge: purge seg %p, addr %p, len %lx, "
-		    "pplist %p\n", (void *)seg, (void *)pcp->p_addr,
-		    pcp->p_len, (void *)pcp->p_pp);
-
-		ASSERT(seg == pcp->p_seg);
-		(void) (*pcp->p_callback)(seg, pcp->p_addr,
-		    pcp->p_len, pcp->p_pp, pcp->p_rw);
-		npages += pcp->p_len >> PAGESHIFT;
-		if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) {
-			npages_window += pcp->p_len >> PAGESHIFT;
-		}
-		kmem_free(pcp, sizeof (struct seg_pcache));
+		ASSERT(!pcp->p_active);
+		(void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, pcp->p_len,
+		    pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
+		npages += btop(pcp->p_len);
+		kmem_cache_free(seg_pkmcache, pcp);
 	}
-	mutex_enter(&seg_pmem);
+	mutex_enter(&seg_pmem_mtx);
+	ASSERT(seg_plocked >= npages);
 	seg_plocked -= npages;
-	seg_plocked_window -= npages_window;
-	mutex_exit(&seg_pmem);
+	if (!IS_PFLAGS_WIRED(flags)) {
+		ASSERT(seg_plocked_window >= npages);
+		seg_plocked_window -= npages;
+	}
+	mutex_exit(&seg_pmem_mtx);
 }

 static void seg_pinit_mem_config(void);
@@ -606,58 +1371,125 @@
 seg_pinit(void)
 {
 	struct seg_phash *hp;
-	int i;
-	uint_t physmegs;
+	ulong_t i;
+	pgcnt_t physmegs;
+
+	seg_plocked = 0;
+	seg_plocked_window = 0;
+
+	if (segpcache_enabled == 0) {
+		seg_phashsize_win = 0;
+		seg_phashsize_wired = 0;
+		seg_pdisabled = 1;
+		return;
+	}

-	sema_init(&seg_pasync_sem, 0, NULL, SEMA_DEFAULT, NULL);
+	seg_pdisabled = 0;
+	seg_pkmcache = kmem_cache_create("seg_pcache",
+	    sizeof (struct seg_pcache), 0, NULL, NULL, NULL, NULL, NULL, 0);
+	if (segpcache_pcp_maxage_ticks <= 0) {
+		segpcache_pcp_maxage_ticks = segpcache_pcp_maxage_sec * hz;
+	}
+	seg_pmax_pcpage = segpcache_pcp_maxage_ticks;
+	seg_pathr_empty_ahb = 0;
+	seg_pathr_full_ahb = 0;
+	seg_pshrink_shift = segpcache_shrink_shift;
+	seg_pmaxapurge_npages = btop(segpcache_maxapurge_bytes);

-	mutex_enter(&seg_pcache);
-	if (p_hashtab == NULL) {
-		physmegs = physmem >> (20 - PAGESHIFT);
+	mutex_init(&seg_pcache_mtx, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&seg_pmem_mtx, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&seg_pasync_mtx, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&seg_pasync_cv, NULL, CV_DEFAULT, NULL);
+
+	physmegs = physmem >> (20 - PAGESHIFT);

-		/* If p_hashsize was not set in /etc/system ... */
-		if (p_hashsize == 0) {
-			/*
-			 * Choose p_hashsize based on physmem.
-			 */
-			if (physmegs < 64) {
-				p_hashsize = 64;
-			} else if (physmegs < 1024) {
-				p_hashsize = 1024;
-			} else if (physmegs < 10 * 1024) {
-				p_hashsize = 8192;
-			} else if (physmegs < 20 * 1024) {
-				p_hashsize = 2 * 8192;
-				seg_pmaxqlen = 16;
-			} else {
-				p_hashsize = 128 * 1024;
-				seg_pmaxqlen = 128;
-			}
-		}
+	/*
+	 * If segpcache_hashsize_win was not set in /etc/system or it has
+	 * absurd value set it to a default.
+	 */
+	if (segpcache_hashsize_win == 0 || segpcache_hashsize_win > physmem) {
+		/*
+		 * Create one bucket per 32K (or at least per 8 pages) of
+		 * available memory.
+		 */
+		pgcnt_t pages_per_bucket = MAX(btop(32 * 1024), 8);
+		segpcache_hashsize_win = MAX(1024, physmem / pages_per_bucket);
+	}
+	if (!ISP2(segpcache_hashsize_win)) {
+		ulong_t rndfac = ~(1UL <<
+		    (highbit(segpcache_hashsize_win) - 1));
+		rndfac &= segpcache_hashsize_win;
+		segpcache_hashsize_win += rndfac;
+		segpcache_hashsize_win = 1 <<
+		    (highbit(segpcache_hashsize_win) - 1);
+	}
+	seg_phashsize_win = segpcache_hashsize_win;
+	seg_phashtab_win = kmem_zalloc(
+	    seg_phashsize_win * sizeof (struct seg_phash),
+	    KM_SLEEP);
+	for (i = 0; i < seg_phashsize_win; i++) {
+		hp = &seg_phashtab_win[i];
+		hp->p_hnext = (struct seg_pcache *)hp;
+		hp->p_hprev = (struct seg_pcache *)hp;
+		mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
+	}

-		p_hashtab = kmem_zalloc(p_hashsize * sizeof (struct seg_phash),
-		    KM_SLEEP);
-		for (i = 0; i < p_hashsize; i++) {
-			hp = (struct seg_phash *)&p_hashtab[i];
-			hp->p_hnext = (struct seg_pcache *)hp;
-			hp->p_hprev = (struct seg_pcache *)hp;
-			mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
-		}
-		if (seg_pwindow == 0) {
-			if (physmegs < 24) {
-				/* don't use cache */
-				seg_plazy = 0;
-			} else if (physmegs < 64) {
-				seg_pwindow = physmem >> 5; /* 3% of memory */
-			} else if (physmegs < 10 * 1024) {
-				seg_pwindow = physmem >> 3; /* 12% of memory */
-			} else {
-				seg_pwindow = physmem >> 1;
-			}
+	seg_pahcur = 0;
+	seg_pathr_on = 0;
+	seg_pahhead[0].p_lnext = &seg_pahhead[0];
+	seg_pahhead[0].p_lprev = &seg_pahhead[0];
+	seg_pahhead[1].p_lnext = &seg_pahhead[1];
+	seg_pahhead[1].p_lprev = &seg_pahhead[1];
+
+	/*
+	 * If segpcache_hashsize_wired was not set in /etc/system or it has
+	 * absurd value set it to a default.
+	 */
+	if (segpcache_hashsize_wired == 0 ||
+	    segpcache_hashsize_wired > physmem / 4) {
+		/*
+		 * Choose segpcache_hashsize_wired based on physmem.
+		 * Create a bucket per 128K bytes upto 256K buckets.
+		 */
+		if (physmegs < 20 * 1024) {
+			segpcache_hashsize_wired = MAX(1024, physmegs << 3);
+		} else {
+			segpcache_hashsize_wired = 256 * 1024;
 		}
 	}
-	mutex_exit(&seg_pcache);
+	if (!ISP2(segpcache_hashsize_wired)) {
+		segpcache_hashsize_wired = 1 <<
+		    highbit(segpcache_hashsize_wired);
+	}
+	seg_phashsize_wired = segpcache_hashsize_wired;
+	seg_phashtab_wired = kmem_zalloc(
+	    seg_phashsize_wired * sizeof (struct seg_phash_wired), KM_SLEEP);
+	for (i = 0; i < seg_phashsize_wired; i++) {
+		hp = (struct seg_phash *)&seg_phashtab_wired[i];
+		hp->p_hnext = (struct seg_pcache *)hp;
+		hp->p_hprev = (struct seg_pcache *)hp;
+		mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
+	}

+	if (segpcache_maxwindow == 0) {
+		if (physmegs < 64) {
+			/* 3% of memory */
+			segpcache_maxwindow = availrmem >> 5;
+		} else if (physmegs < 512) {
+			/* 12% of memory */
+			segpcache_maxwindow = availrmem >> 3;
+		} else if (physmegs < 1024) {
+			/* 25% of memory */
+			segpcache_maxwindow = availrmem >> 2;
+		} else if (physmegs < 2048) {
+			/* 50% of memory */
+			segpcache_maxwindow = availrmem >> 1;
+		} else {
+			/* no limit */
+			segpcache_maxwindow = (pgcnt_t)-1;
+		}
+	}
+	seg_pmaxwindow = segpcache_maxwindow;
 	seg_pinit_mem_config();
 }

@@ -668,16 +1500,24 @@
 seg_preap(void)
 {
 	/*
-	 * if the cache if off or empty, return
+	 * if the cache is off or empty, return
 	 */
-	if (seg_plocked == 0 || seg_plazy == 0) {
+	if (seg_plocked_window == 0) {
 		return;
 	}
-	sema_v(&seg_pasync_sem);
+	ASSERT(seg_phashsize_win != 0);
+
+	/*
+	 * If somebody is already purging pcache
+	 * just return.
+	 */
+	if (seg_pdisabled) {
+		return;
+	}
+
+	cv_signal(&seg_pasync_cv);
 }

-static void seg_pupdate(void *);
-
 /*
  * run as a backgroud thread and reclaim pagelock
  * pages which have not been used recently
@@ -686,42 +1526,30 @@
 seg_pasync_thread(void)
 {
 	callb_cpr_t cpr_info;
-	kmutex_t pasync_lock;	/* just for CPR stuff */

-	mutex_init(&pasync_lock, NULL, MUTEX_DEFAULT, NULL);
-
-	CALLB_CPR_INIT(&cpr_info, &pasync_lock, callb_generic_cpr,
-	    "seg_pasync");
-
-	if (seg_preap_interval == 0) {
-		seg_preap_interval = seg_preap_time * hz;
-	} else {
-		seg_preap_interval *= hz;
-	}
-	if (seg_plazy && seg_pupdate_active) {
-		(void) timeout(seg_pupdate, NULL, seg_preap_interval);
+	if (seg_phashsize_win == 0) {
+		thread_exit();
+		/*NOTREACHED*/
 	}

-	for (;;) {
-		mutex_enter(&pasync_lock);
-		CALLB_CPR_SAFE_BEGIN(&cpr_info);
-		mutex_exit(&pasync_lock);
-		sema_p(&seg_pasync_sem);
-		mutex_enter(&pasync_lock);
-		CALLB_CPR_SAFE_END(&cpr_info, &pasync_lock);
-		mutex_exit(&pasync_lock);
+	seg_pasync_thr = curthread;
+
+	CALLB_CPR_INIT(&cpr_info, &seg_pasync_mtx,
+	    callb_generic_cpr, "seg_pasync");
+
+	if (segpcache_reap_ticks <= 0) {
+		segpcache_reap_ticks = segpcache_reap_sec * hz;
+	}

-		seg_ppurge_all(0);
-	}
-}
-
-static void
-seg_pupdate(void *dummy)
-{
-	sema_v(&seg_pasync_sem);
-
-	if (seg_plazy && seg_pupdate_active) {
-		(void) timeout(seg_pupdate, dummy, seg_preap_interval);
+	mutex_enter(&seg_pasync_mtx);
+	for (;;) {
+		CALLB_CPR_SAFE_BEGIN(&cpr_info);
+		(void) cv_timedwait(&seg_pasync_cv, &seg_pasync_mtx,
+		    lbolt + segpcache_reap_ticks);
+		CALLB_CPR_SAFE_END(&cpr_info, &seg_pasync_mtx);
+		if (seg_pdisabled == 0) {
+			seg_ppurge_async(0);
+		}
 	}
 }

@@ -735,8 +1563,8 @@
 {
 	kstat_t *ksp;

-	seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg), 0,
-	    NULL, NULL, NULL, NULL, NULL, 0);
+	seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg),
+	    0, NULL, NULL, NULL, NULL, NULL, 0);

 	ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED,
 	    segadvstat_ndata, KSTAT_FLAG_VIRTUAL);
@@ -776,6 +1604,9 @@
 	new->s_data = NULL;
 	new->s_szc = 0;
 	new->s_flags = 0;
+	mutex_init(&new->s_pmtx, NULL, MUTEX_DEFAULT, NULL);
+	new->s_phead.p_lnext = &new->s_phead;
+	new->s_phead.p_lprev = &new->s_phead;
 	if (seg_attach(as, segbase, segsize, new) < 0) {
 		kmem_cache_free(seg_cache, new);
 		return ((struct seg *)NULL);
@@ -857,6 +1688,9 @@
 	if (seg->s_data != NULL)
 		SEGOP_FREE(seg);

+	mutex_destroy(&seg->s_pmtx);
+	ASSERT(seg->s_phead.p_lnext == &seg->s_phead);
+	ASSERT(seg->s_phead.p_lprev == &seg->s_phead);
 	kmem_cache_free(seg_cache, seg);
 }

@@ -872,10 +1706,10 @@
 void
 seg_p_enable(void)
 {
-	mutex_enter(&seg_pcache);
-	ASSERT(seg_pdisable != 0);
-	seg_pdisable--;
-	mutex_exit(&seg_pcache);
+	mutex_enter(&seg_pcache_mtx);
+	ASSERT(seg_pdisabled != 0);
+	seg_pdisabled--;
+	mutex_exit(&seg_pcache_mtx);
 }

 /*
@@ -890,18 +1724,19 @@
 	pgcnt_t	old_plocked;
 	int stall_count = 0;

-	mutex_enter(&seg_pcache);
-	seg_pdisable++;
-	ASSERT(seg_pdisable != 0);
-	mutex_exit(&seg_pcache);
+	mutex_enter(&seg_pcache_mtx);
+	seg_pdisabled++;
+	ASSERT(seg_pdisabled != 0);
+	mutex_exit(&seg_pcache_mtx);

 	/*
 	 * Attempt to empty the cache. Terminate if seg_plocked does not
 	 * diminish with SEGP_STALL_THRESHOLD consecutive attempts.
 	 */
 	while (seg_plocked != 0) {
+		ASSERT(seg_phashsize_win != 0);
 		old_plocked = seg_plocked;
-		seg_ppurge_all(1);
+		seg_ppurge_async(1);
 		if (seg_plocked == old_plocked) {
 			if (stall_count++ > SEGP_STALL_THRESHOLD) {
 				return (SEGP_FAIL);
@@ -918,7 +1753,7 @@
  * Attempt to purge seg_pcache.  May need to return before this has
  * completed to allow other pre_del callbacks to unlock pages. This is
  * ok because:
- *	1) The seg_pdisable flag has been set so at least we won't
+ *	1) The seg_pdisabled flag has been set so at least we won't
  *	cache anymore locks and the locks we couldn't purge
  *	will not be held if they do get released by a subsequent
  *	pre-delete callback.
@@ -934,6 +1769,9 @@
 	void *arg,
 	pgcnt_t delta_pages)
 {
+	if (seg_phashsize_win == 0) {
+		return (0);
+	}
 	if (seg_p_disable() != SEGP_SUCCESS)
 		cmn_err(CE_NOTE,
 		    "!Pre-delete couldn't purge"" pagelock cache - continuing");
@@ -947,6 +1785,9 @@
 	pgcnt_t delta_pages,
 	int cancelled)
 {
+	if (seg_phashsize_win == 0) {
+		return;
+	}
 	seg_p_enable();
 }

@@ -971,9 +1812,6 @@
 	ASSERT(ret == 0);
 }

-extern struct seg_ops segvn_ops;
-extern struct seg_ops segspt_shmops;
-
 /*
  * Verify that segment is not a shared anonymous segment which reserves
  * swap.  zone.max-swap accounting (zone->zone_max_swap) cannot be transfered
--- a/usr/src/uts/sfmmu/vm/hat_sfmmu.c	Thu May 22 22:08:42 2008 -0700
+++ b/usr/src/uts/sfmmu/vm/hat_sfmmu.c	Thu May 22 22:23:49 2008 -0700
@@ -7379,28 +7379,23 @@
 		return (PP_GENERIC_ATTR(pp));
 	}

-	if ((clearflag == (HAT_SYNC_STOPON_REF | HAT_SYNC_DONTZERO)) &&
-	    PP_ISREF(pp)) {
-		return (PP_GENERIC_ATTR(pp));
-	}
-
-	if ((clearflag == (HAT_SYNC_STOPON_MOD | HAT_SYNC_DONTZERO)) &&
-	    PP_ISMOD(pp)) {
-		return (PP_GENERIC_ATTR(pp));
-	}
-
-	if ((clearflag & HAT_SYNC_STOPON_SHARED) != 0 &&
-	    (pp->p_share > po_share) &&
-	    !(clearflag & HAT_SYNC_ZERORM)) {
-		hat_page_setattr(pp, P_REF);
-		return (PP_GENERIC_ATTR(pp));
-	}
-
-	if ((clearflag & HAT_SYNC_STOPON_SHARED) &&
-	    !(clearflag & HAT_SYNC_ZERORM)) {
-		stop_on_sh = 1;
-		shcnt = 0;
-	}
+	if ((clearflag & HAT_SYNC_ZERORM) == 0) {
+		if ((clearflag & HAT_SYNC_STOPON_REF) && PP_ISREF(pp)) {
+			return (PP_GENERIC_ATTR(pp));
+		}
+		if ((clearflag & HAT_SYNC_STOPON_MOD) && PP_ISMOD(pp)) {
+			return (PP_GENERIC_ATTR(pp));
+		}
+		if (clearflag & HAT_SYNC_STOPON_SHARED) {
+			if (pp->p_share > po_share) {
+				hat_page_setattr(pp, P_REF);
+				return (PP_GENERIC_ATTR(pp));
+			}
+			stop_on_sh = 1;
+			shcnt = 0;
+		}
+	}
+
 	clearflag &= ~HAT_SYNC_STOPON_SHARED;
 	pml = sfmmu_mlist_enter(pp);
 	index = PP_MAPINDEX(pp);