changeset 3253:c929f34b62c5

PSARC 2006/360 Page retire and caged memory kstats PSARC 2006/401 /dev/physmem 6385792 physical memory driver needed for memory testing
author mec
date Thu, 14 Dec 2006 17:27:13 -0800
parents 68e8f4a5fe5d
children c68742425967
files usr/src/cmd/devfsadm/misc_link.c usr/src/pkgdefs/SUNWckr/prototype_com usr/src/pkgdefs/SUNWckr/prototype_i386 usr/src/pkgdefs/SUNWckr/prototype_sparc usr/src/pkgdefs/SUNWhea/prototype_com usr/src/pkgdefs/common_files/i.minorperm_i386 usr/src/pkgdefs/common_files/i.minorperm_sparc usr/src/uts/common/Makefile.files usr/src/uts/common/io/physmem.c usr/src/uts/common/io/physmem.conf usr/src/uts/common/os/mem_cage.c usr/src/uts/common/os/mem_config.c usr/src/uts/common/sys/Makefile usr/src/uts/common/sys/physmem.h usr/src/uts/common/sys/thread.h usr/src/uts/common/vm/page.h usr/src/uts/common/vm/page_lock.c usr/src/uts/common/vm/page_retire.c usr/src/uts/common/vm/vm_anon.c usr/src/uts/common/vm/vm_page.c usr/src/uts/common/vm/vm_pagelist.c usr/src/uts/i86pc/os/machdep.c usr/src/uts/i86pc/vm/vm_dep.h usr/src/uts/i86pc/vm/vm_machdep.c usr/src/uts/intel/Makefile.intel.shared usr/src/uts/intel/os/minor_perm usr/src/uts/intel/os/name_to_major usr/src/uts/intel/physmem/Makefile usr/src/uts/sparc/Makefile.sparc.shared usr/src/uts/sparc/os/minor_perm usr/src/uts/sparc/os/name_to_major usr/src/uts/sparc/physmem/Makefile usr/src/uts/sun4/vm/vm_dep.h usr/src/uts/sun4u/os/mach_cpu_states.c usr/src/uts/sun4u/os/ppage.c usr/src/uts/sun4v/os/mach_cpu_states.c usr/src/uts/sun4v/os/ppage.c
diffstat 37 files changed, 3008 insertions(+), 446 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/cmd/devfsadm/misc_link.c	Thu Dec 14 16:42:14 2006 -0800
+++ b/usr/src/cmd/devfsadm/misc_link.c	Thu Dec 14 17:27:13 2006 -0800
@@ -99,7 +99,7 @@
 	    "(^eeprom$)|(^ptsl$)|(^mm$)|(^wc$)|(^dump$)|(^cn$)|(^lo$)|(^ptm$)|"
 	    "(^ptc$)|(^openeepr$)|(^poll$)|(^sysmsg$)|(^random$)|(^trapstat$)|"
 	    "(^cryptoadm$)|(^crypto$)|(^pool$)|(^poolctl$)|(^bl$)|(^kmdb$)|"
-	    "(^sysevent$)|(^kssl$)",
+	    "(^sysevent$)|(^kssl$)|(^physmem$)",
 	    TYPE_EXACT | DRV_RE, ILEVEL_1, minor_name
 	},
 	{ "pseudo", "ddi_pseudo",
--- a/usr/src/pkgdefs/SUNWckr/prototype_com	Thu Dec 14 16:42:14 2006 -0800
+++ b/usr/src/pkgdefs/SUNWckr/prototype_com	Thu Dec 14 17:27:13 2006 -0800
@@ -93,6 +93,7 @@
 f none kernel/drv/mm.conf 644 root sys
 f none kernel/drv/openeepr.conf 644 root sys
 f none kernel/drv/options.conf 644 root sys
+f none kernel/drv/physmem.conf 644 root sys
 f none kernel/drv/poll.conf 644 root sys
 f none kernel/drv/pseudo.conf 644 root sys
 f none kernel/drv/ptc.conf 644 root sys
--- a/usr/src/pkgdefs/SUNWckr/prototype_i386	Thu Dec 14 16:42:14 2006 -0800
+++ b/usr/src/pkgdefs/SUNWckr/prototype_i386	Thu Dec 14 17:27:13 2006 -0800
@@ -97,6 +97,7 @@
 f none kernel/drv/options 755 root sys
 f none kernel/drv/pci_to_i2o 755 root sys
 f none kernel/drv/pci_to_i2o.conf 644 root sys
+f none kernel/drv/physmem 755 root sys
 f none kernel/drv/poll 755 root sys
 f none kernel/drv/pseudo 755 root sys
 f none kernel/drv/ptc 755 root sys
@@ -274,6 +275,7 @@
 f none kernel/drv/amd64/mouse8042 755 root sys
 f none kernel/drv/amd64/openeepr 755 root sys
 f none kernel/drv/amd64/options 755 root sys
+f none kernel/drv/amd64/physmem 755 root sys
 f none kernel/drv/amd64/poll 755 root sys
 f none kernel/drv/amd64/pseudo 755 root sys
 f none kernel/drv/amd64/ptc 755 root sys
--- a/usr/src/pkgdefs/SUNWckr/prototype_sparc	Thu Dec 14 16:42:14 2006 -0800
+++ b/usr/src/pkgdefs/SUNWckr/prototype_sparc	Thu Dec 14 17:27:13 2006 -0800
@@ -102,6 +102,7 @@
 f none kernel/drv/sparcv9/openeepr 755 root sys
 f none kernel/drv/sparcv9/options 755 root sys
 f none kernel/drv/sparcv9/pci_pci 755 root sys
+f none kernel/drv/sparcv9/physmem 755 root sys
 f none kernel/drv/sparcv9/poll 755 root sys
 f none kernel/drv/sparcv9/pseudo 755 root sys
 f none kernel/drv/sparcv9/ptc 755 root sys
--- a/usr/src/pkgdefs/SUNWhea/prototype_com	Thu Dec 14 16:42:14 2006 -0800
+++ b/usr/src/pkgdefs/SUNWhea/prototype_com	Thu Dec 14 17:27:13 2006 -0800
@@ -933,6 +933,7 @@
 f none usr/include/sys/pcmcia.h 644 root bin
 f none usr/include/sys/pctypes.h 644 root bin
 f none usr/include/sys/pem.h 644 root bin
+f none usr/include/sys/physmem.h 644 root bin
 f none usr/include/sys/serializer.h 644 root bin
 f none usr/include/sys/pfmod.h 644 root bin
 f none usr/include/sys/pm.h 0644 root bin
--- a/usr/src/pkgdefs/common_files/i.minorperm_i386	Thu Dec 14 16:42:14 2006 -0800
+++ b/usr/src/pkgdefs/common_files/i.minorperm_i386	Thu Dec 14 17:27:13 2006 -0800
@@ -268,6 +268,7 @@
 systrace:systrace
 lx_ptm:lx_ptmajor
 lx_systrace:*
+physmem:*
 EOF
 }
 
--- a/usr/src/pkgdefs/common_files/i.minorperm_sparc	Thu Dec 14 16:42:14 2006 -0800
+++ b/usr/src/pkgdefs/common_files/i.minorperm_sparc	Thu Dec 14 17:27:13 2006 -0800
@@ -312,6 +312,7 @@
 profile:profile
 sdt:sdt
 systrace:systrace
+physmem:*
 EOF
 }
 
--- a/usr/src/uts/common/Makefile.files	Thu Dec 14 16:42:14 2006 -0800
+++ b/usr/src/uts/common/Makefile.files	Thu Dec 14 17:27:13 2006 -0800
@@ -571,6 +571,8 @@
 
 MM_OBJS +=	mem.o
 
+PHYSMEM_OBJS +=	physmem.o
+
 OPTIONS_OBJS += options.o
 
 WINLOCK_OBJS +=	winlockio.o
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/io/physmem.c	Thu Dec 14 17:27:13 2006 -0800
@@ -0,0 +1,981 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/modctl.h>
+#include <sys/conf.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/devops.h>
+#include <sys/stat.h>
+#include <sys/file.h>
+#include <sys/cred.h>
+#include <sys/policy.h>
+#include <sys/errno.h>
+#include <vm/seg_dev.h>
+#include <vm/seg_vn.h>
+#include <vm/page.h>
+#include <sys/fs/swapnode.h>
+#include <sys/sysmacros.h>
+#include <sys/fcntl.h>
+#include <sys/vmsystm.h>
+#include <sys/physmem.h>
+
+static dev_info_t		*physmem_dip = NULL;
+
+/*
+ * Linked list element hanging off physmem_proc_hash below, which holds all
+ * the information for a given segment which has been setup for this process.
+ * This is a simple linked list as we are assuming that for a given process
+ * the setup ioctl will only be called a handful of times.  If this assumption
+ * changes in the future, a quicker to traverse data structure should be used.
+ */
+struct physmem_hash {
+	struct physmem_hash *ph_next;
+	uint64_t ph_base_pa;
+	caddr_t ph_base_va;
+	size_t ph_seg_len;
+	struct vnode *ph_vnode;
+};
+
+/*
+ * Hash of all of the processes which have setup mappings with the driver with
+ * pointers to per process data.
+ */
+struct physmem_proc_hash {
+	struct proc *pph_proc;
+	struct physmem_hash *pph_hash;
+	struct physmem_proc_hash *pph_next;
+};
+
+
+/* Needs to be a power of two for simple hash algorithm */
+#define	PPH_SIZE	8
+struct physmem_proc_hash *pph[PPH_SIZE];
+
+/*
+ * Lock which protects the pph hash above.  To add an element (either a new
+ * process or a new segment) the WRITE lock must be held.  To traverse the
+ * list, only a READ lock is needed.
+ */
+krwlock_t pph_rwlock;
+
+#define	PHYSMEM_HASH(procp) ((int)((((uintptr_t)procp) >> 8) & (PPH_SIZE - 1)))
+
+/*
+ * Need to keep a reference count of how many processes have the driver
+ * open to prevent it from disappearing.
+ */
+uint64_t physmem_vnodecnt;
+kmutex_t physmem_mutex;		/* protects phsymem_vnodecnt */
+
+static int physmem_getpage(struct vnode *vp, offset_t off, size_t len,
+    uint_t *protp, page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
+    enum seg_rw rw, struct cred *cr);
+
+static int physmem_addmap(struct vnode *vp, offset_t off, struct as *as,
+    caddr_t addr, size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
+    struct cred *cred);
+
+static int physmem_delmap(struct vnode *vp, offset_t off, struct as *as,
+    caddr_t addr, size_t len, uint_t prot, uint_t maxprot, uint_t flags,
+    struct cred *cred);
+
+static void physmem_inactive(vnode_t *vp, cred_t *crp);
+
+const fs_operation_def_t physmem_vnodeops_template[] = {
+	VOPNAME_GETPAGE, physmem_getpage,
+	VOPNAME_ADDMAP, (fs_generic_func_p) physmem_addmap,
+	VOPNAME_DELMAP, physmem_delmap,
+	VOPNAME_INACTIVE, (fs_generic_func_p) physmem_inactive,
+	NULL, NULL
+};
+
+vnodeops_t *physmem_vnodeops = NULL;
+
+/*
+ * Removes the current process from the hash if the process has no more
+ * physmem segments active.
+ */
+void
+physmem_remove_hash_proc()
+{
+	int index;
+	struct physmem_proc_hash **walker;
+	struct physmem_proc_hash *victim = NULL;
+
+	index = PHYSMEM_HASH(curproc);
+	rw_enter(&pph_rwlock, RW_WRITER);
+	walker = &pph[index];
+	while (*walker != NULL) {
+		if ((*walker)->pph_proc == curproc &&
+		    (*walker)->pph_hash == NULL) {
+			victim = *walker;
+			*walker = victim->pph_next;
+			break;
+		}
+		walker = &((*walker)->pph_next);
+	}
+	rw_exit(&pph_rwlock);
+	if (victim != NULL)
+		kmem_free(victim, sizeof (struct physmem_proc_hash));
+}
+
+/*
+ * Add a new entry to the hash for the given process to cache the
+ * address ranges that it is working on.  If this is the first hash
+ * item to be added for this process, we will create the head pointer
+ * for this process.
+ * Returns 0 on success, ERANGE when the physical address is already in the
+ * hash.  Note that we add it to the hash as we have already called as_map
+ * and thus the as_unmap call will try to free the vnode, which needs
+ * to be found in the hash.
+ */
+int
+physmem_add_hash(struct physmem_hash *php)
+{
+	int index;
+	struct physmem_proc_hash *iterator;
+	struct physmem_proc_hash *newp = NULL;
+	struct physmem_hash *temp;
+	int ret = 0;
+
+	index = PHYSMEM_HASH(curproc);
+
+insert:
+	rw_enter(&pph_rwlock, RW_WRITER);
+	iterator = pph[index];
+	while (iterator != NULL) {
+		if (iterator->pph_proc == curproc) {
+			/*
+			 * check to make sure a single process does not try to
+			 * map the same region twice.
+			 */
+			for (temp = iterator->pph_hash; temp != NULL;
+			    temp = temp->ph_next) {
+				if ((php->ph_base_pa >= temp->ph_base_pa &&
+				    php->ph_base_pa < temp->ph_base_pa +
+				    temp->ph_seg_len) ||
+				    (temp->ph_base_pa >= php->ph_base_pa &&
+				    temp->ph_base_pa < php->ph_base_pa +
+				    php->ph_seg_len)) {
+					ret = ERANGE;
+					break;
+				}
+			}
+			if (ret == 0) {
+				php->ph_next = iterator->pph_hash;
+				iterator->pph_hash = php;
+			}
+			rw_exit(&pph_rwlock);
+			/* Need to check for two threads in sync */
+			if (newp != NULL)
+				kmem_free(newp, sizeof (*newp));
+			return (ret);
+		}
+		iterator = iterator->pph_next;
+	}
+
+	if (newp != NULL) {
+		newp->pph_proc = curproc;
+		newp->pph_next = pph[index];
+		newp->pph_hash = php;
+		php->ph_next = NULL;
+		pph[index] = newp;
+		rw_exit(&pph_rwlock);
+		return (0);
+	}
+
+	rw_exit(&pph_rwlock);
+	/* Dropped the lock so we could use KM_SLEEP */
+	newp = kmem_zalloc(sizeof (struct physmem_proc_hash), KM_SLEEP);
+	goto insert;
+}
+
+/*
+ * Will return the pointer to the physmem_hash struct if the setup routine
+ * has previously been called for this memory.
+ * Returns NULL on failure.
+ */
+struct physmem_hash *
+physmem_get_hash(uint64_t req_paddr, size_t len, proc_t *procp)
+{
+	int index;
+	struct physmem_proc_hash *proc_hp;
+	struct physmem_hash *php;
+
+	ASSERT(rw_lock_held(&pph_rwlock));
+
+	index = PHYSMEM_HASH(procp);
+	proc_hp = pph[index];
+	while (proc_hp != NULL) {
+		if (proc_hp->pph_proc == procp) {
+			php = proc_hp->pph_hash;
+			while (php != NULL) {
+				if ((req_paddr >= php->ph_base_pa) &&
+				    (req_paddr + len <=
+				    php->ph_base_pa + php->ph_seg_len)) {
+					return (php);
+				}
+				php = php->ph_next;
+			}
+		}
+		proc_hp = proc_hp->pph_next;
+	}
+	return (NULL);
+}
+
+int
+physmem_validate_cookie(uint64_t p_cookie)
+{
+	int index;
+	struct physmem_proc_hash *proc_hp;
+	struct physmem_hash *php;
+
+	ASSERT(rw_lock_held(&pph_rwlock));
+
+	index = PHYSMEM_HASH(curproc);
+	proc_hp = pph[index];
+	while (proc_hp != NULL) {
+		if (proc_hp->pph_proc == curproc) {
+			php = proc_hp->pph_hash;
+			while (php != NULL) {
+				if ((uint64_t)(uintptr_t)php == p_cookie) {
+					return (1);
+				}
+				php = php->ph_next;
+			}
+		}
+		proc_hp = proc_hp->pph_next;
+	}
+	return (0);
+}
+
+/*
+ * Remove the given vnode from the pph hash.  If it exists in the hash the
+ * process still has to be around as the vnode is obviously still around and
+ * since it's a physmem vnode, it must be in the hash.
+ * If it is not in the hash that must mean that the setup ioctl failed.
+ * Return 0 in this instance, 1 if it is in the hash.
+ */
+int
+physmem_remove_vnode_hash(vnode_t *vp)
+{
+	int index;
+	struct physmem_proc_hash *proc_hp;
+	struct physmem_hash **phpp;
+	struct physmem_hash *victim;
+
+	index = PHYSMEM_HASH(curproc);
+	/* synchronize with the map routine */
+	rw_enter(&pph_rwlock, RW_WRITER);
+	proc_hp = pph[index];
+	while (proc_hp != NULL) {
+		if (proc_hp->pph_proc == curproc) {
+			phpp = &proc_hp->pph_hash;
+			while (*phpp != NULL) {
+				if ((*phpp)->ph_vnode == vp) {
+					victim = *phpp;
+					*phpp = victim->ph_next;
+
+					rw_exit(&pph_rwlock);
+					kmem_free(victim, sizeof (*victim));
+					return (1);
+				}
+				phpp = &(*phpp)->ph_next;
+			}
+		}
+		proc_hp = proc_hp->pph_next;
+	}
+	rw_exit(&pph_rwlock);
+
+	/* not found */
+	return (0);
+}
+
+int
+physmem_setup_vnops()
+{
+	int error;
+	char *name = "physmem";
+	if (physmem_vnodeops != NULL)
+		cmn_err(CE_PANIC, "physmem vnodeops already set\n");
+	error = vn_make_ops(name, physmem_vnodeops_template, &physmem_vnodeops);
+	if (error != 0) {
+		cmn_err(CE_WARN, "physmem_setup_vnops: bad vnode ops template");
+	}
+	return (error);
+}
+
+/*
+ * The guts of the PHYSMEM_SETUP ioctl.
+ * Create a segment in the address space with the specified parameters.
+ * If pspp->user_va is NULL, as_gap will be used to find an appropriate VA.
+ * We do not do bounds checking on the requested phsycial addresses, if they
+ * do not exist in the system, they will not be mappable.
+ * Returns 0 on success with the following error codes on failure:
+ *	ENOMEM - The VA range requested was already mapped if pspp->user_va is
+ *		non-NULL or the system was unable to find enough VA space for
+ *		the desired length if user_va was NULL>
+ *	EINVAL - The requested PA, VA, or length was not PAGESIZE aligned.
+ */
+int
+physmem_setup_addrs(struct physmem_setup_param *pspp)
+{
+	struct as *as = curproc->p_as;
+	struct segvn_crargs vn_a;
+	int ret = 0;
+	uint64_t base_pa;
+	size_t len;
+	caddr_t uvaddr;
+	struct vnode *vp;
+	struct physmem_hash *php;
+
+	ASSERT(pspp != NULL);
+	base_pa = pspp->req_paddr;
+	len = pspp->len;
+	uvaddr = (caddr_t)(uintptr_t)pspp->user_va;
+
+	/* Sanity checking */
+	if (!IS_P2ALIGNED(base_pa, PAGESIZE))
+		return (EINVAL);
+	if (!IS_P2ALIGNED(len, PAGESIZE))
+		return (EINVAL);
+	if (uvaddr != NULL && !IS_P2ALIGNED(uvaddr, PAGESIZE))
+		return (EINVAL);
+
+	php = kmem_zalloc(sizeof (struct physmem_hash), KM_SLEEP);
+
+	/* Need to bump vnode count so that the driver can not be unloaded */
+	mutex_enter(&physmem_mutex);
+	physmem_vnodecnt++;
+	mutex_exit(&physmem_mutex);
+
+	vp = vn_alloc(KM_SLEEP);
+	ASSERT(vp != NULL);	/* SLEEP can't return NULL */
+	vn_setops(vp, physmem_vnodeops);
+
+	php->ph_vnode = vp;
+
+	vn_a.vp = vp;
+	vn_a.offset = (u_offset_t)base_pa;
+	vn_a.type = MAP_SHARED;
+	vn_a.prot = PROT_ALL;
+	vn_a.maxprot = PROT_ALL;
+	vn_a.flags = 0;
+	vn_a.cred = NULL;
+	vn_a.amp = NULL;
+	vn_a.szc = 0;
+	vn_a.lgrp_mem_policy_flags = 0;
+
+	as_rangelock(as);
+	if (uvaddr != NULL) {
+		if (as_gap(as, len, &uvaddr, &len, AH_LO, NULL) == -1) {
+			ret = ENOMEM;
+fail:
+			as_rangeunlock(as);
+			vn_free(vp);
+			kmem_free(php, sizeof (*php));
+			mutex_enter(&physmem_mutex);
+			physmem_vnodecnt--;
+			mutex_exit(&physmem_mutex);
+			return (ret);
+		}
+	} else {
+		/* We pick the address for the user */
+		map_addr(&uvaddr, len, 0, 1, 0);
+		if (uvaddr == NULL) {
+			ret = ENOMEM;
+			goto fail;
+		}
+	}
+	ret = as_map(as, uvaddr, len, segvn_create, &vn_a);
+
+	as_rangeunlock(as);
+	if (ret == 0) {
+		php->ph_base_pa = base_pa;
+		php->ph_base_va = uvaddr;
+		php->ph_seg_len = len;
+		pspp->user_va = (uint64_t)(uintptr_t)uvaddr;
+		pspp->cookie = (uint64_t)(uintptr_t)php;
+		ret = physmem_add_hash(php);
+		if (ret == 0)
+			return (0);
+		(void) as_unmap(as, uvaddr, len);
+		return (ret);
+	}
+
+	goto fail;
+	/*NOTREACHED*/
+}
+
+/*
+ * The guts of the PHYSMEM_MAP ioctl.
+ * Map the given PA to the appropriate VA if PHYSMEM_SETUP ioctl has already
+ * been called for this PA range.
+ * Returns 0 on success with the following error codes on failure:
+ *	EPERM - The requested page is long term locked, and thus repeated
+ *		requests to allocate this page will likely fail.
+ *	EAGAIN - The requested page could not be allocated, but it is believed
+ *		that future attempts could succeed.
+ *	ENOMEM - There was not enough free memory in the system to safely
+ *		map the requested page.
+ *	EINVAL - The requested paddr was not PAGESIZE aligned or the
+ *		PHYSMEM_SETUP ioctl was not called for this page.
+ *	ENOENT - The requested page was iniside the kernel cage, and the
+ *		PHYSMEM_CAGE flag was not set.
+ *	EBUSY - The requested page is retired and the PHYSMEM_RETIRE flag
+ *		was not set.
+ */
+static int
+physmem_map_addrs(struct physmem_map_param *pmpp)
+{
+	caddr_t uvaddr;
+	page_t *pp;
+	uint64_t req_paddr;
+	struct vnode *vp;
+	int ret = 0;
+	struct physmem_hash *php;
+	uint_t flags = 0;
+
+	ASSERT(pmpp != NULL);
+	req_paddr = pmpp->req_paddr;
+
+	if (!IS_P2ALIGNED(req_paddr, PAGESIZE))
+		return (EINVAL);
+	/* Find the vnode for this map request */
+	rw_enter(&pph_rwlock, RW_READER);
+	php = physmem_get_hash(req_paddr, PAGESIZE, curproc);
+	if (php == NULL) {
+		rw_exit(&pph_rwlock);
+		return (EINVAL);
+	}
+	vp = php->ph_vnode;
+	uvaddr = php->ph_base_va + (req_paddr - php->ph_base_pa);
+	rw_exit(&pph_rwlock);
+
+	pp = page_numtopp_nolock(btop((size_t)req_paddr));
+	if (pp == NULL) {
+		pmpp->ret_va = NULL;
+		return (EPERM);
+	}
+
+	/*
+	 * Check to see if page already mapped correctly.  This can happen
+	 * when we failed to capture a page previously and it was captured
+	 * asynchronously for us.  Return success in this case.
+	 */
+	if (pp->p_vnode == vp) {
+		ASSERT(pp->p_offset == (u_offset_t)req_paddr);
+		pmpp->ret_va = (uint64_t)(uintptr_t)uvaddr;
+		return (0);
+	}
+
+	/*
+	 * physmem should be responsible for checking for cage
+	 * and prom pages.
+	 */
+	if (pmpp->flags & PHYSMEM_CAGE)
+		flags = CAPTURE_GET_CAGE;
+	if (pmpp->flags & PHYSMEM_RETIRED)
+		flags |= CAPTURE_GET_RETIRED;
+
+	ret = page_trycapture(pp, 0, flags | CAPTURE_PHYSMEM, curproc);
+
+	if (ret != 0) {
+		pmpp->ret_va = NULL;
+		return (ret);
+	} else {
+		pmpp->ret_va = (uint64_t)(uintptr_t)uvaddr;
+		return (0);
+	}
+}
+
+/*
+ * Map the given page into the process's address space if possible.
+ * We actually only hash the page in on the correct vnode as the page
+ * will be mapped via segvn_pagefault.
+ * returns 0 on success
+ * returns 1 if there is no need to map this page anymore (process exited)
+ * returns -1 if we failed to map the page.
+ */
+int
+map_page_proc(page_t *pp, void *arg, uint_t flags)
+{
+	struct vnode *vp;
+	proc_t *procp = (proc_t *)arg;
+	int ret;
+	u_offset_t paddr = (u_offset_t)ptob(pp->p_pagenum);
+	struct physmem_hash *php;
+
+	ASSERT(pp != NULL);
+
+	/*
+	 * Check against availrmem to make sure that we're not low on memory.
+	 * We check again here as ASYNC requests do not do this check elsewhere.
+	 * We return 1 as we don't want the page to have the PR_CAPTURE bit
+	 * set or be on the page capture hash.
+	 */
+	if (swapfs_minfree > availrmem + 1) {
+		page_free(pp, 1);
+		return (1);
+	}
+
+	/*
+	 * If this is an asynchronous request for the current process,
+	 * we can not map the page as it's possible that we are also in the
+	 * process of unmapping the page which could result in a deadlock
+	 * with the as lock.
+	 */
+	if ((flags & CAPTURE_ASYNC) && (curproc == procp)) {
+		page_free(pp, 1);
+		return (-1);
+	}
+
+	/* only return zeroed out pages */
+	pagezero(pp, 0, PAGESIZE);
+
+	rw_enter(&pph_rwlock, RW_READER);
+	php = physmem_get_hash(paddr, PAGESIZE, procp);
+	if (php == NULL) {
+		rw_exit(&pph_rwlock);
+		/*
+		 * Free the page as there is no longer a valid outstanding
+		 * request for this page.
+		 */
+		page_free(pp, 1);
+		return (1);
+	}
+
+	vp = php->ph_vnode;
+
+	/*
+	 * We need to protect against a possible deadlock here where we own
+	 * the vnode page hash mutex and want to acquire it again as there
+	 * are locations in the code, where we unlock a page while holding
+	 * the mutex which can lead to the page being captured and eventually
+	 * end up here.
+	 */
+	if (mutex_owned(page_vnode_mutex(vp))) {
+		rw_exit(&pph_rwlock);
+		page_free(pp, 1);
+		return (-1);
+	}
+
+	ret = page_hashin(pp, vp, paddr, NULL);
+	rw_exit(&pph_rwlock);
+	if (ret == 0) {
+		page_free(pp, 1);
+		return (-1);
+	}
+
+	page_downgrade(pp);
+
+	mutex_enter(&freemem_lock);
+	availrmem--;
+	mutex_exit(&freemem_lock);
+
+	return (0);
+}
+
+/*
+ * The guts of the PHYSMEM_DESTROY ioctl.
+ * The cookie passed in will provide all of the information needed to
+ * free up the address space and physical memory associated with the
+ * corresponding PHSYMEM_SETUP ioctl.
+ * Returns 0 on success with the following error codes on failure:
+ *	EINVAL - The cookie supplied is not valid.
+ */
+int
+physmem_destroy_addrs(uint64_t p_cookie)
+{
+	struct as *as = curproc->p_as;
+	size_t len;
+	caddr_t uvaddr;
+
+	rw_enter(&pph_rwlock, RW_READER);
+	if (physmem_validate_cookie(p_cookie) == 0) {
+		rw_exit(&pph_rwlock);
+		return (EINVAL);
+	}
+
+	len = ((struct physmem_hash *)(uintptr_t)p_cookie)->ph_seg_len;
+	uvaddr = ((struct physmem_hash *)(uintptr_t)p_cookie)->ph_base_va;
+	rw_exit(&pph_rwlock);
+
+	(void) as_unmap(as, uvaddr, len);
+
+	return (0);
+}
+
+/*
+ * If the page has been hashed into the physmem vnode, then just look it up
+ * and return it via pl, otherwise return ENOMEM as the map ioctl has not
+ * succeeded on the given page.
+ */
+/*ARGSUSED*/
+static int
+physmem_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp,
+    page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw,
+    struct cred *cr)
+{
+	page_t *pp;
+
+	ASSERT(len == PAGESIZE);
+	ASSERT(AS_READ_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	/*
+	 * If the page is in the hash, then we successfully claimed this
+	 * page earlier, so return it to the caller.
+	 */
+	pp = page_lookup(vp, off, SE_SHARED);
+	if (pp != NULL) {
+		pl[0] = pp;
+		pl[1] = NULL;
+		*protp = PROT_ALL;
+		return (0);
+	}
+	return (ENOMEM);
+}
+
+/*
+ * We can not allow a process mapping /dev/physmem pages to fork as there can
+ * only be a single mapping to a /dev/physmem page at a given time.  Thus, the
+ * return of EINVAL when we are not working on our own address space.
+ * Otherwise we return zero as this function is required for normal operation.
+ */
+/*ARGSUSED*/
+static int
+physmem_addmap(struct vnode *vp, offset_t off, struct as *as,
+    caddr_t addr, size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
+    struct cred *cred)
+{
+	if (curproc->p_as != as) {
+		return (EINVAL);
+	}
+	return (0);
+}
+
+/* Will always get called for removing a whole segment. */
+/*ARGSUSED*/
+static int
+physmem_delmap(struct vnode *vp, offset_t off, struct as *as,
+    caddr_t addr, size_t len, uint_t prot, uint_t maxprot, uint_t flags,
+    struct cred *cred)
+{
+	/*
+	 * Release our hold on the vnode so that the final VN_RELE will
+	 * call physmem_inactive to clean things up.
+	 */
+	VN_RELE(vp);
+
+	return (0);
+}
+
+/*
+ * Clean up all the pages belonging to this vnode and then free it.
+ */
+/*ARGSUSED*/
+static void
+physmem_inactive(vnode_t *vp, cred_t *crp)
+{
+	page_t *pp;
+
+	/*
+	 * Remove the vnode from the hash now, to prevent asynchronous
+	 * attempts to map into this vnode.  This avoids a deadlock
+	 * where two threads try to get into this logic at the same
+	 * time and try to map the pages they are destroying into the
+	 * other's address space.
+	 * If it's not in the hash, just free it.
+	 */
+	if (physmem_remove_vnode_hash(vp) == 0) {
+		ASSERT(vp->v_pages == NULL);
+		vn_free(vp);
+		physmem_remove_hash_proc();
+		mutex_enter(&physmem_mutex);
+		physmem_vnodecnt--;
+		mutex_exit(&physmem_mutex);
+		return;
+	}
+
+	/*
+	 * At this point in time, no other logic can be adding or removing
+	 * pages from the vnode, otherwise the v_pages list could be inaccurate.
+	 */
+
+	while ((pp = vp->v_pages) != NULL) {
+		page_t *rpp;
+		if (page_tryupgrade(pp)) {
+			/*
+			 * set lckcnt for page_destroy to do availrmem
+			 * accounting
+			 */
+			pp->p_lckcnt = 1;
+			page_destroy(pp, 0);
+		} else {
+			/* failure to lock should be transient */
+			rpp = page_lookup(vp, ptob(pp->p_pagenum), SE_SHARED);
+			if (rpp != pp) {
+				page_unlock(rpp);
+				continue;
+			}
+			page_unlock(pp);
+		}
+	}
+	vn_free(vp);
+	physmem_remove_hash_proc();
+	mutex_enter(&physmem_mutex);
+	physmem_vnodecnt--;
+	mutex_exit(&physmem_mutex);
+}
+
+/*ARGSUSED*/
+static int
+physmem_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
+    int *rvalp)
+{
+	int ret;
+
+	switch (cmd) {
+	case PHYSMEM_SETUP:
+		{
+			struct physmem_setup_param psp;
+			if (ddi_copyin((void *)arg, &psp,
+			    sizeof (struct physmem_setup_param), 0))
+				return (EFAULT);
+			ret = physmem_setup_addrs(&psp);
+			if (ddi_copyout(&psp, (void *)arg, sizeof (psp), 0))
+				return (EFAULT);
+		}
+		break;
+	case PHYSMEM_MAP:
+		{
+			struct physmem_map_param pmp;
+			if (ddi_copyin((void *)arg, &pmp,
+			    sizeof (struct physmem_map_param), 0))
+				return (EFAULT);
+			ret = physmem_map_addrs(&pmp);
+			if (ddi_copyout(&pmp, (void *)arg, sizeof (pmp), 0))
+				return (EFAULT);
+		}
+		break;
+	case PHYSMEM_DESTROY:
+		{
+			uint64_t cookie;
+			if (ddi_copyin((void *)arg, &cookie,
+			    sizeof (uint64_t), 0))
+				return (EFAULT);
+			ret = physmem_destroy_addrs(cookie);
+		}
+		break;
+	default:
+		return (ENOTSUP);
+	}
+	return (ret);
+}
+
+/*ARGSUSED*/
+static int
+physmem_open(dev_t *devp, int flag, int otyp, cred_t *credp)
+{
+	int ret;
+	static int msg_printed = 0;
+
+	if ((flag & (FWRITE | FREAD)) != (FWRITE | FREAD)) {
+		return (EINVAL);
+	}
+
+	/* need to make sure we have the right privileges */
+	if ((ret = secpolicy_resource(credp)) != 0)
+		return (ret);
+	if ((ret = secpolicy_lock_memory(credp)) != 0)
+		return (ret);
+
+	if (msg_printed == 0) {
+		cmn_err(CE_NOTE, "!driver has been opened. This driver may "
+		    "take out long term locks on pages which may impact "
+		    "dynamic reconfiguration events");
+		msg_printed = 1;
+	}
+
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+physmem_close(dev_t dev, int flag, int otyp, cred_t *credp)
+{
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+physmem_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd,
+    void *arg, void **resultp)
+{
+	switch (infocmd) {
+	case DDI_INFO_DEVT2DEVINFO:
+		*resultp = physmem_dip;
+		return (DDI_SUCCESS);
+
+	case DDI_INFO_DEVT2INSTANCE:
+		*resultp = (void *)(ulong_t)getminor((dev_t)arg);
+		return (DDI_SUCCESS);
+
+	default:
+		return (DDI_FAILURE);
+	}
+}
+
+static int
+physmem_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	int i;
+
+	if (cmd == DDI_RESUME) {
+		return (DDI_SUCCESS);
+	}
+
+	if (cmd != DDI_ATTACH)
+		return (DDI_FAILURE);
+
+	if (ddi_create_minor_node(dip, ddi_get_name(dip), S_IFCHR,
+	    ddi_get_instance(dip), DDI_PSEUDO, 0) != DDI_SUCCESS)
+		return (DDI_FAILURE);
+
+	physmem_dip = dip;
+
+	/* Initialize driver specific data */
+	if (physmem_setup_vnops()) {
+		ddi_remove_minor_node(dip, ddi_get_name(dip));
+		return (DDI_FAILURE);
+	}
+
+	for (i = 0; i < PPH_SIZE; i++)
+		pph[i] = NULL;
+
+	page_capture_register_callback(PC_PHYSMEM, 10000,
+	    map_page_proc);
+
+	return (DDI_SUCCESS);
+}
+
+static int
+physmem_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	int ret = DDI_SUCCESS;
+
+	if (cmd == DDI_SUSPEND) {
+		return (DDI_SUCCESS);
+	}
+
+	if (cmd != DDI_DETACH)
+		return (DDI_FAILURE);
+
+	ASSERT(physmem_dip == dip);
+
+	mutex_enter(&physmem_mutex);
+	if (physmem_vnodecnt == 0) {
+		if (physmem_vnodeops != NULL) {
+			vn_freevnodeops(physmem_vnodeops);
+			physmem_vnodeops = NULL;
+			page_capture_unregister_callback(PC_PHYSMEM);
+		}
+	} else {
+		ret = EBUSY;
+	}
+	mutex_exit(&physmem_mutex);
+	if (ret == DDI_SUCCESS)
+		ddi_remove_minor_node(dip, ddi_get_name(dip));
+	return (ret);
+}
+
+static struct cb_ops physmem_cb_ops = {
+	physmem_open,	/* open */
+	physmem_close,	/* close */
+	nodev,		/* strategy */
+	nodev,		/* print */
+	nodev,		/* dump */
+	nodev,		/* read */
+	nodev,		/* write */
+	physmem_ioctl,	/* ioctl */
+	nodev,		/* devmap */
+	nodev,		/* mmap */
+	nodev,		/* segmap */
+	nochpoll,	/* chpoll */
+	ddi_prop_op,	/* prop_op */
+	NULL,		/* cb_str */
+	D_NEW | D_MP | D_DEVMAP,
+	CB_REV,
+	NULL,
+	NULL
+};
+
+static struct dev_ops physmem_ops = {
+	DEVO_REV,
+	0,
+	physmem_getinfo,
+	nulldev,
+	nulldev,
+	physmem_attach,
+	physmem_detach,
+	nodev,
+	&physmem_cb_ops,
+	NULL,
+	NULL
+};
+
+static struct modldrv modldrv = {
+	&mod_driverops,
+	"physmem driver %I%",
+	&physmem_ops
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1,
+	&modldrv,
+	NULL
+};
+
+int
+_init(void)
+{
+	return (mod_install(&modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+	return (mod_remove(&modlinkage));
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/io/physmem.conf	Thu Dec 14 17:27:13 2006 -0800
@@ -0,0 +1,28 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+name="physmem" parent="pseudo" instance=0;
--- a/usr/src/uts/common/os/mem_cage.c	Thu Dec 14 16:42:14 2006 -0800
+++ b/usr/src/uts/common/os/mem_cage.c	Thu Dec 14 17:27:13 2006 -0800
@@ -262,6 +262,11 @@
 #define	KCAGEPAGETS_INC()
 #endif
 
+/* kstats to export what pages are currently caged */
+kmutex_t kcage_kstat_lock;
+static int kcage_kstat_update(kstat_t *ksp, int rw);
+static int kcage_kstat_snapshot(kstat_t *ksp, void *buf, int rw);
+
 /*
  * Startup and Dynamic Reconfiguration interfaces.
  * kcage_range_lock()
@@ -873,6 +878,8 @@
 	pgcnt_t wanted;
 	pfn_t pfn;
 	page_t *pp;
+	kstat_t *ksp;
+
 	extern struct vnode kvp;
 	extern void page_list_noreloc_startup(page_t *);
 
@@ -981,6 +988,83 @@
 			page_freelist_coalesce_all(mnode);
 		}
 	}
+
+	ksp = kstat_create("kcage", 0, "kcage_page_list", "misc",
+	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL);
+	if (ksp != NULL) {
+		ksp->ks_update = kcage_kstat_update;
+		ksp->ks_snapshot = kcage_kstat_snapshot;
+		ksp->ks_lock = &kcage_kstat_lock; /* XXX - not really needed */
+		kstat_install(ksp);
+	}
+
+}
+
+static int
+kcage_kstat_update(kstat_t *ksp, int rw)
+{
+	struct kcage_glist *lp;
+	uint_t count;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	count = 0;
+	kcage_range_lock();
+	for (lp = kcage_glist; lp != NULL; lp = lp->next) {
+		if (lp->decr) {
+			if (lp->curr != lp->lim) {
+				count++;
+			}
+		} else {
+			if (lp->curr != lp->base) {
+				count++;
+			}
+		}
+	}
+	kcage_range_unlock();
+
+	ksp->ks_ndata = count;
+	ksp->ks_data_size = count * 2 * sizeof (uint64_t);
+
+	return (0);
+}
+
+static int
+kcage_kstat_snapshot(kstat_t *ksp, void *buf, int rw)
+{
+	struct kcage_glist *lp;
+	struct memunit {
+		uint64_t address;
+		uint64_t size;
+	} *kspmem;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	ksp->ks_snaptime = gethrtime();
+
+	kspmem = (struct memunit *)buf;
+	kcage_range_lock();
+	for (lp = kcage_glist; lp != NULL; lp = lp->next, kspmem++) {
+		if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size)
+			break;
+
+		if (lp->decr) {
+			if (lp->curr != lp->lim) {
+				kspmem->address = ptob(lp->curr);
+				kspmem->size = ptob(lp->lim - lp->curr);
+			}
+		} else {
+			if (lp->curr != lp->base) {
+				kspmem->address = ptob(lp->base);
+				kspmem->size = ptob(lp->curr - lp->base);
+			}
+		}
+	}
+	kcage_range_unlock();
+
+	return (0);
 }
 
 void
--- a/usr/src/uts/common/os/mem_config.c	Thu Dec 14 16:42:14 2006 -0800
+++ b/usr/src/uts/common/os/mem_config.c	Thu Dec 14 17:27:13 2006 -0800
@@ -2250,7 +2250,8 @@
 					 */
 					mhp->mh_hold_todo++;
 				} else {
-					(void) page_unretire_pp(pp, 0);
+					(void) page_unretire_pp(pp,
+					    PR_UNR_CLEAN);
 				}
 			}
 		}
--- a/usr/src/uts/common/sys/Makefile	Thu Dec 14 16:42:14 2006 -0800
+++ b/usr/src/uts/common/sys/Makefile	Thu Dec 14 17:27:13 2006 -0800
@@ -370,6 +370,7 @@
 	pctypes.h		\
 	pem.h			\
 	pfmod.h			\
+	physmem.h		\
 	pm.h			\
 	policy.h		\
 	poll.h			\
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/sys/physmem.h	Thu Dec 14 17:27:13 2006 -0800
@@ -0,0 +1,61 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+#ifndef	_PHYSMEM_H
+#define	_PHYSMEM_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/* ioctl values */
+#define	PHYSMEM_SETUP 1
+#define	PHYSMEM_MAP 2
+#define	PHYSMEM_DESTROY 3
+
+/* flags values */
+#define	PHYSMEM_CAGE	(1 << 0)
+#define	PHYSMEM_RETIRED	(1 << 1)
+
+struct physmem_setup_param {
+	uint64_t req_paddr;	/* requested physical address */
+	uint64_t len;		/* length of memory to be allocated */
+	uint64_t user_va;	/* VA to associate with req_paddr */
+	uint64_t cookie;	/* cookie returned for destroy function */
+};
+
+struct physmem_map_param {
+	uint64_t req_paddr;	/* requested physical address */
+	uint64_t ret_va;	/* VA which mapped req_paddr */
+	uint32_t flags;		/* flags for cage or retired pages */
+};
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _PHYSMEM_H */
--- a/usr/src/uts/common/sys/thread.h	Thu Dec 14 16:42:14 2006 -0800
+++ b/usr/src/uts/common/sys/thread.h	Thu Dec 14 17:27:13 2006 -0800
@@ -359,6 +359,7 @@
 #define	T_WATCHPT	0x0400	/* thread undergoing a watchpoint emulation */
 #define	T_PANIC		0x0800	/* thread initiated a system panic */
 #define	T_DFLTSTK	0x1000	/* stack is default size */
+#define	T_CAPTURING	0x2000	/* thread is in page capture logic */
 
 /*
  * Flags in t_proc_flag.
--- a/usr/src/uts/common/vm/page.h	Thu Dec 14 16:42:14 2006 -0800
+++ b/usr/src/uts/common/vm/page.h	Thu Dec 14 17:27:13 2006 -0800
@@ -667,6 +667,7 @@
 void	page_free_pages(page_t *);
 void	free_vp_pages(struct vnode *, u_offset_t, size_t);
 int	page_reclaim(page_t *, kmutex_t *);
+int	page_reclaim_pages(page_t *, kmutex_t *, uint_t);
 void	page_destroy(page_t *, int);
 void	page_destroy_pages(page_t *);
 void	page_destroy_free(page_t *);
@@ -702,8 +703,9 @@
 int	page_tryupgrade(page_t *);
 void	page_downgrade(page_t *);
 void	page_unlock(page_t *);
-void	page_unlock_noretire(page_t *);
+void	page_unlock_nocapture(page_t *);
 void	page_lock_delete(page_t *);
+int	page_deleted(page_t *);
 int	page_pp_lock(page_t *, int, int);
 void	page_pp_unlock(page_t *, int, int);
 int	page_resv(pgcnt_t, uint_t);
@@ -725,7 +727,7 @@
 page_t	*page_next_scan_init(void **);
 page_t	*page_next_scan_large(page_t *, ulong_t *, void **);
 void    prefetch_page_r(void *);
-void	ppcopy(page_t *, page_t *);
+int	ppcopy(page_t *, page_t *);
 void	page_relocate_hash(page_t *, page_t *);
 void	pagezero(page_t *, uint_t, uint_t);
 void	pagescrub(page_t *, uint_t, uint_t);
@@ -750,8 +752,7 @@
 int	page_unretire(uint64_t);
 int	page_unretire_pp(page_t *, int);
 void	page_tryretire(page_t *);
-void	page_retire_hunt(void (*)(page_t *));
-void	page_retire_mdboot_cb(page_t *);
+void	page_retire_mdboot();
 void	page_clrtoxic(page_t *, uchar_t);
 void	page_settoxic(page_t *, uchar_t);
 
@@ -910,6 +911,15 @@
  *
  * Note that, while p_toxic bits can be set without holding any locks, they
  * should only be cleared while holding the page exclusively locked.
+ * There is one exception to this, the PR_CAPTURE bit is protected by a mutex
+ * within the page capture logic and thus to set or clear the bit, that mutex
+ * needs to be held.  The page does not need to be locked but the page_clrtoxic
+ * function must be used as we need an atomic operation.
+ * Also note that there is what amounts to a hack to prevent recursion with
+ * large pages such that if we are unlocking a page and the PR_CAPTURE bit is
+ * set, we will only try to capture the page if the current threads T_CAPTURING
+ * flag is not set.  If the flag is set, the unlock will not try to capture
+ * the page even though the PR_CAPTURE bit is set.
  *
  * Pages with PR_UE or PR_FMA flags are retired unconditionally, while pages
  * with PR_MCE are retired if the system has not retired too many of them.
@@ -931,15 +941,15 @@
 #define	PR_UE		0x02	/* page has an unhandled UE */
 #define	PR_UE_SCRUBBED	0x04	/* page has seen a UE but was cleaned */
 #define	PR_FMA		0x08	/* A DE wants this page retired */
-#define	PR_RESV		0x10	/* Reserved for future use */
-#define	PR_BUSY		0x20	/* Page retire is in progress */
+#define	PR_CAPTURE	0x10	/* Generic page capture flag */
+#define	PR_RESV		0x20	/* Reserved for future use */
 #define	PR_MSG		0x40	/* message(s) already printed for this page */
 #define	PR_RETIRED	0x80	/* This page has been retired */
 
 #define	PR_REASONS	(PR_UE | PR_MCE | PR_FMA)
 #define	PR_TOXIC	(PR_UE)
 #define	PR_ERRMASK	(PR_UE | PR_UE_SCRUBBED | PR_MCE | PR_FMA)
-#define	PR_ALLFLAGS	(0xFF)
+#define	PR_TOXICFLAGS	(0xCF)
 
 #define	PP_RETIRED(pp)	((pp)->p_toxic & PR_RETIRED)
 #define	PP_TOXIC(pp)	((pp)->p_toxic & PR_TOXIC)
@@ -949,6 +959,13 @@
 	!PP_ISKVP(pp))
 
 /*
+ * Flags for page_unretire_pp
+ */
+#define	PR_UNR_FREE	0x1
+#define	PR_UNR_CLEAN	0x2
+#define	PR_UNR_TEMP	0x4
+
+/*
  * kpm large page description.
  * The virtual address range of segkpm is divided into chunks of
  * kpm_pgsz. Each chunk is controlled by a kpm_page_t. The ushort
@@ -1064,6 +1081,57 @@
 void build_pfn_hash();
 extern struct memseg *page_numtomemseg_nolock(pfn_t pfnum);
 
+/*
+ * page capture related info:
+ * The page capture routines allow us to asynchronously capture given pages
+ * for the explicit use of the requestor.  New requestors can be added by
+ * explicitly adding themselves to the PC_* flags below and incrementing
+ * PC_NUM_CALLBACKS as necessary.
+ *
+ * Subsystems using page capture must register a callback before attempting
+ * to capture a page.  A duration of -1 will indicate that we will never give
+ * up while trying to capture a page and will only stop trying to capture the
+ * given page once we have successfully captured it.  Thus the user needs to be
+ * aware of the behavior of all callers who have a duration of -1.
+ *
+ * For now, only /dev/physmem and page retire use the page capture interface
+ * and only a single request can be outstanding for a given page.  Thus, if
+ * /dev/phsymem wants a page and page retire also wants the same page, only
+ * the page retire request will be honored until the point in time that the
+ * page is actually retired, at which point in time, subsequent requests by
+ * /dev/physmem will succeed if the CAPTURE_GET_RETIRED flag was set.
+ */
+
+#define	PC_RETIRE		(0)
+#define	PC_PHYSMEM		(1)
+#define	PC_NUM_CALLBACKS	(2)
+#define	PC_MASK			((1 << PC_NUM_CALLBACKS) - 1)
+
+#define	CAPTURE_RETIRE		(1 << PC_RETIRE)
+#define	CAPTURE_PHYSMEM		(1 << PC_PHYSMEM)
+
+#define	CAPTURE_ASYNC		(0x0200)
+
+#define	CAPTURE_GET_RETIRED	(0x1000)
+#define	CAPTURE_GET_CAGE	(0x2000)
+
+struct page_capture_callback {
+	int cb_active;		/* 1 means active, 0 means inactive */
+	clock_t duration;	/* the length in time that we'll attempt to */
+				/* capture this page asynchronously. (in HZ) */
+	krwlock_t cb_rwlock;
+	int (*cb_func)(page_t *, void *, uint_t); /* callback function */
+};
+
+extern kcondvar_t pc_cv;
+
+void page_capture_register_callback(uint_t index, clock_t duration,
+    int (*cb_func)(page_t *, void *, uint_t));
+void page_capture_unregister_callback(uint_t index);
+int page_trycapture(page_t *pp, uint_t szc, uint_t flags, void *datap);
+void page_unlock_capture(page_t *pp);
+int page_capture_unretire_pp(page_t *);
+
 #ifdef	__cplusplus
 }
 #endif
--- a/usr/src/uts/common/vm/page_lock.c	Thu Dec 14 16:42:14 2006 -0800
+++ b/usr/src/uts/common/vm/page_lock.c	Thu Dec 14 17:27:13 2006 -0800
@@ -585,7 +585,7 @@
  * freelist manager; please don't call it.
  */
 void
-page_unlock_noretire(page_t *pp)
+page_unlock_nocapture(page_t *pp)
 {
 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
 	selock_t old;
@@ -598,7 +598,7 @@
 		if (CV_HAS_WAITERS(&pp->p_cv))
 			cv_broadcast(&pp->p_cv);
 	} else if ((old & ~SE_EWANTED) == SE_DELETED) {
-		panic("page_unlock_noretire: page %p is deleted", pp);
+		panic("page_unlock_nocapture: page %p is deleted", pp);
 	} else if (old < 0) {
 		THREAD_KPRI_RELEASE();
 		pp->p_selock &= SE_EWANTED;
@@ -607,7 +607,7 @@
 	} else if ((old & ~SE_EWANTED) > SE_READER) {
 		pp->p_selock = old - SE_READER;
 	} else {
-		panic("page_unlock_noretire: page %p is not locked", pp);
+		panic("page_unlock_nocapture: page %p is not locked", pp);
 	}
 
 	mutex_exit(pse);
@@ -643,23 +643,21 @@
 		panic("page_unlock: page %p is not locked", pp);
 	}
 
-	if (pp->p_selock == 0 && PP_PR_REQ(pp)) {
+	if (pp->p_selock == 0) {
 		/*
-		 * Try to retire the page. If it retires, great.
-		 * If not, oh well, we'll get it in the next unlock
-		 * request, and repeat the cycle.  Regardless,
-		 * page_tryretire() will drop the page lock.
+		 * If the T_CAPTURING bit is set, that means that we should
+		 * not try and capture the page again as we could recurse
+		 * which could lead to a stack overflow panic or spending a
+		 * relatively long time in the kernel making no progress.
 		 */
-		if ((pp->p_toxic & PR_BUSY) == 0) {
+		if ((pp->p_toxic & PR_CAPTURE) &&
+		    !(curthread->t_flag & T_CAPTURING) &&
+		    !PP_RETIRED(pp)) {
 			THREAD_KPRI_REQUEST();
 			pp->p_selock = SE_WRITER;
-			page_settoxic(pp, PR_BUSY);
 			mutex_exit(pse);
-			page_tryretire(pp);
+			page_unlock_capture(pp);
 		} else {
-			pp->p_selock = SE_WRITER;
-			page_clrtoxic(pp, PR_BUSY);
-			pp->p_selock = 0;
 			mutex_exit(pse);
 		}
 	} else {
@@ -736,6 +734,12 @@
 	mutex_exit(pse);
 }
 
+int
+page_deleted(page_t *pp)
+{
+	return (pp->p_selock == SE_DELETED);
+}
+
 /*
  * Implement the io lock for pages
  */
--- a/usr/src/uts/common/vm/page_retire.c	Thu Dec 14 16:42:14 2006 -0800
+++ b/usr/src/uts/common/vm/page_retire.c	Thu Dec 14 17:27:13 2006 -0800
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -85,28 +84,24 @@
 /*
  * Things to fix:
  *
- * 	1. Cleanup SE_EWANTED.  Since we're aggressive about trying to retire
- *	pages, we can use page_retire_pp() to replace SE_EWANTED and all
- *	the special delete_memory_thread() code just goes away.
- *
- * 	2. Trying to retire non-relocatable kvp pages may result in a
+ * 	1. Trying to retire non-relocatable kvp pages may result in a
  *      quagmire. This is because seg_kmem() no longer keeps its pages locked,
  *      and calls page_lookup() in the free path; since kvp pages are modified
  *      and don't have a usable backing store, page_retire() can't do anything
  *      with them, and we'll keep denying the lock to seg_kmem_free() in a
  *      vicious cycle. To prevent that, we don't deny locks to kvp pages, and
- *      hence only call page_retire_pp() from page_unlock() in the free path.
+ *      hence only try to retire a page from page_unlock() in the free path.
  *      Since most kernel pages are indefinitely held anyway, and don't
  *      participate in I/O, this is of little consequence.
  *
- *      3. Low memory situations will be interesting. If we don't have
+ *      2. Low memory situations will be interesting. If we don't have
  *      enough memory for page_relocate() to succeed, we won't be able to
  *      retire dirty pages; nobody will be able to push them out to disk
  *      either, since we aggressively deny the page lock. We could change
  *      fsflush so it can recognize this situation, grab the lock, and push
  *      the page out, where we'll catch it in the free path and retire it.
  *
- *	4. Beware of places that have code like this in them:
+ *	3. Beware of places that have code like this in them:
  *
  *		if (! page_tryupgrade(pp)) {
  *			page_unlock(pp);
@@ -125,7 +120,7 @@
  *	page, and then unlock the page. Page_free() will then go castors
  *	up. So if anybody is doing this, it's already a bug.
  *
- *      5. mdboot()'s call into page_retire_hunt() should probably be
+ *      4. mdboot()'s call into page_retire_mdboot() should probably be
  *      moved lower. Where the call is made now, we can get into trouble
  *      by scrubbing a kernel page that is then accessed later.
  */
@@ -154,18 +149,7 @@
  */
 vnode_t *retired_pages;
 
-/*
- * Background thread that wakes up periodically to try to retire pending
- * pages. This prevents threads from becoming blocked indefinitely in
- * page_lookup() or some other routine should the page(s) they are waiting
- * on become eligible for social security.
- */
-static void page_retire_thread(void);
-static kthread_t *pr_thread_id;
-static kcondvar_t pr_cv;
-static kmutex_t pr_thread_mutex;
-static clock_t pr_thread_shortwait;
-static clock_t pr_thread_longwait;
+static int page_retire_pp_finish(page_t *, void *, uint_t);
 
 /*
  * Make a list of all of the pages that have been marked for retirement
@@ -243,6 +227,13 @@
 #define	PR_KSTAT_DQFAIL		(page_retire_kstat.pr_dequeue_fail.value.ui64)
 
 /*
+ * page retire kstats to list all retired pages
+ */
+static int pr_list_kstat_update(kstat_t *ksp, int rw);
+static int pr_list_kstat_snapshot(kstat_t *ksp, void *buf, int rw);
+kmutex_t pr_list_kstat_mutex;
+
+/*
  * Limit the number of multiple CE page retires.
  * The default is 0.1% of physmem, or 1 in 1000 pages. This is set in
  * basis points, where 100 basis points equals one percent.
@@ -473,11 +464,13 @@
  * Note that multiple bits may cleared in a single clrtoxic operation.
  * Must be called with the page exclusively locked to prevent races which
  * may attempt to retire a page without any toxic bits set.
+ * Note that the PR_CAPTURE bit can be cleared without the exclusive lock
+ * being held as there is a separate mutex which protects that bit.
  */
 void
 page_clrtoxic(page_t *pp, uchar_t bits)
 {
-	ASSERT(PAGE_EXCL(pp));
+	ASSERT((bits & PR_CAPTURE) || PAGE_EXCL(pp));
 	atomic_and_8(&pp->p_toxic, ~bits);
 }
 
@@ -523,82 +516,6 @@
 }
 
 /*
- * On a reboot, our friend mdboot() wants to clear up any PP_PR_REQ() pages
- * that we were not able to retire. On large machines, walking the complete
- * page_t array and looking at every page_t takes too long. So, as a page is
- * marked toxic, we track it using a list that can be processed at reboot
- * time.  page_retire_enqueue() will do its best to try to avoid duplicate
- * entries, but if we get too many errors at once the queue can overflow,
- * in which case we will end up walking every page_t as a last resort.
- * The background thread also makes use of this queue to find which pages
- * are pending retirement.
- */
-static void
-page_retire_enqueue(page_t *pp)
-{
-	int	nslot = -1;
-	int	i;
-
-	mutex_enter(&pr_q_mutex);
-
-	/*
-	 * Check to make sure retire hasn't already dequeued it.
-	 * In the meantime if the page was cleaned up, no need
-	 * to enqueue it.
-	 */
-	if (PP_RETIRED(pp) || pp->p_toxic == 0) {
-		mutex_exit(&pr_q_mutex);
-		PR_DEBUG(prd_noaction);
-		return;
-	}
-
-	for (i = 0; i < PR_PENDING_QMAX; i++) {
-		if (pr_pending_q[i] == pp) {
-			mutex_exit(&pr_q_mutex);
-			PR_DEBUG(prd_qdup);
-			return;
-		} else if (nslot == -1 && pr_pending_q[i] == NULL) {
-			nslot = i;
-		}
-	}
-
-	PR_INCR_KSTAT(pr_pending);
-
-	if (nslot != -1) {
-		pr_pending_q[nslot] = pp;
-		PR_DEBUG(prd_queued);
-	} else {
-		PR_INCR_KSTAT(pr_enqueue_fail);
-		PR_DEBUG(prd_notqueued);
-	}
-	mutex_exit(&pr_q_mutex);
-}
-
-static void
-page_retire_dequeue(page_t *pp)
-{
-	int i;
-
-	mutex_enter(&pr_q_mutex);
-
-	for (i = 0; i < PR_PENDING_QMAX; i++) {
-		if (pr_pending_q[i] == pp) {
-			pr_pending_q[i] = NULL;
-			break;
-		}
-	}
-
-	if (i == PR_PENDING_QMAX) {
-		PR_INCR_KSTAT(pr_dequeue_fail);
-	}
-
-	PR_DECR_KSTAT(pr_pending);
-	PR_DEBUG(prd_dequeue);
-
-	mutex_exit(&pr_q_mutex);
-}
-
-/*
  * Act like page_destroy(), but instead of freeing the page, hash it onto
  * the retired_pages vnode, and mark it retired.
  *
@@ -626,8 +543,6 @@
 	}
 
 	page_settoxic(pp, PR_RETIRED);
-	page_clrtoxic(pp, PR_BUSY);
-	page_retire_dequeue(pp);
 	PR_INCR_KSTAT(pr_retired);
 
 	if (pp->p_toxic & PR_FMA) {
@@ -784,8 +699,7 @@
 		} else {
 			PR_INCR_KSTAT(pr_ue_cleared_free);
 
-			page_clrtoxic(pp, PR_UE | PR_MCE | PR_MSG | PR_BUSY);
-			page_retire_dequeue(pp);
+			page_clrtoxic(pp, PR_UE | PR_MCE | PR_MSG);
 
 			/* LINTED: CONSTCOND */
 			VN_DISPOSE(pp, B_FREE, 1, kcred);
@@ -825,6 +739,83 @@
 	/*NOTREACHED*/
 }
 
+static int
+pr_list_kstat_update(kstat_t *ksp, int rw)
+{
+	uint_t count;
+	page_t *pp;
+	kmutex_t *vphm;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	vphm = page_vnode_mutex(retired_pages);
+	mutex_enter(vphm);
+	/* Needs to be under a lock so that for loop will work right */
+	if (retired_pages->v_pages == NULL) {
+		mutex_exit(vphm);
+		ksp->ks_ndata = 0;
+		ksp->ks_data_size = 0;
+		return (0);
+	}
+
+	count = 1;
+	for (pp = retired_pages->v_pages->p_vpnext;
+	    pp != retired_pages->v_pages; pp = pp->p_vpnext) {
+		count++;
+	}
+	mutex_exit(vphm);
+
+	ksp->ks_ndata = count;
+	ksp->ks_data_size = count * 2 * sizeof (uint64_t);
+
+	return (0);
+}
+
+/*
+ * all spans will be pagesize and no coalescing will be done with the
+ * list produced.
+ */
+static int
+pr_list_kstat_snapshot(kstat_t *ksp, void *buf, int rw)
+{
+	kmutex_t *vphm;
+	page_t *pp;
+	struct memunit {
+		uint64_t address;
+		uint64_t size;
+	} *kspmem;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	ksp->ks_snaptime = gethrtime();
+
+	kspmem = (struct memunit *)buf;
+
+	vphm = page_vnode_mutex(retired_pages);
+	mutex_enter(vphm);
+	pp = retired_pages->v_pages;
+	if (((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size) ||
+	    (pp == NULL)) {
+		mutex_exit(vphm);
+		return (0);
+	}
+	kspmem->address = ptob(pp->p_pagenum);
+	kspmem->size = PAGESIZE;
+	kspmem++;
+	for (pp = pp->p_vpnext; pp != retired_pages->v_pages;
+	    pp = pp->p_vpnext, kspmem++) {
+		if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size)
+			break;
+		kspmem->address = ptob(pp->p_pagenum);
+		kspmem->size = PAGESIZE;
+	}
+	mutex_exit(vphm);
+
+	return (0);
+}
+
 /*
  * Initialize the page retire mechanism:
  *
@@ -833,13 +824,14 @@
  *   - Build the retired_pages vnode.
  *   - Set up the kstats.
  *   - Fire off the background thread.
- *   - Tell page_tryretire() it's OK to start retiring pages.
+ *   - Tell page_retire() it's OK to start retiring pages.
  */
 void
 page_retire_init(void)
 {
 	const fs_operation_def_t retired_vnodeops_template[] = {NULL, NULL};
 	struct vnodeops *vops;
+	kstat_t *ksp;
 
 	const uint_t page_retire_ndata =
 	    sizeof (page_retire_kstat) / sizeof (kstat_named_t);
@@ -869,13 +861,17 @@
 		kstat_install(page_retire_ksp);
 	}
 
-	pr_thread_shortwait = 23 * hz;
-	pr_thread_longwait = 1201 * hz;
-	mutex_init(&pr_thread_mutex, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&pr_cv, NULL, CV_DEFAULT, NULL);
-	pr_thread_id = thread_create(NULL, 0, page_retire_thread, NULL, 0, &p0,
-	    TS_RUN, minclsyspri);
+	mutex_init(&pr_list_kstat_mutex, NULL, MUTEX_DEFAULT, NULL);
+	ksp = kstat_create("unix", 0, "page_retire_list", "misc",
+	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL);
+	if (ksp != NULL) {
+		ksp->ks_update = pr_list_kstat_update;
+		ksp->ks_snapshot = pr_list_kstat_snapshot;
+		ksp->ks_lock = &pr_list_kstat_mutex;
+		kstat_install(ksp);
+	}
 
+	page_capture_register_callback(PC_RETIRE, -1, page_retire_pp_finish);
 	pr_enable = 1;
 }
 
@@ -914,122 +910,17 @@
 	pp->p_toxic = 0;
 }
 
-/*
- * Hunt down any pages in the system that have not yet been retired, invoking
- * the provided callback function on each of them.
- */
-void
-page_retire_hunt(void (*callback)(page_t *))
-{
-	page_t *pp;
-	page_t *first;
-	uint64_t tbr, found;
-	int i;
-
-	PR_DEBUG(prd_hunt);
-
-	if (PR_KSTAT_PENDING == 0) {
-		return;
-	}
-
-	PR_DEBUG(prd_dohunt);
-
-	found = 0;
-	mutex_enter(&pr_q_mutex);
-
-	tbr = PR_KSTAT_PENDING;
-
-	for (i = 0; i < PR_PENDING_QMAX; i++) {
-		if ((pp = pr_pending_q[i]) != NULL) {
-			mutex_exit(&pr_q_mutex);
-			callback(pp);
-			mutex_enter(&pr_q_mutex);
-			found++;
-		}
-	}
-
-	if (PR_KSTAT_EQFAIL == PR_KSTAT_DQFAIL && found == tbr) {
-		mutex_exit(&pr_q_mutex);
-		PR_DEBUG(prd_earlyhunt);
-		return;
-	}
-	mutex_exit(&pr_q_mutex);
-
-	PR_DEBUG(prd_latehunt);
-
-	/*
-	 * We've lost track of a page somewhere. Hunt it down.
-	 */
-	memsegs_lock(0);
-	pp = first = page_first();
-	do {
-		if (PP_PR_REQ(pp)) {
-			callback(pp);
-			if (++found == tbr) {
-				break;	/* got 'em all */
-			}
-		}
-	} while ((pp = page_next(pp)) != first);
-	memsegs_unlock(0);
-}
 
 /*
- * The page_retire_thread loops forever, looking to see if there are
- * pages still waiting to be retired.
+ * Callback used by page_trycapture() to finish off retiring a page.
+ * The page has already been cleaned and we've been given sole access to
+ * it.
+ * Always returns 0 to indicate that callback succeded as the callback never
+ * fails to finish retiring the given page.
  */
-static void
-page_retire_thread(void)
-{
-	callb_cpr_t c;
-
-	CALLB_CPR_INIT(&c, &pr_thread_mutex, callb_generic_cpr, "page_retire");
-
-	mutex_enter(&pr_thread_mutex);
-	for (;;) {
-		if (pr_enable && PR_KSTAT_PENDING) {
-			/*
-			 * Sigh. It's SO broken how we have to try to shake
-			 * loose the holder of the page. Since we have no
-			 * idea who or what has it locked, we go bang on
-			 * every door in the city to try to locate it.
-			 */
-			kmem_reap();
-			seg_preap();
-			page_retire_hunt(page_retire_thread_cb);
-			CALLB_CPR_SAFE_BEGIN(&c);
-			(void) cv_timedwait(&pr_cv, &pr_thread_mutex,
-			    lbolt + pr_thread_shortwait);
-			CALLB_CPR_SAFE_END(&c, &pr_thread_mutex);
-		} else {
-			CALLB_CPR_SAFE_BEGIN(&c);
-			(void) cv_timedwait(&pr_cv, &pr_thread_mutex,
-			    lbolt + pr_thread_longwait);
-			CALLB_CPR_SAFE_END(&c, &pr_thread_mutex);
-		}
-	}
-	/*NOTREACHED*/
-}
-
-/*
- * page_retire_pp() decides what to do with a failing page.
- *
- * When we get a free page (e.g. the scrubber or in the free path) life is
- * nice because the page is clean and marked free -- those always retire
- * nicely. From there we go by order of difficulty. If the page has data,
- * we attempt to relocate its contents to a suitable replacement page. If
- * that does not succeed, we look to see if it is clean. If after all of
- * this we have a clean, unmapped page (which we usually do!), we retire it.
- * If the page is not clean, we still process it regardless on a UE; for
- * CEs or FMA requests, we fail leaving the page in service. The page will
- * eventually be tried again later. We always return with the page unlocked
- * since we are called from page_unlock().
- *
- * We don't call panic or do anything fancy down in here. Our boss the DE
- * gets paid handsomely to do his job of figuring out what to do when errors
- * occur. We just do what he tells us to do.
- */
+/*ARGSUSED*/
 static int
-page_retire_pp(page_t *pp)
+page_retire_pp_finish(page_t *pp, void *notused, uint_t flags)
 {
 	int		toxic;
 
@@ -1037,102 +928,7 @@
 	ASSERT(pp->p_iolock_state == 0);
 	ASSERT(pp->p_szc == 0);
 
-	PR_DEBUG(prd_top);
-	PR_TYPES(pp);
-
 	toxic = pp->p_toxic;
-	ASSERT(toxic & PR_REASONS);
-
-	if ((toxic & (PR_FMA | PR_MCE)) && !(toxic & PR_UE) &&
-	    page_retire_limit()) {
-		page_clrtoxic(pp, PR_FMA | PR_MCE | PR_MSG | PR_BUSY);
-		page_retire_dequeue(pp);
-		page_unlock(pp);
-		return (page_retire_done(pp, PRD_LIMIT));
-	}
-
-	if (PP_ISFREE(pp)) {
-		int dbgnoreclaim = MTBF(recl_calls, recl_mtbf) == 0;
-
-		PR_DEBUG(prd_free);
-
-		if (dbgnoreclaim || !page_reclaim(pp, NULL)) {
-			PR_DEBUG(prd_noreclaim);
-			PR_INCR_KSTAT(pr_failed);
-			/*
-			 * page_reclaim() returns with `pp' unlocked when
-			 * it fails.
-			 */
-			if (dbgnoreclaim)
-				page_unlock(pp);
-			return (page_retire_done(pp, PRD_FAILED));
-		}
-	}
-	ASSERT(!PP_ISFREE(pp));
-
-	if ((toxic & PR_UE) == 0 && pp->p_vnode && !PP_ISNORELOCKERNEL(pp) &&
-	    MTBF(reloc_calls, reloc_mtbf)) {
-		page_t *newpp;
-		spgcnt_t count;
-
-		/*
-		 * If we can relocate the page, great! newpp will go
-		 * on without us, and everything is fine.  Regardless
-		 * of whether the relocation succeeds, we are still
-		 * going to take `pp' around back and shoot it.
-		 */
-		newpp = NULL;
-		if (page_relocate(&pp, &newpp, 0, 0, &count, NULL) == 0) {
-			PR_DEBUG(prd_reloc);
-			page_unlock(newpp);
-			ASSERT(hat_page_getattr(pp, P_MOD) == 0);
-		} else {
-			PR_DEBUG(prd_relocfail);
-		}
-	}
-
-	if (hat_ismod(pp)) {
-		PR_DEBUG(prd_mod);
-		PR_INCR_KSTAT(pr_failed);
-		page_unlock(pp);
-		return (page_retire_done(pp, PRD_FAILED));
-	}
-
-	if (PP_ISKVP(pp)) {
-		PR_DEBUG(prd_kern);
-		PR_INCR_KSTAT(pr_failed_kernel);
-		page_unlock(pp);
-		return (page_retire_done(pp, PRD_FAILED));
-	}
-
-	if (pp->p_lckcnt || pp->p_cowcnt) {
-		PR_DEBUG(prd_locked);
-		PR_INCR_KSTAT(pr_failed);
-		page_unlock(pp);
-		return (page_retire_done(pp, PRD_FAILED));
-	}
-
-	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
-	ASSERT(!hat_page_is_mapped(pp));
-
-	/*
-	 * If the page is modified, and was not relocated; we can't
-	 * retire it without dropping data on the floor. We have to
-	 * recheck after unloading since the dirty bit could have been
-	 * set since we last checked.
-	 */
-	if (hat_ismod(pp)) {
-		PR_DEBUG(prd_mod_late);
-		PR_INCR_KSTAT(pr_failed);
-		page_unlock(pp);
-		return (page_retire_done(pp, PRD_FAILED));
-	}
-
-	if (pp->p_vnode) {
-		PR_DEBUG(prd_hashout);
-		page_hashout(pp, NULL);
-	}
-	ASSERT(!pp->p_vnode);
 
 	/*
 	 * The problem page is locked, demoted, unmapped, not free,
@@ -1141,62 +937,45 @@
 	 * Now we select our ammunition, take it around back, and shoot it.
 	 */
 	if (toxic & PR_UE) {
+ue_error:
 		if (page_retire_transient_ue(pp)) {
 			PR_DEBUG(prd_uescrubbed);
-			return (page_retire_done(pp, PRD_UE_SCRUBBED));
+			(void) page_retire_done(pp, PRD_UE_SCRUBBED);
 		} else {
 			PR_DEBUG(prd_uenotscrubbed);
 			page_retire_destroy(pp);
-			return (page_retire_done(pp, PRD_SUCCESS));
+			(void) page_retire_done(pp, PRD_SUCCESS);
 		}
+		return (0);
 	} else if (toxic & PR_FMA) {
 		PR_DEBUG(prd_fma);
 		page_retire_destroy(pp);
-		return (page_retire_done(pp, PRD_SUCCESS));
+		(void) page_retire_done(pp, PRD_SUCCESS);
+		return (0);
 	} else if (toxic & PR_MCE) {
 		PR_DEBUG(prd_mce);
 		page_retire_destroy(pp);
-		return (page_retire_done(pp, PRD_SUCCESS));
-	}
-	panic("page_retire_pp: bad toxic flags %d", toxic);
-	/*NOTREACHED*/
-}
-
-/*
- * Try to retire a page when we stumble onto it in the page lock routines.
- */
-void
-page_tryretire(page_t *pp)
-{
-	ASSERT(PAGE_EXCL(pp));
-
-	if (!pr_enable) {
-		page_unlock(pp);
-		return;
+		(void) page_retire_done(pp, PRD_SUCCESS);
+		return (0);
 	}
 
 	/*
-	 * If the page is a big page, try to break it up.
-	 *
-	 * If there are other bad pages besides `pp', they will be
-	 * recursively retired for us thanks to a bit of magic.
-	 * If the page is a small page with errors, try to retire it.
+	 * When page_retire_first_ue is set to zero and a UE occurs which is
+	 * transient, it's possible that we clear some flags set by a second
+	 * UE error on the page which occurs while the first is currently being
+	 * handled and thus we need to handle the case where none of the above
+	 * are set.  In this instance, PR_UE_SCRUBBED should be set and thus
+	 * we should execute the UE code above.
 	 */
-	if (pp->p_szc > 0) {
-		if (PP_ISFREE(pp) && !page_try_demote_free_pages(pp)) {
-			page_unlock(pp);
-			PR_DEBUG(prd_nofreedemote);
-			return;
-		} else if (!page_try_demote_pages(pp)) {
-			page_unlock(pp);
-			PR_DEBUG(prd_nodemote);
-			return;
-		}
-		PR_DEBUG(prd_demoted);
-		page_unlock(pp);
-	} else {
-		(void) page_retire_pp(pp);
+	if (toxic & PR_UE_SCRUBBED) {
+		goto ue_error;
 	}
+
+	/*
+	 * It's impossible to get here.
+	 */
+	panic("bad toxic flags 0x%x in page_retire_pp_finish\n", toxic);
+	return (0);
 }
 
 /*
@@ -1204,12 +983,10 @@
  *
  * Ideally, page_retire() would instantly retire the requested page.
  * Unfortunately, some pages are locked or otherwise tied up and cannot be
- * retired right away. To deal with that, bits are set in p_toxic of the
- * page_t. An attempt is made to lock the page; if the attempt is successful,
- * we instantly unlock the page counting on page_unlock() to notice p_toxic
- * is nonzero and to call back into page_retire_pp(). Success is determined
- * by looking to see whether the page has been retired once it has been
- * unlocked.
+ * retired right away.  We use the page capture logic to deal with this
+ * situation as it will continuously try to retire the page in the background
+ * if the first attempt fails.  Success is determined by looking to see whether
+ * the page has been retired after the page_trycapture() attempt.
  *
  * Returns:
  *
@@ -1247,22 +1024,20 @@
 		PR_MESSAGE(CE_NOTE, 1, "Scheduling removal of"
 		    " page 0x%08x.%08x", pa);
 	}
-	page_settoxic(pp, reason);
-	page_retire_enqueue(pp);
+
+	/* Avoid setting toxic bits in the first place */
+	if ((reason & (PR_FMA | PR_MCE)) && !(reason & PR_UE) &&
+	    page_retire_limit()) {
+		return (page_retire_done(pp, PRD_LIMIT));
+	}
 
-	/*
-	 * And now for some magic.
-	 *
-	 * We marked this page toxic up above.  All there is left to do is
-	 * to try to lock the page and then unlock it.  The page lock routines
-	 * will intercept the page and retire it if they can.  If the page
-	 * cannot be locked, 's okay -- page_unlock() will eventually get it,
-	 * or the background thread, until then the lock routines will deny
-	 * further locks on it.
-	 */
-	if (MTBF(pr_calls, pr_mtbf) && page_trylock(pp, SE_EXCL)) {
-		PR_DEBUG(prd_prlocked);
-		page_unlock(pp);
+	if (MTBF(pr_calls, pr_mtbf)) {
+		page_settoxic(pp, reason);
+		if (page_trycapture(pp, 0, CAPTURE_RETIRE, NULL) == 0) {
+			PR_DEBUG(prd_prlocked);
+		} else {
+			PR_DEBUG(prd_prnotlocked);
+		}
 	} else {
 		PR_DEBUG(prd_prnotlocked);
 	}
@@ -1271,7 +1046,7 @@
 		PR_DEBUG(prd_prretired);
 		return (0);
 	} else {
-		cv_signal(&pr_cv);
+		cv_signal(&pc_cv);
 		PR_INCR_KSTAT(pr_failed);
 
 		if (pp->p_toxic & PR_MSG) {
@@ -1291,15 +1066,24 @@
  * Any unretire messages are printed from this routine.
  *
  * Returns 0 if page pp was unretired; else an error code.
+ *
+ * If flags is:
+ *	PR_UNR_FREE - lock the page, clear the toxic flags and free it
+ *	    to the freelist.
+ *	PR_UNR_TEMP - lock the page, unretire it, leave the toxic
+ *	    bits set as is and return it to the caller.
+ *	PR_UNR_CLEAN - page is SE_EXCL locked, unretire it, clear the
+ *	    toxic flags and return it to caller as is.
  */
 int
-page_unretire_pp(page_t *pp, int free)
+page_unretire_pp(page_t *pp, int flags)
 {
 	/*
 	 * To be retired, a page has to be hashed onto the retired_pages vnode
 	 * and have PR_RETIRED set in p_toxic.
 	 */
-	if (free == 0 || page_try_reclaim_lock(pp, SE_EXCL, SE_RETIRED)) {
+	if (flags == PR_UNR_CLEAN ||
+	    page_try_reclaim_lock(pp, SE_EXCL, SE_RETIRED)) {
 		ASSERT(PAGE_EXCL(pp));
 		PR_DEBUG(prd_ulocked);
 		if (!PP_RETIRED(pp)) {
@@ -1317,9 +1101,13 @@
 		} else {
 			PR_DECR_KSTAT(pr_mce);
 		}
-		page_clrtoxic(pp, PR_ALLFLAGS);
 
-		if (free) {
+		if (flags == PR_UNR_TEMP)
+			page_clrtoxic(pp, PR_RETIRED);
+		else
+			page_clrtoxic(pp, PR_TOXICFLAGS);
+
+		if (flags == PR_UNR_FREE) {
 			PR_DEBUG(prd_udestroy);
 			page_destroy(pp, 0);
 		} else {
@@ -1363,7 +1151,7 @@
 		return (page_retire_done(pp, PRD_INVALID_PA));
 	}
 
-	return (page_unretire_pp(pp, 1));
+	return (page_unretire_pp(pp, PR_UNR_FREE));
 }
 
 /*
@@ -1462,12 +1250,14 @@
 				page_unlock(lpp);
 				continue;
 			}
-			page_settoxic(cpp, PR_FMA | PR_BUSY);
-			page_settoxic(cpp2, PR_FMA);
-			page_tryretire(cpp);	/* will fail */
+
+			/* fails */
+			(void) page_retire(ptob(cpp->p_pagenum), PR_FMA);
+
 			page_unlock(lpp);
-			(void) page_retire(cpp->p_pagenum, PR_FMA);
-			(void) page_retire(cpp2->p_pagenum, PR_FMA);
+			page_unlock(cpp);
+			(void) page_retire(ptob(cpp->p_pagenum), PR_FMA);
+			(void) page_retire(ptob(cpp2->p_pagenum), PR_FMA);
 		}
 	} while ((pp = page_next(pp)) != first);
 	memsegs_unlock(0);
--- a/usr/src/uts/common/vm/vm_anon.c	Thu Dec 14 16:42:14 2006 -0800
+++ b/usr/src/uts/common/vm/vm_anon.c	Thu Dec 14 17:27:13 2006 -0800
@@ -2314,7 +2314,16 @@
 	 * which is locked and loaded in the MMU by
 	 * the caller to prevent yet another page fault.
 	 */
-	ppcopy(opp, pp);		/* XXX - should set mod bit in here */
+	/* XXX - should set mod bit in here */
+	if (ppcopy(opp, pp) == 0) {
+		/*
+		 * Before ppcopy could hanlde UE or other faults, we
+		 * would have panicked here, and still have no option
+		 * but to do so now.
+		 */
+		panic("anon_private, ppcopy failed, opp = 0x%p, pp = 0x%p",
+		    opp, pp);
+	}
 
 	hat_setrefmod(pp);		/* mark as modified */
 
@@ -2557,7 +2566,14 @@
 		/*
 		 * Now copy the contents from the original page.
 		 */
-		ppcopy(ppa[pg_idx], pp);
+		if (ppcopy(ppa[pg_idx], pp) == 0) {
+			/*
+			 * Before ppcopy could hanlde UE or other faults, we
+			 * would have panicked here, and still have no option
+			 * but to do so now.
+			 */
+			panic("anon_map_privatepages, ppcopy failed");
+		}
 
 		hat_setrefmod(pp);		/* mark as modified */
 
--- a/usr/src/uts/common/vm/vm_page.c	Thu Dec 14 16:42:14 2006 -0800
+++ b/usr/src/uts/common/vm/vm_page.c	Thu Dec 14 17:27:13 2006 -0800
@@ -329,6 +329,8 @@
 static void page_init_mem_config(void);
 static int page_do_hashin(page_t *, vnode_t *, u_offset_t);
 static void page_do_hashout(page_t *);
+static void page_capture_init();
+int page_capture_take_action(page_t *, uint_t, void *);
 
 static void page_demote_vp_pages(page_t *);
 
@@ -344,6 +346,7 @@
 	page_init_mem_config();
 	page_retire_init();
 	vm_usage_init();
+	page_capture_init();
 }
 
 /*
@@ -4439,7 +4442,7 @@
 
 top:
 	/*
-	 * Flush dirty pages and destory the clean ones.
+	 * Flush dirty pages and destroy the clean ones.
 	 */
 	nbusypages = 0;
 
@@ -4778,6 +4781,7 @@
  * EBUSY	: failure to get locks on the page/pages
  * ENOMEM	: failure to obtain replacement pages
  * EAGAIN	: OBP has not yet completed its boot-time handoff to the kernel
+ * EIO		: An error occurred while trying to copy the page data
  *
  * Return with all constituent members of target and replacement
  * SE_EXCL locked. It is the callers responsibility to drop the
@@ -4791,9 +4795,7 @@
 	spgcnt_t *nrelocp,
 	lgrp_t *lgrp)
 {
-#ifdef DEBUG
 	page_t *first_repl;
-#endif /* DEBUG */
 	page_t *repl;
 	page_t *targ;
 	page_t *pl = NULL;
@@ -4921,9 +4923,7 @@
 #endif
 #endif
 
-#ifdef DEBUG
 	first_repl = repl;
-#endif /* DEBUG */
 
 	for (i = 0; i < npgs; i++) {
 		ASSERT(PAGE_EXCL(targ));
@@ -4942,7 +4942,33 @@
 		 * Copy the page contents and attributes then
 		 * relocate the page in the page hash.
 		 */
-		ppcopy(targ, repl);
+		if (ppcopy(targ, repl) == 0) {
+			targ = *target;
+			repl = first_repl;
+			VM_STAT_ADD(vmm_vmstats.ppr_copyfail);
+			if (grouplock != 0) {
+				group_page_unlock(targ);
+			}
+			if (dofree) {
+				*replacement = NULL;
+				page_free_replacement_page(repl);
+				page_create_putback(dofree);
+			}
+			return (EIO);
+		}
+
+		targ++;
+		if (repl_contig != 0) {
+			repl++;
+		} else {
+			repl = repl->p_next;
+		}
+	}
+
+	repl = first_repl;
+	targ = *target;
+
+	for (i = 0; i < npgs; i++) {
 		ppattr = hat_page_getattr(targ, (P_MOD | P_REF | P_RO));
 		page_clr_all_props(repl);
 		page_set_props(repl, ppattr);
@@ -6182,3 +6208,1277 @@
 {
 	return (hat_page_getattr(pp, P_MOD));
 }
+
+/*
+ * Reclaim the given constituent page from the freelist, regardless of it's
+ * size.  The page will be demoted as required.
+ * Returns 1 on success or 0 on failure.
+ *
+ * The page is unlocked if it can't be reclaimed (when freemem == 0).
+ * If `lock' is non-null, it will be dropped and re-acquired if
+ * the routine must wait while freemem is 0.
+ */
+int
+page_reclaim_page(page_t *pp, kmutex_t *lock)
+{
+	struct pcf	*p;
+	uint_t		pcf_index;
+	struct cpu	*cpup;
+	uint_t		i;
+	pgcnt_t		collected = 0;
+
+	ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
+	ASSERT(PAGE_EXCL(pp) && PP_ISFREE(pp));
+
+	/*
+	 * If `freemem' is 0, we cannot reclaim this page from the
+	 * freelist, so release every lock we might hold: the page,
+	 * and the `lock' before blocking.
+	 *
+	 * The only way `freemem' can become 0 while there are pages
+	 * marked free (have their p->p_free bit set) is when the
+	 * system is low on memory and doing a page_create().  In
+	 * order to guarantee that once page_create() starts acquiring
+	 * pages it will be able to get all that it needs since `freemem'
+	 * was decreased by the requested amount.  So, we need to release
+	 * this page, and let page_create() have it.
+	 *
+	 * Since `freemem' being zero is not supposed to happen, just
+	 * use the usual hash stuff as a starting point.  If that bucket
+	 * is empty, then assume the worst, and start at the beginning
+	 * of the pcf array.  If we always start at the beginning
+	 * when acquiring more than one pcf lock, there won't be any
+	 * deadlock problems.
+	 */
+
+	/* TODO: Do we need to test kcage_freemem if PG_NORELOC(pp)? */
+
+	if (freemem <= throttlefree && !page_create_throttle(1, 0)) {
+		pcf_acquire_all();
+		goto page_reclaim_nomem;
+	}
+
+	pcf_index = PCF_INDEX();
+	p = &pcf[pcf_index];
+	mutex_enter(&p->pcf_lock);
+	if (p->pcf_count > 0) {
+		collected = 1;
+		p->pcf_count -= 1;
+	}
+	mutex_exit(&p->pcf_lock);
+
+	if (!collected) {
+		VM_STAT_ADD(page_reclaim_zero);
+		/*
+		 * Check again. Its possible that some other thread
+		 * could have been right behind us, and added one
+		 * to a list somewhere.  Acquire each of the pcf locks
+		 * until we find a page.
+		 */
+		p = pcf;
+		for (i = 0; i < PCF_FANOUT; i++) {
+			mutex_enter(&p->pcf_lock);
+			if (p->pcf_count) {
+				if (p->pcf_count > 0) {
+					p->pcf_count -= 1;
+					collected = 1;
+					break;
+				}
+			}
+			p++;
+		}
+
+		if (!collected) {
+page_reclaim_nomem:
+			/*
+			 * We really can't have page `pp'.
+			 * Time for the no-memory dance with
+			 * page_free().  This is just like
+			 * page_create_wait().  Plus the added
+			 * attraction of releasing whatever mutex
+			 * we held when we were called with in `lock'.
+			 * Page_unlock() will wakeup any thread
+			 * waiting around for this page.
+			 */
+			if (lock) {
+				VM_STAT_ADD(page_reclaim_zero_locked);
+				mutex_exit(lock);
+			}
+			page_unlock(pp);
+
+			/*
+			 * get this before we drop all the pcf locks.
+			 */
+			mutex_enter(&new_freemem_lock);
+
+			p = pcf;
+			for (i = 0; i < PCF_FANOUT; i++) {
+				p->pcf_wait++;
+				mutex_exit(&p->pcf_lock);
+				p++;
+			}
+
+			freemem_wait++;
+			cv_wait(&freemem_cv, &new_freemem_lock);
+			freemem_wait--;
+
+			mutex_exit(&new_freemem_lock);
+
+			if (lock) {
+				mutex_enter(lock);
+			}
+			return (0);
+		}
+
+		/*
+		 * We beat the PCF bins over the head until
+		 * we got the memory that we wanted.
+		 * The pcf accounting has been done,
+		 * though none of the pcf_wait flags have been set,
+		 * drop the locks and continue on.
+		 */
+		ASSERT(collected == 1);
+		while (p >= pcf) {
+			mutex_exit(&p->pcf_lock);
+			p--;
+		}
+	}
+
+	/*
+	 * freemem is not protected by any lock. Thus, we cannot
+	 * have any assertion containing freemem here.
+	 */
+	freemem -= 1;
+
+	VM_STAT_ADD(pagecnt.pc_reclaim);
+	if (PP_ISAGED(pp)) {
+		page_list_sub(pp, PG_FREE_LIST);
+		TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_FREE,
+		    "page_reclaim_page_free:pp %p", pp);
+	} else {
+		page_list_sub(pp, PG_CACHE_LIST);
+		TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_CACHE,
+		    "page_reclaim_page_cache:pp %p", pp);
+	}
+
+	/*
+	 * The page we took off the freelist must be szc 0 as
+	 * we used page_list_sub which will demote the page if needed.
+	 */
+	ASSERT(pp->p_szc == 0);
+
+	/*
+	 * clear the p_free & p_age bits since this page is no longer
+	 * on the free list.  Notice that there was a brief time where
+	 * a page is marked as free, but is not on the list.
+	 *
+	 * Set the reference bit to protect against immediate pageout.
+	 */
+	PP_CLRFREE(pp);
+	PP_CLRAGED(pp);
+	page_set_props(pp, P_REF);
+
+	CPU_STATS_ENTER_K();
+	cpup = CPU;	/* get cpup now that CPU cannot change */
+	CPU_STATS_ADDQ(cpup, vm, pgrec, 1);
+	CPU_STATS_ADDQ(cpup, vm, pgfrec, 1);
+	CPU_STATS_EXIT_K();
+
+	return (1);
+}
+
+/*
+ * The following code all currently relates to the page capture logic:
+ *
+ * This logic is used for cases where there is a desire to claim a certain
+ * physical page in the system for the caller.  As it may not be possible
+ * to capture the page immediately, the p_toxic bits are used in the page
+ * structure to indicate that someone wants to capture this page.  When the
+ * page gets unlocked, the toxic flag will be noted and an attempt to capture
+ * the page will be made.  If it is successful, the original callers callback
+ * will be called with the page to do with it what they please.
+ *
+ * There is also an async thread which wakes up to attempt to capture
+ * pages occasionally which have the capture bit set.  All of the pages which
+ * need to be captured asynchronously have been inserted into the
+ * page_capture_hash and thus this thread walks that hash list.  Items in the
+ * hash have an expiration time so this thread handles that as well by removing
+ * the item from the hash if it has expired.
+ *
+ * Some important things to note are:
+ * - if the PR_CAPTURE bit is set on a page, then the page is in the
+ *   page_capture_hash.  The page_capture_hash_head.pchh_mutex is needed
+ *   to set and clear this bit, and while the lock is held is the only time
+ *   you can add or remove an entry from the hash.
+ * - the PR_CAPTURE bit can only be set and cleared while holding the
+ *   page_capture_hash_head.pchh_mutex
+ * - the t_flag field of the thread struct is used with the T_CAPTURING
+ *   flag to prevent recursion while dealing with large pages.
+ * - pages which need to be retired never expire on the page_capture_hash.
+ */
+
+static void page_capture_thread(void);
+static kthread_t *pc_thread_id;
+kcondvar_t pc_cv;
+static kmutex_t pc_thread_mutex;
+static clock_t pc_thread_shortwait;
+static clock_t pc_thread_longwait;
+
+struct page_capture_callback pc_cb[PC_NUM_CALLBACKS];
+
+/* Note that this is a circular linked list */
+typedef struct page_capture_hash_bucket {
+	page_t *pp;
+	uint_t szc;
+	uint_t flags;
+	clock_t expires;	/* lbolt at which this request expires. */
+	void *datap;		/* Cached data passed in for callback */
+	struct page_capture_hash_bucket *next;
+	struct page_capture_hash_bucket *prev;
+} page_capture_hash_bucket_t;
+
+/*
+ * Each hash bucket will have it's own mutex and two lists which are:
+ * active (0):	represents requests which have not been processed by
+ *		the page_capture async thread yet.
+ * walked (1):	represents requests which have been processed by the
+ *		page_capture async thread within it's given walk of this bucket.
+ *
+ * These are all needed so that we can synchronize all async page_capture
+ * events.  When the async thread moves to a new bucket, it will append the
+ * walked list to the active list and walk each item one at a time, moving it
+ * from the active list to the walked list.  Thus if there is an async request
+ * outstanding for a given page, it will always be in one of the two lists.
+ * New requests will always be added to the active list.
+ * If we were not able to capture a page before the request expired, we'd free
+ * up the request structure which would indicate to page_capture that there is
+ * no longer a need for the given page, and clear the PR_CAPTURE flag if
+ * possible.
+ */
+typedef struct page_capture_hash_head {
+	kmutex_t pchh_mutex;
+	uint_t num_pages;
+	page_capture_hash_bucket_t lists[2]; /* sentinel nodes */
+} page_capture_hash_head_t;
+
+#ifdef DEBUG
+#define	NUM_PAGE_CAPTURE_BUCKETS 4
+#else
+#define	NUM_PAGE_CAPTURE_BUCKETS 64
+#endif
+
+page_capture_hash_head_t page_capture_hash[NUM_PAGE_CAPTURE_BUCKETS];
+
+/* for now use a very simple hash based upon the size of a page struct */
+#define	PAGE_CAPTURE_HASH(pp)	\
+	((int)(((uintptr_t)pp >> 7) & (NUM_PAGE_CAPTURE_BUCKETS - 1)))
+
+extern pgcnt_t swapfs_minfree;
+
+int page_trycapture(page_t *pp, uint_t szc, uint_t flags, void *datap);
+
+/*
+ * a callback function is required for page capture requests.
+ */
+void
+page_capture_register_callback(uint_t index, clock_t duration,
+    int (*cb_func)(page_t *, void *, uint_t))
+{
+	ASSERT(pc_cb[index].cb_active == 0);
+	ASSERT(cb_func != NULL);
+	rw_enter(&pc_cb[index].cb_rwlock, RW_WRITER);
+	pc_cb[index].duration = duration;
+	pc_cb[index].cb_func = cb_func;
+	pc_cb[index].cb_active = 1;
+	rw_exit(&pc_cb[index].cb_rwlock);
+}
+
+void
+page_capture_unregister_callback(uint_t index)
+{
+	int i, j;
+	struct page_capture_hash_bucket *bp1;
+	struct page_capture_hash_bucket *bp2;
+	struct page_capture_hash_bucket *head = NULL;
+	uint_t flags = (1 << index);
+
+	rw_enter(&pc_cb[index].cb_rwlock, RW_WRITER);
+	ASSERT(pc_cb[index].cb_active == 1);
+	pc_cb[index].duration = 0;	/* Paranoia */
+	pc_cb[index].cb_func = NULL;	/* Paranoia */
+	pc_cb[index].cb_active = 0;
+	rw_exit(&pc_cb[index].cb_rwlock);
+
+	/*
+	 * Just move all the entries to a private list which we can walk
+	 * through without the need to hold any locks.
+	 * No more requests can get added to the hash lists for this consumer
+	 * as the cb_active field for the callback has been cleared.
+	 */
+	for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
+		mutex_enter(&page_capture_hash[i].pchh_mutex);
+		for (j = 0; j < 2; j++) {
+			bp1 = page_capture_hash[i].lists[j].next;
+			/* walk through all but first (sentinel) element */
+			while (bp1 != &page_capture_hash[i].lists[j]) {
+				bp2 = bp1;
+				if (bp2->flags & flags) {
+					bp1 = bp2->next;
+					bp1->prev = bp2->prev;
+					bp2->prev->next = bp1;
+					bp2->next = head;
+					head = bp2;
+					/*
+					 * Clear the PR_CAPTURE bit as we
+					 * hold appropriate locks here.
+					 */
+					page_clrtoxic(head->pp, PR_CAPTURE);
+					page_capture_hash[i].num_pages--;
+					continue;
+				}
+				bp1 = bp1->next;
+			}
+		}
+		mutex_exit(&page_capture_hash[i].pchh_mutex);
+	}
+
+	while (head != NULL) {
+		bp1 = head;
+		head = head->next;
+		kmem_free(bp1, sizeof (*bp1));
+	}
+}
+
+
+/*
+ * Find pp in the active list and move it to the walked list if it
+ * exists.
+ * Note that most often pp should be at the front of the active list
+ * as it is currently used and thus there is no other sort of optimization
+ * being done here as this is a linked list data structure.
+ * Returns 1 on successful move or 0 if page could not be found.
+ */
+static int
+page_capture_move_to_walked(page_t *pp)
+{
+	page_capture_hash_bucket_t *bp;
+	int index;
+
+	index = PAGE_CAPTURE_HASH(pp);
+
+	mutex_enter(&page_capture_hash[index].pchh_mutex);
+	bp = page_capture_hash[index].lists[0].next;
+	while (bp != &page_capture_hash[index].lists[0]) {
+		if (bp->pp == pp) {
+			/* Remove from old list */
+			bp->next->prev = bp->prev;
+			bp->prev->next = bp->next;
+
+			/* Add to new list */
+			bp->next = page_capture_hash[index].lists[1].next;
+			bp->prev = &page_capture_hash[index].lists[1];
+			page_capture_hash[index].lists[1].next = bp;
+			bp->next->prev = bp;
+			mutex_exit(&page_capture_hash[index].pchh_mutex);
+
+			return (1);
+		}
+		bp = bp->next;
+	}
+	mutex_exit(&page_capture_hash[index].pchh_mutex);
+	return (0);
+}
+
+/*
+ * Add a new entry to the page capture hash.  The only case where a new
+ * entry is not added is when the page capture consumer is no longer registered.
+ * In this case, we'll silently not add the page to the hash.  We know that
+ * page retire will always be registered for the case where we are currently
+ * unretiring a page and thus there are no conflicts.
+ */
+static void
+page_capture_add_hash(page_t *pp, uint_t szc, uint_t flags, void *datap)
+{
+	page_capture_hash_bucket_t *bp1;
+	page_capture_hash_bucket_t *bp2;
+	int index;
+	int cb_index;
+	int i;
+#ifdef DEBUG
+	page_capture_hash_bucket_t *tp1;
+	int l;
+#endif
+
+	ASSERT(!(flags & CAPTURE_ASYNC));
+
+	bp1 = kmem_alloc(sizeof (struct page_capture_hash_bucket), KM_SLEEP);
+
+	bp1->pp = pp;
+	bp1->szc = szc;
+	bp1->flags = flags;
+	bp1->datap = datap;
+
+	for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) {
+		if ((flags >> cb_index) & 1) {
+			break;
+		}
+	}
+
+	ASSERT(cb_index != PC_NUM_CALLBACKS);
+
+	rw_enter(&pc_cb[cb_index].cb_rwlock, RW_READER);
+	if (pc_cb[cb_index].cb_active) {
+		if (pc_cb[cb_index].duration == -1) {
+			bp1->expires = (clock_t)-1;
+		} else {
+			bp1->expires = lbolt + pc_cb[cb_index].duration;
+		}
+	} else {
+		/* There's no callback registered so don't add to the hash */
+		rw_exit(&pc_cb[cb_index].cb_rwlock);
+		kmem_free(bp1, sizeof (*bp1));
+		return;
+	}
+
+	index = PAGE_CAPTURE_HASH(pp);
+
+	/*
+	 * Only allow capture flag to be modified under this mutex.
+	 * Prevents multiple entries for same page getting added.
+	 */
+	mutex_enter(&page_capture_hash[index].pchh_mutex);
+
+	/*
+	 * if not already on the hash, set capture bit and add to the hash
+	 */
+	if (!(pp->p_toxic & PR_CAPTURE)) {
+#ifdef DEBUG
+		/* Check for duplicate entries */
+		for (l = 0; l < 2; l++) {
+			tp1 = page_capture_hash[index].lists[l].next;
+			while (tp1 != &page_capture_hash[index].lists[l]) {
+				if (tp1->pp == pp) {
+					panic("page pp 0x%p already on hash "
+					    "at 0x%p\n", pp, tp1);
+				}
+				tp1 = tp1->next;
+			}
+		}
+
+#endif
+		page_settoxic(pp, PR_CAPTURE);
+		bp1->next = page_capture_hash[index].lists[0].next;
+		bp1->prev = &page_capture_hash[index].lists[0];
+		bp1->next->prev = bp1;
+		page_capture_hash[index].lists[0].next = bp1;
+		page_capture_hash[index].num_pages++;
+		mutex_exit(&page_capture_hash[index].pchh_mutex);
+		rw_exit(&pc_cb[cb_index].cb_rwlock);
+		cv_signal(&pc_cv);
+		return;
+	}
+
+	/*
+	 * A page retire request will replace any other request.
+	 * A second physmem request which is for a different process than
+	 * the currently registered one will be dropped as there is
+	 * no way to hold the private data for both calls.
+	 * In the future, once there are more callers, this will have to
+	 * be worked out better as there needs to be private storage for
+	 * at least each type of caller (maybe have datap be an array of
+	 * *void's so that we can index based upon callers index).
+	 */
+
+	/* walk hash list to update expire time */
+	for (i = 0; i < 2; i++) {
+		bp2 = page_capture_hash[index].lists[i].next;
+		while (bp2 != &page_capture_hash[index].lists[i]) {
+			if (bp2->pp == pp) {
+				if (flags & CAPTURE_RETIRE) {
+					if (!(bp2->flags & CAPTURE_RETIRE)) {
+						bp2->flags = flags;
+						bp2->expires = bp1->expires;
+						bp2->datap = datap;
+					}
+				} else {
+					ASSERT(flags & CAPTURE_PHYSMEM);
+					if (!(bp2->flags & CAPTURE_RETIRE) &&
+					    (datap == bp2->datap)) {
+						bp2->expires = bp1->expires;
+					}
+				}
+				mutex_exit(&page_capture_hash[index].
+				    pchh_mutex);
+				rw_exit(&pc_cb[cb_index].cb_rwlock);
+				kmem_free(bp1, sizeof (*bp1));
+				return;
+			}
+			bp2 = bp2->next;
+		}
+	}
+
+	/*
+	 * the PR_CAPTURE flag is protected by the page_capture_hash mutexes
+	 * and thus it either has to be set or not set and can't change
+	 * while holding the mutex above.
+	 */
+	panic("page_capture_add_hash, PR_CAPTURE flag set on pp %p\n", pp);
+}
+
+/*
+ * We have a page in our hands, lets try and make it ours by turning
+ * it into a clean page like it had just come off the freelists.
+ *
+ * Returns 0 on success, with the page still EXCL locked.
+ * On failure, the page will be unlocked, and returns EAGAIN
+ */
+static int
+page_capture_clean_page(page_t *pp)
+{
+	page_t *newpp;
+	int skip_unlock = 0;
+	spgcnt_t count;
+	page_t *tpp;
+	int ret = 0;
+	int extra;
+
+	ASSERT(PAGE_EXCL(pp));
+	ASSERT(!PP_RETIRED(pp));
+	ASSERT(curthread->t_flag & T_CAPTURING);
+
+	if (PP_ISFREE(pp)) {
+		if (!page_reclaim_page(pp, NULL)) {
+			skip_unlock = 1;
+			ret = EAGAIN;
+			goto cleanup;
+		}
+		if (pp->p_vnode != NULL) {
+			/*
+			 * Since this page came from the
+			 * cachelist, we must destroy the
+			 * old vnode association.
+			 */
+			page_hashout(pp, NULL);
+		}
+		goto cleanup;
+	}
+
+	/*
+	 * If we know page_relocate will fail, skip it
+	 * It could still fail due to a UE on another page but we
+	 * can't do anything about that.
+	 */
+	if (pp->p_toxic & PR_UE) {
+		goto skip_relocate;
+	}
+
+	/*
+	 * It's possible that pages can not have a vnode as fsflush comes
+	 * through and cleans up these pages.  It's ugly but that's how it is.
+	 */
+	if (pp->p_vnode == NULL) {
+		goto skip_relocate;
+	}
+
+	/*
+	 * Page was not free, so lets try to relocate it.
+	 * page_relocate only works with root pages, so if this is not a root
+	 * page, we need to demote it to try and relocate it.
+	 * Unfortunately this is the best we can do right now.
+	 */
+	newpp = NULL;
+	if ((pp->p_szc > 0) && (pp != PP_PAGEROOT(pp))) {
+		if (page_try_demote_pages(pp) == 0) {
+			ret = EAGAIN;
+			goto cleanup;
+		}
+	}
+	ret = page_relocate(&pp, &newpp, 1, 0, &count, NULL);
+	if (ret == 0) {
+		page_t *npp;
+		/* unlock the new page(s) */
+		while (count-- > 0) {
+			ASSERT(newpp != NULL);
+			npp = newpp;
+			page_sub(&newpp, npp);
+			page_unlock(npp);
+		}
+		ASSERT(newpp == NULL);
+		/*
+		 * Check to see if the page we have is too large.
+		 * If so, demote it freeing up the extra pages.
+		 */
+		if (pp->p_szc > 0) {
+			/* For now demote extra pages to szc == 0 */
+			extra = page_get_pagecnt(pp->p_szc) - 1;
+			while (extra > 0) {
+				tpp = pp->p_next;
+				page_sub(&pp, tpp);
+				tpp->p_szc = 0;
+				page_free(tpp, 1);
+				extra--;
+			}
+			/* Make sure to set our page to szc 0 as well */
+			ASSERT(pp->p_next == pp && pp->p_prev == pp);
+			pp->p_szc = 0;
+		}
+		goto cleanup;
+	} else if (ret == EIO) {
+		ret = EAGAIN;
+		goto cleanup;
+	} else {
+		/*
+		 * Need to reset return type as we failed to relocate the page
+		 * but that does not mean that some of the next steps will not
+		 * work.
+		 */
+		ret = 0;
+	}
+
+skip_relocate:
+
+	if (pp->p_szc > 0) {
+		if (page_try_demote_pages(pp) == 0) {
+			ret = EAGAIN;
+			goto cleanup;
+		}
+	}
+
+	ASSERT(pp->p_szc == 0);
+
+	if (hat_ismod(pp)) {
+		ret = EAGAIN;
+		goto cleanup;
+	}
+	if (PP_ISKVP(pp)) {
+		ret = EAGAIN;
+		goto cleanup;
+	}
+	if (pp->p_lckcnt || pp->p_cowcnt) {
+		ret = EAGAIN;
+		goto cleanup;
+	}
+
+	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
+	ASSERT(!hat_page_is_mapped(pp));
+
+	if (hat_ismod(pp)) {
+		/*
+		 * This is a semi-odd case as the page is now modified but not
+		 * mapped as we just unloaded the mappings above.
+		 */
+		ret = EAGAIN;
+		goto cleanup;
+	}
+	if (pp->p_vnode != NULL) {
+		page_hashout(pp, NULL);
+	}
+
+	/*
+	 * At this point, the page should be in a clean state and
+	 * we can do whatever we want with it.
+	 */
+
+cleanup:
+	if (ret != 0) {
+		if (!skip_unlock) {
+			page_unlock(pp);
+		}
+	} else {
+		ASSERT(pp->p_szc == 0);
+		ASSERT(PAGE_EXCL(pp));
+
+		pp->p_next = pp;
+		pp->p_prev = pp;
+	}
+	return (ret);
+}
+
+/*
+ * Various callers of page_trycapture() can have different restrictions upon
+ * what memory they have access to.
+ * Returns 0 on success, with the following error codes on failure:
+ *      EPERM - The requested page is long term locked, and thus repeated
+ *              requests to capture this page will likely fail.
+ *      ENOMEM - There was not enough free memory in the system to safely
+ *              map the requested page.
+ *      ENOENT - The requested page was inside the kernel cage, and the
+ *              PHYSMEM_CAGE flag was not set.
+ */
+int
+page_capture_pre_checks(page_t *pp, uint_t flags)
+{
+#if defined(__sparc)
+	extern struct vnode prom_ppages;
+#endif /* __sparc */
+
+	ASSERT(pp != NULL);
+
+	/* only physmem currently has restrictions */
+	if (!(flags & CAPTURE_PHYSMEM)) {
+		return (0);
+	}
+
+#if defined(__sparc)
+	if (pp->p_vnode == &prom_ppages) {
+		return (EPERM);
+	}
+
+	if (PP_ISNORELOC(pp) && !(flags & CAPTURE_GET_CAGE)) {
+		return (ENOENT);
+	}
+
+	if (PP_ISNORELOCKERNEL(pp)) {
+		return (EPERM);
+	}
+#else
+	if (PP_ISKVP(pp)) {
+		return (EPERM);
+	}
+#endif /* __sparc */
+
+	if (availrmem < swapfs_minfree) {
+		/*
+		 * We won't try to capture this page as we are
+		 * running low on memory.
+		 */
+		return (ENOMEM);
+	}
+	return (0);
+}
+
+/*
+ * Once we have a page in our mits, go ahead and complete the capture
+ * operation.
+ * Returns 1 on failure where page is no longer needed
+ * Returns 0 on success
+ * Returns -1 if there was a transient failure.
+ * Failure cases must release the SE_EXCL lock on pp (usually via page_free).
+ */
+int
+page_capture_take_action(page_t *pp, uint_t flags, void *datap)
+{
+	int cb_index;
+	int ret = 0;
+	page_capture_hash_bucket_t *bp1;
+	page_capture_hash_bucket_t *bp2;
+	int index;
+	int found = 0;
+	int i;
+
+	ASSERT(PAGE_EXCL(pp));
+	ASSERT(curthread->t_flag & T_CAPTURING);
+
+	for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) {
+		if ((flags >> cb_index) & 1) {
+			break;
+		}
+	}
+	ASSERT(cb_index < PC_NUM_CALLBACKS);
+
+	/*
+	 * Remove the entry from the page_capture hash, but don't free it yet
+	 * as we may need to put it back.
+	 * Since we own the page at this point in time, we should find it
+	 * in the hash if this is an ASYNC call.  If we don't it's likely
+	 * that the page_capture_async() thread decided that this request
+	 * had expired, in which case we just continue on.
+	 */
+	if (flags & CAPTURE_ASYNC) {
+
+		index = PAGE_CAPTURE_HASH(pp);
+
+		mutex_enter(&page_capture_hash[index].pchh_mutex);
+		for (i = 0; i < 2 && !found; i++) {
+			bp1 = page_capture_hash[index].lists[i].next;
+			while (bp1 != &page_capture_hash[index].lists[i]) {
+				if (bp1->pp == pp) {
+					bp1->next->prev = bp1->prev;
+					bp1->prev->next = bp1->next;
+					page_capture_hash[index].num_pages--;
+					page_clrtoxic(pp, PR_CAPTURE);
+					found = 1;
+					break;
+				}
+				bp1 = bp1->next;
+			}
+		}
+		mutex_exit(&page_capture_hash[index].pchh_mutex);
+	}
+
+	/* Synchronize with the unregister func. */
+	rw_enter(&pc_cb[cb_index].cb_rwlock, RW_READER);
+	if (!pc_cb[cb_index].cb_active) {
+		page_free(pp, 1);
+		rw_exit(&pc_cb[cb_index].cb_rwlock);
+		if (found) {
+			kmem_free(bp1, sizeof (*bp1));
+		}
+		return (1);
+	}
+
+	/*
+	 * We need to remove the entry from the page capture hash and turn off
+	 * the PR_CAPTURE bit before calling the callback.  We'll need to cache
+	 * the entry here, and then based upon the return value, cleanup
+	 * appropriately or re-add it to the hash, making sure that someone else
+	 * hasn't already done so.
+	 * It should be rare for the callback to fail and thus it's ok for
+	 * the failure path to be a bit complicated as the success path is
+	 * cleaner and the locking rules are easier to follow.
+	 */
+
+	ret = pc_cb[cb_index].cb_func(pp, datap, flags);
+
+	rw_exit(&pc_cb[cb_index].cb_rwlock);
+
+	/*
+	 * If this was an ASYNC request, we need to cleanup the hash if the
+	 * callback was successful or if the request was no longer valid.
+	 * For non-ASYNC requests, we return failure to map and the caller
+	 * will take care of adding the request to the hash.
+	 * Note also that the callback itself is responsible for the page
+	 * at this point in time in terms of locking ...  The most common
+	 * case for the failure path should just be a page_free.
+	 */
+	if (ret >= 0) {
+		if (found) {
+			kmem_free(bp1, sizeof (*bp1));
+		}
+		return (ret);
+	}
+	if (!found) {
+		return (ret);
+	}
+
+	ASSERT(flags & CAPTURE_ASYNC);
+
+	/*
+	 * Check for expiration time first as we can just free it up if it's
+	 * expired.
+	 */
+	if (lbolt > bp1->expires && bp1->expires != -1) {
+		kmem_free(bp1, sizeof (*bp1));
+		return (ret);
+	}
+
+	/*
+	 * The callback failed and there used to be an entry in the hash for
+	 * this page, so we need to add it back to the hash.
+	 */
+	mutex_enter(&page_capture_hash[index].pchh_mutex);
+	if (!(pp->p_toxic & PR_CAPTURE)) {
+		/* just add bp1 back to head of walked list */
+		page_settoxic(pp, PR_CAPTURE);
+		bp1->next = page_capture_hash[index].lists[1].next;
+		bp1->prev = &page_capture_hash[index].lists[1];
+		bp1->next->prev = bp1;
+		page_capture_hash[index].lists[1].next = bp1;
+		page_capture_hash[index].num_pages++;
+		mutex_exit(&page_capture_hash[index].pchh_mutex);
+		return (ret);
+	}
+
+	/*
+	 * Otherwise there was a new capture request added to list
+	 * Need to make sure that our original data is represented if
+	 * appropriate.
+	 */
+	for (i = 0; i < 2; i++) {
+		bp2 = page_capture_hash[index].lists[i].next;
+		while (bp2 != &page_capture_hash[index].lists[i]) {
+			if (bp2->pp == pp) {
+				if (bp1->flags & CAPTURE_RETIRE) {
+					if (!(bp2->flags & CAPTURE_RETIRE)) {
+						bp2->szc = bp1->szc;
+						bp2->flags = bp1->flags;
+						bp2->expires = bp1->expires;
+						bp2->datap = bp1->datap;
+					}
+				} else {
+					ASSERT(bp1->flags & CAPTURE_PHYSMEM);
+					if (!(bp2->flags & CAPTURE_RETIRE)) {
+						bp2->szc = bp1->szc;
+						bp2->flags = bp1->flags;
+						bp2->expires = bp1->expires;
+						bp2->datap = bp1->datap;
+					}
+				}
+				mutex_exit(&page_capture_hash[index].
+				    pchh_mutex);
+				kmem_free(bp1, sizeof (*bp1));
+				return (ret);
+			}
+			bp2 = bp2->next;
+		}
+	}
+	panic("PR_CAPTURE set but not on hash for pp 0x%p\n", pp);
+	/*NOTREACHED*/
+}
+
+/*
+ * Try to capture the given page for the caller specified in the flags
+ * parameter.  The page will either be captured and handed over to the
+ * appropriate callback, or will be queued up in the page capture hash
+ * to be captured asynchronously.
+ * If the current request is due to an async capture, the page must be
+ * exclusively locked before calling this function.
+ * Currently szc must be 0 but in the future this should be expandable to
+ * other page sizes.
+ * Returns 0 on success, with the following error codes on failure:
+ *      EPERM - The requested page is long term locked, and thus repeated
+ *              requests to capture this page will likely fail.
+ *      ENOMEM - There was not enough free memory in the system to safely
+ *              map the requested page.
+ *      ENOENT - The requested page was inside the kernel cage, and the
+ *              CAPTURE_GET_CAGE flag was not set.
+ *	EAGAIN - The requested page could not be capturead at this point in
+ *		time but future requests will likely work.
+ *	EBUSY - The requested page is retired and the CAPTURE_GET_RETIRED flag
+ *		was not set.
+ */
+int
+page_itrycapture(page_t *pp, uint_t szc, uint_t flags, void *datap)
+{
+	int ret;
+	int cb_index;
+
+	if (flags & CAPTURE_ASYNC) {
+		ASSERT(PAGE_EXCL(pp));
+		goto async;
+	}
+
+	/* Make sure there's enough availrmem ... */
+	ret = page_capture_pre_checks(pp, flags);
+	if (ret != 0) {
+		return (ret);
+	}
+
+	if (!page_trylock(pp, SE_EXCL)) {
+		for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) {
+			if ((flags >> cb_index) & 1) {
+				break;
+			}
+		}
+		ASSERT(cb_index < PC_NUM_CALLBACKS);
+		ret = EAGAIN;
+		/* Special case for retired pages */
+		if (PP_RETIRED(pp)) {
+			if (flags & CAPTURE_GET_RETIRED) {
+				if (!page_unretire_pp(pp, PR_UNR_TEMP)) {
+					/*
+					 * Need to set capture bit and add to
+					 * hash so that the page will be
+					 * retired when freed.
+					 */
+					page_capture_add_hash(pp, szc,
+					    CAPTURE_RETIRE, NULL);
+					ret = 0;
+					goto own_page;
+				}
+			} else {
+				return (EBUSY);
+			}
+		}
+		page_capture_add_hash(pp, szc, flags, datap);
+		return (ret);
+	}
+
+async:
+	ASSERT(PAGE_EXCL(pp));
+
+	/* Need to check for physmem async requests that availrmem is sane */
+	if ((flags & (CAPTURE_ASYNC | CAPTURE_PHYSMEM)) ==
+	    (CAPTURE_ASYNC | CAPTURE_PHYSMEM) &&
+	    (availrmem < swapfs_minfree)) {
+		page_unlock(pp);
+		return (ENOMEM);
+	}
+
+	ret = page_capture_clean_page(pp);
+
+	if (ret != 0) {
+		/* We failed to get the page, so lets add it to the hash */
+		if (!(flags & CAPTURE_ASYNC)) {
+			page_capture_add_hash(pp, szc, flags, datap);
+		}
+		return (ret);
+	}
+
+own_page:
+	ASSERT(PAGE_EXCL(pp));
+	ASSERT(pp->p_szc == 0);
+
+	/* Call the callback */
+	ret = page_capture_take_action(pp, flags, datap);
+
+	if (ret == 0) {
+		return (0);
+	}
+
+	/*
+	 * Note that in the failure cases from page_capture_take_action, the
+	 * EXCL lock will have already been dropped.
+	 */
+	if ((ret == -1) && (!(flags & CAPTURE_ASYNC))) {
+		page_capture_add_hash(pp, szc, flags, datap);
+	}
+	return (EAGAIN);
+}
+
+int
+page_trycapture(page_t *pp, uint_t szc, uint_t flags, void *datap)
+{
+	int ret;
+
+	curthread->t_flag |= T_CAPTURING;
+	ret = page_itrycapture(pp, szc, flags, datap);
+	curthread->t_flag &= ~T_CAPTURING; /* xor works as we know its set */
+	return (ret);
+}
+
+/*
+ * When unlocking a page which has the PR_CAPTURE bit set, this routine
+ * gets called to try and capture the page.
+ */
+void
+page_unlock_capture(page_t *pp)
+{
+	page_capture_hash_bucket_t *bp;
+	int index;
+	int i;
+	uint_t szc;
+	uint_t flags = 0;
+	void *datap;
+	kmutex_t *mp;
+	extern vnode_t retired_pages;
+
+	/*
+	 * We need to protect against a possible deadlock here where we own
+	 * the vnode page hash mutex and want to acquire it again as there
+	 * are locations in the code, where we unlock a page while holding
+	 * the mutex which can lead to the page being captured and eventually
+	 * end up here.  As we may be hashing out the old page and hashing into
+	 * the retire vnode, we need to make sure we don't own them.
+	 * Other callbacks who do hash operations also need to make sure that
+	 * before they hashin to a vnode that they do not currently own the
+	 * vphm mutex otherwise there will be a panic.
+	 */
+	if (mutex_owned(page_vnode_mutex(&retired_pages))) {
+		page_unlock(pp);
+		return;
+	}
+	if (pp->p_vnode != NULL && mutex_owned(page_vnode_mutex(pp->p_vnode))) {
+		page_unlock(pp);
+		return;
+	}
+
+	index = PAGE_CAPTURE_HASH(pp);
+
+	mp = &page_capture_hash[index].pchh_mutex;
+	mutex_enter(mp);
+	for (i = 0; i < 2; i++) {
+		bp = page_capture_hash[index].lists[i].next;
+		while (bp != &page_capture_hash[index].lists[i]) {
+			if (bp->pp == pp) {
+				szc = bp->szc;
+				flags = bp->flags | CAPTURE_ASYNC;
+				datap = bp->datap;
+				mutex_exit(mp);
+				(void) page_trycapture(pp, szc, flags, datap);
+				return;
+			}
+			bp = bp->next;
+		}
+	}
+
+	/* Failed to find page in hash so clear flags and unlock it. */
+	page_clrtoxic(pp, PR_CAPTURE);
+	page_unlock(pp);
+
+	mutex_exit(mp);
+}
+
+void
+page_capture_init()
+{
+	int i;
+	for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
+		page_capture_hash[i].lists[0].next =
+		    &page_capture_hash[i].lists[0];
+		page_capture_hash[i].lists[0].prev =
+		    &page_capture_hash[i].lists[0];
+		page_capture_hash[i].lists[1].next =
+		    &page_capture_hash[i].lists[1];
+		page_capture_hash[i].lists[1].prev =
+		    &page_capture_hash[i].lists[1];
+	}
+
+	pc_thread_shortwait = 23 * hz;
+	pc_thread_longwait = 1201 * hz;
+	mutex_init(&pc_thread_mutex, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&pc_cv, NULL, CV_DEFAULT, NULL);
+	pc_thread_id = thread_create(NULL, 0, page_capture_thread, NULL, 0, &p0,
+	    TS_RUN, minclsyspri);
+}
+
+/*
+ * It is necessary to scrub any failing pages prior to reboot in order to
+ * prevent a latent error trap from occurring on the next boot.
+ */
+void
+page_retire_mdboot()
+{
+	page_t *pp;
+	int i, j;
+	page_capture_hash_bucket_t *bp;
+
+	/* walk lists looking for pages to scrub */
+	for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
+		if (page_capture_hash[i].num_pages == 0)
+			continue;
+
+		mutex_enter(&page_capture_hash[i].pchh_mutex);
+
+		for (j = 0; j < 2; j++) {
+			bp = page_capture_hash[i].lists[j].next;
+			while (bp != &page_capture_hash[i].lists[j]) {
+				pp = bp->pp;
+				if (!PP_ISKVP(pp) && PP_TOXIC(pp)) {
+					pp->p_selock = -1;  /* pacify ASSERTs */
+					PP_CLRFREE(pp);
+					pagescrub(pp, 0, PAGESIZE);
+					pp->p_selock = 0;
+				}
+				bp = bp->next;
+			}
+		}
+		mutex_exit(&page_capture_hash[i].pchh_mutex);
+	}
+}
+
+/*
+ * Walk the page_capture_hash trying to capture pages and also cleanup old
+ * entries which have expired.
+ */
+void
+page_capture_async()
+{
+	page_t *pp;
+	int i;
+	int ret;
+	page_capture_hash_bucket_t *bp1, *bp2;
+	uint_t szc;
+	uint_t flags;
+	void *datap;
+
+	/* If there are outstanding pages to be captured, get to work */
+	for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
+		if (page_capture_hash[i].num_pages == 0)
+			continue;
+		/* Append list 1 to list 0 and then walk through list 0 */
+		mutex_enter(&page_capture_hash[i].pchh_mutex);
+		bp1 = &page_capture_hash[i].lists[1];
+		bp2 = bp1->next;
+		if (bp1 != bp2) {
+			bp1->prev->next = page_capture_hash[i].lists[0].next;
+			bp2->prev = &page_capture_hash[i].lists[0];
+			page_capture_hash[i].lists[0].next->prev = bp1->prev;
+			page_capture_hash[i].lists[0].next = bp2;
+			bp1->next = bp1;
+			bp1->prev = bp1;
+		}
+
+		/* list[1] will be empty now */
+
+		bp1 = page_capture_hash[i].lists[0].next;
+		while (bp1 != &page_capture_hash[i].lists[0]) {
+			/* Check expiration time */
+			if ((lbolt > bp1->expires && bp1->expires != -1) ||
+			    page_deleted(bp1->pp)) {
+				page_capture_hash[i].lists[0].next = bp1->next;
+				bp1->next->prev =
+				    &page_capture_hash[i].lists[0];
+				page_capture_hash[i].num_pages--;
+
+				/*
+				 * We can safely remove the PR_CAPTURE bit
+				 * without holding the EXCL lock on the page
+				 * as the PR_CAPTURE bit requres that the
+				 * page_capture_hash[].pchh_mutex be held
+				 * to modify it.
+				 */
+				page_clrtoxic(bp1->pp, PR_CAPTURE);
+				mutex_exit(&page_capture_hash[i].pchh_mutex);
+				kmem_free(bp1, sizeof (*bp1));
+				mutex_enter(&page_capture_hash[i].pchh_mutex);
+				bp1 = page_capture_hash[i].lists[0].next;
+				continue;
+			}
+			pp = bp1->pp;
+			szc = bp1->szc;
+			flags = bp1->flags;
+			datap = bp1->datap;
+			mutex_exit(&page_capture_hash[i].pchh_mutex);
+			if (page_trylock(pp, SE_EXCL)) {
+				ret = page_trycapture(pp, szc,
+				    flags | CAPTURE_ASYNC, datap);
+			} else {
+				ret = 1;	/* move to walked hash */
+			}
+
+			if (ret != 0) {
+				/* Move to walked hash */
+				(void) page_capture_move_to_walked(pp);
+			}
+			mutex_enter(&page_capture_hash[i].pchh_mutex);
+			bp1 = page_capture_hash[i].lists[0].next;
+		}
+
+		mutex_exit(&page_capture_hash[i].pchh_mutex);
+	}
+}
+
+/*
+ * The page_capture_thread loops forever, looking to see if there are
+ * pages still waiting to be captured.
+ */
+static void
+page_capture_thread(void)
+{
+	callb_cpr_t c;
+	int outstanding;
+	int i;
+
+	CALLB_CPR_INIT(&c, &pc_thread_mutex, callb_generic_cpr, "page_capture");
+
+	mutex_enter(&pc_thread_mutex);
+	for (;;) {
+		outstanding = 0;
+		for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++)
+			outstanding += page_capture_hash[i].num_pages;
+		if (outstanding) {
+			/*
+			 * Do we really want to be this aggressive for things
+			 * other than page_retire?
+			 * Maybe have a counter for each callback type to
+			 * guide how aggressive we should be here.
+			 * Thus if there's at least one page for page_retire
+			 * we go ahead and reap like this.
+			 */
+			kmem_reap();
+			seg_preap();
+			page_capture_async();
+			CALLB_CPR_SAFE_BEGIN(&c);
+			(void) cv_timedwait(&pc_cv, &pc_thread_mutex,
+			    lbolt + pc_thread_shortwait);
+			CALLB_CPR_SAFE_END(&c, &pc_thread_mutex);
+		} else {
+			CALLB_CPR_SAFE_BEGIN(&c);
+			(void) cv_timedwait(&pc_cv, &pc_thread_mutex,
+			    lbolt + pc_thread_longwait);
+			CALLB_CPR_SAFE_END(&c, &pc_thread_mutex);
+		}
+	}
+	/*NOTREACHED*/
+}
--- a/usr/src/uts/common/vm/vm_pagelist.c	Thu Dec 14 16:42:14 2006 -0800
+++ b/usr/src/uts/common/vm/vm_pagelist.c	Thu Dec 14 17:27:13 2006 -0800
@@ -1487,7 +1487,7 @@
 			kcage_freemem_add(pgcnt);
 #endif
 		for (i = 0; i < pgcnt; i++, pp++)
-			page_unlock_noretire(pp);
+			page_unlock_nocapture(pp);
 	}
 }
 
@@ -1935,7 +1935,7 @@
 			index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset);
 			phm = PAGE_HASH_MUTEX(index);
 			if (!mutex_tryenter(phm)) {
-				page_unlock_noretire(pp);
+				page_unlock_nocapture(pp);
 				goto fail_promote;
 			}
 
@@ -1943,7 +1943,7 @@
 			page_hashout(pp, phm);
 			mutex_exit(phm);
 			PP_SETAGED(pp);
-			page_unlock_noretire(pp);
+			page_unlock_nocapture(pp);
 			which_list = PG_CACHE_LIST;
 		}
 		page_ctr_sub(mnode, mtype, pp, which_list);
@@ -2496,7 +2496,6 @@
 	return (ret_pp);
 }
 
-
 /*
  * Helper routine used only by the freelist code to lock
  * a page. If the page is a large page then it succeeds in
@@ -2529,11 +2528,13 @@
 	while (tpp != pp) {
 		if (!page_trylock(tpp, se)) {
 			/*
-			 * On failure unlock what we
-			 * have locked so far.
+			 * On failure unlock what we have locked so far.
+			 * We want to avoid attempting to capture these
+			 * pages as the pcm mutex may be held which could
+			 * lead to a recursive mutex panic.
 			 */
 			while (first_pp != tpp) {
-				page_unlock_noretire(first_pp);
+				page_unlock_nocapture(first_pp);
 				first_pp = first_pp->p_next;
 			}
 			return (0);
@@ -2976,7 +2977,7 @@
 			while (--i != (pgcnt_t)-1) {
 				pp = &spp[i];
 				ASSERT(PAGE_EXCL(pp));
-				page_unlock_noretire(pp);
+				page_unlock_nocapture(pp);
 			}
 			return (0);
 		}
@@ -2985,7 +2986,7 @@
 		    !PP_ISFREE(pp)) {
 			VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]);
 			ASSERT(i == 0);
-			page_unlock_noretire(pp);
+			page_unlock_nocapture(pp);
 			return (0);
 		}
 		if (PP_ISNORELOC(pp)) {
@@ -2993,7 +2994,7 @@
 			while (i != (pgcnt_t)-1) {
 				pp = &spp[i];
 				ASSERT(PAGE_EXCL(pp));
-				page_unlock_noretire(pp);
+				page_unlock_nocapture(pp);
 				i--;
 			}
 			return (0);
@@ -3088,7 +3089,7 @@
 			 */
 			while (pgcnt--) {
 				ASSERT(PAGE_EXCL(pp));
-				page_unlock_noretire(pp);
+				page_unlock_nocapture(pp);
 				pp++;
 			}
 			/*
@@ -3103,7 +3104,7 @@
 				ASSERT(PP_ISAGED(pp));
 				pp->p_szc = 0;
 				page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
-				page_unlock_noretire(pp);
+				page_unlock_nocapture(pp);
 			}
 
 			if (replpp != NULL)
@@ -3135,7 +3136,7 @@
 			page_sub(&replpp, rpp);
 			ASSERT(PAGE_EXCL(rpp));
 			ASSERT(!PP_ISFREE(rpp));
-			page_unlock_noretire(rpp);
+			page_unlock_nocapture(rpp);
 		}
 		ASSERT(targpp == hpp);
 		ASSERT(replpp == NULL);
@@ -3149,7 +3150,6 @@
  * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code
  * of 0 means nothing left after trim.
  */
-
 int
 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi)
 {
--- a/usr/src/uts/i86pc/os/machdep.c	Thu Dec 14 16:42:14 2006 -0800
+++ b/usr/src/uts/i86pc/os/machdep.c	Thu Dec 14 17:27:13 2006 -0800
@@ -200,7 +200,10 @@
 	if (invoke_cb)
 		(void) callb_execute_class(CB_CL_MDBOOT, NULL);
 
-	page_retire_hunt(page_retire_mdboot_cb);
+	/*
+	 * Clear any unresolved UEs from memory.
+	 */
+	page_retire_mdboot();
 
 	/*
 	 * stop other cpus and raise our priority.  since there is only
--- a/usr/src/uts/i86pc/vm/vm_dep.h	Thu Dec 14 16:42:14 2006 -0800
+++ b/usr/src/uts/i86pc/vm/vm_dep.h	Thu Dec 14 17:27:13 2006 -0800
@@ -686,6 +686,7 @@
 	ulong_t ppr_relocnolock[MMU_PAGE_SIZES];
 	ulong_t ppr_relocnomem[MMU_PAGE_SIZES];
 	ulong_t ppr_relocok[MMU_PAGE_SIZES];
+	ulong_t ppr_copyfail;
 	/* page coalesce counter */
 	ulong_t page_ctrs_coalesce[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
 	/* candidates useful */
--- a/usr/src/uts/i86pc/vm/vm_machdep.c	Thu Dec 14 16:42:14 2006 -0800
+++ b/usr/src/uts/i86pc/vm/vm_machdep.c	Thu Dec 14 17:27:13 2006 -0800
@@ -1992,7 +1992,7 @@
  * Note that the ref/mod bits in the page_t's are not affected by
  * this operation, hence it is up to the caller to update them appropriately.
  */
-void
+int
 ppcopy(page_t *frompp, page_t *topp)
 {
 	caddr_t		pp_addr1;
@@ -2000,6 +2000,8 @@
 	void		*pte1;
 	void		*pte2;
 	kmutex_t	*ppaddr_mutex;
+	label_t		ljb;
+	int		ret = 1;
 
 	ASSERT_STACK_ALIGNED();
 	ASSERT(PAGE_LOCKED(frompp));
@@ -2030,14 +2032,21 @@
 		    HAT_LOAD_NOCONSIST);
 	}
 
+	if (on_fault(&ljb)) {
+		ret = 0;
+		goto faulted;
+	}
 	if (use_sse_pagecopy)
 		hwblkpagecopy(pp_addr1, pp_addr2);
 	else
 		bcopy(pp_addr1, pp_addr2, PAGESIZE);
 
+	no_fault();
+faulted:
 	if (!kpm_enable)
 		mutex_exit(ppaddr_mutex);
 	kpreempt_enable();
+	return (ret);
 }
 
 /*
--- a/usr/src/uts/intel/Makefile.intel.shared	Thu Dec 14 16:42:14 2006 -0800
+++ b/usr/src/uts/intel/Makefile.intel.shared	Thu Dec 14 17:27:13 2006 -0800
@@ -258,6 +258,7 @@
 DRV_KMODS	+= mouse8042
 DRV_KMODS	+= nca
 DRV_KMODS	+= openeepr
+DRV_KMODS	+= physmem
 DRV_KMODS	+= pm
 DRV_KMODS	+= poll
 DRV_KMODS	+= pool
--- a/usr/src/uts/intel/os/minor_perm	Thu Dec 14 16:42:14 2006 -0800
+++ b/usr/src/uts/intel/os/minor_perm	Thu Dec 14 17:27:13 2006 -0800
@@ -139,3 +139,4 @@
 pcn:* 0666 root sys
 rtls:* 0666 root sys
 ath:* 0666 root sys
+physmem:* 0600 root sys
--- a/usr/src/uts/intel/os/name_to_major	Thu Dec 14 16:42:14 2006 -0800
+++ b/usr/src/uts/intel/os/name_to_major	Thu Dec 14 17:27:13 2006 -0800
@@ -123,3 +123,4 @@
 lx_ptm 240
 lx_systrace 241
 lx_audio 242
+physmem 243
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/intel/physmem/Makefile	Thu Dec 14 17:27:13 2006 -0800
@@ -0,0 +1,84 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# uts/intel/physmem/Makefile
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+#	This makefile drives the production of the physmem driver
+#
+#	intel implementation architecture dependent
+#
+
+#
+#	Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE	= ../..
+
+#
+#	Define the module and object file sets.
+#
+MODULE		= physmem
+OBJECTS		= $(PHYSMEM_OBJS:%=$(OBJS_DIR)/%)
+LINTS		= $(PHYSMEM_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE	= $(ROOT_DRV_DIR)/$(MODULE)
+CONF_SRCDIR	= $(UTSBASE)/common/io
+
+#
+#	Include common rules.
+#
+include $(UTSBASE)/intel/Makefile.intel
+
+#
+#	Define targets
+#
+ALL_TARGET	= $(BINARY) $(SRC_CONFILE)
+LINT_TARGET	= $(MODULE).lint
+INSTALL_TARGET	= $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE)
+
+#
+#	Default build targets.
+#
+.KEEP_STATE:
+
+def:		$(DEF_DEPS)
+
+all:		$(ALL_DEPS)
+
+clean:		$(CLEAN_DEPS)
+
+clobber:	$(CLOBBER_DEPS)
+
+lint:		$(LINT_DEPS)
+
+modlintlib:	$(MODLINTLIB_DEPS)
+
+clean.lint:	$(CLEAN_LINT_DEPS)
+
+install:	$(INSTALL_DEPS)
+
+#
+#	Include common targets.
+#
+include $(UTSBASE)/intel/Makefile.targ
--- a/usr/src/uts/sparc/Makefile.sparc.shared	Thu Dec 14 16:42:14 2006 -0800
+++ b/usr/src/uts/sparc/Makefile.sparc.shared	Thu Dec 14 17:27:13 2006 -0800
@@ -227,7 +227,7 @@
 DRV_KMODS	+= fssnap icmp icmp6 ip ip6 ipsecah
 DRV_KMODS	+= ipsecesp iwscn keysock kmdb kstat ksyms llc1
 DRV_KMODS	+= lofi
-DRV_KMODS	+= log logindmux kssl mm nca pm poll pool
+DRV_KMODS	+= log logindmux kssl mm nca physmem pm poll pool
 DRV_KMODS	+= pseudo ptc ptm pts ptsl ramdisk random rsm rts sad
 DRV_KMODS	+= sppp sppptun sy sysevent sysmsg 
 DRV_KMODS	+= spdsock
--- a/usr/src/uts/sparc/os/minor_perm	Thu Dec 14 16:42:14 2006 -0800
+++ b/usr/src/uts/sparc/os/minor_perm	Thu Dec 14 17:27:13 2006 -0800
@@ -166,3 +166,4 @@
 chxge:* 0666 root sys
 vsw:* 0666 root sys
 vnet:* 0666 root sys
+physmem:* 0600 root sys
--- a/usr/src/uts/sparc/os/name_to_major	Thu Dec 14 16:42:14 2006 -0800
+++ b/usr/src/uts/sparc/os/name_to_major	Thu Dec 14 17:27:13 2006 -0800
@@ -215,3 +215,4 @@
 pxb_bcm 267
 pxb_plx 268
 n2rng 269
+physmem 270
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sparc/physmem/Makefile	Thu Dec 14 17:27:13 2006 -0800
@@ -0,0 +1,88 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# uts/sparc/physmem/Makefile
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+#	This makefile drives the production of the mm driver
+#
+#	sparc implementation architecture dependent
+#
+
+#
+#	Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE	= ../..
+
+#
+#	Define the module and object file sets.
+#
+MODULE		= physmem
+OBJECTS		= $(PHYSMEM_OBJS:%=$(OBJS_DIR)/%)
+LINTS		= $(PHYSMEM_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE	= $(ROOT_DRV_DIR)/$(MODULE)
+CONF_SRCDIR	= $(UTSBASE)/common/io
+
+#
+#	Include common rules.
+#
+include $(UTSBASE)/sparc/Makefile.sparc
+
+#
+#	Define targets
+#
+ALL_TARGET	= $(BINARY) $(SRC_CONFILE)
+LINT_TARGET	= $(MODULE).lint
+INSTALL_TARGET	= $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE)
+
+#
+# lint pass one enforcement
+#
+CFLAGS += $(CCVERBOSE)
+
+#
+#	Default build targets.
+#
+.KEEP_STATE:
+
+def:		$(DEF_DEPS)
+
+all:		$(ALL_DEPS)
+
+clean:		$(CLEAN_DEPS)
+
+clobber:	$(CLOBBER_DEPS)
+
+lint:		$(LINT_DEPS)
+
+modlintlib:	$(MODLINTLIB_DEPS)
+
+clean.lint:	$(CLEAN_LINT_DEPS)
+
+install:	$(INSTALL_DEPS)
+
+#
+#	Include common targets.
+#
+include $(UTSBASE)/sparc/Makefile.targ
--- a/usr/src/uts/sun4/vm/vm_dep.h	Thu Dec 14 16:42:14 2006 -0800
+++ b/usr/src/uts/sun4/vm/vm_dep.h	Thu Dec 14 17:27:13 2006 -0800
@@ -630,6 +630,7 @@
 	ulong_t ppr_relocnolock[MMU_PAGE_SIZES];
 	ulong_t ppr_relocnomem[MMU_PAGE_SIZES];
 	ulong_t ppr_krelocfail[MMU_PAGE_SIZES];
+	ulong_t ppr_copyfail;
 	/* page coalesce counter */
 	ulong_t	page_ctrs_coalesce[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
 	/* candidates useful */
--- a/usr/src/uts/sun4u/os/mach_cpu_states.c	Thu Dec 14 16:42:14 2006 -0800
+++ b/usr/src/uts/sun4u/os/mach_cpu_states.c	Thu Dec 14 17:27:13 2006 -0800
@@ -104,8 +104,7 @@
 	/*
 	 * Clear any unresolved UEs from memory.
 	 */
-	if (memsegs != NULL)
-		page_retire_hunt(page_retire_mdboot_cb);
+	page_retire_mdboot();
 
 	/*
 	 * stop other cpus which also raise our priority. since there is only
--- a/usr/src/uts/sun4u/os/ppage.c	Thu Dec 14 16:42:14 2006 -0800
+++ b/usr/src/uts/sun4u/os/ppage.c	Thu Dec 14 17:27:13 2006 -0800
@@ -366,6 +366,8 @@
 	caddr_t fm_va, to_va;
 	caddr_t	*fm_slot, *to_slot;
 	processorid_t cpu;
+	label_t ljb;
+	int ret = 1;
 
 	ASSERT(PAGE_LOCKED(fm_pp));
 	ASSERT(PAGE_LOCKED(to_pp));
@@ -391,12 +393,18 @@
 		kpreempt_enable();
 		return (0);
 	}
+	if (on_fault(&ljb)) {
+		ret = 0;
+		goto faulted;
+	}
 	hwblkpagecopy(fm_va, to_va);
+	no_fault();
+faulted:
 	ASSERT(CPU->cpu_id == cpu);
 	pp_unload_tlb(fm_slot, fm_va);
 	pp_unload_tlb(to_slot, to_va);
 	kpreempt_enable();
-	return (1);
+	return (ret);
 }
 
 /*
@@ -425,22 +433,33 @@
  *
  * Try to use per cpu mapping first, if that fails then call pp_mapin
  * to load it.
+ *
+ * Returns one on success or zero on some sort of fault while doing the copy.
  */
-void
+int
 ppcopy(page_t *fm_pp, page_t *to_pp)
 {
 	caddr_t fm_va, to_va;
+	label_t ljb;
+	int ret = 1;
 
 	/* Try the fast path first */
 	if (ppcopy_common(fm_pp, to_pp))
-		return;
+		return (1);
 
 	/* Fast path failed, so we need to do the slow path. */
 	fm_va = ppmapin(fm_pp, PROT_READ, (caddr_t)-1);
 	to_va = ppmapin(to_pp, PROT_READ | PROT_WRITE, fm_va);
+	if (on_fault(&ljb)) {
+		ret = 0;
+		goto faulted;
+	}
 	bcopy(fm_va, to_va, PAGESIZE);
+	no_fault();
+faulted:
 	ppmapout(fm_va);
 	ppmapout(to_va);
+	return (ret);
 }
 
 /*
--- a/usr/src/uts/sun4v/os/mach_cpu_states.c	Thu Dec 14 16:42:14 2006 -0800
+++ b/usr/src/uts/sun4v/os/mach_cpu_states.c	Thu Dec 14 17:27:13 2006 -0800
@@ -134,8 +134,7 @@
 	/*
 	 * Clear any unresolved UEs from memory.
 	 */
-	if (memsegs != NULL)
-		page_retire_hunt(page_retire_mdboot_cb);
+	page_retire_mdboot();
 
 	/*
 	 * stop other cpus which also raise our priority. since there is only
--- a/usr/src/uts/sun4v/os/ppage.c	Thu Dec 14 16:42:14 2006 -0800
+++ b/usr/src/uts/sun4v/os/ppage.c	Thu Dec 14 17:27:13 2006 -0800
@@ -253,13 +253,16 @@
  *
  * Try to use per cpu mapping first, if that fails then call pp_mapin
  * to load it.
+ * Returns one on success or zero on some sort of fault while doing the copy.
  */
-void
+int
 ppcopy(page_t *fm_pp, page_t *to_pp)
 {
 	caddr_t fm_va;
 	caddr_t to_va;
 	boolean_t fast;
+	label_t ljb;
+	int ret = 1;
 
 	ASSERT(PAGE_LOCKED(fm_pp));
 	ASSERT(PAGE_LOCKED(to_pp));
@@ -278,7 +281,13 @@
 	} else
 		fast = B_TRUE;
 
+	if (on_fault(&ljb)) {
+		ret = 0;
+		goto faulted;
+	}
 	bcopy(fm_va, to_va, PAGESIZE);
+	no_fault();
+faulted:
 
 	/* Unmap */
 	if (fast) {
@@ -288,6 +297,7 @@
 		ppmapout(fm_va);
 		ppmapout(to_va);
 	}
+	return (ret);
 }
 
 /*