Mercurial > illumos > illumos-gate
changeset 3253:c929f34b62c5
PSARC 2006/360 Page retire and caged memory kstats
PSARC 2006/401 /dev/physmem
6385792 physical memory driver needed for memory testing
line wrap: on
line diff
--- a/usr/src/cmd/devfsadm/misc_link.c Thu Dec 14 16:42:14 2006 -0800 +++ b/usr/src/cmd/devfsadm/misc_link.c Thu Dec 14 17:27:13 2006 -0800 @@ -99,7 +99,7 @@ "(^eeprom$)|(^ptsl$)|(^mm$)|(^wc$)|(^dump$)|(^cn$)|(^lo$)|(^ptm$)|" "(^ptc$)|(^openeepr$)|(^poll$)|(^sysmsg$)|(^random$)|(^trapstat$)|" "(^cryptoadm$)|(^crypto$)|(^pool$)|(^poolctl$)|(^bl$)|(^kmdb$)|" - "(^sysevent$)|(^kssl$)", + "(^sysevent$)|(^kssl$)|(^physmem$)", TYPE_EXACT | DRV_RE, ILEVEL_1, minor_name }, { "pseudo", "ddi_pseudo",
--- a/usr/src/pkgdefs/SUNWckr/prototype_com Thu Dec 14 16:42:14 2006 -0800 +++ b/usr/src/pkgdefs/SUNWckr/prototype_com Thu Dec 14 17:27:13 2006 -0800 @@ -93,6 +93,7 @@ f none kernel/drv/mm.conf 644 root sys f none kernel/drv/openeepr.conf 644 root sys f none kernel/drv/options.conf 644 root sys +f none kernel/drv/physmem.conf 644 root sys f none kernel/drv/poll.conf 644 root sys f none kernel/drv/pseudo.conf 644 root sys f none kernel/drv/ptc.conf 644 root sys
--- a/usr/src/pkgdefs/SUNWckr/prototype_i386 Thu Dec 14 16:42:14 2006 -0800 +++ b/usr/src/pkgdefs/SUNWckr/prototype_i386 Thu Dec 14 17:27:13 2006 -0800 @@ -97,6 +97,7 @@ f none kernel/drv/options 755 root sys f none kernel/drv/pci_to_i2o 755 root sys f none kernel/drv/pci_to_i2o.conf 644 root sys +f none kernel/drv/physmem 755 root sys f none kernel/drv/poll 755 root sys f none kernel/drv/pseudo 755 root sys f none kernel/drv/ptc 755 root sys @@ -274,6 +275,7 @@ f none kernel/drv/amd64/mouse8042 755 root sys f none kernel/drv/amd64/openeepr 755 root sys f none kernel/drv/amd64/options 755 root sys +f none kernel/drv/amd64/physmem 755 root sys f none kernel/drv/amd64/poll 755 root sys f none kernel/drv/amd64/pseudo 755 root sys f none kernel/drv/amd64/ptc 755 root sys
--- a/usr/src/pkgdefs/SUNWckr/prototype_sparc Thu Dec 14 16:42:14 2006 -0800 +++ b/usr/src/pkgdefs/SUNWckr/prototype_sparc Thu Dec 14 17:27:13 2006 -0800 @@ -102,6 +102,7 @@ f none kernel/drv/sparcv9/openeepr 755 root sys f none kernel/drv/sparcv9/options 755 root sys f none kernel/drv/sparcv9/pci_pci 755 root sys +f none kernel/drv/sparcv9/physmem 755 root sys f none kernel/drv/sparcv9/poll 755 root sys f none kernel/drv/sparcv9/pseudo 755 root sys f none kernel/drv/sparcv9/ptc 755 root sys
--- a/usr/src/pkgdefs/SUNWhea/prototype_com Thu Dec 14 16:42:14 2006 -0800 +++ b/usr/src/pkgdefs/SUNWhea/prototype_com Thu Dec 14 17:27:13 2006 -0800 @@ -933,6 +933,7 @@ f none usr/include/sys/pcmcia.h 644 root bin f none usr/include/sys/pctypes.h 644 root bin f none usr/include/sys/pem.h 644 root bin +f none usr/include/sys/physmem.h 644 root bin f none usr/include/sys/serializer.h 644 root bin f none usr/include/sys/pfmod.h 644 root bin f none usr/include/sys/pm.h 0644 root bin
--- a/usr/src/pkgdefs/common_files/i.minorperm_i386 Thu Dec 14 16:42:14 2006 -0800 +++ b/usr/src/pkgdefs/common_files/i.minorperm_i386 Thu Dec 14 17:27:13 2006 -0800 @@ -268,6 +268,7 @@ systrace:systrace lx_ptm:lx_ptmajor lx_systrace:* +physmem:* EOF }
--- a/usr/src/pkgdefs/common_files/i.minorperm_sparc Thu Dec 14 16:42:14 2006 -0800 +++ b/usr/src/pkgdefs/common_files/i.minorperm_sparc Thu Dec 14 17:27:13 2006 -0800 @@ -312,6 +312,7 @@ profile:profile sdt:sdt systrace:systrace +physmem:* EOF }
--- a/usr/src/uts/common/Makefile.files Thu Dec 14 16:42:14 2006 -0800 +++ b/usr/src/uts/common/Makefile.files Thu Dec 14 17:27:13 2006 -0800 @@ -571,6 +571,8 @@ MM_OBJS += mem.o +PHYSMEM_OBJS += physmem.o + OPTIONS_OBJS += options.o WINLOCK_OBJS += winlockio.o
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/io/physmem.c Thu Dec 14 17:27:13 2006 -0800 @@ -0,0 +1,981 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/modctl.h> +#include <sys/conf.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/devops.h> +#include <sys/stat.h> +#include <sys/file.h> +#include <sys/cred.h> +#include <sys/policy.h> +#include <sys/errno.h> +#include <vm/seg_dev.h> +#include <vm/seg_vn.h> +#include <vm/page.h> +#include <sys/fs/swapnode.h> +#include <sys/sysmacros.h> +#include <sys/fcntl.h> +#include <sys/vmsystm.h> +#include <sys/physmem.h> + +static dev_info_t *physmem_dip = NULL; + +/* + * Linked list element hanging off physmem_proc_hash below, which holds all + * the information for a given segment which has been setup for this process. + * This is a simple linked list as we are assuming that for a given process + * the setup ioctl will only be called a handful of times. If this assumption + * changes in the future, a quicker to traverse data structure should be used. + */ +struct physmem_hash { + struct physmem_hash *ph_next; + uint64_t ph_base_pa; + caddr_t ph_base_va; + size_t ph_seg_len; + struct vnode *ph_vnode; +}; + +/* + * Hash of all of the processes which have setup mappings with the driver with + * pointers to per process data. + */ +struct physmem_proc_hash { + struct proc *pph_proc; + struct physmem_hash *pph_hash; + struct physmem_proc_hash *pph_next; +}; + + +/* Needs to be a power of two for simple hash algorithm */ +#define PPH_SIZE 8 +struct physmem_proc_hash *pph[PPH_SIZE]; + +/* + * Lock which protects the pph hash above. To add an element (either a new + * process or a new segment) the WRITE lock must be held. To traverse the + * list, only a READ lock is needed. + */ +krwlock_t pph_rwlock; + +#define PHYSMEM_HASH(procp) ((int)((((uintptr_t)procp) >> 8) & (PPH_SIZE - 1))) + +/* + * Need to keep a reference count of how many processes have the driver + * open to prevent it from disappearing. + */ +uint64_t physmem_vnodecnt; +kmutex_t physmem_mutex; /* protects phsymem_vnodecnt */ + +static int physmem_getpage(struct vnode *vp, offset_t off, size_t len, + uint_t *protp, page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, + enum seg_rw rw, struct cred *cr); + +static int physmem_addmap(struct vnode *vp, offset_t off, struct as *as, + caddr_t addr, size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, + struct cred *cred); + +static int physmem_delmap(struct vnode *vp, offset_t off, struct as *as, + caddr_t addr, size_t len, uint_t prot, uint_t maxprot, uint_t flags, + struct cred *cred); + +static void physmem_inactive(vnode_t *vp, cred_t *crp); + +const fs_operation_def_t physmem_vnodeops_template[] = { + VOPNAME_GETPAGE, physmem_getpage, + VOPNAME_ADDMAP, (fs_generic_func_p) physmem_addmap, + VOPNAME_DELMAP, physmem_delmap, + VOPNAME_INACTIVE, (fs_generic_func_p) physmem_inactive, + NULL, NULL +}; + +vnodeops_t *physmem_vnodeops = NULL; + +/* + * Removes the current process from the hash if the process has no more + * physmem segments active. + */ +void +physmem_remove_hash_proc() +{ + int index; + struct physmem_proc_hash **walker; + struct physmem_proc_hash *victim = NULL; + + index = PHYSMEM_HASH(curproc); + rw_enter(&pph_rwlock, RW_WRITER); + walker = &pph[index]; + while (*walker != NULL) { + if ((*walker)->pph_proc == curproc && + (*walker)->pph_hash == NULL) { + victim = *walker; + *walker = victim->pph_next; + break; + } + walker = &((*walker)->pph_next); + } + rw_exit(&pph_rwlock); + if (victim != NULL) + kmem_free(victim, sizeof (struct physmem_proc_hash)); +} + +/* + * Add a new entry to the hash for the given process to cache the + * address ranges that it is working on. If this is the first hash + * item to be added for this process, we will create the head pointer + * for this process. + * Returns 0 on success, ERANGE when the physical address is already in the + * hash. Note that we add it to the hash as we have already called as_map + * and thus the as_unmap call will try to free the vnode, which needs + * to be found in the hash. + */ +int +physmem_add_hash(struct physmem_hash *php) +{ + int index; + struct physmem_proc_hash *iterator; + struct physmem_proc_hash *newp = NULL; + struct physmem_hash *temp; + int ret = 0; + + index = PHYSMEM_HASH(curproc); + +insert: + rw_enter(&pph_rwlock, RW_WRITER); + iterator = pph[index]; + while (iterator != NULL) { + if (iterator->pph_proc == curproc) { + /* + * check to make sure a single process does not try to + * map the same region twice. + */ + for (temp = iterator->pph_hash; temp != NULL; + temp = temp->ph_next) { + if ((php->ph_base_pa >= temp->ph_base_pa && + php->ph_base_pa < temp->ph_base_pa + + temp->ph_seg_len) || + (temp->ph_base_pa >= php->ph_base_pa && + temp->ph_base_pa < php->ph_base_pa + + php->ph_seg_len)) { + ret = ERANGE; + break; + } + } + if (ret == 0) { + php->ph_next = iterator->pph_hash; + iterator->pph_hash = php; + } + rw_exit(&pph_rwlock); + /* Need to check for two threads in sync */ + if (newp != NULL) + kmem_free(newp, sizeof (*newp)); + return (ret); + } + iterator = iterator->pph_next; + } + + if (newp != NULL) { + newp->pph_proc = curproc; + newp->pph_next = pph[index]; + newp->pph_hash = php; + php->ph_next = NULL; + pph[index] = newp; + rw_exit(&pph_rwlock); + return (0); + } + + rw_exit(&pph_rwlock); + /* Dropped the lock so we could use KM_SLEEP */ + newp = kmem_zalloc(sizeof (struct physmem_proc_hash), KM_SLEEP); + goto insert; +} + +/* + * Will return the pointer to the physmem_hash struct if the setup routine + * has previously been called for this memory. + * Returns NULL on failure. + */ +struct physmem_hash * +physmem_get_hash(uint64_t req_paddr, size_t len, proc_t *procp) +{ + int index; + struct physmem_proc_hash *proc_hp; + struct physmem_hash *php; + + ASSERT(rw_lock_held(&pph_rwlock)); + + index = PHYSMEM_HASH(procp); + proc_hp = pph[index]; + while (proc_hp != NULL) { + if (proc_hp->pph_proc == procp) { + php = proc_hp->pph_hash; + while (php != NULL) { + if ((req_paddr >= php->ph_base_pa) && + (req_paddr + len <= + php->ph_base_pa + php->ph_seg_len)) { + return (php); + } + php = php->ph_next; + } + } + proc_hp = proc_hp->pph_next; + } + return (NULL); +} + +int +physmem_validate_cookie(uint64_t p_cookie) +{ + int index; + struct physmem_proc_hash *proc_hp; + struct physmem_hash *php; + + ASSERT(rw_lock_held(&pph_rwlock)); + + index = PHYSMEM_HASH(curproc); + proc_hp = pph[index]; + while (proc_hp != NULL) { + if (proc_hp->pph_proc == curproc) { + php = proc_hp->pph_hash; + while (php != NULL) { + if ((uint64_t)(uintptr_t)php == p_cookie) { + return (1); + } + php = php->ph_next; + } + } + proc_hp = proc_hp->pph_next; + } + return (0); +} + +/* + * Remove the given vnode from the pph hash. If it exists in the hash the + * process still has to be around as the vnode is obviously still around and + * since it's a physmem vnode, it must be in the hash. + * If it is not in the hash that must mean that the setup ioctl failed. + * Return 0 in this instance, 1 if it is in the hash. + */ +int +physmem_remove_vnode_hash(vnode_t *vp) +{ + int index; + struct physmem_proc_hash *proc_hp; + struct physmem_hash **phpp; + struct physmem_hash *victim; + + index = PHYSMEM_HASH(curproc); + /* synchronize with the map routine */ + rw_enter(&pph_rwlock, RW_WRITER); + proc_hp = pph[index]; + while (proc_hp != NULL) { + if (proc_hp->pph_proc == curproc) { + phpp = &proc_hp->pph_hash; + while (*phpp != NULL) { + if ((*phpp)->ph_vnode == vp) { + victim = *phpp; + *phpp = victim->ph_next; + + rw_exit(&pph_rwlock); + kmem_free(victim, sizeof (*victim)); + return (1); + } + phpp = &(*phpp)->ph_next; + } + } + proc_hp = proc_hp->pph_next; + } + rw_exit(&pph_rwlock); + + /* not found */ + return (0); +} + +int +physmem_setup_vnops() +{ + int error; + char *name = "physmem"; + if (physmem_vnodeops != NULL) + cmn_err(CE_PANIC, "physmem vnodeops already set\n"); + error = vn_make_ops(name, physmem_vnodeops_template, &physmem_vnodeops); + if (error != 0) { + cmn_err(CE_WARN, "physmem_setup_vnops: bad vnode ops template"); + } + return (error); +} + +/* + * The guts of the PHYSMEM_SETUP ioctl. + * Create a segment in the address space with the specified parameters. + * If pspp->user_va is NULL, as_gap will be used to find an appropriate VA. + * We do not do bounds checking on the requested phsycial addresses, if they + * do not exist in the system, they will not be mappable. + * Returns 0 on success with the following error codes on failure: + * ENOMEM - The VA range requested was already mapped if pspp->user_va is + * non-NULL or the system was unable to find enough VA space for + * the desired length if user_va was NULL> + * EINVAL - The requested PA, VA, or length was not PAGESIZE aligned. + */ +int +physmem_setup_addrs(struct physmem_setup_param *pspp) +{ + struct as *as = curproc->p_as; + struct segvn_crargs vn_a; + int ret = 0; + uint64_t base_pa; + size_t len; + caddr_t uvaddr; + struct vnode *vp; + struct physmem_hash *php; + + ASSERT(pspp != NULL); + base_pa = pspp->req_paddr; + len = pspp->len; + uvaddr = (caddr_t)(uintptr_t)pspp->user_va; + + /* Sanity checking */ + if (!IS_P2ALIGNED(base_pa, PAGESIZE)) + return (EINVAL); + if (!IS_P2ALIGNED(len, PAGESIZE)) + return (EINVAL); + if (uvaddr != NULL && !IS_P2ALIGNED(uvaddr, PAGESIZE)) + return (EINVAL); + + php = kmem_zalloc(sizeof (struct physmem_hash), KM_SLEEP); + + /* Need to bump vnode count so that the driver can not be unloaded */ + mutex_enter(&physmem_mutex); + physmem_vnodecnt++; + mutex_exit(&physmem_mutex); + + vp = vn_alloc(KM_SLEEP); + ASSERT(vp != NULL); /* SLEEP can't return NULL */ + vn_setops(vp, physmem_vnodeops); + + php->ph_vnode = vp; + + vn_a.vp = vp; + vn_a.offset = (u_offset_t)base_pa; + vn_a.type = MAP_SHARED; + vn_a.prot = PROT_ALL; + vn_a.maxprot = PROT_ALL; + vn_a.flags = 0; + vn_a.cred = NULL; + vn_a.amp = NULL; + vn_a.szc = 0; + vn_a.lgrp_mem_policy_flags = 0; + + as_rangelock(as); + if (uvaddr != NULL) { + if (as_gap(as, len, &uvaddr, &len, AH_LO, NULL) == -1) { + ret = ENOMEM; +fail: + as_rangeunlock(as); + vn_free(vp); + kmem_free(php, sizeof (*php)); + mutex_enter(&physmem_mutex); + physmem_vnodecnt--; + mutex_exit(&physmem_mutex); + return (ret); + } + } else { + /* We pick the address for the user */ + map_addr(&uvaddr, len, 0, 1, 0); + if (uvaddr == NULL) { + ret = ENOMEM; + goto fail; + } + } + ret = as_map(as, uvaddr, len, segvn_create, &vn_a); + + as_rangeunlock(as); + if (ret == 0) { + php->ph_base_pa = base_pa; + php->ph_base_va = uvaddr; + php->ph_seg_len = len; + pspp->user_va = (uint64_t)(uintptr_t)uvaddr; + pspp->cookie = (uint64_t)(uintptr_t)php; + ret = physmem_add_hash(php); + if (ret == 0) + return (0); + (void) as_unmap(as, uvaddr, len); + return (ret); + } + + goto fail; + /*NOTREACHED*/ +} + +/* + * The guts of the PHYSMEM_MAP ioctl. + * Map the given PA to the appropriate VA if PHYSMEM_SETUP ioctl has already + * been called for this PA range. + * Returns 0 on success with the following error codes on failure: + * EPERM - The requested page is long term locked, and thus repeated + * requests to allocate this page will likely fail. + * EAGAIN - The requested page could not be allocated, but it is believed + * that future attempts could succeed. + * ENOMEM - There was not enough free memory in the system to safely + * map the requested page. + * EINVAL - The requested paddr was not PAGESIZE aligned or the + * PHYSMEM_SETUP ioctl was not called for this page. + * ENOENT - The requested page was iniside the kernel cage, and the + * PHYSMEM_CAGE flag was not set. + * EBUSY - The requested page is retired and the PHYSMEM_RETIRE flag + * was not set. + */ +static int +physmem_map_addrs(struct physmem_map_param *pmpp) +{ + caddr_t uvaddr; + page_t *pp; + uint64_t req_paddr; + struct vnode *vp; + int ret = 0; + struct physmem_hash *php; + uint_t flags = 0; + + ASSERT(pmpp != NULL); + req_paddr = pmpp->req_paddr; + + if (!IS_P2ALIGNED(req_paddr, PAGESIZE)) + return (EINVAL); + /* Find the vnode for this map request */ + rw_enter(&pph_rwlock, RW_READER); + php = physmem_get_hash(req_paddr, PAGESIZE, curproc); + if (php == NULL) { + rw_exit(&pph_rwlock); + return (EINVAL); + } + vp = php->ph_vnode; + uvaddr = php->ph_base_va + (req_paddr - php->ph_base_pa); + rw_exit(&pph_rwlock); + + pp = page_numtopp_nolock(btop((size_t)req_paddr)); + if (pp == NULL) { + pmpp->ret_va = NULL; + return (EPERM); + } + + /* + * Check to see if page already mapped correctly. This can happen + * when we failed to capture a page previously and it was captured + * asynchronously for us. Return success in this case. + */ + if (pp->p_vnode == vp) { + ASSERT(pp->p_offset == (u_offset_t)req_paddr); + pmpp->ret_va = (uint64_t)(uintptr_t)uvaddr; + return (0); + } + + /* + * physmem should be responsible for checking for cage + * and prom pages. + */ + if (pmpp->flags & PHYSMEM_CAGE) + flags = CAPTURE_GET_CAGE; + if (pmpp->flags & PHYSMEM_RETIRED) + flags |= CAPTURE_GET_RETIRED; + + ret = page_trycapture(pp, 0, flags | CAPTURE_PHYSMEM, curproc); + + if (ret != 0) { + pmpp->ret_va = NULL; + return (ret); + } else { + pmpp->ret_va = (uint64_t)(uintptr_t)uvaddr; + return (0); + } +} + +/* + * Map the given page into the process's address space if possible. + * We actually only hash the page in on the correct vnode as the page + * will be mapped via segvn_pagefault. + * returns 0 on success + * returns 1 if there is no need to map this page anymore (process exited) + * returns -1 if we failed to map the page. + */ +int +map_page_proc(page_t *pp, void *arg, uint_t flags) +{ + struct vnode *vp; + proc_t *procp = (proc_t *)arg; + int ret; + u_offset_t paddr = (u_offset_t)ptob(pp->p_pagenum); + struct physmem_hash *php; + + ASSERT(pp != NULL); + + /* + * Check against availrmem to make sure that we're not low on memory. + * We check again here as ASYNC requests do not do this check elsewhere. + * We return 1 as we don't want the page to have the PR_CAPTURE bit + * set or be on the page capture hash. + */ + if (swapfs_minfree > availrmem + 1) { + page_free(pp, 1); + return (1); + } + + /* + * If this is an asynchronous request for the current process, + * we can not map the page as it's possible that we are also in the + * process of unmapping the page which could result in a deadlock + * with the as lock. + */ + if ((flags & CAPTURE_ASYNC) && (curproc == procp)) { + page_free(pp, 1); + return (-1); + } + + /* only return zeroed out pages */ + pagezero(pp, 0, PAGESIZE); + + rw_enter(&pph_rwlock, RW_READER); + php = physmem_get_hash(paddr, PAGESIZE, procp); + if (php == NULL) { + rw_exit(&pph_rwlock); + /* + * Free the page as there is no longer a valid outstanding + * request for this page. + */ + page_free(pp, 1); + return (1); + } + + vp = php->ph_vnode; + + /* + * We need to protect against a possible deadlock here where we own + * the vnode page hash mutex and want to acquire it again as there + * are locations in the code, where we unlock a page while holding + * the mutex which can lead to the page being captured and eventually + * end up here. + */ + if (mutex_owned(page_vnode_mutex(vp))) { + rw_exit(&pph_rwlock); + page_free(pp, 1); + return (-1); + } + + ret = page_hashin(pp, vp, paddr, NULL); + rw_exit(&pph_rwlock); + if (ret == 0) { + page_free(pp, 1); + return (-1); + } + + page_downgrade(pp); + + mutex_enter(&freemem_lock); + availrmem--; + mutex_exit(&freemem_lock); + + return (0); +} + +/* + * The guts of the PHYSMEM_DESTROY ioctl. + * The cookie passed in will provide all of the information needed to + * free up the address space and physical memory associated with the + * corresponding PHSYMEM_SETUP ioctl. + * Returns 0 on success with the following error codes on failure: + * EINVAL - The cookie supplied is not valid. + */ +int +physmem_destroy_addrs(uint64_t p_cookie) +{ + struct as *as = curproc->p_as; + size_t len; + caddr_t uvaddr; + + rw_enter(&pph_rwlock, RW_READER); + if (physmem_validate_cookie(p_cookie) == 0) { + rw_exit(&pph_rwlock); + return (EINVAL); + } + + len = ((struct physmem_hash *)(uintptr_t)p_cookie)->ph_seg_len; + uvaddr = ((struct physmem_hash *)(uintptr_t)p_cookie)->ph_base_va; + rw_exit(&pph_rwlock); + + (void) as_unmap(as, uvaddr, len); + + return (0); +} + +/* + * If the page has been hashed into the physmem vnode, then just look it up + * and return it via pl, otherwise return ENOMEM as the map ioctl has not + * succeeded on the given page. + */ +/*ARGSUSED*/ +static int +physmem_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp, + page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw, + struct cred *cr) +{ + page_t *pp; + + ASSERT(len == PAGESIZE); + ASSERT(AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)); + + /* + * If the page is in the hash, then we successfully claimed this + * page earlier, so return it to the caller. + */ + pp = page_lookup(vp, off, SE_SHARED); + if (pp != NULL) { + pl[0] = pp; + pl[1] = NULL; + *protp = PROT_ALL; + return (0); + } + return (ENOMEM); +} + +/* + * We can not allow a process mapping /dev/physmem pages to fork as there can + * only be a single mapping to a /dev/physmem page at a given time. Thus, the + * return of EINVAL when we are not working on our own address space. + * Otherwise we return zero as this function is required for normal operation. + */ +/*ARGSUSED*/ +static int +physmem_addmap(struct vnode *vp, offset_t off, struct as *as, + caddr_t addr, size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, + struct cred *cred) +{ + if (curproc->p_as != as) { + return (EINVAL); + } + return (0); +} + +/* Will always get called for removing a whole segment. */ +/*ARGSUSED*/ +static int +physmem_delmap(struct vnode *vp, offset_t off, struct as *as, + caddr_t addr, size_t len, uint_t prot, uint_t maxprot, uint_t flags, + struct cred *cred) +{ + /* + * Release our hold on the vnode so that the final VN_RELE will + * call physmem_inactive to clean things up. + */ + VN_RELE(vp); + + return (0); +} + +/* + * Clean up all the pages belonging to this vnode and then free it. + */ +/*ARGSUSED*/ +static void +physmem_inactive(vnode_t *vp, cred_t *crp) +{ + page_t *pp; + + /* + * Remove the vnode from the hash now, to prevent asynchronous + * attempts to map into this vnode. This avoids a deadlock + * where two threads try to get into this logic at the same + * time and try to map the pages they are destroying into the + * other's address space. + * If it's not in the hash, just free it. + */ + if (physmem_remove_vnode_hash(vp) == 0) { + ASSERT(vp->v_pages == NULL); + vn_free(vp); + physmem_remove_hash_proc(); + mutex_enter(&physmem_mutex); + physmem_vnodecnt--; + mutex_exit(&physmem_mutex); + return; + } + + /* + * At this point in time, no other logic can be adding or removing + * pages from the vnode, otherwise the v_pages list could be inaccurate. + */ + + while ((pp = vp->v_pages) != NULL) { + page_t *rpp; + if (page_tryupgrade(pp)) { + /* + * set lckcnt for page_destroy to do availrmem + * accounting + */ + pp->p_lckcnt = 1; + page_destroy(pp, 0); + } else { + /* failure to lock should be transient */ + rpp = page_lookup(vp, ptob(pp->p_pagenum), SE_SHARED); + if (rpp != pp) { + page_unlock(rpp); + continue; + } + page_unlock(pp); + } + } + vn_free(vp); + physmem_remove_hash_proc(); + mutex_enter(&physmem_mutex); + physmem_vnodecnt--; + mutex_exit(&physmem_mutex); +} + +/*ARGSUSED*/ +static int +physmem_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, + int *rvalp) +{ + int ret; + + switch (cmd) { + case PHYSMEM_SETUP: + { + struct physmem_setup_param psp; + if (ddi_copyin((void *)arg, &psp, + sizeof (struct physmem_setup_param), 0)) + return (EFAULT); + ret = physmem_setup_addrs(&psp); + if (ddi_copyout(&psp, (void *)arg, sizeof (psp), 0)) + return (EFAULT); + } + break; + case PHYSMEM_MAP: + { + struct physmem_map_param pmp; + if (ddi_copyin((void *)arg, &pmp, + sizeof (struct physmem_map_param), 0)) + return (EFAULT); + ret = physmem_map_addrs(&pmp); + if (ddi_copyout(&pmp, (void *)arg, sizeof (pmp), 0)) + return (EFAULT); + } + break; + case PHYSMEM_DESTROY: + { + uint64_t cookie; + if (ddi_copyin((void *)arg, &cookie, + sizeof (uint64_t), 0)) + return (EFAULT); + ret = physmem_destroy_addrs(cookie); + } + break; + default: + return (ENOTSUP); + } + return (ret); +} + +/*ARGSUSED*/ +static int +physmem_open(dev_t *devp, int flag, int otyp, cred_t *credp) +{ + int ret; + static int msg_printed = 0; + + if ((flag & (FWRITE | FREAD)) != (FWRITE | FREAD)) { + return (EINVAL); + } + + /* need to make sure we have the right privileges */ + if ((ret = secpolicy_resource(credp)) != 0) + return (ret); + if ((ret = secpolicy_lock_memory(credp)) != 0) + return (ret); + + if (msg_printed == 0) { + cmn_err(CE_NOTE, "!driver has been opened. This driver may " + "take out long term locks on pages which may impact " + "dynamic reconfiguration events"); + msg_printed = 1; + } + + return (0); +} + +/*ARGSUSED*/ +static int +physmem_close(dev_t dev, int flag, int otyp, cred_t *credp) +{ + return (0); +} + +/*ARGSUSED*/ +static int +physmem_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, + void *arg, void **resultp) +{ + switch (infocmd) { + case DDI_INFO_DEVT2DEVINFO: + *resultp = physmem_dip; + return (DDI_SUCCESS); + + case DDI_INFO_DEVT2INSTANCE: + *resultp = (void *)(ulong_t)getminor((dev_t)arg); + return (DDI_SUCCESS); + + default: + return (DDI_FAILURE); + } +} + +static int +physmem_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int i; + + if (cmd == DDI_RESUME) { + return (DDI_SUCCESS); + } + + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + if (ddi_create_minor_node(dip, ddi_get_name(dip), S_IFCHR, + ddi_get_instance(dip), DDI_PSEUDO, 0) != DDI_SUCCESS) + return (DDI_FAILURE); + + physmem_dip = dip; + + /* Initialize driver specific data */ + if (physmem_setup_vnops()) { + ddi_remove_minor_node(dip, ddi_get_name(dip)); + return (DDI_FAILURE); + } + + for (i = 0; i < PPH_SIZE; i++) + pph[i] = NULL; + + page_capture_register_callback(PC_PHYSMEM, 10000, + map_page_proc); + + return (DDI_SUCCESS); +} + +static int +physmem_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + int ret = DDI_SUCCESS; + + if (cmd == DDI_SUSPEND) { + return (DDI_SUCCESS); + } + + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + ASSERT(physmem_dip == dip); + + mutex_enter(&physmem_mutex); + if (physmem_vnodecnt == 0) { + if (physmem_vnodeops != NULL) { + vn_freevnodeops(physmem_vnodeops); + physmem_vnodeops = NULL; + page_capture_unregister_callback(PC_PHYSMEM); + } + } else { + ret = EBUSY; + } + mutex_exit(&physmem_mutex); + if (ret == DDI_SUCCESS) + ddi_remove_minor_node(dip, ddi_get_name(dip)); + return (ret); +} + +static struct cb_ops physmem_cb_ops = { + physmem_open, /* open */ + physmem_close, /* close */ + nodev, /* strategy */ + nodev, /* print */ + nodev, /* dump */ + nodev, /* read */ + nodev, /* write */ + physmem_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + nochpoll, /* chpoll */ + ddi_prop_op, /* prop_op */ + NULL, /* cb_str */ + D_NEW | D_MP | D_DEVMAP, + CB_REV, + NULL, + NULL +}; + +static struct dev_ops physmem_ops = { + DEVO_REV, + 0, + physmem_getinfo, + nulldev, + nulldev, + physmem_attach, + physmem_detach, + nodev, + &physmem_cb_ops, + NULL, + NULL +}; + +static struct modldrv modldrv = { + &mod_driverops, + "physmem driver %I%", + &physmem_ops +}; + +static struct modlinkage modlinkage = { + MODREV_1, + &modldrv, + NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + return (mod_remove(&modlinkage)); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/io/physmem.conf Thu Dec 14 17:27:13 2006 -0800 @@ -0,0 +1,28 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +#ident "%Z%%M% %I% %E% SMI" + +name="physmem" parent="pseudo" instance=0;
--- a/usr/src/uts/common/os/mem_cage.c Thu Dec 14 16:42:14 2006 -0800 +++ b/usr/src/uts/common/os/mem_cage.c Thu Dec 14 17:27:13 2006 -0800 @@ -262,6 +262,11 @@ #define KCAGEPAGETS_INC() #endif +/* kstats to export what pages are currently caged */ +kmutex_t kcage_kstat_lock; +static int kcage_kstat_update(kstat_t *ksp, int rw); +static int kcage_kstat_snapshot(kstat_t *ksp, void *buf, int rw); + /* * Startup and Dynamic Reconfiguration interfaces. * kcage_range_lock() @@ -873,6 +878,8 @@ pgcnt_t wanted; pfn_t pfn; page_t *pp; + kstat_t *ksp; + extern struct vnode kvp; extern void page_list_noreloc_startup(page_t *); @@ -981,6 +988,83 @@ page_freelist_coalesce_all(mnode); } } + + ksp = kstat_create("kcage", 0, "kcage_page_list", "misc", + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL); + if (ksp != NULL) { + ksp->ks_update = kcage_kstat_update; + ksp->ks_snapshot = kcage_kstat_snapshot; + ksp->ks_lock = &kcage_kstat_lock; /* XXX - not really needed */ + kstat_install(ksp); + } + +} + +static int +kcage_kstat_update(kstat_t *ksp, int rw) +{ + struct kcage_glist *lp; + uint_t count; + + if (rw == KSTAT_WRITE) + return (EACCES); + + count = 0; + kcage_range_lock(); + for (lp = kcage_glist; lp != NULL; lp = lp->next) { + if (lp->decr) { + if (lp->curr != lp->lim) { + count++; + } + } else { + if (lp->curr != lp->base) { + count++; + } + } + } + kcage_range_unlock(); + + ksp->ks_ndata = count; + ksp->ks_data_size = count * 2 * sizeof (uint64_t); + + return (0); +} + +static int +kcage_kstat_snapshot(kstat_t *ksp, void *buf, int rw) +{ + struct kcage_glist *lp; + struct memunit { + uint64_t address; + uint64_t size; + } *kspmem; + + if (rw == KSTAT_WRITE) + return (EACCES); + + ksp->ks_snaptime = gethrtime(); + + kspmem = (struct memunit *)buf; + kcage_range_lock(); + for (lp = kcage_glist; lp != NULL; lp = lp->next, kspmem++) { + if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size) + break; + + if (lp->decr) { + if (lp->curr != lp->lim) { + kspmem->address = ptob(lp->curr); + kspmem->size = ptob(lp->lim - lp->curr); + } + } else { + if (lp->curr != lp->base) { + kspmem->address = ptob(lp->base); + kspmem->size = ptob(lp->curr - lp->base); + } + } + } + kcage_range_unlock(); + + return (0); } void
--- a/usr/src/uts/common/os/mem_config.c Thu Dec 14 16:42:14 2006 -0800 +++ b/usr/src/uts/common/os/mem_config.c Thu Dec 14 17:27:13 2006 -0800 @@ -2250,7 +2250,8 @@ */ mhp->mh_hold_todo++; } else { - (void) page_unretire_pp(pp, 0); + (void) page_unretire_pp(pp, + PR_UNR_CLEAN); } } }
--- a/usr/src/uts/common/sys/Makefile Thu Dec 14 16:42:14 2006 -0800 +++ b/usr/src/uts/common/sys/Makefile Thu Dec 14 17:27:13 2006 -0800 @@ -370,6 +370,7 @@ pctypes.h \ pem.h \ pfmod.h \ + physmem.h \ pm.h \ policy.h \ poll.h \
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/sys/physmem.h Thu Dec 14 17:27:13 2006 -0800 @@ -0,0 +1,61 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +#ifndef _PHYSMEM_H +#define _PHYSMEM_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +/* ioctl values */ +#define PHYSMEM_SETUP 1 +#define PHYSMEM_MAP 2 +#define PHYSMEM_DESTROY 3 + +/* flags values */ +#define PHYSMEM_CAGE (1 << 0) +#define PHYSMEM_RETIRED (1 << 1) + +struct physmem_setup_param { + uint64_t req_paddr; /* requested physical address */ + uint64_t len; /* length of memory to be allocated */ + uint64_t user_va; /* VA to associate with req_paddr */ + uint64_t cookie; /* cookie returned for destroy function */ +}; + +struct physmem_map_param { + uint64_t req_paddr; /* requested physical address */ + uint64_t ret_va; /* VA which mapped req_paddr */ + uint32_t flags; /* flags for cage or retired pages */ +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _PHYSMEM_H */
--- a/usr/src/uts/common/sys/thread.h Thu Dec 14 16:42:14 2006 -0800 +++ b/usr/src/uts/common/sys/thread.h Thu Dec 14 17:27:13 2006 -0800 @@ -359,6 +359,7 @@ #define T_WATCHPT 0x0400 /* thread undergoing a watchpoint emulation */ #define T_PANIC 0x0800 /* thread initiated a system panic */ #define T_DFLTSTK 0x1000 /* stack is default size */ +#define T_CAPTURING 0x2000 /* thread is in page capture logic */ /* * Flags in t_proc_flag.
--- a/usr/src/uts/common/vm/page.h Thu Dec 14 16:42:14 2006 -0800 +++ b/usr/src/uts/common/vm/page.h Thu Dec 14 17:27:13 2006 -0800 @@ -667,6 +667,7 @@ void page_free_pages(page_t *); void free_vp_pages(struct vnode *, u_offset_t, size_t); int page_reclaim(page_t *, kmutex_t *); +int page_reclaim_pages(page_t *, kmutex_t *, uint_t); void page_destroy(page_t *, int); void page_destroy_pages(page_t *); void page_destroy_free(page_t *); @@ -702,8 +703,9 @@ int page_tryupgrade(page_t *); void page_downgrade(page_t *); void page_unlock(page_t *); -void page_unlock_noretire(page_t *); +void page_unlock_nocapture(page_t *); void page_lock_delete(page_t *); +int page_deleted(page_t *); int page_pp_lock(page_t *, int, int); void page_pp_unlock(page_t *, int, int); int page_resv(pgcnt_t, uint_t); @@ -725,7 +727,7 @@ page_t *page_next_scan_init(void **); page_t *page_next_scan_large(page_t *, ulong_t *, void **); void prefetch_page_r(void *); -void ppcopy(page_t *, page_t *); +int ppcopy(page_t *, page_t *); void page_relocate_hash(page_t *, page_t *); void pagezero(page_t *, uint_t, uint_t); void pagescrub(page_t *, uint_t, uint_t); @@ -750,8 +752,7 @@ int page_unretire(uint64_t); int page_unretire_pp(page_t *, int); void page_tryretire(page_t *); -void page_retire_hunt(void (*)(page_t *)); -void page_retire_mdboot_cb(page_t *); +void page_retire_mdboot(); void page_clrtoxic(page_t *, uchar_t); void page_settoxic(page_t *, uchar_t); @@ -910,6 +911,15 @@ * * Note that, while p_toxic bits can be set without holding any locks, they * should only be cleared while holding the page exclusively locked. + * There is one exception to this, the PR_CAPTURE bit is protected by a mutex + * within the page capture logic and thus to set or clear the bit, that mutex + * needs to be held. The page does not need to be locked but the page_clrtoxic + * function must be used as we need an atomic operation. + * Also note that there is what amounts to a hack to prevent recursion with + * large pages such that if we are unlocking a page and the PR_CAPTURE bit is + * set, we will only try to capture the page if the current threads T_CAPTURING + * flag is not set. If the flag is set, the unlock will not try to capture + * the page even though the PR_CAPTURE bit is set. * * Pages with PR_UE or PR_FMA flags are retired unconditionally, while pages * with PR_MCE are retired if the system has not retired too many of them. @@ -931,15 +941,15 @@ #define PR_UE 0x02 /* page has an unhandled UE */ #define PR_UE_SCRUBBED 0x04 /* page has seen a UE but was cleaned */ #define PR_FMA 0x08 /* A DE wants this page retired */ -#define PR_RESV 0x10 /* Reserved for future use */ -#define PR_BUSY 0x20 /* Page retire is in progress */ +#define PR_CAPTURE 0x10 /* Generic page capture flag */ +#define PR_RESV 0x20 /* Reserved for future use */ #define PR_MSG 0x40 /* message(s) already printed for this page */ #define PR_RETIRED 0x80 /* This page has been retired */ #define PR_REASONS (PR_UE | PR_MCE | PR_FMA) #define PR_TOXIC (PR_UE) #define PR_ERRMASK (PR_UE | PR_UE_SCRUBBED | PR_MCE | PR_FMA) -#define PR_ALLFLAGS (0xFF) +#define PR_TOXICFLAGS (0xCF) #define PP_RETIRED(pp) ((pp)->p_toxic & PR_RETIRED) #define PP_TOXIC(pp) ((pp)->p_toxic & PR_TOXIC) @@ -949,6 +959,13 @@ !PP_ISKVP(pp)) /* + * Flags for page_unretire_pp + */ +#define PR_UNR_FREE 0x1 +#define PR_UNR_CLEAN 0x2 +#define PR_UNR_TEMP 0x4 + +/* * kpm large page description. * The virtual address range of segkpm is divided into chunks of * kpm_pgsz. Each chunk is controlled by a kpm_page_t. The ushort @@ -1064,6 +1081,57 @@ void build_pfn_hash(); extern struct memseg *page_numtomemseg_nolock(pfn_t pfnum); +/* + * page capture related info: + * The page capture routines allow us to asynchronously capture given pages + * for the explicit use of the requestor. New requestors can be added by + * explicitly adding themselves to the PC_* flags below and incrementing + * PC_NUM_CALLBACKS as necessary. + * + * Subsystems using page capture must register a callback before attempting + * to capture a page. A duration of -1 will indicate that we will never give + * up while trying to capture a page and will only stop trying to capture the + * given page once we have successfully captured it. Thus the user needs to be + * aware of the behavior of all callers who have a duration of -1. + * + * For now, only /dev/physmem and page retire use the page capture interface + * and only a single request can be outstanding for a given page. Thus, if + * /dev/phsymem wants a page and page retire also wants the same page, only + * the page retire request will be honored until the point in time that the + * page is actually retired, at which point in time, subsequent requests by + * /dev/physmem will succeed if the CAPTURE_GET_RETIRED flag was set. + */ + +#define PC_RETIRE (0) +#define PC_PHYSMEM (1) +#define PC_NUM_CALLBACKS (2) +#define PC_MASK ((1 << PC_NUM_CALLBACKS) - 1) + +#define CAPTURE_RETIRE (1 << PC_RETIRE) +#define CAPTURE_PHYSMEM (1 << PC_PHYSMEM) + +#define CAPTURE_ASYNC (0x0200) + +#define CAPTURE_GET_RETIRED (0x1000) +#define CAPTURE_GET_CAGE (0x2000) + +struct page_capture_callback { + int cb_active; /* 1 means active, 0 means inactive */ + clock_t duration; /* the length in time that we'll attempt to */ + /* capture this page asynchronously. (in HZ) */ + krwlock_t cb_rwlock; + int (*cb_func)(page_t *, void *, uint_t); /* callback function */ +}; + +extern kcondvar_t pc_cv; + +void page_capture_register_callback(uint_t index, clock_t duration, + int (*cb_func)(page_t *, void *, uint_t)); +void page_capture_unregister_callback(uint_t index); +int page_trycapture(page_t *pp, uint_t szc, uint_t flags, void *datap); +void page_unlock_capture(page_t *pp); +int page_capture_unretire_pp(page_t *); + #ifdef __cplusplus } #endif
--- a/usr/src/uts/common/vm/page_lock.c Thu Dec 14 16:42:14 2006 -0800 +++ b/usr/src/uts/common/vm/page_lock.c Thu Dec 14 17:27:13 2006 -0800 @@ -585,7 +585,7 @@ * freelist manager; please don't call it. */ void -page_unlock_noretire(page_t *pp) +page_unlock_nocapture(page_t *pp) { kmutex_t *pse = PAGE_SE_MUTEX(pp); selock_t old; @@ -598,7 +598,7 @@ if (CV_HAS_WAITERS(&pp->p_cv)) cv_broadcast(&pp->p_cv); } else if ((old & ~SE_EWANTED) == SE_DELETED) { - panic("page_unlock_noretire: page %p is deleted", pp); + panic("page_unlock_nocapture: page %p is deleted", pp); } else if (old < 0) { THREAD_KPRI_RELEASE(); pp->p_selock &= SE_EWANTED; @@ -607,7 +607,7 @@ } else if ((old & ~SE_EWANTED) > SE_READER) { pp->p_selock = old - SE_READER; } else { - panic("page_unlock_noretire: page %p is not locked", pp); + panic("page_unlock_nocapture: page %p is not locked", pp); } mutex_exit(pse); @@ -643,23 +643,21 @@ panic("page_unlock: page %p is not locked", pp); } - if (pp->p_selock == 0 && PP_PR_REQ(pp)) { + if (pp->p_selock == 0) { /* - * Try to retire the page. If it retires, great. - * If not, oh well, we'll get it in the next unlock - * request, and repeat the cycle. Regardless, - * page_tryretire() will drop the page lock. + * If the T_CAPTURING bit is set, that means that we should + * not try and capture the page again as we could recurse + * which could lead to a stack overflow panic or spending a + * relatively long time in the kernel making no progress. */ - if ((pp->p_toxic & PR_BUSY) == 0) { + if ((pp->p_toxic & PR_CAPTURE) && + !(curthread->t_flag & T_CAPTURING) && + !PP_RETIRED(pp)) { THREAD_KPRI_REQUEST(); pp->p_selock = SE_WRITER; - page_settoxic(pp, PR_BUSY); mutex_exit(pse); - page_tryretire(pp); + page_unlock_capture(pp); } else { - pp->p_selock = SE_WRITER; - page_clrtoxic(pp, PR_BUSY); - pp->p_selock = 0; mutex_exit(pse); } } else { @@ -736,6 +734,12 @@ mutex_exit(pse); } +int +page_deleted(page_t *pp) +{ + return (pp->p_selock == SE_DELETED); +} + /* * Implement the io lock for pages */
--- a/usr/src/uts/common/vm/page_retire.c Thu Dec 14 16:42:14 2006 -0800 +++ b/usr/src/uts/common/vm/page_retire.c Thu Dec 14 17:27:13 2006 -0800 @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -85,28 +84,24 @@ /* * Things to fix: * - * 1. Cleanup SE_EWANTED. Since we're aggressive about trying to retire - * pages, we can use page_retire_pp() to replace SE_EWANTED and all - * the special delete_memory_thread() code just goes away. - * - * 2. Trying to retire non-relocatable kvp pages may result in a + * 1. Trying to retire non-relocatable kvp pages may result in a * quagmire. This is because seg_kmem() no longer keeps its pages locked, * and calls page_lookup() in the free path; since kvp pages are modified * and don't have a usable backing store, page_retire() can't do anything * with them, and we'll keep denying the lock to seg_kmem_free() in a * vicious cycle. To prevent that, we don't deny locks to kvp pages, and - * hence only call page_retire_pp() from page_unlock() in the free path. + * hence only try to retire a page from page_unlock() in the free path. * Since most kernel pages are indefinitely held anyway, and don't * participate in I/O, this is of little consequence. * - * 3. Low memory situations will be interesting. If we don't have + * 2. Low memory situations will be interesting. If we don't have * enough memory for page_relocate() to succeed, we won't be able to * retire dirty pages; nobody will be able to push them out to disk * either, since we aggressively deny the page lock. We could change * fsflush so it can recognize this situation, grab the lock, and push * the page out, where we'll catch it in the free path and retire it. * - * 4. Beware of places that have code like this in them: + * 3. Beware of places that have code like this in them: * * if (! page_tryupgrade(pp)) { * page_unlock(pp); @@ -125,7 +120,7 @@ * page, and then unlock the page. Page_free() will then go castors * up. So if anybody is doing this, it's already a bug. * - * 5. mdboot()'s call into page_retire_hunt() should probably be + * 4. mdboot()'s call into page_retire_mdboot() should probably be * moved lower. Where the call is made now, we can get into trouble * by scrubbing a kernel page that is then accessed later. */ @@ -154,18 +149,7 @@ */ vnode_t *retired_pages; -/* - * Background thread that wakes up periodically to try to retire pending - * pages. This prevents threads from becoming blocked indefinitely in - * page_lookup() or some other routine should the page(s) they are waiting - * on become eligible for social security. - */ -static void page_retire_thread(void); -static kthread_t *pr_thread_id; -static kcondvar_t pr_cv; -static kmutex_t pr_thread_mutex; -static clock_t pr_thread_shortwait; -static clock_t pr_thread_longwait; +static int page_retire_pp_finish(page_t *, void *, uint_t); /* * Make a list of all of the pages that have been marked for retirement @@ -243,6 +227,13 @@ #define PR_KSTAT_DQFAIL (page_retire_kstat.pr_dequeue_fail.value.ui64) /* + * page retire kstats to list all retired pages + */ +static int pr_list_kstat_update(kstat_t *ksp, int rw); +static int pr_list_kstat_snapshot(kstat_t *ksp, void *buf, int rw); +kmutex_t pr_list_kstat_mutex; + +/* * Limit the number of multiple CE page retires. * The default is 0.1% of physmem, or 1 in 1000 pages. This is set in * basis points, where 100 basis points equals one percent. @@ -473,11 +464,13 @@ * Note that multiple bits may cleared in a single clrtoxic operation. * Must be called with the page exclusively locked to prevent races which * may attempt to retire a page without any toxic bits set. + * Note that the PR_CAPTURE bit can be cleared without the exclusive lock + * being held as there is a separate mutex which protects that bit. */ void page_clrtoxic(page_t *pp, uchar_t bits) { - ASSERT(PAGE_EXCL(pp)); + ASSERT((bits & PR_CAPTURE) || PAGE_EXCL(pp)); atomic_and_8(&pp->p_toxic, ~bits); } @@ -523,82 +516,6 @@ } /* - * On a reboot, our friend mdboot() wants to clear up any PP_PR_REQ() pages - * that we were not able to retire. On large machines, walking the complete - * page_t array and looking at every page_t takes too long. So, as a page is - * marked toxic, we track it using a list that can be processed at reboot - * time. page_retire_enqueue() will do its best to try to avoid duplicate - * entries, but if we get too many errors at once the queue can overflow, - * in which case we will end up walking every page_t as a last resort. - * The background thread also makes use of this queue to find which pages - * are pending retirement. - */ -static void -page_retire_enqueue(page_t *pp) -{ - int nslot = -1; - int i; - - mutex_enter(&pr_q_mutex); - - /* - * Check to make sure retire hasn't already dequeued it. - * In the meantime if the page was cleaned up, no need - * to enqueue it. - */ - if (PP_RETIRED(pp) || pp->p_toxic == 0) { - mutex_exit(&pr_q_mutex); - PR_DEBUG(prd_noaction); - return; - } - - for (i = 0; i < PR_PENDING_QMAX; i++) { - if (pr_pending_q[i] == pp) { - mutex_exit(&pr_q_mutex); - PR_DEBUG(prd_qdup); - return; - } else if (nslot == -1 && pr_pending_q[i] == NULL) { - nslot = i; - } - } - - PR_INCR_KSTAT(pr_pending); - - if (nslot != -1) { - pr_pending_q[nslot] = pp; - PR_DEBUG(prd_queued); - } else { - PR_INCR_KSTAT(pr_enqueue_fail); - PR_DEBUG(prd_notqueued); - } - mutex_exit(&pr_q_mutex); -} - -static void -page_retire_dequeue(page_t *pp) -{ - int i; - - mutex_enter(&pr_q_mutex); - - for (i = 0; i < PR_PENDING_QMAX; i++) { - if (pr_pending_q[i] == pp) { - pr_pending_q[i] = NULL; - break; - } - } - - if (i == PR_PENDING_QMAX) { - PR_INCR_KSTAT(pr_dequeue_fail); - } - - PR_DECR_KSTAT(pr_pending); - PR_DEBUG(prd_dequeue); - - mutex_exit(&pr_q_mutex); -} - -/* * Act like page_destroy(), but instead of freeing the page, hash it onto * the retired_pages vnode, and mark it retired. * @@ -626,8 +543,6 @@ } page_settoxic(pp, PR_RETIRED); - page_clrtoxic(pp, PR_BUSY); - page_retire_dequeue(pp); PR_INCR_KSTAT(pr_retired); if (pp->p_toxic & PR_FMA) { @@ -784,8 +699,7 @@ } else { PR_INCR_KSTAT(pr_ue_cleared_free); - page_clrtoxic(pp, PR_UE | PR_MCE | PR_MSG | PR_BUSY); - page_retire_dequeue(pp); + page_clrtoxic(pp, PR_UE | PR_MCE | PR_MSG); /* LINTED: CONSTCOND */ VN_DISPOSE(pp, B_FREE, 1, kcred); @@ -825,6 +739,83 @@ /*NOTREACHED*/ } +static int +pr_list_kstat_update(kstat_t *ksp, int rw) +{ + uint_t count; + page_t *pp; + kmutex_t *vphm; + + if (rw == KSTAT_WRITE) + return (EACCES); + + vphm = page_vnode_mutex(retired_pages); + mutex_enter(vphm); + /* Needs to be under a lock so that for loop will work right */ + if (retired_pages->v_pages == NULL) { + mutex_exit(vphm); + ksp->ks_ndata = 0; + ksp->ks_data_size = 0; + return (0); + } + + count = 1; + for (pp = retired_pages->v_pages->p_vpnext; + pp != retired_pages->v_pages; pp = pp->p_vpnext) { + count++; + } + mutex_exit(vphm); + + ksp->ks_ndata = count; + ksp->ks_data_size = count * 2 * sizeof (uint64_t); + + return (0); +} + +/* + * all spans will be pagesize and no coalescing will be done with the + * list produced. + */ +static int +pr_list_kstat_snapshot(kstat_t *ksp, void *buf, int rw) +{ + kmutex_t *vphm; + page_t *pp; + struct memunit { + uint64_t address; + uint64_t size; + } *kspmem; + + if (rw == KSTAT_WRITE) + return (EACCES); + + ksp->ks_snaptime = gethrtime(); + + kspmem = (struct memunit *)buf; + + vphm = page_vnode_mutex(retired_pages); + mutex_enter(vphm); + pp = retired_pages->v_pages; + if (((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size) || + (pp == NULL)) { + mutex_exit(vphm); + return (0); + } + kspmem->address = ptob(pp->p_pagenum); + kspmem->size = PAGESIZE; + kspmem++; + for (pp = pp->p_vpnext; pp != retired_pages->v_pages; + pp = pp->p_vpnext, kspmem++) { + if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size) + break; + kspmem->address = ptob(pp->p_pagenum); + kspmem->size = PAGESIZE; + } + mutex_exit(vphm); + + return (0); +} + /* * Initialize the page retire mechanism: * @@ -833,13 +824,14 @@ * - Build the retired_pages vnode. * - Set up the kstats. * - Fire off the background thread. - * - Tell page_tryretire() it's OK to start retiring pages. + * - Tell page_retire() it's OK to start retiring pages. */ void page_retire_init(void) { const fs_operation_def_t retired_vnodeops_template[] = {NULL, NULL}; struct vnodeops *vops; + kstat_t *ksp; const uint_t page_retire_ndata = sizeof (page_retire_kstat) / sizeof (kstat_named_t); @@ -869,13 +861,17 @@ kstat_install(page_retire_ksp); } - pr_thread_shortwait = 23 * hz; - pr_thread_longwait = 1201 * hz; - mutex_init(&pr_thread_mutex, NULL, MUTEX_DEFAULT, NULL); - cv_init(&pr_cv, NULL, CV_DEFAULT, NULL); - pr_thread_id = thread_create(NULL, 0, page_retire_thread, NULL, 0, &p0, - TS_RUN, minclsyspri); + mutex_init(&pr_list_kstat_mutex, NULL, MUTEX_DEFAULT, NULL); + ksp = kstat_create("unix", 0, "page_retire_list", "misc", + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL); + if (ksp != NULL) { + ksp->ks_update = pr_list_kstat_update; + ksp->ks_snapshot = pr_list_kstat_snapshot; + ksp->ks_lock = &pr_list_kstat_mutex; + kstat_install(ksp); + } + page_capture_register_callback(PC_RETIRE, -1, page_retire_pp_finish); pr_enable = 1; } @@ -914,122 +910,17 @@ pp->p_toxic = 0; } -/* - * Hunt down any pages in the system that have not yet been retired, invoking - * the provided callback function on each of them. - */ -void -page_retire_hunt(void (*callback)(page_t *)) -{ - page_t *pp; - page_t *first; - uint64_t tbr, found; - int i; - - PR_DEBUG(prd_hunt); - - if (PR_KSTAT_PENDING == 0) { - return; - } - - PR_DEBUG(prd_dohunt); - - found = 0; - mutex_enter(&pr_q_mutex); - - tbr = PR_KSTAT_PENDING; - - for (i = 0; i < PR_PENDING_QMAX; i++) { - if ((pp = pr_pending_q[i]) != NULL) { - mutex_exit(&pr_q_mutex); - callback(pp); - mutex_enter(&pr_q_mutex); - found++; - } - } - - if (PR_KSTAT_EQFAIL == PR_KSTAT_DQFAIL && found == tbr) { - mutex_exit(&pr_q_mutex); - PR_DEBUG(prd_earlyhunt); - return; - } - mutex_exit(&pr_q_mutex); - - PR_DEBUG(prd_latehunt); - - /* - * We've lost track of a page somewhere. Hunt it down. - */ - memsegs_lock(0); - pp = first = page_first(); - do { - if (PP_PR_REQ(pp)) { - callback(pp); - if (++found == tbr) { - break; /* got 'em all */ - } - } - } while ((pp = page_next(pp)) != first); - memsegs_unlock(0); -} /* - * The page_retire_thread loops forever, looking to see if there are - * pages still waiting to be retired. + * Callback used by page_trycapture() to finish off retiring a page. + * The page has already been cleaned and we've been given sole access to + * it. + * Always returns 0 to indicate that callback succeded as the callback never + * fails to finish retiring the given page. */ -static void -page_retire_thread(void) -{ - callb_cpr_t c; - - CALLB_CPR_INIT(&c, &pr_thread_mutex, callb_generic_cpr, "page_retire"); - - mutex_enter(&pr_thread_mutex); - for (;;) { - if (pr_enable && PR_KSTAT_PENDING) { - /* - * Sigh. It's SO broken how we have to try to shake - * loose the holder of the page. Since we have no - * idea who or what has it locked, we go bang on - * every door in the city to try to locate it. - */ - kmem_reap(); - seg_preap(); - page_retire_hunt(page_retire_thread_cb); - CALLB_CPR_SAFE_BEGIN(&c); - (void) cv_timedwait(&pr_cv, &pr_thread_mutex, - lbolt + pr_thread_shortwait); - CALLB_CPR_SAFE_END(&c, &pr_thread_mutex); - } else { - CALLB_CPR_SAFE_BEGIN(&c); - (void) cv_timedwait(&pr_cv, &pr_thread_mutex, - lbolt + pr_thread_longwait); - CALLB_CPR_SAFE_END(&c, &pr_thread_mutex); - } - } - /*NOTREACHED*/ -} - -/* - * page_retire_pp() decides what to do with a failing page. - * - * When we get a free page (e.g. the scrubber or in the free path) life is - * nice because the page is clean and marked free -- those always retire - * nicely. From there we go by order of difficulty. If the page has data, - * we attempt to relocate its contents to a suitable replacement page. If - * that does not succeed, we look to see if it is clean. If after all of - * this we have a clean, unmapped page (which we usually do!), we retire it. - * If the page is not clean, we still process it regardless on a UE; for - * CEs or FMA requests, we fail leaving the page in service. The page will - * eventually be tried again later. We always return with the page unlocked - * since we are called from page_unlock(). - * - * We don't call panic or do anything fancy down in here. Our boss the DE - * gets paid handsomely to do his job of figuring out what to do when errors - * occur. We just do what he tells us to do. - */ +/*ARGSUSED*/ static int -page_retire_pp(page_t *pp) +page_retire_pp_finish(page_t *pp, void *notused, uint_t flags) { int toxic; @@ -1037,102 +928,7 @@ ASSERT(pp->p_iolock_state == 0); ASSERT(pp->p_szc == 0); - PR_DEBUG(prd_top); - PR_TYPES(pp); - toxic = pp->p_toxic; - ASSERT(toxic & PR_REASONS); - - if ((toxic & (PR_FMA | PR_MCE)) && !(toxic & PR_UE) && - page_retire_limit()) { - page_clrtoxic(pp, PR_FMA | PR_MCE | PR_MSG | PR_BUSY); - page_retire_dequeue(pp); - page_unlock(pp); - return (page_retire_done(pp, PRD_LIMIT)); - } - - if (PP_ISFREE(pp)) { - int dbgnoreclaim = MTBF(recl_calls, recl_mtbf) == 0; - - PR_DEBUG(prd_free); - - if (dbgnoreclaim || !page_reclaim(pp, NULL)) { - PR_DEBUG(prd_noreclaim); - PR_INCR_KSTAT(pr_failed); - /* - * page_reclaim() returns with `pp' unlocked when - * it fails. - */ - if (dbgnoreclaim) - page_unlock(pp); - return (page_retire_done(pp, PRD_FAILED)); - } - } - ASSERT(!PP_ISFREE(pp)); - - if ((toxic & PR_UE) == 0 && pp->p_vnode && !PP_ISNORELOCKERNEL(pp) && - MTBF(reloc_calls, reloc_mtbf)) { - page_t *newpp; - spgcnt_t count; - - /* - * If we can relocate the page, great! newpp will go - * on without us, and everything is fine. Regardless - * of whether the relocation succeeds, we are still - * going to take `pp' around back and shoot it. - */ - newpp = NULL; - if (page_relocate(&pp, &newpp, 0, 0, &count, NULL) == 0) { - PR_DEBUG(prd_reloc); - page_unlock(newpp); - ASSERT(hat_page_getattr(pp, P_MOD) == 0); - } else { - PR_DEBUG(prd_relocfail); - } - } - - if (hat_ismod(pp)) { - PR_DEBUG(prd_mod); - PR_INCR_KSTAT(pr_failed); - page_unlock(pp); - return (page_retire_done(pp, PRD_FAILED)); - } - - if (PP_ISKVP(pp)) { - PR_DEBUG(prd_kern); - PR_INCR_KSTAT(pr_failed_kernel); - page_unlock(pp); - return (page_retire_done(pp, PRD_FAILED)); - } - - if (pp->p_lckcnt || pp->p_cowcnt) { - PR_DEBUG(prd_locked); - PR_INCR_KSTAT(pr_failed); - page_unlock(pp); - return (page_retire_done(pp, PRD_FAILED)); - } - - (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); - ASSERT(!hat_page_is_mapped(pp)); - - /* - * If the page is modified, and was not relocated; we can't - * retire it without dropping data on the floor. We have to - * recheck after unloading since the dirty bit could have been - * set since we last checked. - */ - if (hat_ismod(pp)) { - PR_DEBUG(prd_mod_late); - PR_INCR_KSTAT(pr_failed); - page_unlock(pp); - return (page_retire_done(pp, PRD_FAILED)); - } - - if (pp->p_vnode) { - PR_DEBUG(prd_hashout); - page_hashout(pp, NULL); - } - ASSERT(!pp->p_vnode); /* * The problem page is locked, demoted, unmapped, not free, @@ -1141,62 +937,45 @@ * Now we select our ammunition, take it around back, and shoot it. */ if (toxic & PR_UE) { +ue_error: if (page_retire_transient_ue(pp)) { PR_DEBUG(prd_uescrubbed); - return (page_retire_done(pp, PRD_UE_SCRUBBED)); + (void) page_retire_done(pp, PRD_UE_SCRUBBED); } else { PR_DEBUG(prd_uenotscrubbed); page_retire_destroy(pp); - return (page_retire_done(pp, PRD_SUCCESS)); + (void) page_retire_done(pp, PRD_SUCCESS); } + return (0); } else if (toxic & PR_FMA) { PR_DEBUG(prd_fma); page_retire_destroy(pp); - return (page_retire_done(pp, PRD_SUCCESS)); + (void) page_retire_done(pp, PRD_SUCCESS); + return (0); } else if (toxic & PR_MCE) { PR_DEBUG(prd_mce); page_retire_destroy(pp); - return (page_retire_done(pp, PRD_SUCCESS)); - } - panic("page_retire_pp: bad toxic flags %d", toxic); - /*NOTREACHED*/ -} - -/* - * Try to retire a page when we stumble onto it in the page lock routines. - */ -void -page_tryretire(page_t *pp) -{ - ASSERT(PAGE_EXCL(pp)); - - if (!pr_enable) { - page_unlock(pp); - return; + (void) page_retire_done(pp, PRD_SUCCESS); + return (0); } /* - * If the page is a big page, try to break it up. - * - * If there are other bad pages besides `pp', they will be - * recursively retired for us thanks to a bit of magic. - * If the page is a small page with errors, try to retire it. + * When page_retire_first_ue is set to zero and a UE occurs which is + * transient, it's possible that we clear some flags set by a second + * UE error on the page which occurs while the first is currently being + * handled and thus we need to handle the case where none of the above + * are set. In this instance, PR_UE_SCRUBBED should be set and thus + * we should execute the UE code above. */ - if (pp->p_szc > 0) { - if (PP_ISFREE(pp) && !page_try_demote_free_pages(pp)) { - page_unlock(pp); - PR_DEBUG(prd_nofreedemote); - return; - } else if (!page_try_demote_pages(pp)) { - page_unlock(pp); - PR_DEBUG(prd_nodemote); - return; - } - PR_DEBUG(prd_demoted); - page_unlock(pp); - } else { - (void) page_retire_pp(pp); + if (toxic & PR_UE_SCRUBBED) { + goto ue_error; } + + /* + * It's impossible to get here. + */ + panic("bad toxic flags 0x%x in page_retire_pp_finish\n", toxic); + return (0); } /* @@ -1204,12 +983,10 @@ * * Ideally, page_retire() would instantly retire the requested page. * Unfortunately, some pages are locked or otherwise tied up and cannot be - * retired right away. To deal with that, bits are set in p_toxic of the - * page_t. An attempt is made to lock the page; if the attempt is successful, - * we instantly unlock the page counting on page_unlock() to notice p_toxic - * is nonzero and to call back into page_retire_pp(). Success is determined - * by looking to see whether the page has been retired once it has been - * unlocked. + * retired right away. We use the page capture logic to deal with this + * situation as it will continuously try to retire the page in the background + * if the first attempt fails. Success is determined by looking to see whether + * the page has been retired after the page_trycapture() attempt. * * Returns: * @@ -1247,22 +1024,20 @@ PR_MESSAGE(CE_NOTE, 1, "Scheduling removal of" " page 0x%08x.%08x", pa); } - page_settoxic(pp, reason); - page_retire_enqueue(pp); + + /* Avoid setting toxic bits in the first place */ + if ((reason & (PR_FMA | PR_MCE)) && !(reason & PR_UE) && + page_retire_limit()) { + return (page_retire_done(pp, PRD_LIMIT)); + } - /* - * And now for some magic. - * - * We marked this page toxic up above. All there is left to do is - * to try to lock the page and then unlock it. The page lock routines - * will intercept the page and retire it if they can. If the page - * cannot be locked, 's okay -- page_unlock() will eventually get it, - * or the background thread, until then the lock routines will deny - * further locks on it. - */ - if (MTBF(pr_calls, pr_mtbf) && page_trylock(pp, SE_EXCL)) { - PR_DEBUG(prd_prlocked); - page_unlock(pp); + if (MTBF(pr_calls, pr_mtbf)) { + page_settoxic(pp, reason); + if (page_trycapture(pp, 0, CAPTURE_RETIRE, NULL) == 0) { + PR_DEBUG(prd_prlocked); + } else { + PR_DEBUG(prd_prnotlocked); + } } else { PR_DEBUG(prd_prnotlocked); } @@ -1271,7 +1046,7 @@ PR_DEBUG(prd_prretired); return (0); } else { - cv_signal(&pr_cv); + cv_signal(&pc_cv); PR_INCR_KSTAT(pr_failed); if (pp->p_toxic & PR_MSG) { @@ -1291,15 +1066,24 @@ * Any unretire messages are printed from this routine. * * Returns 0 if page pp was unretired; else an error code. + * + * If flags is: + * PR_UNR_FREE - lock the page, clear the toxic flags and free it + * to the freelist. + * PR_UNR_TEMP - lock the page, unretire it, leave the toxic + * bits set as is and return it to the caller. + * PR_UNR_CLEAN - page is SE_EXCL locked, unretire it, clear the + * toxic flags and return it to caller as is. */ int -page_unretire_pp(page_t *pp, int free) +page_unretire_pp(page_t *pp, int flags) { /* * To be retired, a page has to be hashed onto the retired_pages vnode * and have PR_RETIRED set in p_toxic. */ - if (free == 0 || page_try_reclaim_lock(pp, SE_EXCL, SE_RETIRED)) { + if (flags == PR_UNR_CLEAN || + page_try_reclaim_lock(pp, SE_EXCL, SE_RETIRED)) { ASSERT(PAGE_EXCL(pp)); PR_DEBUG(prd_ulocked); if (!PP_RETIRED(pp)) { @@ -1317,9 +1101,13 @@ } else { PR_DECR_KSTAT(pr_mce); } - page_clrtoxic(pp, PR_ALLFLAGS); - if (free) { + if (flags == PR_UNR_TEMP) + page_clrtoxic(pp, PR_RETIRED); + else + page_clrtoxic(pp, PR_TOXICFLAGS); + + if (flags == PR_UNR_FREE) { PR_DEBUG(prd_udestroy); page_destroy(pp, 0); } else { @@ -1363,7 +1151,7 @@ return (page_retire_done(pp, PRD_INVALID_PA)); } - return (page_unretire_pp(pp, 1)); + return (page_unretire_pp(pp, PR_UNR_FREE)); } /* @@ -1462,12 +1250,14 @@ page_unlock(lpp); continue; } - page_settoxic(cpp, PR_FMA | PR_BUSY); - page_settoxic(cpp2, PR_FMA); - page_tryretire(cpp); /* will fail */ + + /* fails */ + (void) page_retire(ptob(cpp->p_pagenum), PR_FMA); + page_unlock(lpp); - (void) page_retire(cpp->p_pagenum, PR_FMA); - (void) page_retire(cpp2->p_pagenum, PR_FMA); + page_unlock(cpp); + (void) page_retire(ptob(cpp->p_pagenum), PR_FMA); + (void) page_retire(ptob(cpp2->p_pagenum), PR_FMA); } } while ((pp = page_next(pp)) != first); memsegs_unlock(0);
--- a/usr/src/uts/common/vm/vm_anon.c Thu Dec 14 16:42:14 2006 -0800 +++ b/usr/src/uts/common/vm/vm_anon.c Thu Dec 14 17:27:13 2006 -0800 @@ -2314,7 +2314,16 @@ * which is locked and loaded in the MMU by * the caller to prevent yet another page fault. */ - ppcopy(opp, pp); /* XXX - should set mod bit in here */ + /* XXX - should set mod bit in here */ + if (ppcopy(opp, pp) == 0) { + /* + * Before ppcopy could hanlde UE or other faults, we + * would have panicked here, and still have no option + * but to do so now. + */ + panic("anon_private, ppcopy failed, opp = 0x%p, pp = 0x%p", + opp, pp); + } hat_setrefmod(pp); /* mark as modified */ @@ -2557,7 +2566,14 @@ /* * Now copy the contents from the original page. */ - ppcopy(ppa[pg_idx], pp); + if (ppcopy(ppa[pg_idx], pp) == 0) { + /* + * Before ppcopy could hanlde UE or other faults, we + * would have panicked here, and still have no option + * but to do so now. + */ + panic("anon_map_privatepages, ppcopy failed"); + } hat_setrefmod(pp); /* mark as modified */
--- a/usr/src/uts/common/vm/vm_page.c Thu Dec 14 16:42:14 2006 -0800 +++ b/usr/src/uts/common/vm/vm_page.c Thu Dec 14 17:27:13 2006 -0800 @@ -329,6 +329,8 @@ static void page_init_mem_config(void); static int page_do_hashin(page_t *, vnode_t *, u_offset_t); static void page_do_hashout(page_t *); +static void page_capture_init(); +int page_capture_take_action(page_t *, uint_t, void *); static void page_demote_vp_pages(page_t *); @@ -344,6 +346,7 @@ page_init_mem_config(); page_retire_init(); vm_usage_init(); + page_capture_init(); } /* @@ -4439,7 +4442,7 @@ top: /* - * Flush dirty pages and destory the clean ones. + * Flush dirty pages and destroy the clean ones. */ nbusypages = 0; @@ -4778,6 +4781,7 @@ * EBUSY : failure to get locks on the page/pages * ENOMEM : failure to obtain replacement pages * EAGAIN : OBP has not yet completed its boot-time handoff to the kernel + * EIO : An error occurred while trying to copy the page data * * Return with all constituent members of target and replacement * SE_EXCL locked. It is the callers responsibility to drop the @@ -4791,9 +4795,7 @@ spgcnt_t *nrelocp, lgrp_t *lgrp) { -#ifdef DEBUG page_t *first_repl; -#endif /* DEBUG */ page_t *repl; page_t *targ; page_t *pl = NULL; @@ -4921,9 +4923,7 @@ #endif #endif -#ifdef DEBUG first_repl = repl; -#endif /* DEBUG */ for (i = 0; i < npgs; i++) { ASSERT(PAGE_EXCL(targ)); @@ -4942,7 +4942,33 @@ * Copy the page contents and attributes then * relocate the page in the page hash. */ - ppcopy(targ, repl); + if (ppcopy(targ, repl) == 0) { + targ = *target; + repl = first_repl; + VM_STAT_ADD(vmm_vmstats.ppr_copyfail); + if (grouplock != 0) { + group_page_unlock(targ); + } + if (dofree) { + *replacement = NULL; + page_free_replacement_page(repl); + page_create_putback(dofree); + } + return (EIO); + } + + targ++; + if (repl_contig != 0) { + repl++; + } else { + repl = repl->p_next; + } + } + + repl = first_repl; + targ = *target; + + for (i = 0; i < npgs; i++) { ppattr = hat_page_getattr(targ, (P_MOD | P_REF | P_RO)); page_clr_all_props(repl); page_set_props(repl, ppattr); @@ -6182,3 +6208,1277 @@ { return (hat_page_getattr(pp, P_MOD)); } + +/* + * Reclaim the given constituent page from the freelist, regardless of it's + * size. The page will be demoted as required. + * Returns 1 on success or 0 on failure. + * + * The page is unlocked if it can't be reclaimed (when freemem == 0). + * If `lock' is non-null, it will be dropped and re-acquired if + * the routine must wait while freemem is 0. + */ +int +page_reclaim_page(page_t *pp, kmutex_t *lock) +{ + struct pcf *p; + uint_t pcf_index; + struct cpu *cpup; + uint_t i; + pgcnt_t collected = 0; + + ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1); + ASSERT(PAGE_EXCL(pp) && PP_ISFREE(pp)); + + /* + * If `freemem' is 0, we cannot reclaim this page from the + * freelist, so release every lock we might hold: the page, + * and the `lock' before blocking. + * + * The only way `freemem' can become 0 while there are pages + * marked free (have their p->p_free bit set) is when the + * system is low on memory and doing a page_create(). In + * order to guarantee that once page_create() starts acquiring + * pages it will be able to get all that it needs since `freemem' + * was decreased by the requested amount. So, we need to release + * this page, and let page_create() have it. + * + * Since `freemem' being zero is not supposed to happen, just + * use the usual hash stuff as a starting point. If that bucket + * is empty, then assume the worst, and start at the beginning + * of the pcf array. If we always start at the beginning + * when acquiring more than one pcf lock, there won't be any + * deadlock problems. + */ + + /* TODO: Do we need to test kcage_freemem if PG_NORELOC(pp)? */ + + if (freemem <= throttlefree && !page_create_throttle(1, 0)) { + pcf_acquire_all(); + goto page_reclaim_nomem; + } + + pcf_index = PCF_INDEX(); + p = &pcf[pcf_index]; + mutex_enter(&p->pcf_lock); + if (p->pcf_count > 0) { + collected = 1; + p->pcf_count -= 1; + } + mutex_exit(&p->pcf_lock); + + if (!collected) { + VM_STAT_ADD(page_reclaim_zero); + /* + * Check again. Its possible that some other thread + * could have been right behind us, and added one + * to a list somewhere. Acquire each of the pcf locks + * until we find a page. + */ + p = pcf; + for (i = 0; i < PCF_FANOUT; i++) { + mutex_enter(&p->pcf_lock); + if (p->pcf_count) { + if (p->pcf_count > 0) { + p->pcf_count -= 1; + collected = 1; + break; + } + } + p++; + } + + if (!collected) { +page_reclaim_nomem: + /* + * We really can't have page `pp'. + * Time for the no-memory dance with + * page_free(). This is just like + * page_create_wait(). Plus the added + * attraction of releasing whatever mutex + * we held when we were called with in `lock'. + * Page_unlock() will wakeup any thread + * waiting around for this page. + */ + if (lock) { + VM_STAT_ADD(page_reclaim_zero_locked); + mutex_exit(lock); + } + page_unlock(pp); + + /* + * get this before we drop all the pcf locks. + */ + mutex_enter(&new_freemem_lock); + + p = pcf; + for (i = 0; i < PCF_FANOUT; i++) { + p->pcf_wait++; + mutex_exit(&p->pcf_lock); + p++; + } + + freemem_wait++; + cv_wait(&freemem_cv, &new_freemem_lock); + freemem_wait--; + + mutex_exit(&new_freemem_lock); + + if (lock) { + mutex_enter(lock); + } + return (0); + } + + /* + * We beat the PCF bins over the head until + * we got the memory that we wanted. + * The pcf accounting has been done, + * though none of the pcf_wait flags have been set, + * drop the locks and continue on. + */ + ASSERT(collected == 1); + while (p >= pcf) { + mutex_exit(&p->pcf_lock); + p--; + } + } + + /* + * freemem is not protected by any lock. Thus, we cannot + * have any assertion containing freemem here. + */ + freemem -= 1; + + VM_STAT_ADD(pagecnt.pc_reclaim); + if (PP_ISAGED(pp)) { + page_list_sub(pp, PG_FREE_LIST); + TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_FREE, + "page_reclaim_page_free:pp %p", pp); + } else { + page_list_sub(pp, PG_CACHE_LIST); + TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_CACHE, + "page_reclaim_page_cache:pp %p", pp); + } + + /* + * The page we took off the freelist must be szc 0 as + * we used page_list_sub which will demote the page if needed. + */ + ASSERT(pp->p_szc == 0); + + /* + * clear the p_free & p_age bits since this page is no longer + * on the free list. Notice that there was a brief time where + * a page is marked as free, but is not on the list. + * + * Set the reference bit to protect against immediate pageout. + */ + PP_CLRFREE(pp); + PP_CLRAGED(pp); + page_set_props(pp, P_REF); + + CPU_STATS_ENTER_K(); + cpup = CPU; /* get cpup now that CPU cannot change */ + CPU_STATS_ADDQ(cpup, vm, pgrec, 1); + CPU_STATS_ADDQ(cpup, vm, pgfrec, 1); + CPU_STATS_EXIT_K(); + + return (1); +} + +/* + * The following code all currently relates to the page capture logic: + * + * This logic is used for cases where there is a desire to claim a certain + * physical page in the system for the caller. As it may not be possible + * to capture the page immediately, the p_toxic bits are used in the page + * structure to indicate that someone wants to capture this page. When the + * page gets unlocked, the toxic flag will be noted and an attempt to capture + * the page will be made. If it is successful, the original callers callback + * will be called with the page to do with it what they please. + * + * There is also an async thread which wakes up to attempt to capture + * pages occasionally which have the capture bit set. All of the pages which + * need to be captured asynchronously have been inserted into the + * page_capture_hash and thus this thread walks that hash list. Items in the + * hash have an expiration time so this thread handles that as well by removing + * the item from the hash if it has expired. + * + * Some important things to note are: + * - if the PR_CAPTURE bit is set on a page, then the page is in the + * page_capture_hash. The page_capture_hash_head.pchh_mutex is needed + * to set and clear this bit, and while the lock is held is the only time + * you can add or remove an entry from the hash. + * - the PR_CAPTURE bit can only be set and cleared while holding the + * page_capture_hash_head.pchh_mutex + * - the t_flag field of the thread struct is used with the T_CAPTURING + * flag to prevent recursion while dealing with large pages. + * - pages which need to be retired never expire on the page_capture_hash. + */ + +static void page_capture_thread(void); +static kthread_t *pc_thread_id; +kcondvar_t pc_cv; +static kmutex_t pc_thread_mutex; +static clock_t pc_thread_shortwait; +static clock_t pc_thread_longwait; + +struct page_capture_callback pc_cb[PC_NUM_CALLBACKS]; + +/* Note that this is a circular linked list */ +typedef struct page_capture_hash_bucket { + page_t *pp; + uint_t szc; + uint_t flags; + clock_t expires; /* lbolt at which this request expires. */ + void *datap; /* Cached data passed in for callback */ + struct page_capture_hash_bucket *next; + struct page_capture_hash_bucket *prev; +} page_capture_hash_bucket_t; + +/* + * Each hash bucket will have it's own mutex and two lists which are: + * active (0): represents requests which have not been processed by + * the page_capture async thread yet. + * walked (1): represents requests which have been processed by the + * page_capture async thread within it's given walk of this bucket. + * + * These are all needed so that we can synchronize all async page_capture + * events. When the async thread moves to a new bucket, it will append the + * walked list to the active list and walk each item one at a time, moving it + * from the active list to the walked list. Thus if there is an async request + * outstanding for a given page, it will always be in one of the two lists. + * New requests will always be added to the active list. + * If we were not able to capture a page before the request expired, we'd free + * up the request structure which would indicate to page_capture that there is + * no longer a need for the given page, and clear the PR_CAPTURE flag if + * possible. + */ +typedef struct page_capture_hash_head { + kmutex_t pchh_mutex; + uint_t num_pages; + page_capture_hash_bucket_t lists[2]; /* sentinel nodes */ +} page_capture_hash_head_t; + +#ifdef DEBUG +#define NUM_PAGE_CAPTURE_BUCKETS 4 +#else +#define NUM_PAGE_CAPTURE_BUCKETS 64 +#endif + +page_capture_hash_head_t page_capture_hash[NUM_PAGE_CAPTURE_BUCKETS]; + +/* for now use a very simple hash based upon the size of a page struct */ +#define PAGE_CAPTURE_HASH(pp) \ + ((int)(((uintptr_t)pp >> 7) & (NUM_PAGE_CAPTURE_BUCKETS - 1))) + +extern pgcnt_t swapfs_minfree; + +int page_trycapture(page_t *pp, uint_t szc, uint_t flags, void *datap); + +/* + * a callback function is required for page capture requests. + */ +void +page_capture_register_callback(uint_t index, clock_t duration, + int (*cb_func)(page_t *, void *, uint_t)) +{ + ASSERT(pc_cb[index].cb_active == 0); + ASSERT(cb_func != NULL); + rw_enter(&pc_cb[index].cb_rwlock, RW_WRITER); + pc_cb[index].duration = duration; + pc_cb[index].cb_func = cb_func; + pc_cb[index].cb_active = 1; + rw_exit(&pc_cb[index].cb_rwlock); +} + +void +page_capture_unregister_callback(uint_t index) +{ + int i, j; + struct page_capture_hash_bucket *bp1; + struct page_capture_hash_bucket *bp2; + struct page_capture_hash_bucket *head = NULL; + uint_t flags = (1 << index); + + rw_enter(&pc_cb[index].cb_rwlock, RW_WRITER); + ASSERT(pc_cb[index].cb_active == 1); + pc_cb[index].duration = 0; /* Paranoia */ + pc_cb[index].cb_func = NULL; /* Paranoia */ + pc_cb[index].cb_active = 0; + rw_exit(&pc_cb[index].cb_rwlock); + + /* + * Just move all the entries to a private list which we can walk + * through without the need to hold any locks. + * No more requests can get added to the hash lists for this consumer + * as the cb_active field for the callback has been cleared. + */ + for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) { + mutex_enter(&page_capture_hash[i].pchh_mutex); + for (j = 0; j < 2; j++) { + bp1 = page_capture_hash[i].lists[j].next; + /* walk through all but first (sentinel) element */ + while (bp1 != &page_capture_hash[i].lists[j]) { + bp2 = bp1; + if (bp2->flags & flags) { + bp1 = bp2->next; + bp1->prev = bp2->prev; + bp2->prev->next = bp1; + bp2->next = head; + head = bp2; + /* + * Clear the PR_CAPTURE bit as we + * hold appropriate locks here. + */ + page_clrtoxic(head->pp, PR_CAPTURE); + page_capture_hash[i].num_pages--; + continue; + } + bp1 = bp1->next; + } + } + mutex_exit(&page_capture_hash[i].pchh_mutex); + } + + while (head != NULL) { + bp1 = head; + head = head->next; + kmem_free(bp1, sizeof (*bp1)); + } +} + + +/* + * Find pp in the active list and move it to the walked list if it + * exists. + * Note that most often pp should be at the front of the active list + * as it is currently used and thus there is no other sort of optimization + * being done here as this is a linked list data structure. + * Returns 1 on successful move or 0 if page could not be found. + */ +static int +page_capture_move_to_walked(page_t *pp) +{ + page_capture_hash_bucket_t *bp; + int index; + + index = PAGE_CAPTURE_HASH(pp); + + mutex_enter(&page_capture_hash[index].pchh_mutex); + bp = page_capture_hash[index].lists[0].next; + while (bp != &page_capture_hash[index].lists[0]) { + if (bp->pp == pp) { + /* Remove from old list */ + bp->next->prev = bp->prev; + bp->prev->next = bp->next; + + /* Add to new list */ + bp->next = page_capture_hash[index].lists[1].next; + bp->prev = &page_capture_hash[index].lists[1]; + page_capture_hash[index].lists[1].next = bp; + bp->next->prev = bp; + mutex_exit(&page_capture_hash[index].pchh_mutex); + + return (1); + } + bp = bp->next; + } + mutex_exit(&page_capture_hash[index].pchh_mutex); + return (0); +} + +/* + * Add a new entry to the page capture hash. The only case where a new + * entry is not added is when the page capture consumer is no longer registered. + * In this case, we'll silently not add the page to the hash. We know that + * page retire will always be registered for the case where we are currently + * unretiring a page and thus there are no conflicts. + */ +static void +page_capture_add_hash(page_t *pp, uint_t szc, uint_t flags, void *datap) +{ + page_capture_hash_bucket_t *bp1; + page_capture_hash_bucket_t *bp2; + int index; + int cb_index; + int i; +#ifdef DEBUG + page_capture_hash_bucket_t *tp1; + int l; +#endif + + ASSERT(!(flags & CAPTURE_ASYNC)); + + bp1 = kmem_alloc(sizeof (struct page_capture_hash_bucket), KM_SLEEP); + + bp1->pp = pp; + bp1->szc = szc; + bp1->flags = flags; + bp1->datap = datap; + + for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) { + if ((flags >> cb_index) & 1) { + break; + } + } + + ASSERT(cb_index != PC_NUM_CALLBACKS); + + rw_enter(&pc_cb[cb_index].cb_rwlock, RW_READER); + if (pc_cb[cb_index].cb_active) { + if (pc_cb[cb_index].duration == -1) { + bp1->expires = (clock_t)-1; + } else { + bp1->expires = lbolt + pc_cb[cb_index].duration; + } + } else { + /* There's no callback registered so don't add to the hash */ + rw_exit(&pc_cb[cb_index].cb_rwlock); + kmem_free(bp1, sizeof (*bp1)); + return; + } + + index = PAGE_CAPTURE_HASH(pp); + + /* + * Only allow capture flag to be modified under this mutex. + * Prevents multiple entries for same page getting added. + */ + mutex_enter(&page_capture_hash[index].pchh_mutex); + + /* + * if not already on the hash, set capture bit and add to the hash + */ + if (!(pp->p_toxic & PR_CAPTURE)) { +#ifdef DEBUG + /* Check for duplicate entries */ + for (l = 0; l < 2; l++) { + tp1 = page_capture_hash[index].lists[l].next; + while (tp1 != &page_capture_hash[index].lists[l]) { + if (tp1->pp == pp) { + panic("page pp 0x%p already on hash " + "at 0x%p\n", pp, tp1); + } + tp1 = tp1->next; + } + } + +#endif + page_settoxic(pp, PR_CAPTURE); + bp1->next = page_capture_hash[index].lists[0].next; + bp1->prev = &page_capture_hash[index].lists[0]; + bp1->next->prev = bp1; + page_capture_hash[index].lists[0].next = bp1; + page_capture_hash[index].num_pages++; + mutex_exit(&page_capture_hash[index].pchh_mutex); + rw_exit(&pc_cb[cb_index].cb_rwlock); + cv_signal(&pc_cv); + return; + } + + /* + * A page retire request will replace any other request. + * A second physmem request which is for a different process than + * the currently registered one will be dropped as there is + * no way to hold the private data for both calls. + * In the future, once there are more callers, this will have to + * be worked out better as there needs to be private storage for + * at least each type of caller (maybe have datap be an array of + * *void's so that we can index based upon callers index). + */ + + /* walk hash list to update expire time */ + for (i = 0; i < 2; i++) { + bp2 = page_capture_hash[index].lists[i].next; + while (bp2 != &page_capture_hash[index].lists[i]) { + if (bp2->pp == pp) { + if (flags & CAPTURE_RETIRE) { + if (!(bp2->flags & CAPTURE_RETIRE)) { + bp2->flags = flags; + bp2->expires = bp1->expires; + bp2->datap = datap; + } + } else { + ASSERT(flags & CAPTURE_PHYSMEM); + if (!(bp2->flags & CAPTURE_RETIRE) && + (datap == bp2->datap)) { + bp2->expires = bp1->expires; + } + } + mutex_exit(&page_capture_hash[index]. + pchh_mutex); + rw_exit(&pc_cb[cb_index].cb_rwlock); + kmem_free(bp1, sizeof (*bp1)); + return; + } + bp2 = bp2->next; + } + } + + /* + * the PR_CAPTURE flag is protected by the page_capture_hash mutexes + * and thus it either has to be set or not set and can't change + * while holding the mutex above. + */ + panic("page_capture_add_hash, PR_CAPTURE flag set on pp %p\n", pp); +} + +/* + * We have a page in our hands, lets try and make it ours by turning + * it into a clean page like it had just come off the freelists. + * + * Returns 0 on success, with the page still EXCL locked. + * On failure, the page will be unlocked, and returns EAGAIN + */ +static int +page_capture_clean_page(page_t *pp) +{ + page_t *newpp; + int skip_unlock = 0; + spgcnt_t count; + page_t *tpp; + int ret = 0; + int extra; + + ASSERT(PAGE_EXCL(pp)); + ASSERT(!PP_RETIRED(pp)); + ASSERT(curthread->t_flag & T_CAPTURING); + + if (PP_ISFREE(pp)) { + if (!page_reclaim_page(pp, NULL)) { + skip_unlock = 1; + ret = EAGAIN; + goto cleanup; + } + if (pp->p_vnode != NULL) { + /* + * Since this page came from the + * cachelist, we must destroy the + * old vnode association. + */ + page_hashout(pp, NULL); + } + goto cleanup; + } + + /* + * If we know page_relocate will fail, skip it + * It could still fail due to a UE on another page but we + * can't do anything about that. + */ + if (pp->p_toxic & PR_UE) { + goto skip_relocate; + } + + /* + * It's possible that pages can not have a vnode as fsflush comes + * through and cleans up these pages. It's ugly but that's how it is. + */ + if (pp->p_vnode == NULL) { + goto skip_relocate; + } + + /* + * Page was not free, so lets try to relocate it. + * page_relocate only works with root pages, so if this is not a root + * page, we need to demote it to try and relocate it. + * Unfortunately this is the best we can do right now. + */ + newpp = NULL; + if ((pp->p_szc > 0) && (pp != PP_PAGEROOT(pp))) { + if (page_try_demote_pages(pp) == 0) { + ret = EAGAIN; + goto cleanup; + } + } + ret = page_relocate(&pp, &newpp, 1, 0, &count, NULL); + if (ret == 0) { + page_t *npp; + /* unlock the new page(s) */ + while (count-- > 0) { + ASSERT(newpp != NULL); + npp = newpp; + page_sub(&newpp, npp); + page_unlock(npp); + } + ASSERT(newpp == NULL); + /* + * Check to see if the page we have is too large. + * If so, demote it freeing up the extra pages. + */ + if (pp->p_szc > 0) { + /* For now demote extra pages to szc == 0 */ + extra = page_get_pagecnt(pp->p_szc) - 1; + while (extra > 0) { + tpp = pp->p_next; + page_sub(&pp, tpp); + tpp->p_szc = 0; + page_free(tpp, 1); + extra--; + } + /* Make sure to set our page to szc 0 as well */ + ASSERT(pp->p_next == pp && pp->p_prev == pp); + pp->p_szc = 0; + } + goto cleanup; + } else if (ret == EIO) { + ret = EAGAIN; + goto cleanup; + } else { + /* + * Need to reset return type as we failed to relocate the page + * but that does not mean that some of the next steps will not + * work. + */ + ret = 0; + } + +skip_relocate: + + if (pp->p_szc > 0) { + if (page_try_demote_pages(pp) == 0) { + ret = EAGAIN; + goto cleanup; + } + } + + ASSERT(pp->p_szc == 0); + + if (hat_ismod(pp)) { + ret = EAGAIN; + goto cleanup; + } + if (PP_ISKVP(pp)) { + ret = EAGAIN; + goto cleanup; + } + if (pp->p_lckcnt || pp->p_cowcnt) { + ret = EAGAIN; + goto cleanup; + } + + (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); + ASSERT(!hat_page_is_mapped(pp)); + + if (hat_ismod(pp)) { + /* + * This is a semi-odd case as the page is now modified but not + * mapped as we just unloaded the mappings above. + */ + ret = EAGAIN; + goto cleanup; + } + if (pp->p_vnode != NULL) { + page_hashout(pp, NULL); + } + + /* + * At this point, the page should be in a clean state and + * we can do whatever we want with it. + */ + +cleanup: + if (ret != 0) { + if (!skip_unlock) { + page_unlock(pp); + } + } else { + ASSERT(pp->p_szc == 0); + ASSERT(PAGE_EXCL(pp)); + + pp->p_next = pp; + pp->p_prev = pp; + } + return (ret); +} + +/* + * Various callers of page_trycapture() can have different restrictions upon + * what memory they have access to. + * Returns 0 on success, with the following error codes on failure: + * EPERM - The requested page is long term locked, and thus repeated + * requests to capture this page will likely fail. + * ENOMEM - There was not enough free memory in the system to safely + * map the requested page. + * ENOENT - The requested page was inside the kernel cage, and the + * PHYSMEM_CAGE flag was not set. + */ +int +page_capture_pre_checks(page_t *pp, uint_t flags) +{ +#if defined(__sparc) + extern struct vnode prom_ppages; +#endif /* __sparc */ + + ASSERT(pp != NULL); + + /* only physmem currently has restrictions */ + if (!(flags & CAPTURE_PHYSMEM)) { + return (0); + } + +#if defined(__sparc) + if (pp->p_vnode == &prom_ppages) { + return (EPERM); + } + + if (PP_ISNORELOC(pp) && !(flags & CAPTURE_GET_CAGE)) { + return (ENOENT); + } + + if (PP_ISNORELOCKERNEL(pp)) { + return (EPERM); + } +#else + if (PP_ISKVP(pp)) { + return (EPERM); + } +#endif /* __sparc */ + + if (availrmem < swapfs_minfree) { + /* + * We won't try to capture this page as we are + * running low on memory. + */ + return (ENOMEM); + } + return (0); +} + +/* + * Once we have a page in our mits, go ahead and complete the capture + * operation. + * Returns 1 on failure where page is no longer needed + * Returns 0 on success + * Returns -1 if there was a transient failure. + * Failure cases must release the SE_EXCL lock on pp (usually via page_free). + */ +int +page_capture_take_action(page_t *pp, uint_t flags, void *datap) +{ + int cb_index; + int ret = 0; + page_capture_hash_bucket_t *bp1; + page_capture_hash_bucket_t *bp2; + int index; + int found = 0; + int i; + + ASSERT(PAGE_EXCL(pp)); + ASSERT(curthread->t_flag & T_CAPTURING); + + for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) { + if ((flags >> cb_index) & 1) { + break; + } + } + ASSERT(cb_index < PC_NUM_CALLBACKS); + + /* + * Remove the entry from the page_capture hash, but don't free it yet + * as we may need to put it back. + * Since we own the page at this point in time, we should find it + * in the hash if this is an ASYNC call. If we don't it's likely + * that the page_capture_async() thread decided that this request + * had expired, in which case we just continue on. + */ + if (flags & CAPTURE_ASYNC) { + + index = PAGE_CAPTURE_HASH(pp); + + mutex_enter(&page_capture_hash[index].pchh_mutex); + for (i = 0; i < 2 && !found; i++) { + bp1 = page_capture_hash[index].lists[i].next; + while (bp1 != &page_capture_hash[index].lists[i]) { + if (bp1->pp == pp) { + bp1->next->prev = bp1->prev; + bp1->prev->next = bp1->next; + page_capture_hash[index].num_pages--; + page_clrtoxic(pp, PR_CAPTURE); + found = 1; + break; + } + bp1 = bp1->next; + } + } + mutex_exit(&page_capture_hash[index].pchh_mutex); + } + + /* Synchronize with the unregister func. */ + rw_enter(&pc_cb[cb_index].cb_rwlock, RW_READER); + if (!pc_cb[cb_index].cb_active) { + page_free(pp, 1); + rw_exit(&pc_cb[cb_index].cb_rwlock); + if (found) { + kmem_free(bp1, sizeof (*bp1)); + } + return (1); + } + + /* + * We need to remove the entry from the page capture hash and turn off + * the PR_CAPTURE bit before calling the callback. We'll need to cache + * the entry here, and then based upon the return value, cleanup + * appropriately or re-add it to the hash, making sure that someone else + * hasn't already done so. + * It should be rare for the callback to fail and thus it's ok for + * the failure path to be a bit complicated as the success path is + * cleaner and the locking rules are easier to follow. + */ + + ret = pc_cb[cb_index].cb_func(pp, datap, flags); + + rw_exit(&pc_cb[cb_index].cb_rwlock); + + /* + * If this was an ASYNC request, we need to cleanup the hash if the + * callback was successful or if the request was no longer valid. + * For non-ASYNC requests, we return failure to map and the caller + * will take care of adding the request to the hash. + * Note also that the callback itself is responsible for the page + * at this point in time in terms of locking ... The most common + * case for the failure path should just be a page_free. + */ + if (ret >= 0) { + if (found) { + kmem_free(bp1, sizeof (*bp1)); + } + return (ret); + } + if (!found) { + return (ret); + } + + ASSERT(flags & CAPTURE_ASYNC); + + /* + * Check for expiration time first as we can just free it up if it's + * expired. + */ + if (lbolt > bp1->expires && bp1->expires != -1) { + kmem_free(bp1, sizeof (*bp1)); + return (ret); + } + + /* + * The callback failed and there used to be an entry in the hash for + * this page, so we need to add it back to the hash. + */ + mutex_enter(&page_capture_hash[index].pchh_mutex); + if (!(pp->p_toxic & PR_CAPTURE)) { + /* just add bp1 back to head of walked list */ + page_settoxic(pp, PR_CAPTURE); + bp1->next = page_capture_hash[index].lists[1].next; + bp1->prev = &page_capture_hash[index].lists[1]; + bp1->next->prev = bp1; + page_capture_hash[index].lists[1].next = bp1; + page_capture_hash[index].num_pages++; + mutex_exit(&page_capture_hash[index].pchh_mutex); + return (ret); + } + + /* + * Otherwise there was a new capture request added to list + * Need to make sure that our original data is represented if + * appropriate. + */ + for (i = 0; i < 2; i++) { + bp2 = page_capture_hash[index].lists[i].next; + while (bp2 != &page_capture_hash[index].lists[i]) { + if (bp2->pp == pp) { + if (bp1->flags & CAPTURE_RETIRE) { + if (!(bp2->flags & CAPTURE_RETIRE)) { + bp2->szc = bp1->szc; + bp2->flags = bp1->flags; + bp2->expires = bp1->expires; + bp2->datap = bp1->datap; + } + } else { + ASSERT(bp1->flags & CAPTURE_PHYSMEM); + if (!(bp2->flags & CAPTURE_RETIRE)) { + bp2->szc = bp1->szc; + bp2->flags = bp1->flags; + bp2->expires = bp1->expires; + bp2->datap = bp1->datap; + } + } + mutex_exit(&page_capture_hash[index]. + pchh_mutex); + kmem_free(bp1, sizeof (*bp1)); + return (ret); + } + bp2 = bp2->next; + } + } + panic("PR_CAPTURE set but not on hash for pp 0x%p\n", pp); + /*NOTREACHED*/ +} + +/* + * Try to capture the given page for the caller specified in the flags + * parameter. The page will either be captured and handed over to the + * appropriate callback, or will be queued up in the page capture hash + * to be captured asynchronously. + * If the current request is due to an async capture, the page must be + * exclusively locked before calling this function. + * Currently szc must be 0 but in the future this should be expandable to + * other page sizes. + * Returns 0 on success, with the following error codes on failure: + * EPERM - The requested page is long term locked, and thus repeated + * requests to capture this page will likely fail. + * ENOMEM - There was not enough free memory in the system to safely + * map the requested page. + * ENOENT - The requested page was inside the kernel cage, and the + * CAPTURE_GET_CAGE flag was not set. + * EAGAIN - The requested page could not be capturead at this point in + * time but future requests will likely work. + * EBUSY - The requested page is retired and the CAPTURE_GET_RETIRED flag + * was not set. + */ +int +page_itrycapture(page_t *pp, uint_t szc, uint_t flags, void *datap) +{ + int ret; + int cb_index; + + if (flags & CAPTURE_ASYNC) { + ASSERT(PAGE_EXCL(pp)); + goto async; + } + + /* Make sure there's enough availrmem ... */ + ret = page_capture_pre_checks(pp, flags); + if (ret != 0) { + return (ret); + } + + if (!page_trylock(pp, SE_EXCL)) { + for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) { + if ((flags >> cb_index) & 1) { + break; + } + } + ASSERT(cb_index < PC_NUM_CALLBACKS); + ret = EAGAIN; + /* Special case for retired pages */ + if (PP_RETIRED(pp)) { + if (flags & CAPTURE_GET_RETIRED) { + if (!page_unretire_pp(pp, PR_UNR_TEMP)) { + /* + * Need to set capture bit and add to + * hash so that the page will be + * retired when freed. + */ + page_capture_add_hash(pp, szc, + CAPTURE_RETIRE, NULL); + ret = 0; + goto own_page; + } + } else { + return (EBUSY); + } + } + page_capture_add_hash(pp, szc, flags, datap); + return (ret); + } + +async: + ASSERT(PAGE_EXCL(pp)); + + /* Need to check for physmem async requests that availrmem is sane */ + if ((flags & (CAPTURE_ASYNC | CAPTURE_PHYSMEM)) == + (CAPTURE_ASYNC | CAPTURE_PHYSMEM) && + (availrmem < swapfs_minfree)) { + page_unlock(pp); + return (ENOMEM); + } + + ret = page_capture_clean_page(pp); + + if (ret != 0) { + /* We failed to get the page, so lets add it to the hash */ + if (!(flags & CAPTURE_ASYNC)) { + page_capture_add_hash(pp, szc, flags, datap); + } + return (ret); + } + +own_page: + ASSERT(PAGE_EXCL(pp)); + ASSERT(pp->p_szc == 0); + + /* Call the callback */ + ret = page_capture_take_action(pp, flags, datap); + + if (ret == 0) { + return (0); + } + + /* + * Note that in the failure cases from page_capture_take_action, the + * EXCL lock will have already been dropped. + */ + if ((ret == -1) && (!(flags & CAPTURE_ASYNC))) { + page_capture_add_hash(pp, szc, flags, datap); + } + return (EAGAIN); +} + +int +page_trycapture(page_t *pp, uint_t szc, uint_t flags, void *datap) +{ + int ret; + + curthread->t_flag |= T_CAPTURING; + ret = page_itrycapture(pp, szc, flags, datap); + curthread->t_flag &= ~T_CAPTURING; /* xor works as we know its set */ + return (ret); +} + +/* + * When unlocking a page which has the PR_CAPTURE bit set, this routine + * gets called to try and capture the page. + */ +void +page_unlock_capture(page_t *pp) +{ + page_capture_hash_bucket_t *bp; + int index; + int i; + uint_t szc; + uint_t flags = 0; + void *datap; + kmutex_t *mp; + extern vnode_t retired_pages; + + /* + * We need to protect against a possible deadlock here where we own + * the vnode page hash mutex and want to acquire it again as there + * are locations in the code, where we unlock a page while holding + * the mutex which can lead to the page being captured and eventually + * end up here. As we may be hashing out the old page and hashing into + * the retire vnode, we need to make sure we don't own them. + * Other callbacks who do hash operations also need to make sure that + * before they hashin to a vnode that they do not currently own the + * vphm mutex otherwise there will be a panic. + */ + if (mutex_owned(page_vnode_mutex(&retired_pages))) { + page_unlock(pp); + return; + } + if (pp->p_vnode != NULL && mutex_owned(page_vnode_mutex(pp->p_vnode))) { + page_unlock(pp); + return; + } + + index = PAGE_CAPTURE_HASH(pp); + + mp = &page_capture_hash[index].pchh_mutex; + mutex_enter(mp); + for (i = 0; i < 2; i++) { + bp = page_capture_hash[index].lists[i].next; + while (bp != &page_capture_hash[index].lists[i]) { + if (bp->pp == pp) { + szc = bp->szc; + flags = bp->flags | CAPTURE_ASYNC; + datap = bp->datap; + mutex_exit(mp); + (void) page_trycapture(pp, szc, flags, datap); + return; + } + bp = bp->next; + } + } + + /* Failed to find page in hash so clear flags and unlock it. */ + page_clrtoxic(pp, PR_CAPTURE); + page_unlock(pp); + + mutex_exit(mp); +} + +void +page_capture_init() +{ + int i; + for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) { + page_capture_hash[i].lists[0].next = + &page_capture_hash[i].lists[0]; + page_capture_hash[i].lists[0].prev = + &page_capture_hash[i].lists[0]; + page_capture_hash[i].lists[1].next = + &page_capture_hash[i].lists[1]; + page_capture_hash[i].lists[1].prev = + &page_capture_hash[i].lists[1]; + } + + pc_thread_shortwait = 23 * hz; + pc_thread_longwait = 1201 * hz; + mutex_init(&pc_thread_mutex, NULL, MUTEX_DEFAULT, NULL); + cv_init(&pc_cv, NULL, CV_DEFAULT, NULL); + pc_thread_id = thread_create(NULL, 0, page_capture_thread, NULL, 0, &p0, + TS_RUN, minclsyspri); +} + +/* + * It is necessary to scrub any failing pages prior to reboot in order to + * prevent a latent error trap from occurring on the next boot. + */ +void +page_retire_mdboot() +{ + page_t *pp; + int i, j; + page_capture_hash_bucket_t *bp; + + /* walk lists looking for pages to scrub */ + for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) { + if (page_capture_hash[i].num_pages == 0) + continue; + + mutex_enter(&page_capture_hash[i].pchh_mutex); + + for (j = 0; j < 2; j++) { + bp = page_capture_hash[i].lists[j].next; + while (bp != &page_capture_hash[i].lists[j]) { + pp = bp->pp; + if (!PP_ISKVP(pp) && PP_TOXIC(pp)) { + pp->p_selock = -1; /* pacify ASSERTs */ + PP_CLRFREE(pp); + pagescrub(pp, 0, PAGESIZE); + pp->p_selock = 0; + } + bp = bp->next; + } + } + mutex_exit(&page_capture_hash[i].pchh_mutex); + } +} + +/* + * Walk the page_capture_hash trying to capture pages and also cleanup old + * entries which have expired. + */ +void +page_capture_async() +{ + page_t *pp; + int i; + int ret; + page_capture_hash_bucket_t *bp1, *bp2; + uint_t szc; + uint_t flags; + void *datap; + + /* If there are outstanding pages to be captured, get to work */ + for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) { + if (page_capture_hash[i].num_pages == 0) + continue; + /* Append list 1 to list 0 and then walk through list 0 */ + mutex_enter(&page_capture_hash[i].pchh_mutex); + bp1 = &page_capture_hash[i].lists[1]; + bp2 = bp1->next; + if (bp1 != bp2) { + bp1->prev->next = page_capture_hash[i].lists[0].next; + bp2->prev = &page_capture_hash[i].lists[0]; + page_capture_hash[i].lists[0].next->prev = bp1->prev; + page_capture_hash[i].lists[0].next = bp2; + bp1->next = bp1; + bp1->prev = bp1; + } + + /* list[1] will be empty now */ + + bp1 = page_capture_hash[i].lists[0].next; + while (bp1 != &page_capture_hash[i].lists[0]) { + /* Check expiration time */ + if ((lbolt > bp1->expires && bp1->expires != -1) || + page_deleted(bp1->pp)) { + page_capture_hash[i].lists[0].next = bp1->next; + bp1->next->prev = + &page_capture_hash[i].lists[0]; + page_capture_hash[i].num_pages--; + + /* + * We can safely remove the PR_CAPTURE bit + * without holding the EXCL lock on the page + * as the PR_CAPTURE bit requres that the + * page_capture_hash[].pchh_mutex be held + * to modify it. + */ + page_clrtoxic(bp1->pp, PR_CAPTURE); + mutex_exit(&page_capture_hash[i].pchh_mutex); + kmem_free(bp1, sizeof (*bp1)); + mutex_enter(&page_capture_hash[i].pchh_mutex); + bp1 = page_capture_hash[i].lists[0].next; + continue; + } + pp = bp1->pp; + szc = bp1->szc; + flags = bp1->flags; + datap = bp1->datap; + mutex_exit(&page_capture_hash[i].pchh_mutex); + if (page_trylock(pp, SE_EXCL)) { + ret = page_trycapture(pp, szc, + flags | CAPTURE_ASYNC, datap); + } else { + ret = 1; /* move to walked hash */ + } + + if (ret != 0) { + /* Move to walked hash */ + (void) page_capture_move_to_walked(pp); + } + mutex_enter(&page_capture_hash[i].pchh_mutex); + bp1 = page_capture_hash[i].lists[0].next; + } + + mutex_exit(&page_capture_hash[i].pchh_mutex); + } +} + +/* + * The page_capture_thread loops forever, looking to see if there are + * pages still waiting to be captured. + */ +static void +page_capture_thread(void) +{ + callb_cpr_t c; + int outstanding; + int i; + + CALLB_CPR_INIT(&c, &pc_thread_mutex, callb_generic_cpr, "page_capture"); + + mutex_enter(&pc_thread_mutex); + for (;;) { + outstanding = 0; + for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) + outstanding += page_capture_hash[i].num_pages; + if (outstanding) { + /* + * Do we really want to be this aggressive for things + * other than page_retire? + * Maybe have a counter for each callback type to + * guide how aggressive we should be here. + * Thus if there's at least one page for page_retire + * we go ahead and reap like this. + */ + kmem_reap(); + seg_preap(); + page_capture_async(); + CALLB_CPR_SAFE_BEGIN(&c); + (void) cv_timedwait(&pc_cv, &pc_thread_mutex, + lbolt + pc_thread_shortwait); + CALLB_CPR_SAFE_END(&c, &pc_thread_mutex); + } else { + CALLB_CPR_SAFE_BEGIN(&c); + (void) cv_timedwait(&pc_cv, &pc_thread_mutex, + lbolt + pc_thread_longwait); + CALLB_CPR_SAFE_END(&c, &pc_thread_mutex); + } + } + /*NOTREACHED*/ +}
--- a/usr/src/uts/common/vm/vm_pagelist.c Thu Dec 14 16:42:14 2006 -0800 +++ b/usr/src/uts/common/vm/vm_pagelist.c Thu Dec 14 17:27:13 2006 -0800 @@ -1487,7 +1487,7 @@ kcage_freemem_add(pgcnt); #endif for (i = 0; i < pgcnt; i++, pp++) - page_unlock_noretire(pp); + page_unlock_nocapture(pp); } } @@ -1935,7 +1935,7 @@ index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset); phm = PAGE_HASH_MUTEX(index); if (!mutex_tryenter(phm)) { - page_unlock_noretire(pp); + page_unlock_nocapture(pp); goto fail_promote; } @@ -1943,7 +1943,7 @@ page_hashout(pp, phm); mutex_exit(phm); PP_SETAGED(pp); - page_unlock_noretire(pp); + page_unlock_nocapture(pp); which_list = PG_CACHE_LIST; } page_ctr_sub(mnode, mtype, pp, which_list); @@ -2496,7 +2496,6 @@ return (ret_pp); } - /* * Helper routine used only by the freelist code to lock * a page. If the page is a large page then it succeeds in @@ -2529,11 +2528,13 @@ while (tpp != pp) { if (!page_trylock(tpp, se)) { /* - * On failure unlock what we - * have locked so far. + * On failure unlock what we have locked so far. + * We want to avoid attempting to capture these + * pages as the pcm mutex may be held which could + * lead to a recursive mutex panic. */ while (first_pp != tpp) { - page_unlock_noretire(first_pp); + page_unlock_nocapture(first_pp); first_pp = first_pp->p_next; } return (0); @@ -2976,7 +2977,7 @@ while (--i != (pgcnt_t)-1) { pp = &spp[i]; ASSERT(PAGE_EXCL(pp)); - page_unlock_noretire(pp); + page_unlock_nocapture(pp); } return (0); } @@ -2985,7 +2986,7 @@ !PP_ISFREE(pp)) { VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]); ASSERT(i == 0); - page_unlock_noretire(pp); + page_unlock_nocapture(pp); return (0); } if (PP_ISNORELOC(pp)) { @@ -2993,7 +2994,7 @@ while (i != (pgcnt_t)-1) { pp = &spp[i]; ASSERT(PAGE_EXCL(pp)); - page_unlock_noretire(pp); + page_unlock_nocapture(pp); i--; } return (0); @@ -3088,7 +3089,7 @@ */ while (pgcnt--) { ASSERT(PAGE_EXCL(pp)); - page_unlock_noretire(pp); + page_unlock_nocapture(pp); pp++; } /* @@ -3103,7 +3104,7 @@ ASSERT(PP_ISAGED(pp)); pp->p_szc = 0; page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); - page_unlock_noretire(pp); + page_unlock_nocapture(pp); } if (replpp != NULL) @@ -3135,7 +3136,7 @@ page_sub(&replpp, rpp); ASSERT(PAGE_EXCL(rpp)); ASSERT(!PP_ISFREE(rpp)); - page_unlock_noretire(rpp); + page_unlock_nocapture(rpp); } ASSERT(targpp == hpp); ASSERT(replpp == NULL); @@ -3149,7 +3150,6 @@ * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code * of 0 means nothing left after trim. */ - int trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi) {
--- a/usr/src/uts/i86pc/os/machdep.c Thu Dec 14 16:42:14 2006 -0800 +++ b/usr/src/uts/i86pc/os/machdep.c Thu Dec 14 17:27:13 2006 -0800 @@ -200,7 +200,10 @@ if (invoke_cb) (void) callb_execute_class(CB_CL_MDBOOT, NULL); - page_retire_hunt(page_retire_mdboot_cb); + /* + * Clear any unresolved UEs from memory. + */ + page_retire_mdboot(); /* * stop other cpus and raise our priority. since there is only
--- a/usr/src/uts/i86pc/vm/vm_dep.h Thu Dec 14 16:42:14 2006 -0800 +++ b/usr/src/uts/i86pc/vm/vm_dep.h Thu Dec 14 17:27:13 2006 -0800 @@ -686,6 +686,7 @@ ulong_t ppr_relocnolock[MMU_PAGE_SIZES]; ulong_t ppr_relocnomem[MMU_PAGE_SIZES]; ulong_t ppr_relocok[MMU_PAGE_SIZES]; + ulong_t ppr_copyfail; /* page coalesce counter */ ulong_t page_ctrs_coalesce[MMU_PAGE_SIZES][MAX_MNODE_MRANGES]; /* candidates useful */
--- a/usr/src/uts/i86pc/vm/vm_machdep.c Thu Dec 14 16:42:14 2006 -0800 +++ b/usr/src/uts/i86pc/vm/vm_machdep.c Thu Dec 14 17:27:13 2006 -0800 @@ -1992,7 +1992,7 @@ * Note that the ref/mod bits in the page_t's are not affected by * this operation, hence it is up to the caller to update them appropriately. */ -void +int ppcopy(page_t *frompp, page_t *topp) { caddr_t pp_addr1; @@ -2000,6 +2000,8 @@ void *pte1; void *pte2; kmutex_t *ppaddr_mutex; + label_t ljb; + int ret = 1; ASSERT_STACK_ALIGNED(); ASSERT(PAGE_LOCKED(frompp)); @@ -2030,14 +2032,21 @@ HAT_LOAD_NOCONSIST); } + if (on_fault(&ljb)) { + ret = 0; + goto faulted; + } if (use_sse_pagecopy) hwblkpagecopy(pp_addr1, pp_addr2); else bcopy(pp_addr1, pp_addr2, PAGESIZE); + no_fault(); +faulted: if (!kpm_enable) mutex_exit(ppaddr_mutex); kpreempt_enable(); + return (ret); } /*
--- a/usr/src/uts/intel/Makefile.intel.shared Thu Dec 14 16:42:14 2006 -0800 +++ b/usr/src/uts/intel/Makefile.intel.shared Thu Dec 14 17:27:13 2006 -0800 @@ -258,6 +258,7 @@ DRV_KMODS += mouse8042 DRV_KMODS += nca DRV_KMODS += openeepr +DRV_KMODS += physmem DRV_KMODS += pm DRV_KMODS += poll DRV_KMODS += pool
--- a/usr/src/uts/intel/os/minor_perm Thu Dec 14 16:42:14 2006 -0800 +++ b/usr/src/uts/intel/os/minor_perm Thu Dec 14 17:27:13 2006 -0800 @@ -139,3 +139,4 @@ pcn:* 0666 root sys rtls:* 0666 root sys ath:* 0666 root sys +physmem:* 0600 root sys
--- a/usr/src/uts/intel/os/name_to_major Thu Dec 14 16:42:14 2006 -0800 +++ b/usr/src/uts/intel/os/name_to_major Thu Dec 14 17:27:13 2006 -0800 @@ -123,3 +123,4 @@ lx_ptm 240 lx_systrace 241 lx_audio 242 +physmem 243
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/intel/physmem/Makefile Thu Dec 14 17:27:13 2006 -0800 @@ -0,0 +1,84 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# uts/intel/physmem/Makefile +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# +# This makefile drives the production of the physmem driver +# +# intel implementation architecture dependent +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = physmem +OBJECTS = $(PHYSMEM_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(PHYSMEM_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE) +CONF_SRCDIR = $(UTSBASE)/common/io + +# +# Include common rules. +# +include $(UTSBASE)/intel/Makefile.intel + +# +# Define targets +# +ALL_TARGET = $(BINARY) $(SRC_CONFILE) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/intel/Makefile.targ
--- a/usr/src/uts/sparc/Makefile.sparc.shared Thu Dec 14 16:42:14 2006 -0800 +++ b/usr/src/uts/sparc/Makefile.sparc.shared Thu Dec 14 17:27:13 2006 -0800 @@ -227,7 +227,7 @@ DRV_KMODS += fssnap icmp icmp6 ip ip6 ipsecah DRV_KMODS += ipsecesp iwscn keysock kmdb kstat ksyms llc1 DRV_KMODS += lofi -DRV_KMODS += log logindmux kssl mm nca pm poll pool +DRV_KMODS += log logindmux kssl mm nca physmem pm poll pool DRV_KMODS += pseudo ptc ptm pts ptsl ramdisk random rsm rts sad DRV_KMODS += sppp sppptun sy sysevent sysmsg DRV_KMODS += spdsock
--- a/usr/src/uts/sparc/os/minor_perm Thu Dec 14 16:42:14 2006 -0800 +++ b/usr/src/uts/sparc/os/minor_perm Thu Dec 14 17:27:13 2006 -0800 @@ -166,3 +166,4 @@ chxge:* 0666 root sys vsw:* 0666 root sys vnet:* 0666 root sys +physmem:* 0600 root sys
--- a/usr/src/uts/sparc/os/name_to_major Thu Dec 14 16:42:14 2006 -0800 +++ b/usr/src/uts/sparc/os/name_to_major Thu Dec 14 17:27:13 2006 -0800 @@ -215,3 +215,4 @@ pxb_bcm 267 pxb_plx 268 n2rng 269 +physmem 270
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/sparc/physmem/Makefile Thu Dec 14 17:27:13 2006 -0800 @@ -0,0 +1,88 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# uts/sparc/physmem/Makefile +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# +# This makefile drives the production of the mm driver +# +# sparc implementation architecture dependent +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = physmem +OBJECTS = $(PHYSMEM_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(PHYSMEM_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE) +CONF_SRCDIR = $(UTSBASE)/common/io + +# +# Include common rules. +# +include $(UTSBASE)/sparc/Makefile.sparc + +# +# Define targets +# +ALL_TARGET = $(BINARY) $(SRC_CONFILE) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) + +# +# lint pass one enforcement +# +CFLAGS += $(CCVERBOSE) + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/sparc/Makefile.targ
--- a/usr/src/uts/sun4/vm/vm_dep.h Thu Dec 14 16:42:14 2006 -0800 +++ b/usr/src/uts/sun4/vm/vm_dep.h Thu Dec 14 17:27:13 2006 -0800 @@ -630,6 +630,7 @@ ulong_t ppr_relocnolock[MMU_PAGE_SIZES]; ulong_t ppr_relocnomem[MMU_PAGE_SIZES]; ulong_t ppr_krelocfail[MMU_PAGE_SIZES]; + ulong_t ppr_copyfail; /* page coalesce counter */ ulong_t page_ctrs_coalesce[MMU_PAGE_SIZES][MAX_MNODE_MRANGES]; /* candidates useful */
--- a/usr/src/uts/sun4u/os/mach_cpu_states.c Thu Dec 14 16:42:14 2006 -0800 +++ b/usr/src/uts/sun4u/os/mach_cpu_states.c Thu Dec 14 17:27:13 2006 -0800 @@ -104,8 +104,7 @@ /* * Clear any unresolved UEs from memory. */ - if (memsegs != NULL) - page_retire_hunt(page_retire_mdboot_cb); + page_retire_mdboot(); /* * stop other cpus which also raise our priority. since there is only
--- a/usr/src/uts/sun4u/os/ppage.c Thu Dec 14 16:42:14 2006 -0800 +++ b/usr/src/uts/sun4u/os/ppage.c Thu Dec 14 17:27:13 2006 -0800 @@ -366,6 +366,8 @@ caddr_t fm_va, to_va; caddr_t *fm_slot, *to_slot; processorid_t cpu; + label_t ljb; + int ret = 1; ASSERT(PAGE_LOCKED(fm_pp)); ASSERT(PAGE_LOCKED(to_pp)); @@ -391,12 +393,18 @@ kpreempt_enable(); return (0); } + if (on_fault(&ljb)) { + ret = 0; + goto faulted; + } hwblkpagecopy(fm_va, to_va); + no_fault(); +faulted: ASSERT(CPU->cpu_id == cpu); pp_unload_tlb(fm_slot, fm_va); pp_unload_tlb(to_slot, to_va); kpreempt_enable(); - return (1); + return (ret); } /* @@ -425,22 +433,33 @@ * * Try to use per cpu mapping first, if that fails then call pp_mapin * to load it. + * + * Returns one on success or zero on some sort of fault while doing the copy. */ -void +int ppcopy(page_t *fm_pp, page_t *to_pp) { caddr_t fm_va, to_va; + label_t ljb; + int ret = 1; /* Try the fast path first */ if (ppcopy_common(fm_pp, to_pp)) - return; + return (1); /* Fast path failed, so we need to do the slow path. */ fm_va = ppmapin(fm_pp, PROT_READ, (caddr_t)-1); to_va = ppmapin(to_pp, PROT_READ | PROT_WRITE, fm_va); + if (on_fault(&ljb)) { + ret = 0; + goto faulted; + } bcopy(fm_va, to_va, PAGESIZE); + no_fault(); +faulted: ppmapout(fm_va); ppmapout(to_va); + return (ret); } /*
--- a/usr/src/uts/sun4v/os/mach_cpu_states.c Thu Dec 14 16:42:14 2006 -0800 +++ b/usr/src/uts/sun4v/os/mach_cpu_states.c Thu Dec 14 17:27:13 2006 -0800 @@ -134,8 +134,7 @@ /* * Clear any unresolved UEs from memory. */ - if (memsegs != NULL) - page_retire_hunt(page_retire_mdboot_cb); + page_retire_mdboot(); /* * stop other cpus which also raise our priority. since there is only
--- a/usr/src/uts/sun4v/os/ppage.c Thu Dec 14 16:42:14 2006 -0800 +++ b/usr/src/uts/sun4v/os/ppage.c Thu Dec 14 17:27:13 2006 -0800 @@ -253,13 +253,16 @@ * * Try to use per cpu mapping first, if that fails then call pp_mapin * to load it. + * Returns one on success or zero on some sort of fault while doing the copy. */ -void +int ppcopy(page_t *fm_pp, page_t *to_pp) { caddr_t fm_va; caddr_t to_va; boolean_t fast; + label_t ljb; + int ret = 1; ASSERT(PAGE_LOCKED(fm_pp)); ASSERT(PAGE_LOCKED(to_pp)); @@ -278,7 +281,13 @@ } else fast = B_TRUE; + if (on_fault(&ljb)) { + ret = 0; + goto faulted; + } bcopy(fm_va, to_va, PAGESIZE); + no_fault(); +faulted: /* Unmap */ if (fast) { @@ -288,6 +297,7 @@ ppmapout(fm_va); ppmapout(to_va); } + return (ret); } /*