Mercurial > illumos > illumos-gate
changeset 6695:12d7dd4459fd
6423097 segvn_pagelock() may perform very poorly
6526804 DR delete_memory_thread, AIO, and segvn deadlock
6557794 segspt_dismpagelock() and segspt_shmadvise(MADV_FREE) may deadlock
6557813 seg_ppurge_seg() shouldn't flush all unrelated ISM/DISM segments
6557891 softlocks/pagelocks of anon pages should not decrement availrmem for memory swapped pages
6559612 multiple softlocks on a DISM segment should decrement availrmem just once
6562291 page_mem_avail() is stuck due to availrmem overaccounting and lack of seg_preap() calls
6596555 locked anonymous pages should not have assigned disk swap slots
6639424 hat_sfmmu.c:hat_pagesync() doesn't handle well HAT_SYNC_STOPON_REF and HAT_SYNC_STOPON_MOD flags
6639425 optimize checkpage() optimizations
6662927 page_llock contention during I/O
line wrap: on
line diff
--- a/usr/src/uts/common/fs/swapfs/swap_vnops.c Thu May 22 22:08:42 2008 -0700 +++ b/usr/src/uts/common/fs/swapfs/swap_vnops.c Thu May 22 22:23:49 2008 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -167,7 +167,7 @@ int upgrade = 0; SWAPFS_PRINT(SWAP_VOPS, "swap_getapage: vp %p, off %llx, len %lx\n", - vp, off, len, 0, 0); + vp, off, len, 0, 0); /* * Until there is a call-back mechanism to cause SEGKP @@ -247,8 +247,10 @@ mutex_enter(ahm); ap = swap_anon(vp, off); - if (ap == NULL) - panic("swap_getapage: null anon"); + if (ap == NULL) { + panic("swap_getapage:" + " null anon"); + } if (ap->an_pvp == pvp && ap->an_poff == poff) { @@ -298,7 +300,7 @@ pvn_plist_init(pp, pl, plsz, off, PAGESIZE, rw); } TRACE_3(TR_FAC_SWAPFS, TR_SWAPFS_GETAPAGE, - "swapfs getapage:pp %p vp %p off %llx", pp, vp, off); + "swapfs getapage:pp %p vp %p off %llx", pp, vp, off); return (err); } @@ -340,7 +342,7 @@ ASSERT(nreloc != NULL); ASSERT(!SEG_IS_SEGKP(seg)); /* XXX for now not supported */ SWAPFS_PRINT(SWAP_VOPS, "swap_getconpage: vp %p, off %llx, len %lx\n", - vp, off, len, 0, 0); + vp, off, len, 0, 0); /* * If we are not using a preallocated page then we know one already @@ -384,7 +386,7 @@ pl[1] = NULL; if (page_pptonum(pp) & (page_get_pagecnt(conpp->p_szc) - 1)) - cmn_err(CE_PANIC, "swap_getconpage: no root"); + cmn_err(CE_PANIC, "swap_getconpage: no root"); } return (err); } @@ -415,9 +417,27 @@ "swap_getconpage: swap_getphysname failed!"); } - if (pvp) { - err = VOP_PAGEIO(pvp, pp, poff, PAGESIZE, B_READ, cr, - NULL); + if (pvp != NULL) { + err = VOP_PAGEIO(pvp, pp, poff, PAGESIZE, B_READ, + cr, NULL); + if (err == 0) { + struct anon *ap; + kmutex_t *ahm; + + ahm = &anonhash_lock[AH_LOCK(vp, off)]; + mutex_enter(ahm); + ap = swap_anon(vp, off); + if (ap == NULL) + panic("swap_getconpage: null anon"); + if (ap->an_pvp != pvp || ap->an_poff != poff) + panic("swap_getconpage: bad anon"); + + swap_phys_free(pvp, poff, PAGESIZE); + ap->an_pvp = NULL; + ap->an_poff = NULL; + hat_setmod(pp); + mutex_exit(ahm); + } } else { pagezero(pp, 0, PAGESIZE); } @@ -435,7 +455,7 @@ ASSERT(pp->p_prev == pp); TRACE_3(TR_FAC_SWAPFS, TR_SWAPFS_GETAPAGE, - "swapfs getconpage:pp %p vp %p off %llx", pp, vp, off); + "swapfs getconpage:pp %p vp %p off %llx", pp, vp, off); pl[0] = pp; pl[1] = NULL; @@ -552,7 +572,7 @@ pp = page_lookup(vp, io_off, SE_EXCL); else pp = page_lookup_nowait(vp, io_off, - (flags & B_FREE) ? SE_EXCL : SE_SHARED); + (flags & B_FREE) ? SE_EXCL : SE_SHARED); if (pp == NULL || pvn_getdirty(pp, flags) == 0) io_len = PAGESIZE; @@ -628,8 +648,8 @@ } SWAPFS_PRINT(SWAP_PUTP, - "swap_putapage: pp %p, vp %p, off %llx, flags %x\n", - pp, vp, pp->p_offset, flags, 0); + "swap_putapage: pp %p, vp %p, off %llx, flags %x\n", + pp, vp, pp->p_offset, flags, 0); ASSERT(PAGE_LOCKED(pp)); @@ -683,7 +703,7 @@ doff = off; dlen = PAGESIZE; if (err = swap_newphysname(vp, off, &doff, &dlen, - &pvp, &poff)) { + &pvp, &poff)) { swap_otherfail++; swap_otherpages += btop(klsz); hat_setmod(pp); @@ -715,7 +735,7 @@ } err = VOP_PAGEIO(klvp, pplist, klstart, klsz, - B_WRITE | flags, cr, NULL); + B_WRITE | flags, cr, NULL); if ((flags & B_ASYNC) == 0) pvn_write_done(pp, ((err) ? B_ERROR : 0) | B_WRITE | flags); @@ -727,8 +747,8 @@ } out: TRACE_4(TR_FAC_SWAPFS, TR_SWAPFS_PUTAPAGE, - "swapfs putapage:vp %p klvp %p, klstart %lx, klsz %lx", - vp, klvp, klstart, klsz); + "swapfs putapage:vp %p klvp %p, klstart %lx, klsz %lx", + vp, klvp, klstart, klsz); if (err && err != ENOMEM) cmn_err(CE_WARN, "swapfs_putapage: err %d\n", err); if (lenp)
--- a/usr/src/uts/common/io/dump.c Thu May 22 22:08:42 2008 -0700 +++ b/usr/src/uts/common/io/dump.c Thu May 22 22:23:49 2008 -0700 @@ -116,13 +116,12 @@ * of these counters. */ dumpsize_in_pages = (physinstalled - obp_pages - - availrmem - - anon_segkp_pages_locked - - k_anoninfo.ani_mem_resv - - segvn_pages_locked - - pages_locked - - pages_claimed - - pages_useclaim); + availrmem - + anon_segkp_pages_locked - + k_anoninfo.ani_mem_resv - + pages_locked - + pages_claimed - + pages_useclaim); /* * Protect against vm vagaries.
--- a/usr/src/uts/common/os/kstat_fr.c Thu May 22 22:08:42 2008 -0700 +++ b/usr/src/uts/common/os/kstat_fr.c Thu May 22 22:23:49 2008 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" @@ -494,7 +494,7 @@ */ kstat_chain_id = 0; ksp = kstat_create("unix", 0, "kstat_headers", "kstat", KSTAT_TYPE_RAW, - 0, KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_VAR_SIZE); + 0, KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_VAR_SIZE); if (ksp) { ksp->ks_lock = &kstat_chain_lock; ksp->ks_update = header_kstat_update; @@ -505,35 +505,35 @@ } ksp = kstat_create("unix", 0, "kstat_types", "kstat", - KSTAT_TYPE_NAMED, KSTAT_NUM_TYPES, 0); + KSTAT_TYPE_NAMED, KSTAT_NUM_TYPES, 0); if (ksp) { int i; kstat_named_t *kn = KSTAT_NAMED_PTR(ksp); for (i = 0; i < KSTAT_NUM_TYPES; i++) { kstat_named_init(&kn[i], kstat_data_type[i].name, - KSTAT_DATA_ULONG); + KSTAT_DATA_ULONG); kn[i].value.ul = i; } kstat_install(ksp); } ksp = kstat_create("unix", 0, "sysinfo", "misc", KSTAT_TYPE_RAW, - sizeof (sysinfo_t), KSTAT_FLAG_VIRTUAL); + sizeof (sysinfo_t), KSTAT_FLAG_VIRTUAL); if (ksp) { ksp->ks_data = (void *) &sysinfo; kstat_install(ksp); } ksp = kstat_create("unix", 0, "vminfo", "vm", KSTAT_TYPE_RAW, - sizeof (vminfo_t), KSTAT_FLAG_VIRTUAL); + sizeof (vminfo_t), KSTAT_FLAG_VIRTUAL); if (ksp) { ksp->ks_data = (void *) &vminfo; kstat_install(ksp); } ksp = kstat_create("unix", 0, "segmap", "vm", KSTAT_TYPE_NAMED, - segmapcnt_ndata, KSTAT_FLAG_VIRTUAL); + segmapcnt_ndata, KSTAT_FLAG_VIRTUAL); if (ksp) { ksp->ks_data = (void *) segmapcnt_ptr; ksp->ks_update = segmap_kstat_update; @@ -541,7 +541,7 @@ } ksp = kstat_create("unix", 0, "biostats", "misc", KSTAT_TYPE_NAMED, - biostats_ndata, KSTAT_FLAG_VIRTUAL); + biostats_ndata, KSTAT_FLAG_VIRTUAL); if (ksp) { ksp->ks_data = (void *) biostats_ptr; kstat_install(ksp); @@ -549,7 +549,7 @@ #ifdef VAC ksp = kstat_create("unix", 0, "flushmeter", "hat", KSTAT_TYPE_RAW, - sizeof (struct flushmeter), KSTAT_FLAG_VIRTUAL); + sizeof (struct flushmeter), KSTAT_FLAG_VIRTUAL); if (ksp) { ksp->ks_data = (void *) &flush_cnt; kstat_install(ksp); @@ -557,15 +557,15 @@ #endif /* VAC */ ksp = kstat_create("unix", 0, "var", "misc", KSTAT_TYPE_RAW, - sizeof (struct var), KSTAT_FLAG_VIRTUAL); + sizeof (struct var), KSTAT_FLAG_VIRTUAL); if (ksp) { ksp->ks_data = (void *) &v; kstat_install(ksp); } ksp = kstat_create("unix", 0, "system_misc", "misc", KSTAT_TYPE_NAMED, - sizeof (system_misc_kstat) / sizeof (kstat_named_t), - KSTAT_FLAG_VIRTUAL); + sizeof (system_misc_kstat) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); if (ksp) { ksp->ks_data = (void *) &system_misc_kstat; ksp->ks_update = system_misc_kstat_update; @@ -573,8 +573,8 @@ } ksp = kstat_create("unix", 0, "system_pages", "pages", KSTAT_TYPE_NAMED, - sizeof (system_pages_kstat) / sizeof (kstat_named_t), - KSTAT_FLAG_VIRTUAL); + sizeof (system_pages_kstat) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); if (ksp) { ksp->ks_data = (void *) &system_pages_kstat; ksp->ks_update = system_pages_kstat_update; @@ -911,9 +911,9 @@ * user explicit page locking. */ system_pages_kstat.pp_kernel.value.ul = (ulong_t)(physinstalled - - obp_pages - availrmem - k_anoninfo.ani_mem_resv - - anon_segkp_pages_locked - segvn_pages_locked - - pages_locked - pages_claimed - pages_useclaim); + obp_pages - availrmem - k_anoninfo.ani_mem_resv - + anon_segkp_pages_locked - pages_locked - + pages_claimed - pages_useclaim); return (0); } @@ -923,7 +923,7 @@ const char *ks_class, uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags) { return (kstat_create_zone(ks_module, ks_instance, ks_name, ks_class, - ks_type, ks_ndata, ks_flags, ALL_ZONES)); + ks_type, ks_ndata, ks_flags, ALL_ZONES)); } /* @@ -966,8 +966,8 @@ */ if (ks_type >= KSTAT_NUM_TYPES) { cmn_err(CE_WARN, "kstat_create('%s', %d, '%s'): " - "invalid kstat type %d", - ks_module, ks_instance, ks_name, ks_type); + "invalid kstat type %d", + ks_module, ks_instance, ks_name, ks_type); return (NULL); } @@ -978,8 +978,8 @@ if ((ks_flags & KSTAT_FLAG_PERSISTENT) && (ks_flags & KSTAT_FLAG_VIRTUAL)) { cmn_err(CE_WARN, "kstat_create('%s', %d, '%s'): " - "cannot create persistent virtual kstat", - ks_module, ks_instance, ks_name); + "cannot create persistent virtual kstat", + ks_module, ks_instance, ks_name); return (NULL); } @@ -990,8 +990,8 @@ if ((ks_flags & KSTAT_FLAG_VAR_SIZE) && !(ks_flags & KSTAT_FLAG_VIRTUAL)) { cmn_err(CE_WARN, "kstat_create('%s', %d, '%s'): " - "cannot create variable-size physical kstat", - ks_module, ks_instance, ks_name); + "cannot create variable-size physical kstat", + ks_module, ks_instance, ks_name); return (NULL); } @@ -1001,10 +1001,10 @@ if (ks_ndata < kstat_data_type[ks_type].min_ndata || ks_ndata > kstat_data_type[ks_type].max_ndata) { cmn_err(CE_WARN, "kstat_create('%s', %d, '%s'): " - "ks_ndata=%d out of range [%d, %d]", - ks_module, ks_instance, ks_name, (int)ks_ndata, - kstat_data_type[ks_type].min_ndata, - kstat_data_type[ks_type].max_ndata); + "ks_ndata=%d out of range [%d, %d]", + ks_module, ks_instance, ks_name, (int)ks_ndata, + kstat_data_type[ks_type].min_ndata, + kstat_data_type[ks_type].max_ndata); return (NULL); } @@ -1036,8 +1036,8 @@ */ kstat_rele(ksp); cmn_err(CE_WARN, "kstat_create('%s', %d, '%s'): " - "invalid reactivation of dormant kstat", - ks_module, ks_instance, ks_name); + "invalid reactivation of dormant kstat", + ks_module, ks_instance, ks_name); return (NULL); } /* @@ -1056,8 +1056,8 @@ e = kstat_alloc(ks_flags & KSTAT_FLAG_VIRTUAL ? 0 : ks_data_size); if (e == NULL) { cmn_err(CE_NOTE, "kstat_create('%s', %d, '%s'): " - "insufficient kernel memory", - ks_module, ks_instance, ks_name); + "insufficient kernel memory", + ks_module, ks_instance, ks_name); return (NULL); }
--- a/usr/src/uts/common/os/mem_cage.c Thu May 22 22:08:42 2008 -0700 +++ b/usr/src/uts/common/os/mem_cage.c Thu May 22 22:23:49 2008 -0700 @@ -1271,6 +1271,11 @@ } } } + + if (NOMEMWAIT() && freemem < minfree) { + return (KCT_CRIT); + } + } return (KCT_NONCRIT); }
--- a/usr/src/uts/common/os/schedctl.c Thu May 22 22:08:42 2008 -0700 +++ b/usr/src/uts/common/os/schedctl.c Thu May 22 22:23:49 2008 -0700 @@ -676,6 +676,7 @@ * we have to free everything rather than letting as_free * do the work. */ + anonmap_purge(amp); anon_free(amp->ahp, 0, PAGESIZE); ANON_LOCK_EXIT(&->a_rwlock); anonmap_free(amp);
--- a/usr/src/uts/common/os/shm.c Thu May 22 22:08:42 2008 -0700 +++ b/usr/src/uts/common/os/shm.c Thu May 22 22:23:49 2008 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -718,9 +718,8 @@ if (error = shmem_lock(sp, sp->shm_amp)) { ANON_LOCK_ENTER(&sp->shm_amp->a_rwlock, RW_WRITER); - cmn_err(CE_NOTE, - "shmctl - couldn't lock %ld pages into " - "memory", sp->shm_amp->size); + cmn_err(CE_NOTE, "shmctl - couldn't lock %ld" + " pages into memory", sp->shm_amp->size); ANON_LOCK_EXIT(&sp->shm_amp->a_rwlock); error = ENOMEM; sp->shm_lkcnt--; @@ -1253,13 +1252,14 @@ * Free up the anon_map. */ lgrp_shm_policy_fini(amp, NULL); + ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); + anonmap_purge(amp); if (amp->a_szc != 0) { - ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); anon_shmap_free_pages(amp, 0, amp->size); - ANON_LOCK_EXIT(&->a_rwlock); } else { anon_free(amp->ahp, 0, amp->size); } + ANON_LOCK_EXIT(&->a_rwlock); anon_unresv_zone(amp->swresv, zone); anonmap_free(amp); }
--- a/usr/src/uts/common/os/vm_pageout.c Thu May 22 22:08:42 2008 -0700 +++ b/usr/src/uts/common/os/vm_pageout.c Thu May 22 22:23:49 2008 -0700 @@ -531,7 +531,7 @@ if (freemem < lotsfree + needfree + kmem_reapahead) kmem_reap(); - if (freemem < lotsfree + needfree + seg_preapahead) + if (freemem < lotsfree + needfree) seg_preap(); if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree)) @@ -957,9 +957,10 @@ * * NOTE: These optimizations assume that reads are atomic. */ -top: - if ((PP_ISKAS(pp)) || (PP_ISFREE(pp)) || - hat_page_checkshare(pp, po_share) || PAGE_LOCKED(pp)) { + + if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) || + pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || + hat_page_checkshare(pp, po_share)) { return (-1); }
--- a/usr/src/uts/common/vm/anon.h Thu May 22 22:08:42 2008 -0700 +++ b/usr/src/uts/common/vm/anon.h Thu May 22 22:23:49 2008 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -139,7 +139,6 @@ * Declaration for the Global counters to accurately * track the kernel foot print in memory. */ -extern pgcnt_t segvn_pages_locked; extern pgcnt_t pages_locked; extern pgcnt_t pages_claimed; extern pgcnt_t pages_useclaim; @@ -278,7 +277,7 @@ * 0 (base page size) or page_num_pagesizes() - 1, while MAP_PRIVATE * the amp->szc could be anything in [0, page_num_pagesizes() - 1]. */ -struct anon_map { +typedef struct anon_map { krwlock_t a_rwlock; /* protect anon_map and anon array */ size_t size; /* size in bytes mapped by the anon array */ struct anon_hdr *ahp; /* anon array header pointer, containing */ @@ -288,7 +287,13 @@ ushort_t a_szc; /* max szc among shared processes */ void *locality; /* lgroup locality info */ struct kshmid *a_sp; /* kshmid if amp backs sysV, or NULL */ -}; + int a_purgewait; /* somebody waits for slocks to go away */ + kcondvar_t a_purgecv; /* cv for waiting for slocks to go away */ + kmutex_t a_purgemtx; /* mutex for anonmap_purge() */ + spgcnt_t a_softlockcnt; /* number of pages locked in pcache */ + kmutex_t a_pmtx; /* protects amp's pcache list */ + pcache_link_t a_phead; /* head of amp's pcache list */ +} amp_t; #ifdef _KERNEL @@ -303,6 +308,9 @@ #define ANON_LOCK_ENTER(lock, type) rw_enter((lock), (type)) #define ANON_LOCK_EXIT(lock) rw_exit((lock)) +#define ANON_LOCK_HELD(lock) RW_LOCK_HELD((lock)) +#define ANON_READ_HELD(lock) RW_READ_HELD((lock)) +#define ANON_WRITE_HELD(lock) RW_WRITE_HELD((lock)) #define ANON_ARRAY_HASH(amp, idx)\ ((((idx) + ((idx) >> ANON_ARRAY_SHIFT) +\ @@ -334,9 +342,9 @@ /* * Swap slots currently available for reservation */ -#define CURRENT_TOTAL_AVAILABLE_SWAP \ +#define CURRENT_TOTAL_AVAILABLE_SWAP \ ((k_anoninfo.ani_max - k_anoninfo.ani_phys_resv) + \ - MAX((spgcnt_t)(availrmem - swapfs_minfree), 0)) + MAX((spgcnt_t)(availrmem - swapfs_minfree), 0)) struct k_anoninfo { pgcnt_t ani_max; /* total reservable slots on phys */ @@ -392,6 +400,8 @@ extern void anon_unresvmem(size_t, zone_t *); extern struct anon_map *anonmap_alloc(size_t, size_t, int); extern void anonmap_free(struct anon_map *); +extern void anonmap_purge(struct anon_map *); +extern void anon_swap_free(struct anon *, struct page *); extern void anon_decref(struct anon *); extern int non_anon(struct anon_hdr *, ulong_t, u_offset_t *, size_t *); extern pgcnt_t anon_pages(struct anon_hdr *, ulong_t, pgcnt_t);
--- a/usr/src/uts/common/vm/as.h Thu May 22 22:08:42 2008 -0700 +++ b/usr/src/uts/common/vm/as.h Thu May 22 22:23:49 2008 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -131,23 +131,26 @@ #define AS_CLAIMGAP 0x40 #define AS_UNMAPWAIT 0x20 #define AS_NEEDSPURGE 0x10 /* mostly for seg_nf, see as_purge() */ +#define AS_NOUNMAPWAIT 0x02 #define AS_BUSY 0x01 /* needed by XHAT framework */ #define AS_ISPGLCK(as) ((as)->a_flags & AS_PAGLCK) #define AS_ISCLAIMGAP(as) ((as)->a_flags & AS_CLAIMGAP) #define AS_ISUNMAPWAIT(as) ((as)->a_flags & AS_UNMAPWAIT) #define AS_ISBUSY(as) ((as)->a_flags & AS_BUSY) - +#define AS_ISNOUNMAPWAIT(as) ((as)->a_flags & AS_NOUNMAPWAIT) #define AS_SETPGLCK(as) ((as)->a_flags |= AS_PAGLCK) #define AS_SETCLAIMGAP(as) ((as)->a_flags |= AS_CLAIMGAP) #define AS_SETUNMAPWAIT(as) ((as)->a_flags |= AS_UNMAPWAIT) #define AS_SETBUSY(as) ((as)->a_flags |= AS_BUSY) +#define AS_SETNOUNMAPWAIT(as) ((as)->a_flags |= AS_NOUNMAPWAIT) #define AS_CLRPGLCK(as) ((as)->a_flags &= ~AS_PAGLCK) #define AS_CLRCLAIMGAP(as) ((as)->a_flags &= ~AS_CLAIMGAP) #define AS_CLRUNMAPWAIT(as) ((as)->a_flags &= ~AS_UNMAPWAIT) #define AS_CLRBUSY(as) ((as)->a_flags &= ~AS_BUSY) +#define AS_CLRNOUNMAPWAIT(as) ((as)->a_flags &= ~AS_NOUNMAPWAIT) #define AS_TYPE_64BIT(as) \ (((as)->a_userlimit > (caddr_t)UINT32_MAX) ? 1 : 0) @@ -281,8 +284,6 @@ size_t size, enum seg_rw rw); void as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size, enum seg_rw rw); -void as_pagereclaim(struct as *as, struct page **pp, caddr_t addr, - size_t size, enum seg_rw rw); int as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc, boolean_t wait); int as_set_default_lpsize(struct as *as, caddr_t addr, size_t size);
--- a/usr/src/uts/common/vm/seg.h Thu May 22 22:08:42 2008 -0700 +++ b/usr/src/uts/common/vm/seg.h Thu May 22 22:23:49 2008 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -55,6 +55,8 @@ * VM - Segments. */ +struct anon_map; + /* * kstat statistics for segment advise */ @@ -93,7 +95,12 @@ * write locked. */ -struct seg { +typedef struct pcache_link { + struct pcache_link *p_lnext; + struct pcache_link *p_lprev; +} pcache_link_t; + +typedef struct seg { caddr_t s_base; /* base virtual address */ size_t s_size; /* size in bytes */ uint_t s_szc; /* max page size code */ @@ -102,7 +109,9 @@ avl_node_t s_tree; /* AVL tree links to segs in this as */ struct seg_ops *s_ops; /* ops vector: see below */ void *s_data; /* private data for instance */ -}; + kmutex_t s_pmtx; /* protects seg's pcache list */ + pcache_link_t s_phead; /* head of seg's pcache list */ +} seg_t; #define S_PURGE (0x01) /* seg should be purged in as_gap() */ @@ -136,6 +145,7 @@ }; #ifdef _KERNEL + /* * Generic segment operations */ @@ -149,28 +159,41 @@ /* * functions for pagelock cache support */ -extern void seg_ppurge(struct seg *seg); -extern void seg_ppurge_seg(int (*callback)()); -extern void seg_pinactive(struct seg *seg, caddr_t addr, size_t len, - struct page **pp, enum seg_rw rw, int (*callback)()); -extern int seg_pinsert_check(struct seg *seg, size_t len, uint_t flags); -extern int seg_pinsert(struct seg *seg, caddr_t addr, size_t len, - struct page **pp, enum seg_rw rw, uint_t flags, - int (*callback)()); -extern struct page **seg_plookup(struct seg *seg, caddr_t addr, - size_t len, enum seg_rw rw); +typedef int (*seg_preclaim_cbfunc_t)(void *, caddr_t, size_t, + struct page **, enum seg_rw, int); + +extern struct page **seg_plookup(struct seg *seg, struct anon_map *amp, + caddr_t addr, size_t len, enum seg_rw rw, uint_t flags); +extern void seg_pinactive(struct seg *seg, struct anon_map *amp, + caddr_t addr, size_t len, struct page **pp, enum seg_rw rw, + uint_t flags, seg_preclaim_cbfunc_t callback); + +extern void seg_ppurge(struct seg *seg, struct anon_map *amp, + uint_t flags); +extern void seg_ppurge_wiredpp(struct page **pp); + +extern int seg_pinsert_check(struct seg *seg, struct anon_map *amp, + caddr_t addr, size_t len, uint_t flags); +extern int seg_pinsert(struct seg *seg, struct anon_map *amp, + caddr_t addr, size_t len, size_t wlen, struct page **pp, enum seg_rw rw, + uint_t flags, seg_preclaim_cbfunc_t callback); + extern void seg_pasync_thread(void); extern void seg_preap(void); extern int seg_p_disable(void); extern void seg_p_enable(void); -extern int seg_preapahead; -extern segadvstat_t segadvstat; +extern segadvstat_t segadvstat; + /* - * Flags for pagelock cache support + * Flags for pagelock cache support. + * Flags argument is passed as uint_t to pcache routines. upper 16 bits of + * the flags argument are reserved for alignment page shift when SEGP_PSHIFT + * is set. */ -#define SEGP_ASYNC_FLUSH 0x1 /* flushed by async thread */ -#define SEGP_FORCE_WIRED 0x2 /* skip check against seg_pwindow */ +#define SEGP_FORCE_WIRED 0x1 /* skip check against seg_pwindow */ +#define SEGP_AMP 0x2 /* anon map's pcache entry */ +#define SEGP_PSHIFT 0x4 /* addr pgsz shift for hash function */ /* * Return values for seg_pinsert and seg_pinsert_check functions.
--- a/usr/src/uts/common/vm/seg_enum.h Thu May 22 22:08:42 2008 -0700 +++ b/usr/src/uts/common/vm/seg_enum.h Thu May 22 22:23:49 2008 -0700 @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -62,8 +61,7 @@ */ enum lock_type { L_PAGELOCK, /* lock pages */ - L_PAGEUNLOCK, /* unlock pages */ - L_PAGERECLAIM /* reclaim pages */ + L_PAGEUNLOCK /* unlock pages */ }; /*
--- a/usr/src/uts/common/vm/seg_kmem.c Thu May 22 22:08:42 2008 -0700 +++ b/usr/src/uts/common/vm/seg_kmem.c Thu May 22 22:23:49 2008 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -674,9 +674,6 @@ BT_TEST(segkp_bitmap, btop((uintptr_t)(addr - seg->s_base)))) return (SEGOP_PAGELOCK(segkp, addr, len, ppp, type, rw)); - if (type == L_PAGERECLAIM) - return (ENOTSUP); - npages = btopr(len); nb = sizeof (page_t *) * npages;
--- a/usr/src/uts/common/vm/seg_spt.c Thu May 22 22:08:42 2008 -0700 +++ b/usr/src/uts/common/vm/seg_spt.c Thu May 22 22:23:49 2008 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -174,8 +174,8 @@ }; static void segspt_purge(struct seg *seg); -static int segspt_reclaim(struct seg *, caddr_t, size_t, struct page **, - enum seg_rw); +static int segspt_reclaim(void *, caddr_t, size_t, struct page **, + enum seg_rw, int); static int spt_anon_getpages(struct seg *seg, caddr_t addr, size_t len, page_t **ppa); @@ -833,6 +833,7 @@ uint_t szc; ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK); /* * We want to lock/unlock the entire ISM segment. Therefore, @@ -857,8 +858,8 @@ if (type == L_PAGEUNLOCK) { ASSERT(sptd->spt_ppa != NULL); - seg_pinactive(seg, seg->s_base, sptd->spt_amp->size, - sptd->spt_ppa, sptd->spt_prot, segspt_reclaim); + seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size, + sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); /* * If someone is blocked while unmapping, we purge @@ -868,17 +869,16 @@ * raw async i/o is still in progress or where a thread * exits on data fault in a multithreaded application. */ - if (AS_ISUNMAPWAIT(seg->s_as) && (shmd->shm_softlockcnt > 0)) { + if ((sptd->spt_flags & DISM_PPA_CHANGED) || + (AS_ISUNMAPWAIT(seg->s_as) && + shmd->shm_softlockcnt > 0)) { segspt_purge(seg); } return (0); - } else if (type == L_PAGERECLAIM) { - ASSERT(sptd->spt_ppa != NULL); - (void) segspt_reclaim(seg, seg->s_base, sptd->spt_amp->size, - sptd->spt_ppa, sptd->spt_prot); - return (0); } + /* The L_PAGELOCK case ... */ + if (sptd->spt_flags & DISM_PPA_CHANGED) { segspt_purge(seg); /* @@ -893,17 +893,17 @@ * First try to find pages in segment page cache, without * holding the segment lock. */ - pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size, - sptd->spt_prot); + pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size, + S_WRITE, SEGP_FORCE_WIRED); if (pplist != NULL) { ASSERT(sptd->spt_ppa != NULL); ASSERT(sptd->spt_ppa == pplist); ppa = sptd->spt_ppa; for (an_idx = pg_idx; an_idx < pg_idx + npages; ) { if (ppa[an_idx] == NULL) { - seg_pinactive(seg, seg->s_base, + seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size, ppa, - sptd->spt_prot, segspt_reclaim); + S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); *ppp = NULL; return (ENOTSUP); } @@ -923,13 +923,12 @@ return (0); } - /* The L_PAGELOCK case... */ mutex_enter(&sptd->spt_lock); /* * try to find pages in segment page cache with mutex */ - pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size, - sptd->spt_prot); + pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size, + S_WRITE, SEGP_FORCE_WIRED); if (pplist != NULL) { ASSERT(sptd->spt_ppa != NULL); ASSERT(sptd->spt_ppa == pplist); @@ -937,9 +936,9 @@ for (an_idx = pg_idx; an_idx < pg_idx + npages; ) { if (ppa[an_idx] == NULL) { mutex_exit(&sptd->spt_lock); - seg_pinactive(seg, seg->s_base, + seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size, ppa, - sptd->spt_prot, segspt_reclaim); + S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); *ppp = NULL; return (ENOTSUP); } @@ -959,8 +958,8 @@ *ppp = &(sptd->spt_ppa[pg_idx]); return (0); } - if (seg_pinsert_check(seg, sptd->spt_amp->size, SEGP_FORCE_WIRED) == - SEGP_FAIL) { + if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size, + SEGP_FORCE_WIRED) == SEGP_FAIL) { mutex_exit(&sptd->spt_lock); *ppp = NULL; return (ENOTSUP); @@ -1038,16 +1037,18 @@ } ANON_LOCK_EXIT(&->a_rwlock); - mutex_enter(&freemem_lock); - if (availrmem < tune.t_minarmem + claim_availrmem) { + if (claim_availrmem) { + mutex_enter(&freemem_lock); + if (availrmem < tune.t_minarmem + claim_availrmem) { + mutex_exit(&freemem_lock); + ret = ENOTSUP; + claim_availrmem = 0; + goto insert_fail; + } else { + availrmem -= claim_availrmem; + } mutex_exit(&freemem_lock); - ret = FC_MAKE_ERR(ENOMEM); - claim_availrmem = 0; - goto insert_fail; - } else { - availrmem -= claim_availrmem; } - mutex_exit(&freemem_lock); sptd->spt_ppa = pl; } else { @@ -1059,8 +1060,8 @@ ASSERT(pl != NULL); - ret = seg_pinsert(seg, seg->s_base, sptd->spt_amp->size, - pl, sptd->spt_prot, SEGP_FORCE_WIRED | SEGP_ASYNC_FLUSH, + ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size, + sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); if (ret == SEGP_FAIL) { /* @@ -1089,8 +1090,9 @@ for (an_idx = pg_idx; an_idx < pg_idx + npages; ) { if (ppa[an_idx] == NULL) { mutex_exit(&sptd->spt_lock); - seg_pinactive(seg, seg->s_base, sptd->spt_amp->size, - pl, sptd->spt_prot, segspt_reclaim); + seg_pinactive(seg, NULL, seg->s_base, + sptd->spt_amp->size, + pl, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); *ppp = NULL; return (ENOTSUP); } @@ -1113,7 +1115,7 @@ * to the requested addr, i.e. pg_idx. */ *ppp = &(sptd->spt_ppa[pg_idx]); - return (ret); + return (0); insert_fail: /* @@ -1125,9 +1127,11 @@ mutex_exit(&sptd->spt_lock); if (pl_built) { - mutex_enter(&freemem_lock); - availrmem += claim_availrmem; - mutex_exit(&freemem_lock); + if (claim_availrmem) { + mutex_enter(&freemem_lock); + availrmem += claim_availrmem; + mutex_exit(&freemem_lock); + } /* * We created pl and we need to destroy it. @@ -1184,6 +1188,8 @@ u_offset_t off; ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK); + /* * We want to lock/unlock the entire ISM segment. Therefore, @@ -1213,8 +1219,8 @@ ASSERT(sptd->spt_ppa != NULL); - seg_pinactive(seg, seg->s_base, sptd->spt_amp->size, - sptd->spt_ppa, sptd->spt_prot, segspt_reclaim); + seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size, + sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); /* * If someone is blocked while unmapping, we purge @@ -1228,20 +1234,16 @@ segspt_purge(seg); } return (0); - } else if (type == L_PAGERECLAIM) { - ASSERT(sptd->spt_ppa != NULL); + } - (void) segspt_reclaim(seg, seg->s_base, sptd->spt_amp->size, - sptd->spt_ppa, sptd->spt_prot); - return (0); - } + /* The L_PAGELOCK case... */ /* * First try to find pages in segment page cache, without * holding the segment lock. */ - pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size, - sptd->spt_prot); + pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size, + S_WRITE, SEGP_FORCE_WIRED); if (pplist != NULL) { ASSERT(sptd->spt_ppa == pplist); ASSERT(sptd->spt_ppa[page_index]); @@ -1254,14 +1256,13 @@ return (0); } - /* The L_PAGELOCK case... */ mutex_enter(&sptd->spt_lock); /* * try to find pages in segment page cache */ - pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size, - sptd->spt_prot); + pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size, + S_WRITE, SEGP_FORCE_WIRED); if (pplist != NULL) { ASSERT(sptd->spt_ppa == pplist); /* @@ -1274,8 +1275,8 @@ return (0); } - if (seg_pinsert_check(seg, sptd->spt_amp->size, SEGP_FORCE_WIRED) == - SEGP_FAIL) { + if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size, + SEGP_FORCE_WIRED) == SEGP_FAIL) { mutex_exit(&sptd->spt_lock); *ppp = NULL; return (ENOTSUP); @@ -1338,8 +1339,9 @@ ASSERT(pl != NULL); - ret = seg_pinsert(seg, seg->s_base, sptd->spt_amp->size, - pl, sptd->spt_prot, SEGP_FORCE_WIRED, segspt_reclaim); + ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size, + sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED, + segspt_reclaim); if (ret == SEGP_FAIL) { /* * seg_pinsert failed. We return @@ -1375,7 +1377,7 @@ * to the requested addr, i.e. page_index. */ *ppp = &(sptd->spt_ppa[page_index]); - return (ret); + return (0); insert_fail: /* @@ -1419,13 +1421,14 @@ static void segspt_purge(struct seg *seg) { - seg_ppurge(seg); + seg_ppurge(seg, NULL, SEGP_FORCE_WIRED); } static int -segspt_reclaim(struct seg *seg, caddr_t addr, size_t len, struct page **pplist, - enum seg_rw rw) +segspt_reclaim(void *ptag, caddr_t addr, size_t len, struct page **pplist, + enum seg_rw rw, int async) { + struct seg *seg = (struct seg *)ptag; struct shm_data *shmd = (struct shm_data *)seg->s_data; struct seg *sptseg; struct spt_data *sptd; @@ -1442,6 +1445,8 @@ ASSERT(sptd->spt_pcachecnt != 0); ASSERT(sptd->spt_ppa == pplist); ASSERT(npages == btopr(sptd->spt_amp->size)); + ASSERT(async || AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + /* * Acquire the lock on the dummy seg and destroy the * ppa array IF this is the last pcachecnt. @@ -1462,7 +1467,7 @@ free_availrmem++; page_unlock(pplist[i]); } - if (sptd->spt_flags & SHM_PAGEABLE) { + if ((sptd->spt_flags & SHM_PAGEABLE) && free_availrmem) { mutex_enter(&freemem_lock); availrmem += free_availrmem; mutex_exit(&freemem_lock); @@ -1482,14 +1487,41 @@ done = 1; } mutex_exit(&sptd->spt_lock); + + /* + * If we are pcache async thread or called via seg_ppurge_wiredpp() we + * may not hold AS lock (in this case async argument is not 0). This + * means if softlockcnt drops to 0 after the decrement below address + * space may get freed. We can't allow it since after softlock + * derement to 0 we still need to access as structure for possible + * wakeup of unmap waiters. To prevent the disappearance of as we take + * this segment's shm_segfree_syncmtx. segspt_shmfree() also takes + * this mutex as a barrier to make sure this routine completes before + * segment is freed. + * + * The second complication we have to deal with in async case is a + * possibility of missed wake up of unmap wait thread. When we don't + * hold as lock here we may take a_contents lock before unmap wait + * thread that was first to see softlockcnt was still not 0. As a + * result we'll fail to wake up an unmap wait thread. To avoid this + * race we set nounmapwait flag in as structure if we drop softlockcnt + * to 0 if async is not 0. unmapwait thread + * will not block if this flag is set. + */ + if (async) + mutex_enter(&shmd->shm_segfree_syncmtx); + /* * Now decrement softlockcnt. */ + ASSERT(shmd->shm_softlockcnt > 0); atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -1); if (shmd->shm_softlockcnt <= 0) { - if (AS_ISUNMAPWAIT(seg->s_as)) { + if (async || AS_ISUNMAPWAIT(seg->s_as)) { mutex_enter(&seg->s_as->a_contents); + if (async) + AS_SETNOUNMAPWAIT(seg->s_as); if (AS_ISUNMAPWAIT(seg->s_as)) { AS_CLRUNMAPWAIT(seg->s_as); cv_broadcast(&seg->s_as->a_cv); @@ -1497,6 +1529,10 @@ mutex_exit(&seg->s_as->a_contents); } } + + if (async) + mutex_exit(&shmd->shm_segfree_syncmtx); + return (done); } @@ -1604,6 +1640,7 @@ softlock_decrement: npages = btopr(len); + ASSERT(shmd->shm_softlockcnt >= npages); atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -npages); if (shmd->shm_softlockcnt == 0) { /* @@ -1646,6 +1683,8 @@ (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, shm_amp, 0, NULL, 0, seg->s_size); + mutex_init(&shmd->shm_segfree_syncmtx, NULL, MUTEX_DEFAULT, NULL); + seg->s_data = (void *)shmd; seg->s_ops = &segspt_shmops; seg->s_szc = shmd->shm_sptseg->s_szc; @@ -1741,6 +1780,15 @@ kmem_free(shmd->shm_vpage, btopr(shm_amp->size)); shmd->shm_vpage = NULL; } + + /* + * Take shm_segfree_syncmtx lock to let segspt_reclaim() finish if it's + * still working with this segment without holding as lock. + */ + ASSERT(shmd->shm_softlockcnt == 0); + mutex_enter(&shmd->shm_segfree_syncmtx); + mutex_destroy(&shmd->shm_segfree_syncmtx); + kmem_free(shmd, sizeof (*shmd)); } @@ -1834,14 +1882,6 @@ case F_SOFTLOCK: - mutex_enter(&freemem_lock); - if (availrmem < tune.t_minarmem + npages) { - mutex_exit(&freemem_lock); - return (FC_MAKE_ERR(ENOMEM)); - } else { - availrmem -= npages; - } - mutex_exit(&freemem_lock); atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages); /* * Fall through to the F_INVAL case to load up the hat layer @@ -1858,9 +1898,6 @@ err = spt_anon_getpages(sptseg, segspt_addr, size, ppa); if (err != 0) { if (type == F_SOFTLOCK) { - mutex_enter(&freemem_lock); - availrmem += npages; - mutex_exit(&freemem_lock); atomic_add_long((ulong_t *)( &(shmd->shm_softlockcnt)), -npages); } @@ -1934,10 +1971,6 @@ case F_SOFTUNLOCK: - mutex_enter(&freemem_lock); - availrmem += npages; - mutex_exit(&freemem_lock); - /* * This is a bit ugly, we pass in the real seg pointer, * but the segspt_addr is the virtual address within the @@ -2616,6 +2649,7 @@ int kernel; anon_sync_obj_t cookie; rctl_qty_t unlocked = 0; + page_t **ppa; amp = sptd->spt_amp; mutex_enter(&sptd->spt_lock); @@ -2661,12 +2695,15 @@ } } ANON_LOCK_EXIT(&->a_rwlock); - if (sptd->spt_ppa != NULL) + if ((ppa = sptd->spt_ppa) != NULL) sptd->spt_flags |= DISM_PPA_CHANGED; mutex_exit(&sptd->spt_lock); rctl_decr_locked_mem(NULL, proj, unlocked, 0); mutex_exit(&sp->shm_mlock); + + if (ppa != NULL) + seg_ppurge_wiredpp(ppa); } return (sts); } @@ -2748,6 +2785,7 @@ ushort_t gen; clock_t end_lbolt; int writer; + page_t **ppa; ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); @@ -2759,7 +2797,7 @@ pg_idx = seg_page(seg, addr); mutex_enter(&sptd->spt_lock); - if (sptd->spt_ppa == NULL) { + if ((ppa = sptd->spt_ppa) == NULL) { mutex_exit(&sptd->spt_lock); ANON_LOCK_ENTER(&->a_rwlock, RW_READER); anon_disclaim(amp, pg_idx, len); @@ -2775,7 +2813,7 @@ /* * Purge all DISM cached pages */ - seg_ppurge_seg(segspt_reclaim); + seg_ppurge_wiredpp(ppa); /* * Drop the AS_LOCK so that other threads can grab it
--- a/usr/src/uts/common/vm/seg_spt.h Thu May 22 22:08:42 2008 -0700 +++ b/usr/src/uts/common/vm/seg_spt.h Thu May 22 22:23:49 2008 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -74,7 +74,7 @@ typedef struct shm_data { struct as *shm_sptas; struct anon_map *shm_amp; - size_t shm_softlockcnt; /* # outstanding lock operations */ + spgcnt_t shm_softlockcnt; /* # outstanding lock operations */ struct seg *shm_sptseg; /* pointer to spt segment */ char *shm_vpage; /* indicating locked pages */ spgcnt_t shm_lckpgs; /* # of locked pages per attached seg */ @@ -82,6 +82,7 @@ * Memory allocation policy after shmat() */ lgrp_mem_policy_info_t shm_policy_info; + kmutex_t shm_segfree_syncmtx; /* barrier lock for segspt_shmfree() */ } shm_data_t; #define DISM_PG_LOCKED 0x1 /* DISM page is locked */
--- a/usr/src/uts/common/vm/seg_vn.c Thu May 22 22:08:42 2008 -0700 +++ b/usr/src/uts/common/vm/seg_vn.c Thu May 22 22:23:49 2008 -0700 @@ -162,6 +162,11 @@ size_t segvn_comb_thrshld = UINT_MAX; /* patchable -- see 1196681 */ +size_t segvn_pglock_comb_thrshld = (1UL << 16); /* 64K */ +size_t segvn_pglock_comb_balign = (1UL << 16); /* 64K */ +uint_t segvn_pglock_comb_bshift; +size_t segvn_pglock_comb_palign; + static int segvn_concat(struct seg *, struct seg *, int); static int segvn_extend_prev(struct seg *, struct seg *, struct segvn_crargs *, size_t); @@ -180,13 +185,15 @@ caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); static faultcode_t segvn_faultpage(struct hat *, struct seg *, caddr_t, u_offset_t, struct vpage *, page_t **, uint_t, - enum fault_type, enum seg_rw, int, int); + enum fault_type, enum seg_rw, int); static void segvn_vpage(struct seg *); static size_t segvn_count_swap_by_vpages(struct seg *); static void segvn_purge(struct seg *seg); -static int segvn_reclaim(struct seg *, caddr_t, size_t, struct page **, - enum seg_rw); +static int segvn_reclaim(void *, caddr_t, size_t, struct page **, + enum seg_rw, int); +static int shamp_reclaim(void *, caddr_t, size_t, struct page **, + enum seg_rw, int); static int sameprot(struct seg *, caddr_t, size_t); @@ -199,9 +206,6 @@ static void segvn_hat_rgn_unload_callback(caddr_t, caddr_t, caddr_t, size_t, void *, u_offset_t); -static int segvn_slock_anonpages(page_t *, int); -static void segvn_sunlock_anonpages(page_t *, int); - static struct kmem_cache *segvn_cache; static struct kmem_cache **segvn_szc_cache; @@ -212,7 +216,7 @@ ulong_t fullszcpages[10]; ulong_t relocatepages[3]; ulong_t fltanpages[17]; - ulong_t pagelock[3]; + ulong_t pagelock[2]; ulong_t demoterange[3]; } segvnvmstats; #endif /* VM_STATS */ @@ -240,7 +244,7 @@ struct segvn_data *svd = buf; rw_init(&svd->lock, NULL, RW_DEFAULT, NULL); - mutex_init(&svd->segp_slock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&svd->segfree_syncmtx, NULL, MUTEX_DEFAULT, NULL); svd->svn_trnext = svd->svn_trprev = NULL; return (0); } @@ -252,7 +256,7 @@ struct segvn_data *svd = buf; rw_destroy(&svd->lock); - mutex_destroy(&svd->segp_slock); + mutex_destroy(&svd->segfree_syncmtx); } /*ARGSUSED*/ @@ -467,6 +471,13 @@ NULL, 0, &p0, TS_RUN, minclsyspri); } #endif + + if (!ISP2(segvn_pglock_comb_balign) || + segvn_pglock_comb_balign < PAGESIZE) { + segvn_pglock_comb_balign = 1UL << 16; /* 64K */ + } + segvn_pglock_comb_bshift = highbit(segvn_pglock_comb_balign) - 1; + segvn_pglock_comb_palign = btop(segvn_pglock_comb_balign); } #define SEGVN_PAGEIO ((void *)0x1) @@ -786,6 +797,8 @@ svd->pageadvice = 0; svd->flags = (ushort_t)a->flags; svd->softlockcnt = 0; + svd->softlockcnt_sbase = 0; + svd->softlockcnt_send = 0; svd->rcookie = HAT_INVALID_REGION_COOKIE; svd->pageswap = 0; @@ -991,7 +1004,7 @@ (!svd1->pageprot && !svd2->pageprot && incompat(prot)) || incompat(type) || incompat(cred) || incompat(flags) || seg1->s_szc != seg2->s_szc || incompat(policy_info.mem_policy) || - (svd2->softlockcnt > 0)) + (svd2->softlockcnt > 0) || svd1->softlockcnt_send > 0) return (-1); #undef incompat @@ -1232,7 +1245,7 @@ if (svd1->vp != a->vp || svd1->maxprot != a->maxprot || (!svd1->pageprot && (svd1->prot != a->prot)) || svd1->type != a->type || svd1->flags != a->flags || - seg1->s_szc != a->szc) + seg1->s_szc != a->szc || svd1->softlockcnt_send > 0) return (-1); /* vp == NULL implies zfod, offset doesn't matter */ @@ -1353,7 +1366,7 @@ if (svd2->vp != a->vp || svd2->maxprot != a->maxprot || (!svd2->pageprot && (svd2->prot != a->prot)) || svd2->type != a->type || svd2->flags != a->flags || - seg2->s_szc != a->szc) + seg2->s_szc != a->szc || svd2->softlockcnt_sbase > 0) return (-1); /* vp == NULL implies zfod, offset doesn't matter */ if (svd2->vp != NULL && @@ -1498,6 +1511,8 @@ newsvd->pageswap = svd->pageswap; newsvd->flags = svd->flags; newsvd->softlockcnt = 0; + newsvd->softlockcnt_sbase = 0; + newsvd->softlockcnt_send = 0; newsvd->policy_info = svd->policy_info; newsvd->rcookie = HAT_INVALID_REGION_COOKIE; @@ -1797,6 +1812,15 @@ retry: if (svd->softlockcnt > 0) { ASSERT(svd->tr_state == SEGVN_TR_OFF); + + /* + * If this is shared segment non 0 softlockcnt + * means locked pages are still in use. + */ + if (svd->type == MAP_SHARED) { + return (EAGAIN); + } + /* * since we do have the writers lock nobody can fill * the cache during the purge. The flush either succeeds @@ -1946,6 +1970,16 @@ ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { /* + * Shared anon map is no longer in use. Before + * freeing its pages purge all entries from + * pcache that belong to this amp. + */ + if (svd->type == MAP_SHARED) { + ASSERT(amp->refcnt == 1); + ASSERT(svd->softlockcnt == 0); + anonmap_purge(amp); + } + /* * Free up now unused parts of anon_map array. */ if (amp->a_szc == seg->s_szc) { @@ -2040,6 +2074,18 @@ * Free up now unused parts of anon_map array. */ ulong_t an_idx = svd->anon_index + npages; + + /* + * Shared anon map is no longer in use. Before + * freeing its pages purge all entries from + * pcache that belong to this amp. + */ + if (svd->type == MAP_SHARED) { + ASSERT(amp->refcnt == 1); + ASSERT(svd->softlockcnt == 0); + anonmap_purge(amp); + } + if (amp->a_szc == seg->s_szc) { if (seg->s_szc != 0) { anon_free_pages(amp->ahp, @@ -2123,6 +2169,8 @@ nsvd->offset = svd->offset + (uintptr_t)(nseg->s_base - seg->s_base); nsvd->swresv = 0; nsvd->softlockcnt = 0; + nsvd->softlockcnt_sbase = 0; + nsvd->softlockcnt_send = 0; ASSERT(nsvd->rcookie == HAT_INVALID_REGION_COOKIE); if (svd->vp != NULL) { @@ -2173,6 +2221,18 @@ * Free up now unused parts of anon_map array. */ ulong_t an_idx = svd->anon_index + opages; + + /* + * Shared anon map is no longer in use. Before + * freeing its pages purge all entries from + * pcache that belong to this amp. + */ + if (svd->type == MAP_SHARED) { + ASSERT(amp->refcnt == 1); + ASSERT(svd->softlockcnt == 0); + anonmap_purge(amp); + } + if (amp->a_szc == seg->s_szc) { if (seg->s_szc != 0) { anon_free_pages(amp->ahp, an_idx, len, @@ -2316,6 +2376,15 @@ seg->s_size); } } else { + + /* + * Shared anon map is no longer in use. Before + * freeing its pages purge all entries from + * pcache that belong to this amp. + */ + ASSERT(svd->softlockcnt == 0); + anonmap_purge(amp); + /* * Shared - anon_free the entire * anon_map's worth of stuff and @@ -2380,155 +2449,19 @@ svd->pageswap = 0; svd->cred = NULL; + /* + * Take segfree_syncmtx lock to let segvn_reclaim() finish if it's + * still working with this segment without holding as lock (in case + * it's called by pcache async thread). + */ + ASSERT(svd->softlockcnt == 0); + mutex_enter(&svd->segfree_syncmtx); + mutex_exit(&svd->segfree_syncmtx); + seg->s_data = NULL; kmem_cache_free(segvn_cache, svd); } -#ifdef DEBUG -uint32_t segvn_slock_mtbf = 0; -#endif - -ulong_t segvn_lpglck_limit = 0; - -/* - * Support routines used by segvn_pagelock() and softlock faults for anonymous - * pages to implement availrmem accounting in a way that makes sure the - * same memory is accounted just once for all softlock/pagelock purposes. - * This prevents a bug when availrmem is quickly incorrectly exhausted from - * several pagelocks to different parts of the same large page since each - * pagelock has to decrement availrmem by the size of the entire large - * page. Note those pages are not COW shared until softunlock/pageunlock so - * we don't need to use cow style accounting here. We also need to make sure - * the entire large page is accounted even if softlock range is less than the - * entire large page because large anon pages can't be demoted when any of - * constituent pages is locked. The caller calls this routine for every page_t - * it locks. The very first page in the range may not be the root page of a - * large page. For all other pages it's guaranteed we are going to visit the - * root of a particular large page before any other constituent page as we are - * locking sequential pages belonging to the same anon map. So we do all the - * locking when the root is encountered except for the very first page. Since - * softlocking is not supported (except S_READ_NOCOW special case) for vmpss - * segments and since vnode pages can be demoted without locking all - * constituent pages vnode pages don't come here. Unlocking relies on the - * fact that pagesize can't change whenever any of constituent large pages is - * locked at least SE_SHARED. This allows unlocking code to find the right - * root and decrement availrmem by the same amount it was incremented when the - * page was locked. - */ -static int -segvn_slock_anonpages(page_t *pp, int first) -{ - pgcnt_t pages; - pfn_t pfn; - uchar_t szc = pp->p_szc; - - ASSERT(PAGE_LOCKED(pp)); - ASSERT(pp->p_vnode != NULL); - ASSERT(IS_SWAPFSVP(pp->p_vnode)); - - /* - * pagesize won't change as long as any constituent page is locked. - */ - pages = page_get_pagecnt(pp->p_szc); - pfn = page_pptonum(pp); - - if (!first) { - if (!IS_P2ALIGNED(pfn, pages)) { -#ifdef DEBUG - pp = &pp[-(spgcnt_t)(pfn & (pages - 1))]; - pfn = page_pptonum(pp); - ASSERT(IS_P2ALIGNED(pfn, pages)); - ASSERT(pp->p_szc == szc); - ASSERT(pp->p_vnode != NULL); - ASSERT(IS_SWAPFSVP(pp->p_vnode)); - ASSERT(pp->p_slckcnt != 0); -#endif /* DEBUG */ - return (1); - } - } else if (!IS_P2ALIGNED(pfn, pages)) { - pp = &pp[-(spgcnt_t)(pfn & (pages - 1))]; -#ifdef DEBUG - pfn = page_pptonum(pp); - ASSERT(IS_P2ALIGNED(pfn, pages)); - ASSERT(pp->p_szc == szc); - ASSERT(pp->p_vnode != NULL); - ASSERT(IS_SWAPFSVP(pp->p_vnode)); -#endif /* DEBUG */ - } - -#ifdef DEBUG - if (segvn_slock_mtbf && !(gethrtime() % segvn_slock_mtbf)) { - return (0); - } -#endif /* DEBUG */ - - /* - * pp is a root page. - * We haven't locked this large page yet. - */ - page_struct_lock(pp); - if (pp->p_slckcnt != 0) { - if (pp->p_slckcnt < PAGE_SLOCK_MAXIMUM) { - pp->p_slckcnt++; - page_struct_unlock(pp); - return (1); - } - page_struct_unlock(pp); - segvn_lpglck_limit++; - return (0); - } - mutex_enter(&freemem_lock); - if (availrmem < tune.t_minarmem + pages) { - mutex_exit(&freemem_lock); - page_struct_unlock(pp); - return (0); - } - pp->p_slckcnt++; - availrmem -= pages; - mutex_exit(&freemem_lock); - page_struct_unlock(pp); - return (1); -} - -static void -segvn_sunlock_anonpages(page_t *pp, int first) -{ - pgcnt_t pages; - pfn_t pfn; - - ASSERT(PAGE_LOCKED(pp)); - ASSERT(pp->p_vnode != NULL); - ASSERT(IS_SWAPFSVP(pp->p_vnode)); - - /* - * pagesize won't change as long as any constituent page is locked. - */ - pages = page_get_pagecnt(pp->p_szc); - pfn = page_pptonum(pp); - - if (!first) { - if (!IS_P2ALIGNED(pfn, pages)) { - return; - } - } else if (!IS_P2ALIGNED(pfn, pages)) { - pp = &pp[-(spgcnt_t)(pfn & (pages - 1))]; -#ifdef DEBUG - pfn = page_pptonum(pp); - ASSERT(IS_P2ALIGNED(pfn, pages)); -#endif /* DEBUG */ - } - ASSERT(pp->p_vnode != NULL); - ASSERT(IS_SWAPFSVP(pp->p_vnode)); - ASSERT(pp->p_slckcnt != 0); - page_struct_lock(pp); - if (--pp->p_slckcnt == 0) { - mutex_enter(&freemem_lock); - availrmem += pages; - mutex_exit(&freemem_lock); - } - page_struct_unlock(pp); -} - /* * Do a F_SOFTUNLOCK call over the range requested. The range must have * already been F_SOFTLOCK'ed. @@ -2601,19 +2534,10 @@ } TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, "segvn_fault:pp %p vp %p offset %llx", pp, vp, offset); - if (svd->vp == NULL) { - segvn_sunlock_anonpages(pp, adr == addr); - } page_unlock(pp); } - mutex_enter(&freemem_lock); /* for availrmem */ - if (svd->vp != NULL) { - availrmem += btop(len); - } - segvn_pages_locked -= btop(len); - svd->softlockcnt -= btop(len); - mutex_exit(&freemem_lock); - if (svd->softlockcnt == 0) { + ASSERT(svd->softlockcnt >= btop(len)); + if (!atomic_add_long_nv((ulong_t *)&svd->softlockcnt, -btop(len))) { /* * All SOFTLOCKS are gone. Wakeup any waiting * unmappers so they can try again to unmap. @@ -2691,8 +2615,7 @@ uint_t vpprot, /* access allowed to object pages */ enum fault_type type, /* type of fault */ enum seg_rw rw, /* type of access at fault */ - int brkcow, /* we may need to break cow */ - int first) /* first page for this fault if 1 */ + int brkcow) /* we may need to break cow */ { struct segvn_data *svd = (struct segvn_data *)seg->s_data; page_t *pp, **ppp; @@ -2749,17 +2672,8 @@ prot = svd->prot; } - if (type == F_SOFTLOCK && svd->vp != NULL) { - mutex_enter(&freemem_lock); - if (availrmem <= tune.t_minarmem) { - mutex_exit(&freemem_lock); - return (FC_MAKE_ERR(ENOMEM)); /* out of real memory */ - } else { - availrmem--; - svd->softlockcnt++; - segvn_pages_locked++; - } - mutex_exit(&freemem_lock); + if (type == F_SOFTLOCK) { + atomic_add_long((ulong_t *)&svd->softlockcnt, 1); } /* @@ -2809,19 +2723,6 @@ if (lgrp_optimizations()) page_migrate(seg, addr, &pp, 1); - if (type == F_SOFTLOCK) { - if (!segvn_slock_anonpages(pp, first)) { - page_unlock(pp); - err = ENOMEM; - goto out; - } else { - mutex_enter(&freemem_lock); - svd->softlockcnt++; - segvn_pages_locked++; - mutex_exit(&freemem_lock); - } - } - if (enable_mbit_wa) { if (rw == S_WRITE) hat_setmod(pp); @@ -2981,23 +2882,6 @@ if (lgrp_optimizations()) page_migrate(seg, addr, &opp, 1); - if (type == F_SOFTLOCK && svd->vp == NULL) { - - ASSERT(opp->p_szc == 0 || - (svd->type == MAP_SHARED && - amp != NULL && amp->a_szc != 0)); - - if (!segvn_slock_anonpages(opp, first)) { - page_unlock(opp); - err = ENOMEM; - goto out; - } else { - mutex_enter(&freemem_lock); - svd->softlockcnt++; - segvn_pages_locked++; - mutex_exit(&freemem_lock); - } - } if (IS_VMODSORT(opp->p_vnode) || enable_mbit_wa) { if (rw == S_WRITE) hat_setmod(opp); @@ -3124,18 +3008,6 @@ page_migrate(seg, addr, &pp, 1); ASSERT(pp->p_szc == 0); - if (type == F_SOFTLOCK && svd->vp == NULL) { - if (!segvn_slock_anonpages(pp, first)) { - page_unlock(pp); - err = ENOMEM; - goto out; - } else { - mutex_enter(&freemem_lock); - svd->softlockcnt++; - segvn_pages_locked++; - mutex_exit(&freemem_lock); - } - } ASSERT(!IS_VMODSORT(pp->p_vnode)); if (enable_mbit_wa) { @@ -3158,12 +3030,8 @@ if (anon_lock) anon_array_exit(&cookie); - if (type == F_SOFTLOCK && svd->vp != NULL) { - mutex_enter(&freemem_lock); - availrmem++; - segvn_pages_locked--; - svd->softlockcnt--; - mutex_exit(&freemem_lock); + if (type == F_SOFTLOCK) { + atomic_add_long((ulong_t *)&svd->softlockcnt, -1); } return (FC_MAKE_ERR(err)); } @@ -3819,13 +3687,10 @@ int segvn_anypgsz = 0; -#define SEGVN_RESTORE_SOFTLOCK(type, pages) \ - if ((type) == F_SOFTLOCK) { \ - mutex_enter(&freemem_lock); \ - availrmem += (pages); \ - segvn_pages_locked -= (pages); \ - svd->softlockcnt -= (pages); \ - mutex_exit(&freemem_lock); \ +#define SEGVN_RESTORE_SOFTLOCK_VP(type, pages) \ + if ((type) == F_SOFTLOCK) { \ + atomic_add_long((ulong_t *)&(svd)->softlockcnt, \ + -(pages)); \ } #define SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot) \ @@ -4032,17 +3897,8 @@ } } if (type == F_SOFTLOCK) { - mutex_enter(&freemem_lock); - if (availrmem < tune.t_minarmem + pages) { - mutex_exit(&freemem_lock); - err = FC_MAKE_ERR(ENOMEM); - goto out; - } else { - availrmem -= pages; - segvn_pages_locked += pages; - svd->softlockcnt += pages; - } - mutex_exit(&freemem_lock); + atomic_add_long((ulong_t *)&svd->softlockcnt, + pages); } pplist = NULL; @@ -4123,7 +3979,7 @@ page_free_replacement_page(pplist); page_create_putback(pages); } - SEGVN_RESTORE_SOFTLOCK(type, pages); + SEGVN_RESTORE_SOFTLOCK_VP(type, pages); if (a + pgsz <= eaddr) { SEGVN_VMSTAT_FLTVNPAGES(19); err = FC_MAKE_ERR(ierr); @@ -4179,7 +4035,7 @@ page_free_replacement_page(pplist); page_create_putback(pages); } - SEGVN_RESTORE_SOFTLOCK(type, pages); + SEGVN_RESTORE_SOFTLOCK_VP(type, pages); if (szc < seg->s_szc) { SEGVN_VMSTAT_FLTVNPAGES(26); /* @@ -4226,7 +4082,7 @@ SEGVN_VMSTAT_FLTVNPAGES(28); anon_array_exit(&an_cookie); ANON_LOCK_EXIT(&->a_rwlock); - SEGVN_RESTORE_SOFTLOCK(type, pages); + SEGVN_RESTORE_SOFTLOCK_VP(type, pages); err = FC_MAKE_ERR(ierr); goto out; } @@ -4724,9 +4580,7 @@ ulong_t i; int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; anon_sync_obj_t cookie; - int first = 1; int adjszc_chk; - int purged = 0; int pgflags = (svd->tr_state == SEGVN_TR_ON) ? PG_LOCAL : 0; ASSERT(szc != 0); @@ -4794,18 +4648,9 @@ lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); } - if (type == F_SOFTLOCK && svd->vp != NULL) { - mutex_enter(&freemem_lock); - if (availrmem < tune.t_minarmem + pages) { - mutex_exit(&freemem_lock); - err = FC_MAKE_ERR(ENOMEM); - goto error; - } else { - availrmem -= pages; - segvn_pages_locked += pages; - svd->softlockcnt += pages; - } - mutex_exit(&freemem_lock); + if (type == F_SOFTLOCK) { + atomic_add_long((ulong_t *)&svd->softlockcnt, + pages); } anon_array_enter(amp, aindx, &cookie); ppa_szc = (uint_t)-1; @@ -4815,13 +4660,10 @@ if (ierr != 0) { anon_array_exit(&cookie); VM_STAT_ADD(segvnvmstats.fltanpages[4]); - if (type == F_SOFTLOCK && svd->vp != NULL) { - VM_STAT_ADD(segvnvmstats.fltanpages[5]); - mutex_enter(&freemem_lock); - availrmem += pages; - segvn_pages_locked -= pages; - svd->softlockcnt -= pages; - mutex_exit(&freemem_lock); + if (type == F_SOFTLOCK) { + atomic_add_long( + (ulong_t *)&svd->softlockcnt, + -pages); } if (ierr > 0) { VM_STAT_ADD(segvnvmstats.fltanpages[6]); @@ -4845,41 +4687,6 @@ page_migrate(seg, a, ppa, pages); ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); - if (type == F_SOFTLOCK && svd->vp == NULL) { - /* - * If all pages in ppa array belong to the same - * large page call segvn_slock_anonpages() - * just for ppa[0]. - */ - for (i = 0; i < pages; i++) { - if (!segvn_slock_anonpages(ppa[i], - i == 0 && first)) { - ulong_t j; - for (j = 0; j < i; j++) { - segvn_sunlock_anonpages( - ppa[j], j == 0 && - first); - page_unlock(ppa[j]); - } - for (j = i; j < pages; j++) { - page_unlock(ppa[j]); - } - anon_array_exit(&cookie); - err = FC_MAKE_ERR(ENOMEM); - goto error; - } - if (i == 0 && ppa[0]->p_szc >= szc) { - ASSERT(!(page_pptonum(ppa[0]) & - (pages - 1))); - break; - } - } - first = 0; - mutex_enter(&freemem_lock); - svd->softlockcnt += pages; - segvn_pages_locked += pages; - mutex_exit(&freemem_lock); - } if (segtype == MAP_SHARED) { vpprot |= PROT_WRITE; @@ -4920,17 +4727,6 @@ * have relocated locked pages. */ ASSERT(ierr == -1 || ierr == -2); - /* - * For the very first relocation failure try to purge this - * segment's cache so that the relocator can obtain an - * exclusive lock on pages we want to relocate. - */ - if (!purged && ierr == -1 && ppa_szc != (uint_t)-1 && - svd->softlockcnt != 0) { - purged = 1; - segvn_purge(seg); - continue; - } if (segvn_anypgsz) { ASSERT(ierr == -2 || szc != 0); @@ -5613,7 +5409,7 @@ */ for (a = addr; a < addr + len; a += PAGESIZE, off += PAGESIZE) { err = segvn_faultpage(hat, seg, a, off, vpage, plp, vpprot, - type, rw, brkcow, a == addr); + type, rw, brkcow); if (err) { if (amp != NULL) ANON_LOCK_EXIT(&->a_rwlock); @@ -5826,6 +5622,16 @@ */ if (svd->softlockcnt > 0) { ASSERT(svd->tr_state == SEGVN_TR_OFF); + + /* + * If this is shared segment non 0 softlockcnt + * means locked pages are still in use. + */ + if (svd->type == MAP_SHARED) { + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (EAGAIN); + } + /* * Since we do have the segvn writers lock nobody can fill * the cache with entries belonging to this seg during @@ -6084,15 +5890,17 @@ if ((VPP_PROT(svp) ^ prot) & PROT_WRITE) { if (prot & PROT_WRITE) { - if (!page_addclaim(pp)) { - page_unlock(pp); - break; - } + if (!page_addclaim( + pp)) { + page_unlock(pp); + break; + } } else { - if (!page_subclaim(pp)) { - page_unlock(pp); - break; - } + if (!page_subclaim( + pp)) { + page_unlock(pp); + break; + } } } page_unlock(pp); @@ -6257,6 +6065,15 @@ */ if (svd->softlockcnt > 0) { ASSERT(svd->tr_state == SEGVN_TR_OFF); + + /* + * If this is shared segment non 0 softlockcnt + * means locked pages are still in use. + */ + if (svd->type == MAP_SHARED) { + return (EAGAIN); + } + /* * Since we do have the segvn writers lock nobody can fill * the cache with entries belonging to this seg during @@ -6339,6 +6156,13 @@ } nsvd = (struct segvn_data *)nseg->s_data; if (nsvd->softlockcnt > 0) { + /* + * If this is shared segment non 0 softlockcnt + * means locked pages are still in use. + */ + if (nsvd->type == MAP_SHARED) { + return (EAGAIN); + } segvn_purge(nseg); if (nsvd->softlockcnt > 0) { return (EAGAIN); @@ -6698,6 +6522,8 @@ } ASSERT(svd->softlockcnt == 0); + ASSERT(svd->softlockcnt_sbase == 0); + ASSERT(svd->softlockcnt_send == 0); crhold(svd->cred); if (svd->vpage != NULL) { @@ -7336,11 +7162,20 @@ if (svd->softlockcnt > 0) { /* + * If this is shared segment non 0 softlockcnt + * means locked pages are still in use. + */ + if (svd->type == MAP_SHARED) { + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (EAGAIN); + } + + /* * flush all pages from seg cache * otherwise we may deadlock in swap_putpage * for B_INVAL page (4175402). * - * Even if we grab segvn WRITER's lock or segp_slock + * Even if we grab segvn WRITER's lock * here, there might be another thread which could've * successfully performed lookup/insert just before * we acquired the lock here. So, grabbing either @@ -7354,6 +7189,18 @@ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); return (EAGAIN); } + } else if (svd->type == MAP_SHARED && svd->amp != NULL && + svd->amp->a_softlockcnt > 0) { + /* + * Try to purge this amp's entries from pcache. It will + * succeed only if other segments that share the amp have no + * outstanding softlock's. + */ + segvn_purge(seg); + if (svd->amp->a_softlockcnt > 0 || svd->softlockcnt > 0) { + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (EAGAIN); + } } vpp = svd->vpage; @@ -7904,8 +7751,10 @@ vp = svd->vp; off = offset; } - anon_array_exit(&cookie); - ANON_LOCK_EXIT(&->a_rwlock); + if (op != MC_LOCK || ap == NULL) { + anon_array_exit(&cookie); + ANON_LOCK_EXIT(&->a_rwlock); + } } else { vp = svd->vp; off = offset; @@ -7933,6 +7782,11 @@ (uint_t *)NULL, pl, PAGESIZE, seg, addr, S_OTHER, svd->cred, NULL); + if (error && ap != NULL) { + anon_array_exit(&cookie); + ANON_LOCK_EXIT(&->a_rwlock); + } + /* * If the error is EDEADLK then we must bounce * up and drop all vm subsystem locks and then @@ -8004,6 +7858,13 @@ ASSERT(!VPP_ISPPLOCK(vpp)); ret = page_pp_lock(pp, claim, 0); + if (ap != NULL) { + if (ap->an_pvp != NULL) { + anon_swap_free(ap, pp); + } + anon_array_exit(&cookie); + ANON_LOCK_EXIT(&->a_rwlock); + } if (ret == 0) { /* locking page failed */ page_unlock(pp); @@ -8146,6 +8007,14 @@ */ if (svd->softlockcnt > 0) { /* + * If this is shared segment non 0 softlockcnt + * means locked pages are still in use. + */ + if (svd->type == MAP_SHARED) { + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (EAGAIN); + } + /* * Since we do have the segvn writers lock * nobody can fill the cache with entries * belonging to this seg during the purge. @@ -8164,6 +8033,14 @@ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); return (EAGAIN); } + } else if (svd->type == MAP_SHARED && svd->amp != NULL && + svd->amp->a_softlockcnt > 0) { + /* + * Try to purge this amp's entries from pcache. It + * will succeed only if other segments that share the + * amp have no outstanding softlock's. + */ + segvn_purge(seg); } } @@ -8182,6 +8059,8 @@ return (0); } + segvn_purge(seg); + page = seg_page(seg, addr); ANON_LOCK_ENTER(&->a_rwlock, RW_READER); anon_disclaim(amp, svd->anon_index + page, len); @@ -8623,59 +8502,289 @@ ANON_LOCK_EXIT(&->a_rwlock); } +#ifdef DEBUG +static uint32_t segvn_pglock_mtbf = 0; +#endif + +#define PCACHE_SHWLIST ((page_t *)-2) +#define NOPCACHE_SHWLIST ((page_t *)-1) + /* - * lock/unlock anon pages over a given range. Return shadow list + * Lock/Unlock anon pages over a given range. Return shadow list. This routine + * uses global segment pcache to cache shadow lists (i.e. pp arrays) of pages + * to avoid the overhead of per page locking, unlocking for subsequent IOs to + * the same parts of the segment. Currently shadow list creation is only + * supported for pure anon segments. MAP_PRIVATE segment pcache entries are + * tagged with segment pointer, starting virtual address and length. This + * approach for MAP_SHARED segments may add many pcache entries for the same + * set of pages and lead to long hash chains that decrease pcache lookup + * performance. To avoid this issue for shared segments shared anon map and + * starting anon index are used for pcache entry tagging. This allows all + * segments to share pcache entries for the same anon range and reduces pcache + * chain's length as well as memory overhead from duplicate shadow lists and + * pcache entries. + * + * softlockcnt field in segvn_data structure counts the number of F_SOFTLOCK'd + * pages via segvn_fault() and pagelock'd pages via this routine. But pagelock + * part of softlockcnt accounting is done differently for private and shared + * segments. In private segment case softlock is only incremented when a new + * shadow list is created but not when an existing one is found via + * seg_plookup(). pcache entries have reference count incremented/decremented + * by each seg_plookup()/seg_pinactive() operation. Only entries that have 0 + * reference count can be purged (and purging is needed before segment can be + * freed). When a private segment pcache entry is purged segvn_reclaim() will + * decrement softlockcnt. Since in private segment case each of its pcache + * entries only belongs to this segment we can expect that when + * segvn_pagelock(L_PAGEUNLOCK) was called for all outstanding IOs in this + * segment purge will succeed and softlockcnt will drop to 0. In shared + * segment case reference count in pcache entry counts active locks from many + * different segments so we can't expect segment purging to succeed even when + * segvn_pagelock(L_PAGEUNLOCK) was called for all outstanding IOs in this + * segment. To be able to determine when there're no pending pagelocks in + * shared segment case we don't rely on purging to make softlockcnt drop to 0 + * but instead softlockcnt is incremented and decremented for every + * segvn_pagelock(L_PAGELOCK/L_PAGEUNLOCK) call regardless if a new shadow + * list was created or an existing one was found. When softlockcnt drops to 0 + * this segment no longer has any claims for pcached shadow lists and the + * segment can be freed even if there're still active pcache entries + * shared by this segment anon map. Shared segment pcache entries belong to + * anon map and are typically removed when anon map is freed after all + * processes destroy the segments that use this anon map. */ static int segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp, enum lock_type type, enum seg_rw rw) { struct segvn_data *svd = (struct segvn_data *)seg->s_data; - size_t np, adjustpages = 0, npages = (len >> PAGESHIFT); + size_t np; + pgcnt_t adjustpages; + pgcnt_t npages; ulong_t anon_index; - uint_t protchk; + uint_t protchk = (rw == S_READ) ? PROT_READ : PROT_WRITE; uint_t error; struct anon_map *amp; + pgcnt_t anpgcnt; struct page **pplist, **pl, *pp; caddr_t a; size_t page; caddr_t lpgaddr, lpgeaddr; - pgcnt_t szc0_npages = 0; + anon_sync_obj_t cookie; + int anlock; + struct anon_map *pamp; + caddr_t paddr; + seg_preclaim_cbfunc_t preclaim_callback; + size_t pgsz; + int use_pcache; + size_t wlen; + uint_t pflags = 0; + int sftlck_sbase = 0; + int sftlck_send = 0; + +#ifdef DEBUG + if (type == L_PAGELOCK && segvn_pglock_mtbf) { + hrtime_t ts = gethrtime(); + if ((ts % segvn_pglock_mtbf) == 0) { + return (ENOTSUP); + } + if ((ts % segvn_pglock_mtbf) == 1) { + return (EFAULT); + } + } +#endif TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_START, "segvn_pagelock: start seg %p addr %p", seg, addr); ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); - if (seg->s_szc != 0 && (type == L_PAGELOCK || type == L_PAGEUNLOCK)) { + ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK); + + SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); + + /* + * for now we only support pagelock to anon memory. We would have to + * check protections for vnode objects and call into the vnode driver. + * That's too much for a fast path. Let the fault entry point handle + * it. + */ + if (svd->vp != NULL) { + if (type == L_PAGELOCK) { + error = ENOTSUP; + goto out; + } + panic("segvn_pagelock(L_PAGEUNLOCK): vp != NULL"); + } + if ((amp = svd->amp) == NULL) { + if (type == L_PAGELOCK) { + error = EFAULT; + goto out; + } + panic("segvn_pagelock(L_PAGEUNLOCK): amp == NULL"); + } + if (rw != S_READ && rw != S_WRITE) { + if (type == L_PAGELOCK) { + error = ENOTSUP; + goto out; + } + panic("segvn_pagelock(L_PAGEUNLOCK): bad rw"); + } + + if (seg->s_szc != 0) { /* * We are adjusting the pagelock region to the large page size * boundary because the unlocked part of a large page cannot * be freed anyway unless all constituent pages of a large - * page are locked. Therefore this adjustment allows us to - * decrement availrmem by the right value (note we don't want - * to just decrement availrem by the large page size without - * adjusting addr and len because then we may end up - * decrementing availrmem by large page size for every - * constituent page locked by a new as_pagelock call). - * as_pageunlock caller must always match as_pagelock call's - * addr and len. + * page are locked. Bigger regions reduce pcache chain length + * and improve lookup performance. The tradeoff is that the + * very first segvn_pagelock() call for a given page is more + * expensive if only 1 page_t is needed for IO. This is only + * an issue if pcache entry doesn't get reused by several + * subsequent calls. We optimize here for the case when pcache + * is heavily used by repeated IOs to the same address range. * * Note segment's page size cannot change while we are holding * as lock. And then it cannot change while softlockcnt is * not 0. This will allow us to correctly recalculate large - * page size region for the matching pageunlock/reclaim call. + * page size region for the matching pageunlock/reclaim call + * since as_pageunlock() caller must always match + * as_pagelock() call's addr and len. * - * for pageunlock *ppp points to the pointer of page_t that + * For pageunlock *ppp points to the pointer of page_t that * corresponds to the real unadjusted start address. Similar * for pagelock *ppp must point to the pointer of page_t that * corresponds to the real unadjusted start address. */ - size_t pgsz = page_get_pagesize(seg->s_szc); + pgsz = page_get_pagesize(seg->s_szc); CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); - adjustpages = ((uintptr_t)(addr - lpgaddr)) >> PAGESHIFT; + adjustpages = btop((uintptr_t)(addr - lpgaddr)); + } else if (len < segvn_pglock_comb_thrshld) { + lpgaddr = addr; + lpgeaddr = addr + len; + adjustpages = 0; + pgsz = PAGESIZE; + } else { + /* + * Align the address range of large enough requests to allow + * combining of different shadow lists into 1 to reduce memory + * overhead from potentially overlapping large shadow lists + * (worst case is we have a 1MB IO into buffers with start + * addresses separated by 4K). Alignment is only possible if + * padded chunks have sufficient access permissions. Note + * permissions won't change between L_PAGELOCK and + * L_PAGEUNLOCK calls since non 0 softlockcnt will force + * segvn_setprot() to wait until softlockcnt drops to 0. This + * allows us to determine in L_PAGEUNLOCK the same range we + * computed in L_PAGELOCK. + * + * If alignment is limited by segment ends set + * sftlck_sbase/sftlck_send flags. In L_PAGELOCK case when + * these flags are set bump softlockcnt_sbase/softlockcnt_send + * per segment counters. In L_PAGEUNLOCK case decrease + * softlockcnt_sbase/softlockcnt_send counters if + * sftlck_sbase/sftlck_send flags are set. When + * softlockcnt_sbase/softlockcnt_send are non 0 + * segvn_concat()/segvn_extend_prev()/segvn_extend_next() + * won't merge the segments. This restriction combined with + * restriction on segment unmapping and splitting for segments + * that have non 0 softlockcnt allows L_PAGEUNLOCK to + * correctly determine the same range that was previously + * locked by matching L_PAGELOCK. + */ + pflags = SEGP_PSHIFT | (segvn_pglock_comb_bshift << 16); + pgsz = PAGESIZE; + if (svd->type == MAP_PRIVATE) { + lpgaddr = (caddr_t)P2ALIGN((uintptr_t)addr, + segvn_pglock_comb_balign); + if (lpgaddr < seg->s_base) { + lpgaddr = seg->s_base; + sftlck_sbase = 1; + } + } else { + ulong_t aix = svd->anon_index + seg_page(seg, addr); + ulong_t aaix = P2ALIGN(aix, segvn_pglock_comb_palign); + if (aaix < svd->anon_index) { + lpgaddr = seg->s_base; + sftlck_sbase = 1; + } else { + lpgaddr = addr - ptob(aix - aaix); + ASSERT(lpgaddr >= seg->s_base); + } + } + if (svd->pageprot && lpgaddr != addr) { + struct vpage *vp = &svd->vpage[seg_page(seg, lpgaddr)]; + struct vpage *evp = &svd->vpage[seg_page(seg, addr)]; + while (vp < evp) { + if ((VPP_PROT(vp) & protchk) == 0) { + break; + } + vp++; + } + if (vp < evp) { + lpgaddr = addr; + pflags = 0; + } + } + lpgeaddr = addr + len; + if (pflags) { + if (svd->type == MAP_PRIVATE) { + lpgeaddr = (caddr_t)P2ROUNDUP( + (uintptr_t)lpgeaddr, + segvn_pglock_comb_balign); + } else { + ulong_t aix = svd->anon_index + + seg_page(seg, lpgeaddr); + ulong_t aaix = P2ROUNDUP(aix, + segvn_pglock_comb_palign); + if (aaix < aix) { + lpgeaddr = 0; + } else { + lpgeaddr += ptob(aaix - aix); + } + } + if (lpgeaddr == 0 || + lpgeaddr > seg->s_base + seg->s_size) { + lpgeaddr = seg->s_base + seg->s_size; + sftlck_send = 1; + } + } + if (svd->pageprot && lpgeaddr != addr + len) { + struct vpage *vp; + struct vpage *evp; + + vp = &svd->vpage[seg_page(seg, addr + len)]; + evp = &svd->vpage[seg_page(seg, lpgeaddr)]; + + while (vp < evp) { + if ((VPP_PROT(vp) & protchk) == 0) { + break; + } + vp++; + } + if (vp < evp) { + lpgeaddr = addr + len; + } + } + adjustpages = btop((uintptr_t)(addr - lpgaddr)); + } + + /* + * For MAP_SHARED segments we create pcache entries tagged by amp and + * anon index so that we can share pcache entries with other segments + * that map this amp. For private segments pcache entries are tagged + * with segment and virtual address. + */ + if (svd->type == MAP_SHARED) { + pamp = amp; + paddr = (caddr_t)((lpgaddr - seg->s_base) + + ptob(svd->anon_index)); + preclaim_callback = shamp_reclaim; + } else { + pamp = NULL; + paddr = lpgaddr; + preclaim_callback = segvn_reclaim; } if (type == L_PAGEUNLOCK) { + VM_STAT_ADD(segvnvmstats.pagelock[0]); /* * update hat ref bits for /proc. We need to make sure @@ -8694,13 +8803,50 @@ } } } - SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); - if (seg->s_szc != 0) { - VM_STAT_ADD(segvnvmstats.pagelock[0]); - seg_pinactive(seg, lpgaddr, lpgeaddr - lpgaddr, - *ppp - adjustpages, rw, segvn_reclaim); + + /* + * Check the shadow list entry after the last page used in + * this IO request. If it's NOPCACHE_SHWLIST the shadow list + * was not inserted into pcache and is not large page + * adjusted. In this case call reclaim callback directly and + * don't adjust the shadow list start and size for large + * pages. + */ + npages = btop(len); + if ((*ppp)[npages] == NOPCACHE_SHWLIST) { + void *ptag; + if (pamp != NULL) { + ASSERT(svd->type == MAP_SHARED); + ptag = (void *)pamp; + paddr = (caddr_t)((addr - seg->s_base) + + ptob(svd->anon_index)); + } else { + ptag = (void *)seg; + paddr = addr; + } + (*preclaim_callback)(ptag, paddr, len, *ppp, rw, 0); } else { - seg_pinactive(seg, addr, len, *ppp, rw, segvn_reclaim); + ASSERT((*ppp)[npages] == PCACHE_SHWLIST || + IS_SWAPFSVP((*ppp)[npages]->p_vnode)); + len = lpgeaddr - lpgaddr; + npages = btop(len); + seg_pinactive(seg, pamp, paddr, len, + *ppp - adjustpages, rw, pflags, preclaim_callback); + } + + if (pamp != NULL) { + ASSERT(svd->type == MAP_SHARED); + ASSERT(svd->softlockcnt >= npages); + atomic_add_long((ulong_t *)&svd->softlockcnt, -npages); + } + + if (sftlck_sbase) { + ASSERT(svd->softlockcnt_sbase > 0); + atomic_add_long((ulong_t *)&svd->softlockcnt_sbase, -1); + } + if (sftlck_send) { + ASSERT(svd->softlockcnt_send > 0); + atomic_add_long((ulong_t *)&svd->softlockcnt_send, -1); } /* @@ -8711,77 +8857,97 @@ * raw async i/o is still in progress or where a thread * exits on data fault in a multithreaded application. */ - if (AS_ISUNMAPWAIT(seg->s_as) && (svd->softlockcnt > 0)) { - /* - * Even if we grab segvn WRITER's lock or segp_slock - * here, there might be another thread which could've - * successfully performed lookup/insert just before - * we acquired the lock here. So, grabbing either - * lock here is of not much use. Until we devise - * a strategy at upper layers to solve the - * synchronization issues completely, we expect - * applications to handle this appropriately. - */ - segvn_purge(seg); + if (AS_ISUNMAPWAIT(seg->s_as)) { + if (svd->softlockcnt == 0) { + mutex_enter(&seg->s_as->a_contents); + if (AS_ISUNMAPWAIT(seg->s_as)) { + AS_CLRUNMAPWAIT(seg->s_as); + cv_broadcast(&seg->s_as->a_cv); + } + mutex_exit(&seg->s_as->a_contents); + } else if (pamp == NULL) { + /* + * softlockcnt is not 0 and this is a + * MAP_PRIVATE segment. Try to purge its + * pcache entries to reduce softlockcnt. + * If it drops to 0 segvn_reclaim() + * will wake up a thread waiting on + * unmapwait flag. + * + * We don't purge MAP_SHARED segments with non + * 0 softlockcnt since IO is still in progress + * for such segments. + */ + ASSERT(svd->type == MAP_PRIVATE); + segvn_purge(seg); + } } SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END, "segvn_pagelock: unlock seg %p addr %p", seg, addr); return (0); - } else if (type == L_PAGERECLAIM) { - VM_STAT_COND_ADD(seg->s_szc != 0, segvnvmstats.pagelock[1]); - SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); - (void) segvn_reclaim(seg, addr, len, *ppp, rw); - SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); - TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END, - "segvn_pagelock: reclaim seg %p addr %p", seg, addr); - return (0); - } - - if (seg->s_szc != 0) { - VM_STAT_ADD(segvnvmstats.pagelock[2]); - addr = lpgaddr; - len = lpgeaddr - lpgaddr; - npages = (len >> PAGESHIFT); - } - - /* - * for now we only support pagelock to anon memory. We've to check - * protections for vnode objects and call into the vnode driver. - * That's too much for a fast path. Let the fault entry point handle it. - */ - if (svd->vp != NULL) { - TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, - "segvn_pagelock: mapped vnode seg %p addr %p", seg, addr); - *ppp = NULL; - return (ENOTSUP); - } - - /* - * if anonmap is not yet created, let the fault entry point populate it - * with anon ptrs. - */ - if ((amp = svd->amp) == NULL) { - TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, - "segvn_pagelock: anonmap null seg %p addr %p", seg, addr); - *ppp = NULL; - return (EFAULT); - } - - SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); - - /* - * we acquire segp_slock to prevent duplicate entries - * in seg_pcache - */ - mutex_enter(&svd->segp_slock); + } + + /* The L_PAGELOCK case ... */ + + VM_STAT_ADD(segvnvmstats.pagelock[1]); + + /* + * For MAP_SHARED segments we have to check protections before + * seg_plookup() since pcache entries may be shared by many segments + * with potentially different page protections. + */ + if (pamp != NULL) { + ASSERT(svd->type == MAP_SHARED); + if (svd->pageprot == 0) { + if ((svd->prot & protchk) == 0) { + error = EACCES; + goto out; + } + } else { + /* + * check page protections + */ + caddr_t ea; + + if (seg->s_szc) { + a = lpgaddr; + ea = lpgeaddr; + } else { + a = addr; + ea = addr + len; + } + for (; a < ea; a += pgsz) { + struct vpage *vp; + + ASSERT(seg->s_szc == 0 || + sameprot(seg, a, pgsz)); + vp = &svd->vpage[seg_page(seg, a)]; + if ((VPP_PROT(vp) & protchk) == 0) { + error = EACCES; + goto out; + } + } + } + } /* * try to find pages in segment page cache */ - pplist = seg_plookup(seg, addr, len, rw); + pplist = seg_plookup(seg, pamp, paddr, lpgeaddr - lpgaddr, rw, pflags); if (pplist != NULL) { - mutex_exit(&svd->segp_slock); + if (pamp != NULL) { + npages = btop((uintptr_t)(lpgeaddr - lpgaddr)); + ASSERT(svd->type == MAP_SHARED); + atomic_add_long((ulong_t *)&svd->softlockcnt, + npages); + } + if (sftlck_sbase) { + atomic_add_long((ulong_t *)&svd->softlockcnt_sbase, 1); + } + if (sftlck_send) { + atomic_add_long((ulong_t *)&svd->softlockcnt_send, 1); + } SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); *ppp = pplist + adjustpages; TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_HIT_END, @@ -8789,145 +8955,211 @@ return (0); } - if (rw == S_READ) { - protchk = PROT_READ; - } else { - protchk = PROT_WRITE; - } - - if (svd->pageprot == 0) { - if ((svd->prot & protchk) == 0) { - mutex_exit(&svd->segp_slock); - error = EFAULT; - goto out; - } - } else { - /* - * check page protections - */ - for (a = addr; a < addr + len; a += PAGESIZE) { - struct vpage *vp; - - vp = &svd->vpage[seg_page(seg, a)]; - if ((VPP_PROT(vp) & protchk) == 0) { - mutex_exit(&svd->segp_slock); - error = EFAULT; + /* + * For MAP_SHARED segments we already verified above that segment + * protections allow this pagelock operation. + */ + if (pamp == NULL) { + ASSERT(svd->type == MAP_PRIVATE); + if (svd->pageprot == 0) { + if ((svd->prot & protchk) == 0) { + error = EACCES; goto out; } - } - } - - /* - * Avoid per page overhead of segvn_slock_anonpages() for small - * pages. For large pages segvn_slock_anonpages() only does real - * work once per large page. The tradeoff is that we may decrement - * availrmem more than once for the same page but this is ok - * for small pages. - */ - if (seg->s_szc == 0) { - mutex_enter(&freemem_lock); - if (availrmem < tune.t_minarmem + npages) { - mutex_exit(&freemem_lock); - mutex_exit(&svd->segp_slock); - error = ENOMEM; - goto out; - } - availrmem -= npages; - mutex_exit(&freemem_lock); - } - - pplist = kmem_alloc(sizeof (page_t *) * npages, KM_SLEEP); + if (svd->prot & PROT_WRITE) { + wlen = lpgeaddr - lpgaddr; + } else { + wlen = 0; + ASSERT(rw == S_READ); + } + } else { + int wcont = 1; + /* + * check page protections + */ + for (a = lpgaddr, wlen = 0; a < lpgeaddr; a += pgsz) { + struct vpage *vp; + + ASSERT(seg->s_szc == 0 || + sameprot(seg, a, pgsz)); + vp = &svd->vpage[seg_page(seg, a)]; + if ((VPP_PROT(vp) & protchk) == 0) { + error = EACCES; + goto out; + } + if (wcont && (VPP_PROT(vp) & PROT_WRITE)) { + wlen += pgsz; + } else { + wcont = 0; + ASSERT(rw == S_READ); + } + } + } + ASSERT(rw == S_READ || wlen == lpgeaddr - lpgaddr); + ASSERT(rw == S_WRITE || wlen <= lpgeaddr - lpgaddr); + } + + /* + * Only build large page adjusted shadow list if we expect to insert + * it into pcache. For large enough pages it's a big overhead to + * create a shadow list of the entire large page. But this overhead + * should be amortized over repeated pcache hits on subsequent reuse + * of this shadow list (IO into any range within this shadow list will + * find it in pcache since we large page align the request for pcache + * lookups). pcache performance is improved with bigger shadow lists + * as it reduces the time to pcache the entire big segment and reduces + * pcache chain length. + */ + if (seg_pinsert_check(seg, pamp, paddr, + lpgeaddr - lpgaddr, pflags) == SEGP_SUCCESS) { + addr = lpgaddr; + len = lpgeaddr - lpgaddr; + use_pcache = 1; + } else { + use_pcache = 0; + /* + * Since this entry will not be inserted into the pcache, we + * will not do any adjustments to the starting address or + * size of the memory to be locked. + */ + adjustpages = 0; + } + npages = btop(len); + + pplist = kmem_alloc(sizeof (page_t *) * (npages + 1), KM_SLEEP); pl = pplist; *ppp = pplist + adjustpages; + /* + * If use_pcache is 0 this shadow list is not large page adjusted. + * Record this info in the last entry of shadow array so that + * L_PAGEUNLOCK can determine if it should large page adjust the + * address range to find the real range that was locked. + */ + pl[npages] = use_pcache ? PCACHE_SHWLIST : NOPCACHE_SHWLIST; page = seg_page(seg, addr); anon_index = svd->anon_index + page; + anlock = 0; ANON_LOCK_ENTER(&->a_rwlock, RW_READER); + ASSERT(amp->a_szc >= seg->s_szc); + anpgcnt = page_get_pagecnt(amp->a_szc); for (a = addr; a < addr + len; a += PAGESIZE, anon_index++) { struct anon *ap; struct vnode *vp; u_offset_t off; - anon_sync_obj_t cookie; - - anon_array_enter(amp, anon_index, &cookie); + + /* + * Lock and unlock anon array only once per large page. + * anon_array_enter() locks the root anon slot according to + * a_szc which can't change while anon map is locked. We lock + * anon the first time through this loop and each time we + * reach anon index that corresponds to a root of a large + * page. + */ + if (a == addr || P2PHASE(anon_index, anpgcnt) == 0) { + ASSERT(anlock == 0); + anon_array_enter(amp, anon_index, &cookie); + anlock = 1; + } ap = anon_get_ptr(amp->ahp, anon_index); - if (ap == NULL) { + + /* + * We must never use seg_pcache for COW pages + * because we might end up with original page still + * lying in seg_pcache even after private page is + * created. This leads to data corruption as + * aio_write refers to the page still in cache + * while all other accesses refer to the private + * page. + */ + if (ap == NULL || ap->an_refcnt != 1) { + struct vpage *vpage; + + if (seg->s_szc) { + error = EFAULT; + break; + } + if (svd->vpage != NULL) { + vpage = &svd->vpage[seg_page(seg, a)]; + } else { + vpage = NULL; + } + ASSERT(anlock); anon_array_exit(&cookie); - break; - } else { - /* - * We must never use seg_pcache for COW pages - * because we might end up with original page still - * lying in seg_pcache even after private page is - * created. This leads to data corruption as - * aio_write refers to the page still in cache - * while all other accesses refer to the private - * page. - */ - if (ap->an_refcnt != 1) { - anon_array_exit(&cookie); + anlock = 0; + pp = NULL; + error = segvn_faultpage(seg->s_as->a_hat, seg, a, 0, + vpage, &pp, 0, F_INVAL, rw, 1); + if (error) { + error = fc_decode(error); + break; + } + anon_array_enter(amp, anon_index, &cookie); + anlock = 1; + ap = anon_get_ptr(amp->ahp, anon_index); + if (ap == NULL || ap->an_refcnt != 1) { + error = EFAULT; break; } } swap_xlate(ap, &vp, &off); - anon_array_exit(&cookie); - pp = page_lookup_nowait(vp, off, SE_SHARED); if (pp == NULL) { + error = EFAULT; break; } - if (seg->s_szc != 0 || pp->p_szc != 0) { - if (!segvn_slock_anonpages(pp, a == addr)) { - page_unlock(pp); - break; - } - } else { - szc0_npages++; + if (ap->an_pvp != NULL) { + anon_swap_free(ap, pp); + } + /* + * Unlock anon if this is the last slot in a large page. + */ + if (P2PHASE(anon_index, anpgcnt) == anpgcnt - 1) { + ASSERT(anlock); + anon_array_exit(&cookie); + anlock = 0; } *pplist++ = pp; } + if (anlock) { /* Ensure the lock is dropped */ + anon_array_exit(&cookie); + } ANON_LOCK_EXIT(&->a_rwlock); - ASSERT(npages >= szc0_npages); - if (a >= addr + len) { - mutex_enter(&freemem_lock); - if (seg->s_szc == 0 && npages != szc0_npages) { - ASSERT(svd->type == MAP_SHARED && amp->a_szc > 0); - availrmem += (npages - szc0_npages); - } - svd->softlockcnt += npages; - segvn_pages_locked += npages; - mutex_exit(&freemem_lock); - (void) seg_pinsert(seg, addr, len, pl, rw, SEGP_ASYNC_FLUSH, - segvn_reclaim); - mutex_exit(&svd->segp_slock); + atomic_add_long((ulong_t *)&svd->softlockcnt, npages); + if (pamp != NULL) { + ASSERT(svd->type == MAP_SHARED); + atomic_add_long((ulong_t *)&pamp->a_softlockcnt, + npages); + wlen = len; + } + if (sftlck_sbase) { + atomic_add_long((ulong_t *)&svd->softlockcnt_sbase, 1); + } + if (sftlck_send) { + atomic_add_long((ulong_t *)&svd->softlockcnt_send, 1); + } + if (use_pcache) { + (void) seg_pinsert(seg, pamp, paddr, len, wlen, pl, + rw, pflags, preclaim_callback); + } SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_FILL_END, "segvn_pagelock: cache fill seg %p addr %p", seg, addr); return (0); } - mutex_exit(&svd->segp_slock); - if (seg->s_szc == 0) { - mutex_enter(&freemem_lock); - availrmem += npages; - mutex_exit(&freemem_lock); - } - error = EFAULT; pplist = pl; np = ((uintptr_t)(a - addr)) >> PAGESHIFT; while (np > (uint_t)0) { ASSERT(PAGE_LOCKED(*pplist)); - if (seg->s_szc != 0 || (*pplist)->p_szc != 0) { - segvn_sunlock_anonpages(*pplist, pplist == pl); - } page_unlock(*pplist); np--; pplist++; } - kmem_free(pl, sizeof (page_t *) * npages); + kmem_free(pl, sizeof (page_t *) * (npages + 1)); out: SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); *ppp = NULL; @@ -8942,34 +9174,55 @@ static void segvn_purge(struct seg *seg) { - seg_ppurge(seg); + struct segvn_data *svd = (struct segvn_data *)seg->s_data; + + /* + * pcache is only used by pure anon segments. + */ + if (svd->amp == NULL || svd->vp != NULL) { + return; + } + + /* + * For MAP_SHARED segments non 0 segment's softlockcnt means + * active IO is still in progress via this segment. So we only + * purge MAP_SHARED segments when their softlockcnt is 0. + */ + if (svd->type == MAP_PRIVATE) { + if (svd->softlockcnt) { + seg_ppurge(seg, NULL, 0); + } + } else if (svd->softlockcnt == 0 && svd->amp->a_softlockcnt != 0) { + seg_ppurge(seg, svd->amp, 0); + } } +/* + * If async argument is not 0 we are called from pcache async thread and don't + * hold AS lock. + */ + +/*ARGSUSED*/ static int -segvn_reclaim(struct seg *seg, caddr_t addr, size_t len, struct page **pplist, - enum seg_rw rw) +segvn_reclaim(void *ptag, caddr_t addr, size_t len, struct page **pplist, + enum seg_rw rw, int async) { + struct seg *seg = (struct seg *)ptag; struct segvn_data *svd = (struct segvn_data *)seg->s_data; pgcnt_t np, npages; struct page **pl; - pgcnt_t szc0_npages = 0; - -#ifdef lint - addr = addr; -#endif - - npages = np = (len >> PAGESHIFT); + + npages = np = btop(len); ASSERT(npages); - pl = pplist; - if (seg->s_szc != 0) { - size_t pgsz = page_get_pagesize(seg->s_szc); - if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { - panic("segvn_reclaim: unaligned addr or len"); - /*NOTREACHED*/ - } - } ASSERT(svd->vp == NULL && svd->amp != NULL); + ASSERT(svd->softlockcnt >= npages); + ASSERT(async || AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + pl = pplist; + + ASSERT(pl[np] == NOPCACHE_SHWLIST || pl[np] == PCACHE_SHWLIST); + ASSERT(!async || pl[np] == PCACHE_SHWLIST); while (np > (uint_t)0) { if (rw == S_WRITE) { @@ -8977,27 +9230,41 @@ } else { hat_setref(*pplist); } - if (seg->s_szc != 0 || (*pplist)->p_szc != 0) { - segvn_sunlock_anonpages(*pplist, pplist == pl); - } else { - szc0_npages++; - } page_unlock(*pplist); np--; pplist++; } - kmem_free(pl, sizeof (page_t *) * npages); - - mutex_enter(&freemem_lock); - segvn_pages_locked -= npages; - svd->softlockcnt -= npages; - if (szc0_npages != 0) { - availrmem += szc0_npages; - } - mutex_exit(&freemem_lock); - if (svd->softlockcnt <= 0) { - if (AS_ISUNMAPWAIT(seg->s_as)) { + + kmem_free(pl, sizeof (page_t *) * (npages + 1)); + + /* + * If we are pcache async thread we don't hold AS lock. This means if + * softlockcnt drops to 0 after the decrement below address space may + * get freed. We can't allow it since after softlock derement to 0 we + * still need to access as structure for possible wakeup of unmap + * waiters. To prevent the disappearance of as we take this segment + * segfree_syncmtx. segvn_free() also takes this mutex as a barrier to + * make sure this routine completes before segment is freed. + * + * The second complication we have to deal with in async case is a + * possibility of missed wake up of unmap wait thread. When we don't + * hold as lock here we may take a_contents lock before unmap wait + * thread that was first to see softlockcnt was still not 0. As a + * result we'll fail to wake up an unmap wait thread. To avoid this + * race we set nounmapwait flag in as structure if we drop softlockcnt + * to 0 when we were called by pcache async thread. unmapwait thread + * will not block if this flag is set. + */ + if (async) { + mutex_enter(&svd->segfree_syncmtx); + } + + if (!atomic_add_long_nv((ulong_t *)&svd->softlockcnt, -npages)) { + if (async || AS_ISUNMAPWAIT(seg->s_as)) { mutex_enter(&seg->s_as->a_contents); + if (async) { + AS_SETNOUNMAPWAIT(seg->s_as); + } if (AS_ISUNMAPWAIT(seg->s_as)) { AS_CLRUNMAPWAIT(seg->s_as); cv_broadcast(&seg->s_as->a_cv); @@ -9005,8 +9272,59 @@ mutex_exit(&seg->s_as->a_contents); } } + + if (async) { + mutex_exit(&svd->segfree_syncmtx); + } return (0); } + +/*ARGSUSED*/ +static int +shamp_reclaim(void *ptag, caddr_t addr, size_t len, struct page **pplist, + enum seg_rw rw, int async) +{ + amp_t *amp = (amp_t *)ptag; + pgcnt_t np, npages; + struct page **pl; + + npages = np = btop(len); + ASSERT(npages); + ASSERT(amp->a_softlockcnt >= npages); + + pl = pplist; + + ASSERT(pl[np] == NOPCACHE_SHWLIST || pl[np] == PCACHE_SHWLIST); + ASSERT(!async || pl[np] == PCACHE_SHWLIST); + + while (np > (uint_t)0) { + if (rw == S_WRITE) { + hat_setrefmod(*pplist); + } else { + hat_setref(*pplist); + } + page_unlock(*pplist); + np--; + pplist++; + } + + kmem_free(pl, sizeof (page_t *) * (npages + 1)); + + /* + * If somebody sleeps in anonmap_purge() wake them up if a_softlockcnt + * drops to 0. anon map can't be freed until a_softlockcnt drops to 0 + * and anonmap_purge() acquires a_purgemtx. + */ + mutex_enter(&->a_purgemtx); + if (!atomic_add_long_nv((ulong_t *)&->a_softlockcnt, -npages) && + amp->a_purgewait) { + amp->a_purgewait = 0; + cv_broadcast(&->a_purgecv); + } + mutex_exit(&->a_purgemtx); + return (0); +} + /* * get a memory ID for an addr in a given segment *
--- a/usr/src/uts/common/vm/seg_vn.h Thu May 22 22:08:42 2008 -0700 +++ b/usr/src/uts/common/vm/seg_vn.h Thu May 22 22:23:49 2008 -0700 @@ -86,7 +86,7 @@ */ typedef struct segvn_data { krwlock_t lock; /* protect segvn_data and vpage array */ - kmutex_t segp_slock; /* serialize insertions into seg_pcache */ + kmutex_t segfree_syncmtx; /* barrier lock for segvn_free() */ uchar_t pageprot; /* true if per page protections present */ uchar_t prot; /* current segment prot if pageprot == 0 */ uchar_t maxprot; /* maximum segment protections */ @@ -101,7 +101,7 @@ uchar_t advice; /* madvise flags for segment */ uchar_t pageadvice; /* true if per page advice set */ ushort_t flags; /* flags - from sys/mman.h */ - ssize_t softlockcnt; /* # of pages SOFTLOCKED in seg */ + spgcnt_t softlockcnt; /* # of pages SOFTLOCKED in seg */ lgrp_mem_policy_info_t policy_info; /* memory allocation policy */ hat_region_cookie_t rcookie; /* region for hat calls */ lgrp_mem_policy_info_t tr_policy_info; /* memory allocation for TR */ @@ -110,6 +110,8 @@ struct segvn_data *svn_trprev; /* textrepl list prev link */ int tr_state; /* TR (text replication) state */ uchar_t pageswap; /* true if per page swap accounting is set */ + spgcnt_t softlockcnt_sbase; /* # of softlocks for seg start addr */ + spgcnt_t softlockcnt_send; /* # of softlocks for seg end addr */ } segvn_data_t; #ifdef _KERNEL
--- a/usr/src/uts/common/vm/vm_anon.c Thu May 22 22:08:42 2008 -0700 +++ b/usr/src/uts/common/vm/vm_anon.c Thu May 22 22:23:49 2008 -0700 @@ -106,6 +106,7 @@ #include <sys/sysmacros.h> #include <sys/bitmap.h> #include <sys/vmsystm.h> +#include <sys/tuneable.h> #include <sys/debug.h> #include <sys/fs/swapnode.h> #include <sys/tnf_probe.h> @@ -156,7 +157,6 @@ } anonvmstats; #endif /* VM_STATS */ - /*ARGSUSED*/ static int anonmap_cache_constructor(void *buf, void *cdrarg, int kmflags) @@ -164,6 +164,9 @@ struct anon_map *amp = buf; rw_init(&->a_rwlock, NULL, RW_DEFAULT, NULL); + cv_init(&->a_purgecv, NULL, CV_DEFAULT, NULL); + mutex_init(&->a_pmtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&->a_purgemtx, NULL, MUTEX_DEFAULT, NULL); return (0); } @@ -174,6 +177,9 @@ struct anon_map *amp = buf; rw_destroy(&->a_rwlock); + cv_destroy(&->a_purgecv); + mutex_destroy(&->a_pmtx); + mutex_destroy(&->a_purgemtx); } kmutex_t anonhash_lock[AH_LOCK_SIZE]; @@ -870,6 +876,7 @@ mutex_enter(&anoninfo_lock); ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); + /* * If some of this reservation belonged to swapfs * give it back to availrmem. @@ -944,6 +951,48 @@ } /* + * Called for pages locked in memory via softlock/pagelock/mlock to make sure + * such pages don't consume any physical swap resources needed for swapping + * unlocked pages. + */ +void +anon_swap_free(struct anon *ap, page_t *pp) +{ + kmutex_t *ahm; + + ASSERT(ap != NULL); + ASSERT(pp != NULL); + ASSERT(PAGE_LOCKED(pp)); + ASSERT(pp->p_vnode != NULL); + ASSERT(IS_SWAPFSVP(pp->p_vnode)); + ASSERT(ap->an_refcnt != 0); + ASSERT(pp->p_vnode == ap->an_vp); + ASSERT(pp->p_offset == ap->an_off); + + if (ap->an_pvp == NULL) + return; + + page_io_lock(pp); + ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; + mutex_enter(ahm); + + ASSERT(ap->an_refcnt != 0); + ASSERT(pp->p_vnode == ap->an_vp); + ASSERT(pp->p_offset == ap->an_off); + + if (ap->an_pvp != NULL) { + swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE); + ap->an_pvp = NULL; + ap->an_poff = 0; + mutex_exit(ahm); + hat_setmod(pp); + } else { + mutex_exit(ahm); + } + page_io_unlock(pp); +} + +/* * Decrement the reference count of an anon page. * If reference count goes to zero, free it and * its associated page (if any). @@ -3154,7 +3203,7 @@ ulong_t sidx_aligned; ulong_t eidx_aligned; - ASSERT(RW_WRITE_HELD(&->a_rwlock)); + ASSERT(ANON_WRITE_HELD(&->a_rwlock)); ASSERT(amp->refcnt <= 1); ASSERT(amp->a_szc > 0); ASSERT(eidx <= ahp->size); @@ -3205,6 +3254,53 @@ } /* + * This routine should be called with amp's writer lock when there're no other + * users of amp. All pcache entries of this amp must have been already + * inactivated. We must not drop a_rwlock here to prevent new users from + * attaching to this amp. + */ +void +anonmap_purge(struct anon_map *amp) +{ + ASSERT(ANON_WRITE_HELD(&->a_rwlock)); + ASSERT(amp->refcnt <= 1); + + if (amp->a_softlockcnt != 0) { + seg_ppurge(NULL, amp, 0); + } + + /* + * Since all pcache entries were already inactive before this routine + * was called seg_ppurge() couldn't return while there're still + * entries that can be found via the list anchored at a_phead. So we + * can assert this list is empty now. a_softlockcnt may be still non 0 + * if asynchronous thread that manages pcache already removed pcache + * entries but hasn't unlocked the pages yet. If a_softlockcnt is non + * 0 we just wait on a_purgecv for shamp_reclaim() to finish. Even if + * a_softlockcnt is 0 we grab a_purgemtx to avoid freeing anon map + * before shamp_reclaim() is done with it. a_purgemtx also taken by + * shamp_reclaim() while a_softlockcnt was still not 0 acts as a + * barrier that prevents anonmap_purge() to complete while + * shamp_reclaim() may still be referencing this amp. + */ + ASSERT(amp->a_phead.p_lnext == &->a_phead); + ASSERT(amp->a_phead.p_lprev == &->a_phead); + + mutex_enter(&->a_purgemtx); + while (amp->a_softlockcnt != 0) { + ASSERT(amp->a_phead.p_lnext == &->a_phead); + ASSERT(amp->a_phead.p_lprev == &->a_phead); + amp->a_purgewait = 1; + cv_wait(&->a_purgecv, &->a_purgemtx); + } + mutex_exit(&->a_purgemtx); + + ASSERT(amp->a_phead.p_lnext == &->a_phead); + ASSERT(amp->a_phead.p_lprev == &->a_phead); + ASSERT(amp->a_softlockcnt == 0); +} + +/* * Allocate and initialize an anon_map structure for seg * associating the given swap reservation with the new anon_map. */ @@ -3232,14 +3328,22 @@ amp->locality = 0; amp->a_szc = 0; amp->a_sp = NULL; + amp->a_softlockcnt = 0; + amp->a_purgewait = 0; + amp->a_phead.p_lnext = &->a_phead; + amp->a_phead.p_lprev = &->a_phead; + return (amp); } void anonmap_free(struct anon_map *amp) { - ASSERT(amp->ahp); + ASSERT(amp->ahp != NULL); ASSERT(amp->refcnt == 0); + ASSERT(amp->a_softlockcnt == 0); + ASSERT(amp->a_phead.p_lnext == &->a_phead); + ASSERT(amp->a_phead.p_lprev == &->a_phead); lgrp_shm_policy_fini(amp, NULL); anon_release(amp->ahp, btopr(amp->size));
--- a/usr/src/uts/common/vm/vm_as.c Thu May 22 22:08:42 2008 -0700 +++ b/usr/src/uts/common/vm/vm_as.c Thu May 22 22:23:49 2008 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -715,12 +715,13 @@ int err; next = AS_SEGNEXT(as, seg); +retry: err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size); if (err == EAGAIN) { mutex_enter(&as->a_contents); if (as->a_callbacks) { AS_LOCK_EXIT(as, &as->a_lock); - } else { + } else if (!AS_ISNOUNMAPWAIT(as)) { /* * Memory is currently locked. Wait for a * cv_signal that it has been unlocked, then @@ -732,6 +733,20 @@ AS_LOCK_EXIT(as, &as->a_lock); while (AS_ISUNMAPWAIT(as)) cv_wait(&as->a_cv, &as->a_contents); + } else { + /* + * We may have raced with + * segvn_reclaim()/segspt_reclaim(). In this + * case clean nounmapwait flag and retry since + * softlockcnt in this segment may be already + * 0. We don't drop as writer lock so our + * number of retries without sleeping should + * be very small. See segvn_reclaim() for + * more comments. + */ + AS_CLRNOUNMAPWAIT(as); + mutex_exit(&as->a_contents); + goto retry; } mutex_exit(&as->a_contents); goto top; @@ -1193,6 +1208,7 @@ ssize = seg->s_base + seg->s_size - raddr; else ssize = rsize; +retry: error = SEGOP_SETPROT(seg, raddr, ssize, prot); if (error == IE_NOMEM) { @@ -1254,13 +1270,27 @@ seg->s_base, seg->s_size))) { AS_LOCK_EXIT(as, &as->a_lock); as_execute_callback(as, cb, AS_SETPROT_EVENT); - } else { + } else if (!AS_ISNOUNMAPWAIT(as)) { if (AS_ISUNMAPWAIT(as) == 0) cv_broadcast(&as->a_cv); AS_SETUNMAPWAIT(as); AS_LOCK_EXIT(as, &as->a_lock); while (AS_ISUNMAPWAIT(as)) cv_wait(&as->a_cv, &as->a_contents); + } else { + /* + * We may have raced with + * segvn_reclaim()/segspt_reclaim(). In this + * case clean nounmapwait flag and retry since + * softlockcnt in this segment may be already + * 0. We don't drop as writer lock so our + * number of retries without sleeping should + * be very small. See segvn_reclaim() for + * more comments. + */ + AS_CLRNOUNMAPWAIT(as); + mutex_exit(&as->a_contents); + goto retry; } mutex_exit(&as->a_contents); goto setprot_top; @@ -1385,6 +1415,7 @@ */ seg_next = AS_SEGNEXT(as, seg); +retry: err = SEGOP_UNMAP(seg, raddr, ssize); if (err == EAGAIN) { /* @@ -1419,25 +1450,37 @@ * either there were no callbacks for this event * or they were already in progress. */ - as_setwatch(as); mutex_enter(&as->a_contents); if (as->a_callbacks && (cb = as_find_callback(as, AS_UNMAP_EVENT, seg->s_base, seg->s_size))) { AS_LOCK_EXIT(as, &as->a_lock); as_execute_callback(as, cb, AS_UNMAP_EVENT); - } else { + } else if (!AS_ISNOUNMAPWAIT(as)) { if (AS_ISUNMAPWAIT(as) == 0) cv_broadcast(&as->a_cv); AS_SETUNMAPWAIT(as); AS_LOCK_EXIT(as, &as->a_lock); while (AS_ISUNMAPWAIT(as)) cv_wait(&as->a_cv, &as->a_contents); + } else { + /* + * We may have raced with + * segvn_reclaim()/segspt_reclaim(). In this + * case clean nounmapwait flag and retry since + * softlockcnt in this segment may be already + * 0. We don't drop as writer lock so our + * number of retries without sleeping should + * be very small. See segvn_reclaim() for + * more comments. + */ + AS_CLRNOUNMAPWAIT(as); + mutex_exit(&as->a_contents); + goto retry; } mutex_exit(&as->a_contents); goto top; } else if (err == IE_RETRY) { - as_setwatch(as); AS_LOCK_EXIT(as, &as->a_lock); goto top; } else if (err) { @@ -2539,6 +2582,167 @@ } /* + * Pagelock pages from a range that spans more than 1 segment. Obtain shadow + * lists from each segment and copy them to one contiguous shadow list (plist) + * as expected by the caller. Save pointers to per segment shadow lists at + * the tail of plist so that they can be used during as_pageunlock(). + */ +static int +as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp, + caddr_t addr, size_t size, enum seg_rw rw) +{ + caddr_t sv_addr = addr; + size_t sv_size = size; + struct seg *sv_seg = seg; + ulong_t segcnt = 1; + ulong_t cnt; + size_t ssize; + pgcnt_t npages = btop(size); + page_t **plist; + page_t **pl; + int error; + caddr_t eaddr; + faultcode_t fault_err = 0; + pgcnt_t pl_off; + extern struct seg_ops segspt_shmops; + + ASSERT(AS_LOCK_HELD(as, &as->a_lock)); + ASSERT(seg != NULL); + ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size); + ASSERT(addr + size > seg->s_base + seg->s_size); + ASSERT(IS_P2ALIGNED(size, PAGESIZE)); + ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); + + /* + * Count the number of segments covered by the range we are about to + * lock. The segment count is used to size the shadow list we return + * back to the caller. + */ + for (; size != 0; size -= ssize, addr += ssize) { + if (addr >= seg->s_base + seg->s_size) { + + seg = AS_SEGNEXT(as, seg); + if (seg == NULL || addr != seg->s_base) { + AS_LOCK_EXIT(as, &as->a_lock); + return (EFAULT); + } + /* + * Do a quick check if subsequent segments + * will most likely support pagelock. + */ + if (seg->s_ops == &segvn_ops) { + vnode_t *vp; + + if (SEGOP_GETVP(seg, addr, &vp) != 0 || + vp != NULL) { + AS_LOCK_EXIT(as, &as->a_lock); + goto slow; + } + } else if (seg->s_ops != &segspt_shmops) { + AS_LOCK_EXIT(as, &as->a_lock); + goto slow; + } + segcnt++; + } + if (addr + size > seg->s_base + seg->s_size) { + ssize = seg->s_base + seg->s_size - addr; + } else { + ssize = size; + } + } + ASSERT(segcnt > 1); + + plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP); + + addr = sv_addr; + size = sv_size; + seg = sv_seg; + + for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) { + if (addr >= seg->s_base + seg->s_size) { + seg = AS_SEGNEXT(as, seg); + ASSERT(seg != NULL && addr == seg->s_base); + cnt++; + ASSERT(cnt < segcnt); + } + if (addr + size > seg->s_base + seg->s_size) { + ssize = seg->s_base + seg->s_size - addr; + } else { + ssize = size; + } + pl = &plist[npages + cnt]; + error = SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl, + L_PAGELOCK, rw); + if (error) { + break; + } + ASSERT(plist[npages + cnt] != NULL); + ASSERT(pl_off + btop(ssize) <= npages); + bcopy(plist[npages + cnt], &plist[pl_off], + btop(ssize) * sizeof (page_t *)); + pl_off += btop(ssize); + } + + if (size == 0) { + AS_LOCK_EXIT(as, &as->a_lock); + ASSERT(cnt == segcnt - 1); + *ppp = plist; + return (0); + } + + /* + * one of pagelock calls failed. The error type is in error variable. + * Unlock what we've locked so far and retry with F_SOFTLOCK if error + * type is either EFAULT or ENOTSUP. Otherwise just return the error + * back to the caller. + */ + + eaddr = addr; + seg = sv_seg; + + for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) { + if (addr >= seg->s_base + seg->s_size) { + seg = AS_SEGNEXT(as, seg); + ASSERT(seg != NULL && addr == seg->s_base); + cnt++; + ASSERT(cnt < segcnt); + } + if (eaddr > seg->s_base + seg->s_size) { + ssize = seg->s_base + seg->s_size - addr; + } else { + ssize = eaddr - addr; + } + pl = &plist[npages + cnt]; + ASSERT(*pl != NULL); + (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl, + L_PAGEUNLOCK, rw); + } + + AS_LOCK_EXIT(as, &as->a_lock); + + kmem_free(plist, (npages + segcnt) * sizeof (page_t *)); + + if (error != ENOTSUP && error != EFAULT) { + return (error); + } + +slow: + /* + * If we are here because pagelock failed due to the need to cow fault + * in the pages we want to lock F_SOFTLOCK will do this job and in + * next as_pagelock() call for this address range pagelock will + * hopefully succeed. + */ + fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw); + if (fault_err != 0) { + return (fc_decode(fault_err)); + } + *ppp = NULL; + + return (0); +} + +/* * lock pages in a given address space. Return shadow list. If * the list is NULL, the MMU mapping is also locked. */ @@ -2547,12 +2751,10 @@ size_t size, enum seg_rw rw) { size_t rsize; - caddr_t base; caddr_t raddr; faultcode_t fault_err; struct seg *seg; - int res; - int prefaulted = 0; + int err; TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START, "as_pagelock_start: addr %p size %ld", addr, size); @@ -2560,17 +2762,25 @@ raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - (size_t)raddr; -top: + /* * if the request crosses two segments let * as_fault handle it. */ AS_LOCK_ENTER(as, &as->a_lock, RW_READER); - seg = as_findseg(as, addr, 0); - if ((seg == NULL) || ((base = seg->s_base) > addr) || - (addr + size) > base + seg->s_size) { + + seg = as_segat(as, raddr); + if (seg == NULL) { AS_LOCK_EXIT(as, &as->a_lock); - goto slow; + return (EFAULT); + } + ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size); + if (raddr + rsize > seg->s_base + seg->s_size) { + return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw)); + } + if (raddr + rsize <= raddr) { + AS_LOCK_EXIT(as, &as->a_lock); + return (EFAULT); } TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START, @@ -2579,46 +2789,22 @@ /* * try to lock pages and pass back shadow list */ - res = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw); + err = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw); TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end"); + AS_LOCK_EXIT(as, &as->a_lock); - if (res == 0) { - return (0); - } else if (res == ENOTSUP || prefaulted) { - /* - * (1) segment driver doesn't support PAGELOCK fastpath, or - * (2) we've already tried fast path unsuccessfully after - * faulting in the addr range below; system might be - * thrashing or there may not be enough availrmem. - */ - goto slow; + + if (err == 0 || (err != ENOTSUP && err != EFAULT)) { + return (err); } - TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_FAULT_START, - "as_fault_start: addr %p size %ld", addr, size); - /* - * we might get here because of some COW fault or non - * existing page. Let as_fault deal with it. Just load - * the page, don't lock the MMU mapping. - */ - fault_err = as_fault(as->a_hat, as, addr, size, F_INVAL, rw); - if (fault_err != 0) { - return (fc_decode(fault_err)); - } - - prefaulted = 1; - - /* - * try fast path again; since we've dropped a_lock, - * we need to try the dance from the start to see if - * the addr range is still valid. - */ - goto top; -slow: - /* - * load the page and lock the MMU mapping. + * Use F_SOFTLOCK to lock the pages because pagelock failed either due + * to no pagelock support for this segment or pages need to be cow + * faulted in. If fault is needed F_SOFTLOCK will do this job for + * this as_pagelock() call and in the next as_pagelock() call for the + * same address range pagelock call will hopefull succeed. */ fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw); if (fault_err != 0) { @@ -2631,6 +2817,52 @@ } /* + * unlock pages locked by as_pagelock_segs(). Retrieve per segment shadow + * lists from the end of plist and call pageunlock interface for each segment. + * Drop as lock and free plist. + */ +static void +as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size, + struct page **plist, enum seg_rw rw) +{ + ulong_t cnt; + caddr_t eaddr = addr + size; + pgcnt_t npages = btop(size); + size_t ssize; + page_t **pl; + + ASSERT(AS_LOCK_HELD(as, &as->a_lock)); + ASSERT(seg != NULL); + ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size); + ASSERT(addr + size > seg->s_base + seg->s_size); + ASSERT(IS_P2ALIGNED(size, PAGESIZE)); + ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); + ASSERT(plist != NULL); + + for (cnt = 0; addr < eaddr; addr += ssize) { + if (addr >= seg->s_base + seg->s_size) { + seg = AS_SEGNEXT(as, seg); + ASSERT(seg != NULL && addr == seg->s_base); + cnt++; + } + if (eaddr > seg->s_base + seg->s_size) { + ssize = seg->s_base + seg->s_size - addr; + } else { + ssize = eaddr - addr; + } + pl = &plist[npages + cnt]; + ASSERT(*pl != NULL); + (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl, + L_PAGEUNLOCK, rw); + } + ASSERT(cnt > 0); + AS_LOCK_EXIT(as, &as->a_lock); + + cnt++; + kmem_free(plist, (npages + cnt) * sizeof (page_t *)); +} + +/* * unlock pages in a given address range */ void @@ -2652,44 +2884,29 @@ (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw); return; } - raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); - rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - - (size_t)raddr; - AS_LOCK_ENTER(as, &as->a_lock, RW_READER); - seg = as_findseg(as, addr, 0); - ASSERT(seg); - TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START, - "seg_unlock_start: raddr %p rsize %ld", raddr, rsize); - SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw); - AS_LOCK_EXIT(as, &as->a_lock); - TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end"); -} - -/* - * reclaim cached pages in a given address range - */ -void -as_pagereclaim(struct as *as, struct page **pp, caddr_t addr, - size_t size, enum seg_rw rw) -{ - struct seg *seg; - size_t rsize; - caddr_t raddr; - - ASSERT(AS_READ_HELD(as, &as->a_lock)); - ASSERT(pp != NULL); raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - (size_t)raddr; - seg = as_findseg(as, addr, 0); - ASSERT(seg); - SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGERECLAIM, rw); + + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + seg = as_segat(as, raddr); + ASSERT(seg != NULL); + + TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START, + "seg_unlock_start: raddr %p rsize %ld", raddr, rsize); + + ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size); + if (raddr + rsize <= seg->s_base + seg->s_size) { + SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw); + } else { + as_pageunlock_segs(as, seg, raddr, rsize, pp, rw); + return; + } + AS_LOCK_EXIT(as, &as->a_lock); + TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end"); } -#define MAXPAGEFLIP 4 -#define MAXPAGEFLIPSIZ MAXPAGEFLIP*PAGESIZE - int as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc, boolean_t wait) @@ -2735,6 +2952,7 @@ ssize = rsize; } +retry: error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc); if (error == IE_NOMEM) { @@ -2778,13 +2996,29 @@ * as_unmap, as_setprot or as_free would do. */ mutex_enter(&as->a_contents); - if (AS_ISUNMAPWAIT(as) == 0) { - cv_broadcast(&as->a_cv); - } - AS_SETUNMAPWAIT(as); - AS_LOCK_EXIT(as, &as->a_lock); - while (AS_ISUNMAPWAIT(as)) { - cv_wait(&as->a_cv, &as->a_contents); + if (!AS_ISNOUNMAPWAIT(as)) { + if (AS_ISUNMAPWAIT(as) == 0) { + cv_broadcast(&as->a_cv); + } + AS_SETUNMAPWAIT(as); + AS_LOCK_EXIT(as, &as->a_lock); + while (AS_ISUNMAPWAIT(as)) { + cv_wait(&as->a_cv, &as->a_contents); + } + } else { + /* + * We may have raced with + * segvn_reclaim()/segspt_reclaim(). In this + * case clean nounmapwait flag and retry since + * softlockcnt in this segment may be already + * 0. We don't drop as writer lock so our + * number of retries without sleeping should + * be very small. See segvn_reclaim() for + * more comments. + */ + AS_CLRNOUNMAPWAIT(as); + mutex_exit(&as->a_contents); + goto retry; } mutex_exit(&as->a_contents); goto setpgsz_top; @@ -2809,6 +3043,8 @@ size_t ssize; int error; + ASSERT(AS_WRITE_HELD(as, &as->a_lock)); + seg = as_segat(as, raddr); if (seg == NULL) { panic("as_iset3_default_lpsize: no seg"); @@ -2864,6 +3100,8 @@ int error; int retry; + ASSERT(AS_WRITE_HELD(as, &as->a_lock)); + for (;;) { error = as_iset3_default_lpsize(as, addr, size, szc, &retry); if (error == EINVAL && retry) { @@ -3150,16 +3388,30 @@ error = EINVAL; } else if (error == EAGAIN) { mutex_enter(&as->a_contents); - if (AS_ISUNMAPWAIT(as) == 0) { - cv_broadcast(&as->a_cv); + if (!AS_ISNOUNMAPWAIT(as)) { + if (AS_ISUNMAPWAIT(as) == 0) { + cv_broadcast(&as->a_cv); + } + AS_SETUNMAPWAIT(as); + AS_LOCK_EXIT(as, &as->a_lock); + while (AS_ISUNMAPWAIT(as)) { + cv_wait(&as->a_cv, &as->a_contents); + } + mutex_exit(&as->a_contents); + AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); + } else { + /* + * We may have raced with + * segvn_reclaim()/segspt_reclaim(). In this case + * clean nounmapwait flag and retry since softlockcnt + * in this segment may be already 0. We don't drop as + * writer lock so our number of retries without + * sleeping should be very small. See segvn_reclaim() + * for more comments. + */ + AS_CLRNOUNMAPWAIT(as); + mutex_exit(&as->a_contents); } - AS_SETUNMAPWAIT(as); - AS_LOCK_EXIT(as, &as->a_lock); - while (AS_ISUNMAPWAIT(as)) { - cv_wait(&as->a_cv, &as->a_contents); - } - mutex_exit(&as->a_contents); - AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); goto again; }
--- a/usr/src/uts/common/vm/vm_page.c Thu May 22 22:08:42 2008 -0700 +++ b/usr/src/uts/common/vm/vm_page.c Thu May 22 22:23:49 2008 -0700 @@ -106,9 +106,6 @@ * These new counters will track the pages locked through segvn and * by explicit user locking. * - * segvn_pages_locked : This keeps track on a global basis how many pages - * are currently locked because of I/O. - * * pages_locked : How many pages are locked because of user specified * locking through mlock or plock. * @@ -117,10 +114,9 @@ * * All these globals are protected by the same lock which protects availrmem. */ -pgcnt_t segvn_pages_locked; -pgcnt_t pages_locked; -pgcnt_t pages_useclaim; -pgcnt_t pages_claimed; +pgcnt_t pages_locked = 0; +pgcnt_t pages_useclaim = 0; +pgcnt_t pages_claimed = 0; /* @@ -5878,7 +5874,6 @@ deficit = tune.t_minarmem + npages + epages - availrmem; mutex_exit(&freemem_lock); page_needfree(deficit); - seg_preap(); kmem_reap(); delay(hz); page_needfree(-(spgcnt_t)deficit); @@ -6285,7 +6280,7 @@ static kmutex_t pc_thread_mutex; static clock_t pc_thread_shortwait; static clock_t pc_thread_longwait; -static int pc_thread_ism_retry; +static int pc_thread_retry; struct page_capture_callback pc_cb[PC_NUM_CALLBACKS]; @@ -6782,17 +6777,13 @@ ASSERT(pp != NULL); - /* only physmem currently has restrictions */ - if (!(flags & CAPTURE_PHYSMEM)) { - return (0); - } - #if defined(__sparc) if (pp->p_vnode == &prom_ppages) { return (EPERM); } - if (PP_ISNORELOC(pp) && !(flags & CAPTURE_GET_CAGE)) { + if (PP_ISNORELOC(pp) && !(flags & CAPTURE_GET_CAGE) && + (flags & CAPTURE_PHYSMEM)) { return (ENOENT); } @@ -6805,6 +6796,11 @@ } #endif /* __sparc */ + /* only physmem currently has the restrictions checked below */ + if (!(flags & CAPTURE_PHYSMEM)) { + return (0); + } + if (availrmem < swapfs_minfree) { /* * We won't try to capture this page as we are @@ -7187,7 +7183,7 @@ pc_thread_shortwait = 23 * hz; pc_thread_longwait = 1201 * hz; - pc_thread_ism_retry = 3; + pc_thread_retry = 3; mutex_init(&pc_thread_mutex, NULL, MUTEX_DEFAULT, NULL); cv_init(&pc_cv, NULL, CV_DEFAULT, NULL); pc_thread_id = thread_create(NULL, 0, page_capture_thread, NULL, 0, &p0, @@ -7358,7 +7354,6 @@ static void page_capture_handle_outstanding(void) { - extern size_t spt_used; int ntry; if (!page_retire_pend_count()) { @@ -7380,34 +7375,23 @@ * we reap prior to attempting to capture. */ kmem_reap(); - /* - * When ISM is in use, we need to disable and - * purge the seg_pcache, and initiate aio - * cleanup in order to release page locks and - * subsquently retire pages in need of - * retirement. - */ - if (spt_used) { - /* disable and purge seg_pcache */ - (void) seg_p_disable(); - for (ntry = 0; ntry < pc_thread_ism_retry; ntry++) { - if (!page_retire_pend_count()) - break; - if (do_aio_cleanup()) { - /* - * allow the apps cleanup threads - * to run - */ - delay(pc_thread_shortwait); - } - page_capture_async(); + + /* disable and purge seg_pcache */ + (void) seg_p_disable(); + for (ntry = 0; ntry < pc_thread_retry; ntry++) { + if (!page_retire_pend_count()) + break; + if (do_aio_cleanup()) { + /* + * allow the apps cleanup threads + * to run + */ + delay(pc_thread_shortwait); } - /* reenable seg_pcache */ - seg_p_enable(); - } else { - seg_preap(); page_capture_async(); } + /* reenable seg_pcache */ + seg_p_enable(); } }
--- a/usr/src/uts/common/vm/vm_seg.c Thu May 22 22:08:42 2008 -0700 +++ b/usr/src/uts/common/vm/vm_seg.c Thu May 22 22:23:49 2008 -0700 @@ -48,8 +48,11 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/kmem.h> +#include <sys/sysmacros.h> #include <sys/vmsystm.h> +#include <sys/tuneable.h> #include <sys/debug.h> +#include <sys/fs/swapnode.h> #include <sys/cmn_err.h> #include <sys/callb.h> #include <sys/mem_config.h> @@ -61,6 +64,8 @@ #include <vm/seg_kmem.h> #include <vm/seg_spt.h> #include <vm/seg_vn.h> +#include <vm/anon.h> + /* * kstats for segment advise */ @@ -72,472 +77,1188 @@ kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat; uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t); -/* #define PDEBUG */ -#if defined(PDEBUG) || defined(lint) || defined(__lint) -int pdebug = 0; -#else -#define pdebug 0 -#endif /* PDEBUG */ - -#define PPRINTF if (pdebug) printf -#define PPRINT(x) PPRINTF(x) -#define PPRINT1(x, a) PPRINTF(x, a) -#define PPRINT2(x, a, b) PPRINTF(x, a, b) -#define PPRINT3(x, a, b, c) PPRINTF(x, a, b, c) -#define PPRINT4(x, a, b, c, d) PPRINTF(x, a, b, c, d) -#define PPRINT5(x, a, b, c, d, e) PPRINTF(x, a, b, c, d, e) - -#define P_HASHMASK (p_hashsize - 1) -#define P_BASESHIFT 6 - /* * entry in the segment page cache */ struct seg_pcache { - struct seg_pcache *p_hnext; /* list for hashed blocks */ - struct seg_pcache *p_hprev; - int p_active; /* active count */ - int p_ref; /* ref bit */ - size_t p_len; /* segment length */ - caddr_t p_addr; /* base address */ - struct seg *p_seg; /* segment */ - struct page **p_pp; /* pp shadow list */ - enum seg_rw p_rw; /* rw */ - uint_t p_flags; /* bit flags */ - int (*p_callback)(struct seg *, caddr_t, size_t, - struct page **, enum seg_rw); + struct seg_pcache *p_hnext; /* list for hashed blocks */ + struct seg_pcache *p_hprev; + pcache_link_t p_plink; /* per segment/amp list */ + void *p_htag0; /* segment/amp pointer */ + caddr_t p_addr; /* base address/anon_idx */ + size_t p_len; /* total bytes */ + size_t p_wlen; /* writtable bytes at p_addr */ + struct page **p_pp; /* pp shadow list */ + seg_preclaim_cbfunc_t p_callback; /* reclaim callback function */ + clock_t p_lbolt; /* lbolt from last use */ + struct seg_phash *p_hashp; /* our pcache hash bucket */ + uint_t p_active; /* active count */ + uchar_t p_write; /* true if S_WRITE */ + uchar_t p_ref; /* reference byte */ + ushort_t p_flags; /* bit flags */ }; struct seg_phash { - struct seg_pcache *p_hnext; /* list for hashed blocks */ - struct seg_pcache *p_hprev; - int p_qlen; /* Q length */ - kmutex_t p_hmutex; /* protects hash bucket */ + struct seg_pcache *p_hnext; /* list for hashed blocks */ + struct seg_pcache *p_hprev; + kmutex_t p_hmutex; /* protects hash bucket */ + pcache_link_t p_halink[2]; /* active bucket linkages */ +}; + +struct seg_phash_wired { + struct seg_pcache *p_hnext; /* list for hashed blocks */ + struct seg_pcache *p_hprev; + kmutex_t p_hmutex; /* protects hash bucket */ }; -static int seg_preap_time = 20; /* reclaim every 20 secs */ -static int seg_pmaxqlen = 5; /* max Q length in hash list */ -static int seg_ppcount = 5; /* max # of purges per reclaim interval */ -static int seg_plazy = 1; /* if 1, pages are cached after pageunlock */ -static pgcnt_t seg_pwindow; /* max # of pages that can be cached */ -static pgcnt_t seg_plocked; /* # of pages which are cached by pagelock */ -static pgcnt_t seg_plocked_window; /* # pages from window */ -int seg_preapahead; +/* + * A parameter to control a maximum number of bytes that can be + * purged from pcache at a time. + */ +#define P_MAX_APURGE_BYTES (1024 * 1024 * 1024) + +/* + * log2(fraction of pcache to reclaim at a time). + */ +#define P_SHRINK_SHFT (5) + +/* + * The following variables can be tuned via /etc/system. + */ + +int segpcache_enabled = 1; /* if 1, shadow lists are cached */ +pgcnt_t segpcache_maxwindow = 0; /* max # of pages that can be cached */ +ulong_t segpcache_hashsize_win = 0; /* # of non wired buckets */ +ulong_t segpcache_hashsize_wired = 0; /* # of wired buckets */ +int segpcache_reap_sec = 1; /* reap check rate in secs */ +clock_t segpcache_reap_ticks = 0; /* reap interval in ticks */ +int segpcache_pcp_maxage_sec = 1; /* pcp max age in secs */ +clock_t segpcache_pcp_maxage_ticks = 0; /* pcp max age in ticks */ +int segpcache_shrink_shift = P_SHRINK_SHFT; /* log2 reap fraction */ +pgcnt_t segpcache_maxapurge_bytes = P_MAX_APURGE_BYTES; /* max purge bytes */ -static uint_t seg_pdisable = 0; /* if not 0, caching temporarily disabled */ +static kmutex_t seg_pcache_mtx; /* protects seg_pdisabled counter */ +static kmutex_t seg_pasync_mtx; /* protects async thread scheduling */ +static kcondvar_t seg_pasync_cv; + +#pragma align 64(pctrl1) +#pragma align 64(pctrl2) +#pragma align 64(pctrl3) -static int seg_pupdate_active = 1; /* background reclaim thread */ -static clock_t seg_preap_interval; /* reap interval in ticks */ +/* + * Keep frequently used variables together in one cache line. + */ +static struct p_ctrl1 { + uint_t p_disabled; /* if not 0, caching temporarily off */ + pgcnt_t p_maxwin; /* max # of pages that can be cached */ + size_t p_hashwin_sz; /* # of non wired buckets */ + struct seg_phash *p_htabwin; /* hash table for non wired entries */ + size_t p_hashwired_sz; /* # of wired buckets */ + struct seg_phash_wired *p_htabwired; /* hash table for wired entries */ + kmem_cache_t *p_kmcache; /* kmem cache for seg_pcache structs */ +#ifdef _LP64 + ulong_t pad[1]; +#endif /* _LP64 */ +} pctrl1; + +static struct p_ctrl2 { + kmutex_t p_mem_mtx; /* protects window counter and p_halinks */ + pgcnt_t p_locked_win; /* # pages from window */ + pgcnt_t p_locked; /* # of pages cached by pagelock */ + uchar_t p_ahcur; /* current active links for insert/delete */ + uchar_t p_athr_on; /* async reclaim thread is running. */ + pcache_link_t p_ahhead[2]; /* active buckets linkages */ +} pctrl2; -static kmutex_t seg_pcache; /* protects the whole pagelock cache */ -static kmutex_t seg_pmem; /* protects window counter */ -static ksema_t seg_pasync_sem; /* sema for reclaim thread */ -static struct seg_phash *p_hashtab; -static int p_hashsize = 0; +static struct p_ctrl3 { + clock_t p_pcp_maxage; /* max pcp age in ticks */ + ulong_t p_athr_empty_ahb; /* athread walk stats */ + ulong_t p_athr_full_ahb; /* athread walk stats */ + pgcnt_t p_maxapurge_npages; /* max pages to purge at a time */ + int p_shrink_shft; /* reap shift factor */ +#ifdef _LP64 + ulong_t pad[3]; +#endif /* _LP64 */ +} pctrl3; -#define p_hash(seg) \ - (P_HASHMASK & \ - ((uintptr_t)(seg) >> P_BASESHIFT)) +#define seg_pdisabled pctrl1.p_disabled +#define seg_pmaxwindow pctrl1.p_maxwin +#define seg_phashsize_win pctrl1.p_hashwin_sz +#define seg_phashtab_win pctrl1.p_htabwin +#define seg_phashsize_wired pctrl1.p_hashwired_sz +#define seg_phashtab_wired pctrl1.p_htabwired +#define seg_pkmcache pctrl1.p_kmcache +#define seg_pmem_mtx pctrl2.p_mem_mtx +#define seg_plocked_window pctrl2.p_locked_win +#define seg_plocked pctrl2.p_locked +#define seg_pahcur pctrl2.p_ahcur +#define seg_pathr_on pctrl2.p_athr_on +#define seg_pahhead pctrl2.p_ahhead +#define seg_pmax_pcpage pctrl3.p_pcp_maxage +#define seg_pathr_empty_ahb pctrl3.p_athr_empty_ahb +#define seg_pathr_full_ahb pctrl3.p_athr_full_ahb +#define seg_pshrink_shift pctrl3.p_shrink_shft +#define seg_pmaxapurge_npages pctrl3.p_maxapurge_npages + +#define P_HASHWIN_MASK (seg_phashsize_win - 1) +#define P_HASHWIRED_MASK (seg_phashsize_wired - 1) +#define P_BASESHIFT (6) + +kthread_t *seg_pasync_thr; + +extern struct seg_ops segvn_ops; +extern struct seg_ops segspt_shmops; -#define p_match(pcp, seg, addr, len, rw) \ - (((pcp)->p_seg == (seg) && \ - (pcp)->p_addr == (addr) && \ - (pcp)->p_rw == (rw) && \ - (pcp)->p_len == (len)) ? 1 : 0) +#define IS_PFLAGS_WIRED(flags) ((flags) & SEGP_FORCE_WIRED) +#define IS_PCP_WIRED(pcp) IS_PFLAGS_WIRED((pcp)->p_flags) + +#define LBOLT_DELTA(t) ((ulong_t)(lbolt - (t))) + +#define PCP_AGE(pcp) LBOLT_DELTA((pcp)->p_lbolt) + +/* + * htag0 argument can be a seg or amp pointer. + */ +#define P_HASHBP(seg, htag0, addr, flags) \ + (IS_PFLAGS_WIRED((flags)) ? \ + ((struct seg_phash *)&seg_phashtab_wired[P_HASHWIRED_MASK & \ + ((uintptr_t)(htag0) >> P_BASESHIFT)]) : \ + (&seg_phashtab_win[P_HASHWIN_MASK & \ + (((uintptr_t)(htag0) >> 3) ^ \ + ((uintptr_t)(addr) >> ((flags & SEGP_PSHIFT) ? \ + (flags >> 16) : page_get_shift((seg)->s_szc))))])) -#define p_match_pp(pcp, seg, addr, len, pp, rw) \ - (((pcp)->p_seg == (seg) && \ - (pcp)->p_addr == (addr) && \ - (pcp)->p_pp == (pp) && \ - (pcp)->p_rw == (rw) && \ - (pcp)->p_len == (len)) ? 1 : 0) +/* + * htag0 argument can be a seg or amp pointer. + */ +#define P_MATCH(pcp, htag0, addr, len) \ + ((pcp)->p_htag0 == (htag0) && \ + (pcp)->p_addr == (addr) && \ + (pcp)->p_len >= (len)) +#define P_MATCH_PP(pcp, htag0, addr, len, pp) \ + ((pcp)->p_pp == (pp) && \ + (pcp)->p_htag0 == (htag0) && \ + (pcp)->p_addr == (addr) && \ + (pcp)->p_len >= (len)) + +#define plink2pcache(pl) ((struct seg_pcache *)((uintptr_t)(pl) - \ + offsetof(struct seg_pcache, p_plink))) + +#define hlink2phash(hl, l) ((struct seg_phash *)((uintptr_t)(hl) - \ + offsetof(struct seg_phash, p_halink[l]))) /* - * lookup an address range in pagelock cache. Return shadow list - * and bump up active count. + * seg_padd_abuck()/seg_premove_abuck() link and unlink hash buckets from + * active hash bucket lists. We maintain active bucket lists to reduce the + * overhead of finding active buckets during asynchronous purging since there + * can be 10s of millions of buckets on a large system but only a small subset + * of them in actual use. + * + * There're 2 active bucket lists. Current active list (as per seg_pahcur) is + * used by seg_pinsert()/seg_pinactive()/seg_ppurge() to add and delete + * buckets. The other list is used by asynchronous purge thread. This allows + * the purge thread to walk its active list without holding seg_pmem_mtx for a + * long time. When asynchronous thread is done with its list it switches to + * current active list and makes the list it just finished processing as + * current active list. + * + * seg_padd_abuck() only adds the bucket to current list if the bucket is not + * yet on any list. seg_premove_abuck() may remove the bucket from either + * list. If the bucket is on current list it will be always removed. Otherwise + * the bucket is only removed if asynchronous purge thread is not currently + * running or seg_premove_abuck() is called by asynchronous purge thread + * itself. A given bucket can only be on one of active lists at a time. These + * routines should be called with per bucket lock held. The routines use + * seg_pmem_mtx to protect list updates. seg_padd_abuck() must be called after + * the first entry is added to the bucket chain and seg_premove_abuck() must + * be called after the last pcp entry is deleted from its chain. Per bucket + * lock should be held by the callers. This avoids a potential race condition + * when seg_premove_abuck() removes a bucket after pcp entries are added to + * its list after the caller checked that the bucket has no entries. (this + * race would cause a loss of an active bucket from the active lists). + * + * Both lists are circular doubly linked lists anchored at seg_pahhead heads. + * New entries are added to the end of the list since LRU is used as the + * purging policy. + */ +static void +seg_padd_abuck(struct seg_phash *hp) +{ + int lix; + + ASSERT(MUTEX_HELD(&hp->p_hmutex)); + ASSERT((struct seg_phash *)hp->p_hnext != hp); + ASSERT((struct seg_phash *)hp->p_hprev != hp); + ASSERT(hp->p_hnext == hp->p_hprev); + ASSERT(!IS_PCP_WIRED(hp->p_hnext)); + ASSERT(hp->p_hnext->p_hnext == (struct seg_pcache *)hp); + ASSERT(hp->p_hprev->p_hprev == (struct seg_pcache *)hp); + ASSERT(hp >= seg_phashtab_win && + hp < &seg_phashtab_win[seg_phashsize_win]); + + /* + * This bucket can already be on one of active lists + * since seg_premove_abuck() may have failed to remove it + * before. + */ + mutex_enter(&seg_pmem_mtx); + lix = seg_pahcur; + ASSERT(lix >= 0 && lix <= 1); + if (hp->p_halink[lix].p_lnext != NULL) { + ASSERT(hp->p_halink[lix].p_lprev != NULL); + ASSERT(hp->p_halink[!lix].p_lnext == NULL); + ASSERT(hp->p_halink[!lix].p_lprev == NULL); + mutex_exit(&seg_pmem_mtx); + return; + } + ASSERT(hp->p_halink[lix].p_lprev == NULL); + + /* + * If this bucket is still on list !lix async thread can't yet remove + * it since we hold here per bucket lock. In this case just return + * since async thread will eventually find and process this bucket. + */ + if (hp->p_halink[!lix].p_lnext != NULL) { + ASSERT(hp->p_halink[!lix].p_lprev != NULL); + mutex_exit(&seg_pmem_mtx); + return; + } + ASSERT(hp->p_halink[!lix].p_lprev == NULL); + /* + * This bucket is not on any active bucket list yet. + * Add the bucket to the tail of current active list. + */ + hp->p_halink[lix].p_lnext = &seg_pahhead[lix]; + hp->p_halink[lix].p_lprev = seg_pahhead[lix].p_lprev; + seg_pahhead[lix].p_lprev->p_lnext = &hp->p_halink[lix]; + seg_pahhead[lix].p_lprev = &hp->p_halink[lix]; + mutex_exit(&seg_pmem_mtx); +} + +static void +seg_premove_abuck(struct seg_phash *hp, int athr) +{ + int lix; + + ASSERT(MUTEX_HELD(&hp->p_hmutex)); + ASSERT((struct seg_phash *)hp->p_hnext == hp); + ASSERT((struct seg_phash *)hp->p_hprev == hp); + ASSERT(hp >= seg_phashtab_win && + hp < &seg_phashtab_win[seg_phashsize_win]); + + if (athr) { + ASSERT(seg_pathr_on); + ASSERT(seg_pahcur <= 1); + /* + * We are called by asynchronous thread that found this bucket + * on not currently active (i.e. !seg_pahcur) list. Remove it + * from there. Per bucket lock we are holding makes sure + * seg_pinsert() can't sneak in and add pcp entries to this + * bucket right before we remove the bucket from its list. + */ + lix = !seg_pahcur; + ASSERT(hp->p_halink[lix].p_lnext != NULL); + ASSERT(hp->p_halink[lix].p_lprev != NULL); + ASSERT(hp->p_halink[!lix].p_lnext == NULL); + ASSERT(hp->p_halink[!lix].p_lprev == NULL); + hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev; + hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext; + hp->p_halink[lix].p_lnext = NULL; + hp->p_halink[lix].p_lprev = NULL; + return; + } + + mutex_enter(&seg_pmem_mtx); + lix = seg_pahcur; + ASSERT(lix >= 0 && lix <= 1); + + /* + * If the bucket is on currently active list just remove it from + * there. + */ + if (hp->p_halink[lix].p_lnext != NULL) { + ASSERT(hp->p_halink[lix].p_lprev != NULL); + ASSERT(hp->p_halink[!lix].p_lnext == NULL); + ASSERT(hp->p_halink[!lix].p_lprev == NULL); + hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev; + hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext; + hp->p_halink[lix].p_lnext = NULL; + hp->p_halink[lix].p_lprev = NULL; + mutex_exit(&seg_pmem_mtx); + return; + } + ASSERT(hp->p_halink[lix].p_lprev == NULL); + + /* + * If asynchronous thread is not running we can remove the bucket from + * not currently active list. The bucket must be on this list since we + * already checked that it's not on the other list and the bucket from + * which we just deleted the last pcp entry must be still on one of the + * active bucket lists. + */ + lix = !lix; + ASSERT(hp->p_halink[lix].p_lnext != NULL); + ASSERT(hp->p_halink[lix].p_lprev != NULL); + + if (!seg_pathr_on) { + hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev; + hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext; + hp->p_halink[lix].p_lnext = NULL; + hp->p_halink[lix].p_lprev = NULL; + } + mutex_exit(&seg_pmem_mtx); +} + +/* + * Check if bucket pointed by hp already has a pcp entry that matches request + * htag0, addr and len. Set *found to 1 if match is found and to 0 otherwise. + * Also delete matching entries that cover smaller address range but start + * at the same address as addr argument. Return the list of deleted entries if + * any. This is an internal helper function called from seg_pinsert() only + * for non wired shadow lists. The caller already holds a per seg/amp list + * lock. + */ +static struct seg_pcache * +seg_plookup_checkdup(struct seg_phash *hp, void *htag0, + caddr_t addr, size_t len, int *found) +{ + struct seg_pcache *pcp; + struct seg_pcache *delcallb_list = NULL; + + ASSERT(MUTEX_HELD(&hp->p_hmutex)); + + *found = 0; + for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; + pcp = pcp->p_hnext) { + ASSERT(pcp->p_hashp == hp); + if (pcp->p_htag0 == htag0 && pcp->p_addr == addr) { + ASSERT(!IS_PCP_WIRED(pcp)); + if (pcp->p_len < len) { + pcache_link_t *plinkp; + if (pcp->p_active) { + continue; + } + plinkp = &pcp->p_plink; + plinkp->p_lprev->p_lnext = plinkp->p_lnext; + plinkp->p_lnext->p_lprev = plinkp->p_lprev; + pcp->p_hprev->p_hnext = pcp->p_hnext; + pcp->p_hnext->p_hprev = pcp->p_hprev; + pcp->p_hprev = delcallb_list; + delcallb_list = pcp; + } else { + *found = 1; + break; + } + } + } + return (delcallb_list); +} + +/* + * lookup an address range in pagelock cache. Return shadow list and bump up + * active count. If amp is not NULL use amp as a lookup tag otherwise use seg + * as a lookup tag. */ struct page ** -seg_plookup(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw) +seg_plookup(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len, + enum seg_rw rw, uint_t flags) { struct seg_pcache *pcp; struct seg_phash *hp; + void *htag0; + + ASSERT(seg != NULL); + ASSERT(rw == S_READ || rw == S_WRITE); /* * Skip pagelock cache, while DR is in progress or * seg_pcache is off. */ - if (seg_pdisable || seg_plazy == 0) { + if (seg_pdisabled) { return (NULL); } + ASSERT(seg_phashsize_win != 0); - hp = &p_hashtab[p_hash(seg)]; + htag0 = (amp == NULL ? (void *)seg : (void *)amp); + hp = P_HASHBP(seg, htag0, addr, flags); mutex_enter(&hp->p_hmutex); for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; pcp = pcp->p_hnext) { - if (p_match(pcp, seg, addr, len, rw)) { + ASSERT(pcp->p_hashp == hp); + if (P_MATCH(pcp, htag0, addr, len)) { + ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp)); + /* + * If this request wants to write pages + * but write permissions starting from + * addr don't cover the entire length len + * return lookup failure back to the caller. + * It will check protections and fail this + * pagelock operation with EACCESS error. + */ + if (rw == S_WRITE && pcp->p_wlen < len) { + break; + } + if (pcp->p_active == UINT_MAX) { + break; + } pcp->p_active++; + if (rw == S_WRITE && !pcp->p_write) { + pcp->p_write = 1; + } mutex_exit(&hp->p_hmutex); - - PPRINT5("seg_plookup hit: seg %p, addr %p, " - "len %lx, count %d, pplist %p \n", - (void *)seg, (void *)addr, len, pcp->p_active, - (void *)pcp->p_pp); - return (pcp->p_pp); } } mutex_exit(&hp->p_hmutex); - - PPRINT("seg_plookup miss:\n"); - return (NULL); } /* - * mark address range inactive. If the cache is off or the address - * range is not in the cache we call the segment driver to reclaim - * the pages. Otherwise just decrement active count and set ref bit. + * mark address range inactive. If the cache is off or the address range is + * not in the cache or another shadow list that covers bigger range is found + * we call the segment driver to reclaim the pages. Otherwise just decrement + * active count and set ref bit. If amp is not NULL use amp as a lookup tag + * otherwise use seg as a lookup tag. */ void -seg_pinactive(struct seg *seg, caddr_t addr, size_t len, struct page **pp, - enum seg_rw rw, int (*callback)(struct seg *, caddr_t, size_t, - struct page **, enum seg_rw)) +seg_pinactive(struct seg *seg, struct anon_map *amp, caddr_t addr, + size_t len, struct page **pp, enum seg_rw rw, uint_t flags, + seg_preclaim_cbfunc_t callback) { struct seg_pcache *pcp; struct seg_phash *hp; + kmutex_t *pmtx = NULL; + pcache_link_t *pheadp; + void *htag0; + pgcnt_t npages = 0; + int keep = 0; - if (seg_plazy == 0) { - (void) (*callback)(seg, addr, len, pp, rw); - return; + ASSERT(seg != NULL); + ASSERT(rw == S_READ || rw == S_WRITE); + + htag0 = (amp == NULL ? (void *)seg : (void *)amp); + + /* + * Skip lookup if pcache is not configured. + */ + if (seg_phashsize_win == 0) { + goto out; } - hp = &p_hashtab[p_hash(seg)]; + + /* + * Grab per seg/amp lock before hash lock if we are going to remove + * inactive entry from pcache. + */ + if (!IS_PFLAGS_WIRED(flags) && seg_pdisabled) { + if (amp == NULL) { + pheadp = &seg->s_phead; + pmtx = &seg->s_pmtx; + } else { + pheadp = &->a_phead; + pmtx = &->a_pmtx; + } + mutex_enter(pmtx); + } + + hp = P_HASHBP(seg, htag0, addr, flags); mutex_enter(&hp->p_hmutex); +again: for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; pcp = pcp->p_hnext) { - if (p_match_pp(pcp, seg, addr, len, pp, rw)) { + ASSERT(pcp->p_hashp == hp); + if (P_MATCH_PP(pcp, htag0, addr, len, pp)) { + ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp)); + ASSERT(pcp->p_active); + if (keep) { + /* + * Don't remove this pcp entry + * if we didn't find duplicate + * shadow lists on second search. + * Somebody removed those duplicates + * since we dropped hash lock after first + * search. + */ + ASSERT(pmtx != NULL); + ASSERT(!IS_PFLAGS_WIRED(flags)); + mutex_exit(pmtx); + pmtx = NULL; + } pcp->p_active--; - ASSERT(pcp->p_active >= 0); - if (pcp->p_active == 0 && seg_pdisable) { - int npages; + if (pcp->p_active == 0 && (pmtx != NULL || + (seg_pdisabled && IS_PFLAGS_WIRED(flags)))) { + + /* + * This entry is no longer active. Remove it + * now either because pcaching is temporarily + * disabled or there're other pcp entries that + * can match this pagelock request (i.e. this + * entry is a duplicate). + */ ASSERT(callback == pcp->p_callback); - /* free the entry */ - hp->p_qlen--; + if (pmtx != NULL) { + pcache_link_t *plinkp = &pcp->p_plink; + ASSERT(!IS_PCP_WIRED(pcp)); + ASSERT(pheadp->p_lnext != pheadp); + ASSERT(pheadp->p_lprev != pheadp); + plinkp->p_lprev->p_lnext = + plinkp->p_lnext; + plinkp->p_lnext->p_lprev = + plinkp->p_lprev; + } pcp->p_hprev->p_hnext = pcp->p_hnext; pcp->p_hnext->p_hprev = pcp->p_hprev; + if (!IS_PCP_WIRED(pcp) && + hp->p_hnext == (struct seg_pcache *)hp) { + /* + * We removed the last entry from this + * bucket. Now remove the bucket from + * its active list. + */ + seg_premove_abuck(hp, 0); + } mutex_exit(&hp->p_hmutex); - npages = pcp->p_len >> PAGESHIFT; - mutex_enter(&seg_pmem); - seg_plocked -= npages; - if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) { - seg_plocked_window -= npages; + if (pmtx != NULL) { + mutex_exit(pmtx); + } + len = pcp->p_len; + npages = btop(len); + if (rw != S_WRITE && pcp->p_write) { + rw = S_WRITE; + } + kmem_cache_free(seg_pkmcache, pcp); + goto out; + } else { + /* + * We found a matching pcp entry but will not + * free it right away even if it's no longer + * active. + */ + if (!pcp->p_active && !IS_PCP_WIRED(pcp)) { + /* + * Set the reference bit and mark the + * time of last access to this pcp + * so that asynchronous thread doesn't + * free it immediately since + * it may be reactivated very soon. + */ + pcp->p_lbolt = lbolt; + pcp->p_ref = 1; + } + mutex_exit(&hp->p_hmutex); + if (pmtx != NULL) { + mutex_exit(pmtx); } - mutex_exit(&seg_pmem); - kmem_free(pcp, sizeof (struct seg_pcache)); - goto out; + return; + } + } else if (!IS_PFLAGS_WIRED(flags) && + P_MATCH(pcp, htag0, addr, len)) { + /* + * This is a duplicate pcp entry. This situation may + * happen if a bigger shadow list that covers our + * range was added while our entry was still active. + * Now we can free our pcp entry if it becomes + * inactive. + */ + if (!pcp->p_active) { + /* + * Mark this entry as referenced just in case + * we'll free our own pcp entry soon. + */ + pcp->p_lbolt = lbolt; + pcp->p_ref = 1; + } + if (pmtx != NULL) { + /* + * we are already holding pmtx and found a + * duplicate. Don't keep our own pcp entry. + */ + keep = 0; + continue; } - pcp->p_ref = 1; - mutex_exit(&hp->p_hmutex); - return; + /* + * We have to use mutex_tryenter to attempt to lock + * seg/amp list lock since we already hold hash lock + * and seg/amp list lock is above hash lock in lock + * order. If mutex_tryenter fails drop hash lock and + * retake both locks in correct order and research + * this hash chain. + */ + ASSERT(keep == 0); + if (amp == NULL) { + pheadp = &seg->s_phead; + pmtx = &seg->s_pmtx; + } else { + pheadp = &->a_phead; + pmtx = &->a_pmtx; + } + if (!mutex_tryenter(pmtx)) { + mutex_exit(&hp->p_hmutex); + mutex_enter(pmtx); + mutex_enter(&hp->p_hmutex); + /* + * If we don't find bigger shadow list on + * second search (it may happen since we + * dropped bucket lock) keep the entry that + * matches our own shadow list. + */ + keep = 1; + goto again; + } } } mutex_exit(&hp->p_hmutex); + if (pmtx != NULL) { + mutex_exit(pmtx); + } out: - (void) (*callback)(seg, addr, len, pp, rw); + (*callback)(htag0, addr, len, pp, rw, 0); + if (npages) { + mutex_enter(&seg_pmem_mtx); + ASSERT(seg_plocked >= npages); + seg_plocked -= npages; + if (!IS_PFLAGS_WIRED(flags)) { + ASSERT(seg_plocked_window >= npages); + seg_plocked_window -= npages; + } + mutex_exit(&seg_pmem_mtx); + } + } +#ifdef DEBUG +static uint32_t p_insert_chk_mtbf = 0; +#endif + /* * The seg_pinsert_check() is used by segment drivers to predict whether * a call to seg_pinsert will fail and thereby avoid wasteful pre-processing. */ - +/*ARGSUSED*/ int -seg_pinsert_check(struct seg *seg, size_t len, uint_t flags) +seg_pinsert_check(struct seg *seg, struct anon_map *amp, caddr_t addr, + size_t len, uint_t flags) { - struct seg_phash *hp; + ASSERT(seg != NULL); - if (seg_plazy == 0) { - return (SEGP_FAIL); - } - if (seg_pdisable != 0) { +#ifdef DEBUG + if (p_insert_chk_mtbf && !(gethrtime() % p_insert_chk_mtbf)) { return (SEGP_FAIL); } - ASSERT((len & PAGEOFFSET) == 0); - hp = &p_hashtab[p_hash(seg)]; - if (hp->p_qlen > seg_pmaxqlen && (flags & SEGP_FORCE_WIRED) == 0) { +#endif + + if (seg_pdisabled) { return (SEGP_FAIL); } - /* - * If the SEGP_FORCE_WIRED flag is set, - * we skip the check for seg_pwindow. - */ - if ((flags & SEGP_FORCE_WIRED) == 0) { - pgcnt_t npages; + ASSERT(seg_phashsize_win != 0); + + if (IS_PFLAGS_WIRED(flags)) { + return (SEGP_SUCCESS); + } - npages = len >> PAGESHIFT; - if ((seg_plocked_window + npages) > seg_pwindow) { - return (SEGP_FAIL); - } + if (seg_plocked_window + btop(len) > seg_pmaxwindow) { + return (SEGP_FAIL); } + + if (freemem < desfree) { + return (SEGP_FAIL); + } + return (SEGP_SUCCESS); } +#ifdef DEBUG +static uint32_t p_insert_mtbf = 0; +#endif /* - * insert address range with shadow list into pagelock cache. If - * the cache is off or caching is temporarily disabled or the allowed - * 'window' is exceeded - return SEGP_FAIL. Otherwise return - * SEGP_SUCCESS. + * Insert address range with shadow list into pagelock cache if there's no + * shadow list already cached for this address range. If the cache is off or + * caching is temporarily disabled or the allowed 'window' is exceeded return + * SEGP_FAIL. Otherwise return SEGP_SUCCESS. + * + * For non wired shadow lists (segvn case) include address in the hashing + * function to avoid linking all the entries from the same segment or amp on + * the same bucket. amp is used instead of seg if amp is not NULL. Non wired + * pcache entries are also linked on a per segment/amp list so that all + * entries can be found quickly during seg/amp purge without walking the + * entire pcache hash table. For wired shadow lists (segspt case) we + * don't use address hashing and per segment linking because the caller + * currently inserts only one entry per segment that covers the entire + * segment. If we used per segment linking even for segspt it would complicate + * seg_ppurge_wiredpp() locking. + * + * Both hash bucket and per seg/amp locks need to be held before adding a non + * wired entry to hash and per seg/amp lists. per seg/amp lock should be taken + * first. + * + * This function will also remove from pcache old inactive shadow lists that + * overlap with this request but cover smaller range for the same start + * address. */ int -seg_pinsert(struct seg *seg, caddr_t addr, size_t len, struct page **pp, - enum seg_rw rw, uint_t flags, int (*callback)(struct seg *, caddr_t, - size_t, struct page **, enum seg_rw)) +seg_pinsert(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len, + size_t wlen, struct page **pp, enum seg_rw rw, uint_t flags, + seg_preclaim_cbfunc_t callback) { struct seg_pcache *pcp; struct seg_phash *hp; pgcnt_t npages; + pcache_link_t *pheadp; + kmutex_t *pmtx; + struct seg_pcache *delcallb_list = NULL; - if (seg_plazy == 0) { + ASSERT(seg != NULL); + ASSERT(rw == S_READ || rw == S_WRITE); + ASSERT(rw == S_READ || wlen == len); + ASSERT(rw == S_WRITE || wlen <= len); + ASSERT(amp == NULL || wlen == len); + +#ifdef DEBUG + if (p_insert_mtbf && !(gethrtime() % p_insert_mtbf)) { return (SEGP_FAIL); } - if (seg_pdisable != 0) { - return (SEGP_FAIL); - } - ASSERT((len & PAGEOFFSET) == 0); - hp = &p_hashtab[p_hash(seg)]; - if (hp->p_qlen > seg_pmaxqlen && (flags & SEGP_FORCE_WIRED) == 0) { +#endif + + if (seg_pdisabled) { return (SEGP_FAIL); } - npages = len >> PAGESHIFT; - mutex_enter(&seg_pmem); - /* - * If the SEGP_FORCE_WIRED flag is set, - * we skip the check for seg_pwindow. - */ - if ((flags & SEGP_FORCE_WIRED) == 0) { - seg_plocked_window += npages; - if (seg_plocked_window > seg_pwindow) { - seg_plocked_window -= npages; - mutex_exit(&seg_pmem); + ASSERT(seg_phashsize_win != 0); + + ASSERT((len & PAGEOFFSET) == 0); + npages = btop(len); + mutex_enter(&seg_pmem_mtx); + if (!IS_PFLAGS_WIRED(flags)) { + if (seg_plocked_window + npages > seg_pmaxwindow) { + mutex_exit(&seg_pmem_mtx); return (SEGP_FAIL); } + seg_plocked_window += npages; } seg_plocked += npages; - mutex_exit(&seg_pmem); + mutex_exit(&seg_pmem_mtx); - pcp = kmem_alloc(sizeof (struct seg_pcache), KM_SLEEP); - pcp->p_seg = seg; + pcp = kmem_cache_alloc(seg_pkmcache, KM_SLEEP); + /* + * If amp is not NULL set htag0 to amp otherwise set it to seg. + */ + if (amp == NULL) { + pcp->p_htag0 = (void *)seg; + pcp->p_flags = flags & 0xffff; + } else { + pcp->p_htag0 = (void *)amp; + pcp->p_flags = (flags & 0xffff) | SEGP_AMP; + } pcp->p_addr = addr; pcp->p_len = len; + pcp->p_wlen = wlen; pcp->p_pp = pp; - pcp->p_rw = rw; + pcp->p_write = (rw == S_WRITE); pcp->p_callback = callback; pcp->p_active = 1; - pcp->p_flags = flags; - PPRINT4("seg_pinsert: seg %p, addr %p, len %lx, pplist %p\n", - (void *)seg, (void *)addr, len, (void *)pp); - - hp = &p_hashtab[p_hash(seg)]; - mutex_enter(&hp->p_hmutex); - hp->p_qlen++; + hp = P_HASHBP(seg, pcp->p_htag0, addr, flags); + if (!IS_PFLAGS_WIRED(flags)) { + int found; + void *htag0; + if (amp == NULL) { + pheadp = &seg->s_phead; + pmtx = &seg->s_pmtx; + htag0 = (void *)seg; + } else { + pheadp = &->a_phead; + pmtx = &->a_pmtx; + htag0 = (void *)amp; + } + mutex_enter(pmtx); + mutex_enter(&hp->p_hmutex); + delcallb_list = seg_plookup_checkdup(hp, htag0, addr, + len, &found); + if (found) { + mutex_exit(&hp->p_hmutex); + mutex_exit(pmtx); + mutex_enter(&seg_pmem_mtx); + seg_plocked -= npages; + seg_plocked_window -= npages; + mutex_exit(&seg_pmem_mtx); + kmem_cache_free(seg_pkmcache, pcp); + goto out; + } + pcp->p_plink.p_lnext = pheadp->p_lnext; + pcp->p_plink.p_lprev = pheadp; + pheadp->p_lnext->p_lprev = &pcp->p_plink; + pheadp->p_lnext = &pcp->p_plink; + } else { + mutex_enter(&hp->p_hmutex); + } + pcp->p_hashp = hp; pcp->p_hnext = hp->p_hnext; pcp->p_hprev = (struct seg_pcache *)hp; hp->p_hnext->p_hprev = pcp; hp->p_hnext = pcp; + if (!IS_PFLAGS_WIRED(flags) && + hp->p_hprev == pcp) { + seg_padd_abuck(hp); + } mutex_exit(&hp->p_hmutex); + if (!IS_PFLAGS_WIRED(flags)) { + mutex_exit(pmtx); + } + +out: + npages = 0; + while (delcallb_list != NULL) { + pcp = delcallb_list; + delcallb_list = pcp->p_hprev; + ASSERT(!IS_PCP_WIRED(pcp) && !pcp->p_active); + (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, + pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0); + npages += btop(pcp->p_len); + kmem_cache_free(seg_pkmcache, pcp); + } + if (npages) { + ASSERT(!IS_PFLAGS_WIRED(flags)); + mutex_enter(&seg_pmem_mtx); + ASSERT(seg_plocked >= npages); + ASSERT(seg_plocked_window >= npages); + seg_plocked -= npages; + seg_plocked_window -= npages; + mutex_exit(&seg_pmem_mtx); + } + return (SEGP_SUCCESS); } /* - * purge all entries from the pagelock cache if not active - * and not recently used. Drop all locks and call through - * the address space into the segment driver to reclaim - * the pages. This makes sure we get the address space - * and segment driver locking right. + * purge entries from the pagelock cache if not active + * and not recently used. */ static void -seg_ppurge_all(int force) +seg_ppurge_async(int force) { struct seg_pcache *delcallb_list = NULL; struct seg_pcache *pcp; struct seg_phash *hp; - int purge_count = 0; pgcnt_t npages = 0; pgcnt_t npages_window = 0; + pgcnt_t npgs_to_purge; + pgcnt_t npgs_purged = 0; + int hlinks = 0; + int hlix; + pcache_link_t *hlinkp; + pcache_link_t *hlnextp = NULL; + int lowmem; + int trim; + + ASSERT(seg_phashsize_win != 0); /* - * if the cache if off or empty, return + * if the cache is off or empty, return */ - if (seg_plazy == 0 || seg_plocked == 0) { + if (seg_plocked == 0 || (!force && seg_plocked_window == 0)) { return; } - for (hp = p_hashtab; hp < &p_hashtab[p_hashsize]; hp++) { + + if (!force) { + lowmem = 0; + trim = 0; + if (freemem < lotsfree + needfree) { + spgcnt_t fmem = MAX((spgcnt_t)(freemem - needfree), 0); + if (fmem <= 5 * (desfree >> 2)) { + lowmem = 1; + } else if (fmem <= 7 * (lotsfree >> 3)) { + if (seg_plocked_window >= + (availrmem_initial >> 1)) { + lowmem = 1; + } + } else if (fmem < lotsfree) { + if (seg_plocked_window >= + 3 * (availrmem_initial >> 2)) { + lowmem = 1; + } + } + } + if (seg_plocked_window >= 7 * (seg_pmaxwindow >> 3)) { + trim = 1; + } + if (!lowmem && !trim) { + return; + } + npgs_to_purge = seg_plocked_window >> + seg_pshrink_shift; + if (lowmem) { + npgs_to_purge = MIN(npgs_to_purge, + MAX(seg_pmaxapurge_npages, desfree)); + } else { + npgs_to_purge = MIN(npgs_to_purge, + seg_pmaxapurge_npages); + } + if (npgs_to_purge == 0) { + return; + } + } else { + struct seg_phash_wired *hpw; + + ASSERT(seg_phashsize_wired != 0); + + for (hpw = seg_phashtab_wired; + hpw < &seg_phashtab_wired[seg_phashsize_wired]; hpw++) { + + if (hpw->p_hnext == (struct seg_pcache *)hpw) { + continue; + } + + mutex_enter(&hpw->p_hmutex); + + for (pcp = hpw->p_hnext; + pcp != (struct seg_pcache *)hpw; + pcp = pcp->p_hnext) { + + ASSERT(IS_PCP_WIRED(pcp)); + ASSERT(pcp->p_hashp == + (struct seg_phash *)hpw); + + if (pcp->p_active) { + continue; + } + pcp->p_hprev->p_hnext = pcp->p_hnext; + pcp->p_hnext->p_hprev = pcp->p_hprev; + pcp->p_hprev = delcallb_list; + delcallb_list = pcp; + } + mutex_exit(&hpw->p_hmutex); + } + } + + mutex_enter(&seg_pmem_mtx); + if (seg_pathr_on) { + mutex_exit(&seg_pmem_mtx); + goto runcb; + } + seg_pathr_on = 1; + mutex_exit(&seg_pmem_mtx); + ASSERT(seg_pahcur <= 1); + hlix = !seg_pahcur; + +again: + for (hlinkp = seg_pahhead[hlix].p_lnext; hlinkp != &seg_pahhead[hlix]; + hlinkp = hlnextp) { + + hlnextp = hlinkp->p_lnext; + ASSERT(hlnextp != NULL); + + hp = hlink2phash(hlinkp, hlix); + if (hp->p_hnext == (struct seg_pcache *)hp) { + seg_pathr_empty_ahb++; + continue; + } + seg_pathr_full_ahb++; mutex_enter(&hp->p_hmutex); - pcp = hp->p_hnext; + + for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; + pcp = pcp->p_hnext) { + pcache_link_t *pheadp; + pcache_link_t *plinkp; + void *htag0; + kmutex_t *pmtx; + + ASSERT(!IS_PCP_WIRED(pcp)); + ASSERT(pcp->p_hashp == hp); + + if (pcp->p_active) { + continue; + } + if (!force && pcp->p_ref && + PCP_AGE(pcp) < seg_pmax_pcpage) { + pcp->p_ref = 0; + continue; + } + plinkp = &pcp->p_plink; + htag0 = pcp->p_htag0; + if (pcp->p_flags & SEGP_AMP) { + pheadp = &((amp_t *)htag0)->a_phead; + pmtx = &((amp_t *)htag0)->a_pmtx; + } else { + pheadp = &((seg_t *)htag0)->s_phead; + pmtx = &((seg_t *)htag0)->s_pmtx; + } + if (!mutex_tryenter(pmtx)) { + continue; + } + ASSERT(pheadp->p_lnext != pheadp); + ASSERT(pheadp->p_lprev != pheadp); + plinkp->p_lprev->p_lnext = + plinkp->p_lnext; + plinkp->p_lnext->p_lprev = + plinkp->p_lprev; + pcp->p_hprev->p_hnext = pcp->p_hnext; + pcp->p_hnext->p_hprev = pcp->p_hprev; + mutex_exit(pmtx); + pcp->p_hprev = delcallb_list; + delcallb_list = pcp; + npgs_purged += btop(pcp->p_len); + } + if (hp->p_hnext == (struct seg_pcache *)hp) { + seg_premove_abuck(hp, 1); + } + mutex_exit(&hp->p_hmutex); + if (npgs_purged >= seg_plocked_window) { + break; + } + if (!force) { + if (npgs_purged >= npgs_to_purge) { + break; + } + if (!trim && !(seg_pathr_full_ahb & 15)) { + ASSERT(lowmem); + if (freemem >= lotsfree + needfree) { + break; + } + } + } + } + + if (hlinkp == &seg_pahhead[hlix]) { + /* + * We processed the entire hlix active bucket list + * but didn't find enough pages to reclaim. + * Switch the lists and walk the other list + * if we haven't done it yet. + */ + mutex_enter(&seg_pmem_mtx); + ASSERT(seg_pathr_on); + ASSERT(seg_pahcur == !hlix); + seg_pahcur = hlix; + mutex_exit(&seg_pmem_mtx); + if (++hlinks < 2) { + hlix = !hlix; + goto again; + } + } else if ((hlinkp = hlnextp) != &seg_pahhead[hlix] && + seg_pahhead[hlix].p_lnext != hlinkp) { + ASSERT(hlinkp != NULL); + ASSERT(hlinkp->p_lprev != &seg_pahhead[hlix]); + ASSERT(seg_pahhead[hlix].p_lnext != &seg_pahhead[hlix]); + ASSERT(seg_pahhead[hlix].p_lprev != &seg_pahhead[hlix]); /* - * While 'force' is set, seg_pasync_thread is not - * throttled. This is to speedup flushing of seg_pcache - * in preparation for DR. - * - * In normal case, when 'force' is not set, we throttle - * seg_pasync_thread so that we don't spend all the time - * time in purging the cache. + * Reinsert the header to point to hlinkp + * so that we start from hlinkp bucket next time around. */ - while ((pcp != (struct seg_pcache *)hp) && - (force || (purge_count <= seg_ppcount))) { + seg_pahhead[hlix].p_lnext->p_lprev = seg_pahhead[hlix].p_lprev; + seg_pahhead[hlix].p_lprev->p_lnext = seg_pahhead[hlix].p_lnext; + seg_pahhead[hlix].p_lnext = hlinkp; + seg_pahhead[hlix].p_lprev = hlinkp->p_lprev; + hlinkp->p_lprev->p_lnext = &seg_pahhead[hlix]; + hlinkp->p_lprev = &seg_pahhead[hlix]; + } + + mutex_enter(&seg_pmem_mtx); + ASSERT(seg_pathr_on); + seg_pathr_on = 0; + mutex_exit(&seg_pmem_mtx); +runcb: + /* + * Run the delayed callback list. segments/amps can't go away until + * callback is executed since they must have non 0 softlockcnt. That's + * why we don't need to hold as/seg/amp locks to execute the callback. + */ + while (delcallb_list != NULL) { + pcp = delcallb_list; + delcallb_list = pcp->p_hprev; + ASSERT(!pcp->p_active); + (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, + pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 1); + npages += btop(pcp->p_len); + if (!IS_PCP_WIRED(pcp)) { + npages_window += btop(pcp->p_len); + } + kmem_cache_free(seg_pkmcache, pcp); + } + if (npages) { + mutex_enter(&seg_pmem_mtx); + ASSERT(seg_plocked >= npages); + ASSERT(seg_plocked_window >= npages_window); + seg_plocked -= npages; + seg_plocked_window -= npages_window; + mutex_exit(&seg_pmem_mtx); + } +} + +/* + * Remove cached pages for segment(s) entries from hashtable. The segments + * are identified by pp array. This is useful for multiple seg's cached on + * behalf of dummy segment (ISM/DISM) with common pp array. + */ +void +seg_ppurge_wiredpp(struct page **pp) +{ + struct seg_pcache *pcp; + struct seg_phash_wired *hp; + pgcnt_t npages = 0; + struct seg_pcache *delcallb_list = NULL; + + /* + * if the cache is empty, return + */ + if (seg_plocked == 0) { + return; + } + ASSERT(seg_phashsize_wired != 0); + + for (hp = seg_phashtab_wired; + hp < &seg_phashtab_wired[seg_phashsize_wired]; hp++) { + if (hp->p_hnext == (struct seg_pcache *)hp) { + continue; + } + mutex_enter(&hp->p_hmutex); + pcp = hp->p_hnext; + while (pcp != (struct seg_pcache *)hp) { + ASSERT(pcp->p_hashp == (struct seg_phash *)hp); + ASSERT(IS_PCP_WIRED(pcp)); /* - * purge entries which are not active and - * have not been used recently and - * have the SEGP_ASYNC_FLUSH flag. - * - * In the 'force' case, we ignore the - * SEGP_ASYNC_FLUSH flag. + * purge entries which are not active */ - if (!(pcp->p_flags & SEGP_ASYNC_FLUSH)) - pcp->p_ref = 1; - if (force) - pcp->p_ref = 0; - if (!pcp->p_ref && !pcp->p_active) { - struct as *as = pcp->p_seg->s_as; - - /* - * try to get the readers lock on the address - * space before taking out the cache element. - * This ensures as_pagereclaim() can actually - * call through the address space and free - * the pages. If we don't get the lock, just - * skip this entry. The pages will be reclaimed - * by the segment driver at unmap time. - */ - if (AS_LOCK_TRYENTER(as, &as->a_lock, - RW_READER)) { - hp->p_qlen--; - pcp->p_hprev->p_hnext = pcp->p_hnext; - pcp->p_hnext->p_hprev = pcp->p_hprev; - pcp->p_hprev = delcallb_list; - delcallb_list = pcp; - purge_count++; - } - } else { - pcp->p_ref = 0; + if (!pcp->p_active && pcp->p_pp == pp) { + ASSERT(pcp->p_htag0 != NULL); + pcp->p_hprev->p_hnext = pcp->p_hnext; + pcp->p_hnext->p_hprev = pcp->p_hprev; + pcp->p_hprev = delcallb_list; + delcallb_list = pcp; } pcp = pcp->p_hnext; } mutex_exit(&hp->p_hmutex); - if (!force && purge_count > seg_ppcount) - break; + /* + * segments can't go away until callback is executed since + * they must have non 0 softlockcnt. That's why we don't + * need to hold as/seg locks to execute the callback. + */ + while (delcallb_list != NULL) { + int done; + pcp = delcallb_list; + delcallb_list = pcp->p_hprev; + ASSERT(!pcp->p_active); + done = (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, + pcp->p_len, pcp->p_pp, + pcp->p_write ? S_WRITE : S_READ, 1); + npages += btop(pcp->p_len); + ASSERT(IS_PCP_WIRED(pcp)); + kmem_cache_free(seg_pkmcache, pcp); + if (done) { + ASSERT(delcallb_list == NULL); + goto out; + } + } } - /* - * run the delayed callback list. We don't want to hold the - * cache lock during a call through the address space. - */ - while (delcallb_list != NULL) { - struct as *as; - - pcp = delcallb_list; - delcallb_list = pcp->p_hprev; - as = pcp->p_seg->s_as; - - PPRINT4("seg_ppurge_all: purge seg %p, addr %p, len %lx, " - "pplist %p\n", (void *)pcp->p_seg, (void *)pcp->p_addr, - pcp->p_len, (void *)pcp->p_pp); - - as_pagereclaim(as, pcp->p_pp, pcp->p_addr, - pcp->p_len, pcp->p_rw); - AS_LOCK_EXIT(as, &as->a_lock); - npages += pcp->p_len >> PAGESHIFT; - if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) { - npages_window += pcp->p_len >> PAGESHIFT; - } - kmem_free(pcp, sizeof (struct seg_pcache)); - } - mutex_enter(&seg_pmem); +out: + mutex_enter(&seg_pmem_mtx); + ASSERT(seg_plocked >= npages); seg_plocked -= npages; - seg_plocked_window -= npages_window; - mutex_exit(&seg_pmem); -} - -/* - * Remove cached pages for segment(s) entries from hashtable. - * The segments are identified by a given clients callback - * function. - * This is useful for multiple seg's cached on behalf of - * dummy segment (ISM/DISM) with common callback function. - * The clients callback function may return status indicating - * that the last seg's entry has been purged. In such a case - * the seg_ppurge_seg() stops searching hashtable and exits. - * Otherwise all hashtable entries are scanned. - */ -void -seg_ppurge_seg(int (*callback)(struct seg *, caddr_t, size_t, - struct page **, enum seg_rw)) -{ - struct seg_pcache *pcp, *npcp; - struct seg_phash *hp; - pgcnt_t npages = 0; - pgcnt_t npages_window = 0; - int done = 0; - - /* - * if the cache if off or empty, return - */ - if (seg_plazy == 0 || seg_plocked == 0) { - return; - } - mutex_enter(&seg_pcache); - seg_pdisable++; - mutex_exit(&seg_pcache); - - for (hp = p_hashtab; hp < &p_hashtab[p_hashsize]; hp++) { - - mutex_enter(&hp->p_hmutex); - pcp = hp->p_hnext; - while (pcp != (struct seg_pcache *)hp) { - - /* - * purge entries which are not active - */ - npcp = pcp->p_hnext; - if (!pcp->p_active && pcp->p_callback == callback) { - hp->p_qlen--; - pcp->p_hprev->p_hnext = pcp->p_hnext; - pcp->p_hnext->p_hprev = pcp->p_hprev; - - if ((*pcp->p_callback)(pcp->p_seg, pcp->p_addr, - pcp->p_len, pcp->p_pp, pcp->p_rw)) { - done = 1; - } - - npages += pcp->p_len >> PAGESHIFT; - if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) { - npages_window += - pcp->p_len >> PAGESHIFT; - } - kmem_free(pcp, sizeof (struct seg_pcache)); - } - pcp = npcp; - if (done) - break; - } - mutex_exit(&hp->p_hmutex); - if (done) - break; - } - - mutex_enter(&seg_pcache); - seg_pdisable--; - mutex_exit(&seg_pcache); - - mutex_enter(&seg_pmem); - seg_plocked -= npages; - seg_plocked_window -= npages_window; - mutex_exit(&seg_pmem); + mutex_exit(&seg_pmem_mtx); } /* @@ -546,55 +1267,99 @@ * reclaim the caller needs to hold the right locks. */ void -seg_ppurge(struct seg *seg) +seg_ppurge(struct seg *seg, struct anon_map *amp, uint_t flags) { struct seg_pcache *delcallb_list = NULL; struct seg_pcache *pcp; struct seg_phash *hp; pgcnt_t npages = 0; - pgcnt_t npages_window = 0; + void *htag0; - if (seg_plazy == 0) { + if (seg_plocked == 0) { return; } - hp = &p_hashtab[p_hash(seg)]; - mutex_enter(&hp->p_hmutex); - pcp = hp->p_hnext; - while (pcp != (struct seg_pcache *)hp) { - if (pcp->p_seg == seg) { + ASSERT(seg_phashsize_win != 0); + + /* + * If amp is not NULL use amp as a lookup tag otherwise use seg + * as a lookup tag. + */ + htag0 = (amp == NULL ? (void *)seg : (void *)amp); + ASSERT(htag0 != NULL); + if (IS_PFLAGS_WIRED(flags)) { + hp = P_HASHBP(seg, htag0, 0, flags); + mutex_enter(&hp->p_hmutex); + pcp = hp->p_hnext; + while (pcp != (struct seg_pcache *)hp) { + ASSERT(pcp->p_hashp == hp); + ASSERT(IS_PCP_WIRED(pcp)); + if (pcp->p_htag0 == htag0) { + if (pcp->p_active) { + break; + } + pcp->p_hprev->p_hnext = pcp->p_hnext; + pcp->p_hnext->p_hprev = pcp->p_hprev; + pcp->p_hprev = delcallb_list; + delcallb_list = pcp; + } + pcp = pcp->p_hnext; + } + mutex_exit(&hp->p_hmutex); + } else { + pcache_link_t *plinkp; + pcache_link_t *pheadp; + kmutex_t *pmtx; + + if (amp == NULL) { + ASSERT(seg != NULL); + pheadp = &seg->s_phead; + pmtx = &seg->s_pmtx; + } else { + pheadp = &->a_phead; + pmtx = &->a_pmtx; + } + mutex_enter(pmtx); + while ((plinkp = pheadp->p_lnext) != pheadp) { + pcp = plink2pcache(plinkp); + ASSERT(!IS_PCP_WIRED(pcp)); + ASSERT(pcp->p_htag0 == htag0); + hp = pcp->p_hashp; + mutex_enter(&hp->p_hmutex); if (pcp->p_active) { + mutex_exit(&hp->p_hmutex); break; } - hp->p_qlen--; + ASSERT(plinkp->p_lprev == pheadp); + pheadp->p_lnext = plinkp->p_lnext; + plinkp->p_lnext->p_lprev = pheadp; pcp->p_hprev->p_hnext = pcp->p_hnext; pcp->p_hnext->p_hprev = pcp->p_hprev; pcp->p_hprev = delcallb_list; delcallb_list = pcp; + if (hp->p_hnext == (struct seg_pcache *)hp) { + seg_premove_abuck(hp, 0); + } + mutex_exit(&hp->p_hmutex); } - pcp = pcp->p_hnext; + mutex_exit(pmtx); } - mutex_exit(&hp->p_hmutex); while (delcallb_list != NULL) { pcp = delcallb_list; delcallb_list = pcp->p_hprev; - - PPRINT4("seg_ppurge: purge seg %p, addr %p, len %lx, " - "pplist %p\n", (void *)seg, (void *)pcp->p_addr, - pcp->p_len, (void *)pcp->p_pp); - - ASSERT(seg == pcp->p_seg); - (void) (*pcp->p_callback)(seg, pcp->p_addr, - pcp->p_len, pcp->p_pp, pcp->p_rw); - npages += pcp->p_len >> PAGESHIFT; - if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) { - npages_window += pcp->p_len >> PAGESHIFT; - } - kmem_free(pcp, sizeof (struct seg_pcache)); + ASSERT(!pcp->p_active); + (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, pcp->p_len, + pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0); + npages += btop(pcp->p_len); + kmem_cache_free(seg_pkmcache, pcp); } - mutex_enter(&seg_pmem); + mutex_enter(&seg_pmem_mtx); + ASSERT(seg_plocked >= npages); seg_plocked -= npages; - seg_plocked_window -= npages_window; - mutex_exit(&seg_pmem); + if (!IS_PFLAGS_WIRED(flags)) { + ASSERT(seg_plocked_window >= npages); + seg_plocked_window -= npages; + } + mutex_exit(&seg_pmem_mtx); } static void seg_pinit_mem_config(void); @@ -606,58 +1371,125 @@ seg_pinit(void) { struct seg_phash *hp; - int i; - uint_t physmegs; + ulong_t i; + pgcnt_t physmegs; + + seg_plocked = 0; + seg_plocked_window = 0; + + if (segpcache_enabled == 0) { + seg_phashsize_win = 0; + seg_phashsize_wired = 0; + seg_pdisabled = 1; + return; + } - sema_init(&seg_pasync_sem, 0, NULL, SEMA_DEFAULT, NULL); + seg_pdisabled = 0; + seg_pkmcache = kmem_cache_create("seg_pcache", + sizeof (struct seg_pcache), 0, NULL, NULL, NULL, NULL, NULL, 0); + if (segpcache_pcp_maxage_ticks <= 0) { + segpcache_pcp_maxage_ticks = segpcache_pcp_maxage_sec * hz; + } + seg_pmax_pcpage = segpcache_pcp_maxage_ticks; + seg_pathr_empty_ahb = 0; + seg_pathr_full_ahb = 0; + seg_pshrink_shift = segpcache_shrink_shift; + seg_pmaxapurge_npages = btop(segpcache_maxapurge_bytes); - mutex_enter(&seg_pcache); - if (p_hashtab == NULL) { - physmegs = physmem >> (20 - PAGESHIFT); + mutex_init(&seg_pcache_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&seg_pmem_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&seg_pasync_mtx, NULL, MUTEX_DEFAULT, NULL); + cv_init(&seg_pasync_cv, NULL, CV_DEFAULT, NULL); + + physmegs = physmem >> (20 - PAGESHIFT); - /* If p_hashsize was not set in /etc/system ... */ - if (p_hashsize == 0) { - /* - * Choose p_hashsize based on physmem. - */ - if (physmegs < 64) { - p_hashsize = 64; - } else if (physmegs < 1024) { - p_hashsize = 1024; - } else if (physmegs < 10 * 1024) { - p_hashsize = 8192; - } else if (physmegs < 20 * 1024) { - p_hashsize = 2 * 8192; - seg_pmaxqlen = 16; - } else { - p_hashsize = 128 * 1024; - seg_pmaxqlen = 128; - } - } + /* + * If segpcache_hashsize_win was not set in /etc/system or it has + * absurd value set it to a default. + */ + if (segpcache_hashsize_win == 0 || segpcache_hashsize_win > physmem) { + /* + * Create one bucket per 32K (or at least per 8 pages) of + * available memory. + */ + pgcnt_t pages_per_bucket = MAX(btop(32 * 1024), 8); + segpcache_hashsize_win = MAX(1024, physmem / pages_per_bucket); + } + if (!ISP2(segpcache_hashsize_win)) { + ulong_t rndfac = ~(1UL << + (highbit(segpcache_hashsize_win) - 1)); + rndfac &= segpcache_hashsize_win; + segpcache_hashsize_win += rndfac; + segpcache_hashsize_win = 1 << + (highbit(segpcache_hashsize_win) - 1); + } + seg_phashsize_win = segpcache_hashsize_win; + seg_phashtab_win = kmem_zalloc( + seg_phashsize_win * sizeof (struct seg_phash), + KM_SLEEP); + for (i = 0; i < seg_phashsize_win; i++) { + hp = &seg_phashtab_win[i]; + hp->p_hnext = (struct seg_pcache *)hp; + hp->p_hprev = (struct seg_pcache *)hp; + mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL); + } - p_hashtab = kmem_zalloc(p_hashsize * sizeof (struct seg_phash), - KM_SLEEP); - for (i = 0; i < p_hashsize; i++) { - hp = (struct seg_phash *)&p_hashtab[i]; - hp->p_hnext = (struct seg_pcache *)hp; - hp->p_hprev = (struct seg_pcache *)hp; - mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL); - } - if (seg_pwindow == 0) { - if (physmegs < 24) { - /* don't use cache */ - seg_plazy = 0; - } else if (physmegs < 64) { - seg_pwindow = physmem >> 5; /* 3% of memory */ - } else if (physmegs < 10 * 1024) { - seg_pwindow = physmem >> 3; /* 12% of memory */ - } else { - seg_pwindow = physmem >> 1; - } + seg_pahcur = 0; + seg_pathr_on = 0; + seg_pahhead[0].p_lnext = &seg_pahhead[0]; + seg_pahhead[0].p_lprev = &seg_pahhead[0]; + seg_pahhead[1].p_lnext = &seg_pahhead[1]; + seg_pahhead[1].p_lprev = &seg_pahhead[1]; + + /* + * If segpcache_hashsize_wired was not set in /etc/system or it has + * absurd value set it to a default. + */ + if (segpcache_hashsize_wired == 0 || + segpcache_hashsize_wired > physmem / 4) { + /* + * Choose segpcache_hashsize_wired based on physmem. + * Create a bucket per 128K bytes upto 256K buckets. + */ + if (physmegs < 20 * 1024) { + segpcache_hashsize_wired = MAX(1024, physmegs << 3); + } else { + segpcache_hashsize_wired = 256 * 1024; } } - mutex_exit(&seg_pcache); + if (!ISP2(segpcache_hashsize_wired)) { + segpcache_hashsize_wired = 1 << + highbit(segpcache_hashsize_wired); + } + seg_phashsize_wired = segpcache_hashsize_wired; + seg_phashtab_wired = kmem_zalloc( + seg_phashsize_wired * sizeof (struct seg_phash_wired), KM_SLEEP); + for (i = 0; i < seg_phashsize_wired; i++) { + hp = (struct seg_phash *)&seg_phashtab_wired[i]; + hp->p_hnext = (struct seg_pcache *)hp; + hp->p_hprev = (struct seg_pcache *)hp; + mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL); + } + if (segpcache_maxwindow == 0) { + if (physmegs < 64) { + /* 3% of memory */ + segpcache_maxwindow = availrmem >> 5; + } else if (physmegs < 512) { + /* 12% of memory */ + segpcache_maxwindow = availrmem >> 3; + } else if (physmegs < 1024) { + /* 25% of memory */ + segpcache_maxwindow = availrmem >> 2; + } else if (physmegs < 2048) { + /* 50% of memory */ + segpcache_maxwindow = availrmem >> 1; + } else { + /* no limit */ + segpcache_maxwindow = (pgcnt_t)-1; + } + } + seg_pmaxwindow = segpcache_maxwindow; seg_pinit_mem_config(); } @@ -668,16 +1500,24 @@ seg_preap(void) { /* - * if the cache if off or empty, return + * if the cache is off or empty, return */ - if (seg_plocked == 0 || seg_plazy == 0) { + if (seg_plocked_window == 0) { return; } - sema_v(&seg_pasync_sem); + ASSERT(seg_phashsize_win != 0); + + /* + * If somebody is already purging pcache + * just return. + */ + if (seg_pdisabled) { + return; + } + + cv_signal(&seg_pasync_cv); } -static void seg_pupdate(void *); - /* * run as a backgroud thread and reclaim pagelock * pages which have not been used recently @@ -686,42 +1526,30 @@ seg_pasync_thread(void) { callb_cpr_t cpr_info; - kmutex_t pasync_lock; /* just for CPR stuff */ - mutex_init(&pasync_lock, NULL, MUTEX_DEFAULT, NULL); - - CALLB_CPR_INIT(&cpr_info, &pasync_lock, callb_generic_cpr, - "seg_pasync"); - - if (seg_preap_interval == 0) { - seg_preap_interval = seg_preap_time * hz; - } else { - seg_preap_interval *= hz; - } - if (seg_plazy && seg_pupdate_active) { - (void) timeout(seg_pupdate, NULL, seg_preap_interval); + if (seg_phashsize_win == 0) { + thread_exit(); + /*NOTREACHED*/ } - for (;;) { - mutex_enter(&pasync_lock); - CALLB_CPR_SAFE_BEGIN(&cpr_info); - mutex_exit(&pasync_lock); - sema_p(&seg_pasync_sem); - mutex_enter(&pasync_lock); - CALLB_CPR_SAFE_END(&cpr_info, &pasync_lock); - mutex_exit(&pasync_lock); + seg_pasync_thr = curthread; + + CALLB_CPR_INIT(&cpr_info, &seg_pasync_mtx, + callb_generic_cpr, "seg_pasync"); + + if (segpcache_reap_ticks <= 0) { + segpcache_reap_ticks = segpcache_reap_sec * hz; + } - seg_ppurge_all(0); - } -} - -static void -seg_pupdate(void *dummy) -{ - sema_v(&seg_pasync_sem); - - if (seg_plazy && seg_pupdate_active) { - (void) timeout(seg_pupdate, dummy, seg_preap_interval); + mutex_enter(&seg_pasync_mtx); + for (;;) { + CALLB_CPR_SAFE_BEGIN(&cpr_info); + (void) cv_timedwait(&seg_pasync_cv, &seg_pasync_mtx, + lbolt + segpcache_reap_ticks); + CALLB_CPR_SAFE_END(&cpr_info, &seg_pasync_mtx); + if (seg_pdisabled == 0) { + seg_ppurge_async(0); + } } } @@ -735,8 +1563,8 @@ { kstat_t *ksp; - seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg), 0, - NULL, NULL, NULL, NULL, NULL, 0); + seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg), + 0, NULL, NULL, NULL, NULL, NULL, 0); ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED, segadvstat_ndata, KSTAT_FLAG_VIRTUAL); @@ -776,6 +1604,9 @@ new->s_data = NULL; new->s_szc = 0; new->s_flags = 0; + mutex_init(&new->s_pmtx, NULL, MUTEX_DEFAULT, NULL); + new->s_phead.p_lnext = &new->s_phead; + new->s_phead.p_lprev = &new->s_phead; if (seg_attach(as, segbase, segsize, new) < 0) { kmem_cache_free(seg_cache, new); return ((struct seg *)NULL); @@ -857,6 +1688,9 @@ if (seg->s_data != NULL) SEGOP_FREE(seg); + mutex_destroy(&seg->s_pmtx); + ASSERT(seg->s_phead.p_lnext == &seg->s_phead); + ASSERT(seg->s_phead.p_lprev == &seg->s_phead); kmem_cache_free(seg_cache, seg); } @@ -872,10 +1706,10 @@ void seg_p_enable(void) { - mutex_enter(&seg_pcache); - ASSERT(seg_pdisable != 0); - seg_pdisable--; - mutex_exit(&seg_pcache); + mutex_enter(&seg_pcache_mtx); + ASSERT(seg_pdisabled != 0); + seg_pdisabled--; + mutex_exit(&seg_pcache_mtx); } /* @@ -890,18 +1724,19 @@ pgcnt_t old_plocked; int stall_count = 0; - mutex_enter(&seg_pcache); - seg_pdisable++; - ASSERT(seg_pdisable != 0); - mutex_exit(&seg_pcache); + mutex_enter(&seg_pcache_mtx); + seg_pdisabled++; + ASSERT(seg_pdisabled != 0); + mutex_exit(&seg_pcache_mtx); /* * Attempt to empty the cache. Terminate if seg_plocked does not * diminish with SEGP_STALL_THRESHOLD consecutive attempts. */ while (seg_plocked != 0) { + ASSERT(seg_phashsize_win != 0); old_plocked = seg_plocked; - seg_ppurge_all(1); + seg_ppurge_async(1); if (seg_plocked == old_plocked) { if (stall_count++ > SEGP_STALL_THRESHOLD) { return (SEGP_FAIL); @@ -918,7 +1753,7 @@ * Attempt to purge seg_pcache. May need to return before this has * completed to allow other pre_del callbacks to unlock pages. This is * ok because: - * 1) The seg_pdisable flag has been set so at least we won't + * 1) The seg_pdisabled flag has been set so at least we won't * cache anymore locks and the locks we couldn't purge * will not be held if they do get released by a subsequent * pre-delete callback. @@ -934,6 +1769,9 @@ void *arg, pgcnt_t delta_pages) { + if (seg_phashsize_win == 0) { + return (0); + } if (seg_p_disable() != SEGP_SUCCESS) cmn_err(CE_NOTE, "!Pre-delete couldn't purge"" pagelock cache - continuing"); @@ -947,6 +1785,9 @@ pgcnt_t delta_pages, int cancelled) { + if (seg_phashsize_win == 0) { + return; + } seg_p_enable(); } @@ -971,9 +1812,6 @@ ASSERT(ret == 0); } -extern struct seg_ops segvn_ops; -extern struct seg_ops segspt_shmops; - /* * Verify that segment is not a shared anonymous segment which reserves * swap. zone.max-swap accounting (zone->zone_max_swap) cannot be transfered
--- a/usr/src/uts/sfmmu/vm/hat_sfmmu.c Thu May 22 22:08:42 2008 -0700 +++ b/usr/src/uts/sfmmu/vm/hat_sfmmu.c Thu May 22 22:23:49 2008 -0700 @@ -7379,28 +7379,23 @@ return (PP_GENERIC_ATTR(pp)); } - if ((clearflag == (HAT_SYNC_STOPON_REF | HAT_SYNC_DONTZERO)) && - PP_ISREF(pp)) { - return (PP_GENERIC_ATTR(pp)); - } - - if ((clearflag == (HAT_SYNC_STOPON_MOD | HAT_SYNC_DONTZERO)) && - PP_ISMOD(pp)) { - return (PP_GENERIC_ATTR(pp)); - } - - if ((clearflag & HAT_SYNC_STOPON_SHARED) != 0 && - (pp->p_share > po_share) && - !(clearflag & HAT_SYNC_ZERORM)) { - hat_page_setattr(pp, P_REF); - return (PP_GENERIC_ATTR(pp)); - } - - if ((clearflag & HAT_SYNC_STOPON_SHARED) && - !(clearflag & HAT_SYNC_ZERORM)) { - stop_on_sh = 1; - shcnt = 0; - } + if ((clearflag & HAT_SYNC_ZERORM) == 0) { + if ((clearflag & HAT_SYNC_STOPON_REF) && PP_ISREF(pp)) { + return (PP_GENERIC_ATTR(pp)); + } + if ((clearflag & HAT_SYNC_STOPON_MOD) && PP_ISMOD(pp)) { + return (PP_GENERIC_ATTR(pp)); + } + if (clearflag & HAT_SYNC_STOPON_SHARED) { + if (pp->p_share > po_share) { + hat_page_setattr(pp, P_REF); + return (PP_GENERIC_ATTR(pp)); + } + stop_on_sh = 1; + shcnt = 0; + } + } + clearflag &= ~HAT_SYNC_STOPON_SHARED; pml = sfmmu_mlist_enter(pp); index = PP_MAPINDEX(pp);