Mercurial > illumos > illumos-gate
changeset 9894:42b0c48b08a4
6708183 poor scalability of mdb memstat with increasing CPU count
author | Pavel Tatashin <Pavel.Tatashin@Sun.COM> |
---|---|
date | Wed, 17 Jun 2009 15:32:10 -0700 |
parents | 589b92d8d72b |
children | adcf72c91c4f |
files | usr/src/cmd/mdb/common/modules/genunix/genunix.c usr/src/cmd/mdb/common/modules/genunix/memory.c usr/src/cmd/mdb/common/modules/genunix/memory.h usr/src/uts/common/io/mem.c usr/src/uts/common/sys/vnode.h usr/src/uts/common/vm/hat.h usr/src/uts/i86pc/vm/hat_i86.c usr/src/uts/sun4u/vm/mach_kpm.c usr/src/uts/sun4v/vm/mach_kpm.c |
diffstat | 9 files changed, 456 insertions(+), 86 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/cmd/mdb/common/modules/genunix/genunix.c Wed Jun 17 13:10:47 2009 -0700 +++ b/usr/src/cmd/mdb/common/modules/genunix/genunix.c Wed Jun 17 15:32:10 2009 -0700 @@ -4833,6 +4833,8 @@ /* from memory.c */ { "page", "walk all pages, or those from the specified vnode", page_walk_init, page_walk_step, page_walk_fini }, + { "allpages", "walk all pages, including free pages", + allpages_walk_init, allpages_walk_step, allpages_walk_fini }, { "memlist", "walk specified memlist", NULL, memlist_walk_step, NULL }, { "swapinfo", "walk swapinfo structures",
--- a/usr/src/cmd/mdb/common/modules/genunix/memory.c Wed Jun 17 13:10:47 2009 -0700 +++ b/usr/src/cmd/mdb/common/modules/genunix/memory.c Wed Jun 17 15:32:10 2009 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -214,9 +214,222 @@ mdb_free(wsp->walk_data, sizeof (page_walk_data_t)); } +/* + * allpages walks all pages in the system in order they appear in + * the memseg structure + */ + +#define PAGE_BUFFER 128 + +int +allpages_walk_init(mdb_walk_state_t *wsp) +{ + if (wsp->walk_addr != 0) { + mdb_warn("allpages only supports global walks.\n"); + return (WALK_ERR); + } + + if (mdb_layered_walk("memseg", wsp) == -1) { + mdb_warn("couldn't walk 'memseg'"); + return (WALK_ERR); + } + + wsp->walk_data = mdb_alloc(sizeof (page_t) * PAGE_BUFFER, UM_SLEEP); + return (WALK_NEXT); +} + +int +allpages_walk_step(mdb_walk_state_t *wsp) +{ + const struct memseg *msp = wsp->walk_layer; + page_t *buf = wsp->walk_data; + size_t pg_read, i; + size_t pg_num = msp->pages_end - msp->pages_base; + const page_t *pg_addr = msp->pages; + + while (pg_num > 0) { + pg_read = MIN(pg_num, PAGE_BUFFER); + + if (mdb_vread(buf, pg_read * sizeof (page_t), + (uintptr_t)pg_addr) == -1) { + mdb_warn("can't read page_t's at %#lx", pg_addr); + return (WALK_ERR); + } + for (i = 0; i < pg_read; i++) { + int ret = wsp->walk_callback((uintptr_t)&pg_addr[i], + &buf[i], wsp->walk_cbdata); + + if (ret != WALK_NEXT) + return (ret); + } + pg_num -= pg_read; + pg_addr += pg_read; + } + + return (WALK_NEXT); +} + +void +allpages_walk_fini(mdb_walk_state_t *wsp) +{ + mdb_free(wsp->walk_data, sizeof (page_t) * PAGE_BUFFER); +} + +/* + * Hash table + LRU queue. + * This table is used to cache recently read vnodes for the memstat + * command, to reduce the number of mdb_vread calls. This greatly + * speeds the memstat command on on live, large CPU count systems. + */ + +#define VN_SMALL 401 +#define VN_LARGE 10007 +#define VN_HTABLE_KEY(p, hp) ((p) % ((hp)->vn_htable_buckets)) + +struct vn_htable_list { + uint_t vn_flag; /* v_flag from vnode */ + uintptr_t vn_ptr; /* pointer to vnode */ + struct vn_htable_list *vn_q_next; /* queue next pointer */ + struct vn_htable_list *vn_q_prev; /* queue prev pointer */ + struct vn_htable_list *vn_h_next; /* hash table pointer */ +}; + +/* + * vn_q_first -> points to to head of queue: the vnode that was most + * recently used + * vn_q_last -> points to the oldest used vnode, and is freed once a new + * vnode is read. + * vn_htable -> hash table + * vn_htable_buf -> contains htable objects + * vn_htable_size -> total number of items in the hash table + * vn_htable_buckets -> number of buckets in the hash table + */ +typedef struct vn_htable { + struct vn_htable_list *vn_q_first; + struct vn_htable_list *vn_q_last; + struct vn_htable_list **vn_htable; + struct vn_htable_list *vn_htable_buf; + int vn_htable_size; + int vn_htable_buckets; +} vn_htable_t; + + +/* allocate memory, initilize hash table and LRU queue */ +static void +vn_htable_init(vn_htable_t *hp, size_t vn_size) +{ + int i; + int htable_size = MAX(vn_size, VN_LARGE); + + if ((hp->vn_htable_buf = mdb_zalloc(sizeof (struct vn_htable_list) + * htable_size, UM_NOSLEEP|UM_GC)) == NULL) { + htable_size = VN_SMALL; + hp->vn_htable_buf = mdb_zalloc(sizeof (struct vn_htable_list) + * htable_size, UM_SLEEP|UM_GC); + } + + hp->vn_htable = mdb_zalloc(sizeof (struct vn_htable_list *) + * htable_size, UM_SLEEP|UM_GC); + + hp->vn_q_first = &hp->vn_htable_buf[0]; + hp->vn_q_last = &hp->vn_htable_buf[htable_size - 1]; + hp->vn_q_first->vn_q_next = &hp->vn_htable_buf[1]; + hp->vn_q_last->vn_q_prev = &hp->vn_htable_buf[htable_size - 2]; + + for (i = 1; i < (htable_size-1); i++) { + hp->vn_htable_buf[i].vn_q_next = &hp->vn_htable_buf[i + 1]; + hp->vn_htable_buf[i].vn_q_prev = &hp->vn_htable_buf[i - 1]; + } + + hp->vn_htable_size = htable_size; + hp->vn_htable_buckets = htable_size; +} + + +/* + * Find the vnode whose address is ptr, and return its v_flag in vp->v_flag. + * The function tries to find needed information in the following order: + * + * 1. check if ptr is the first in queue + * 2. check if ptr is in hash table (if so move it to the top of queue) + * 3. do mdb_vread, remove last queue item from queue and hash table. + * Insert new information to freed object, and put this object in to the + * top of the queue. + */ +static int +vn_get(vn_htable_t *hp, struct vnode *vp, uintptr_t ptr) +{ + int hkey; + struct vn_htable_list *hent, **htmp, *q_next, *q_prev; + struct vn_htable_list *q_first = hp->vn_q_first; + + /* 1. vnode ptr is the first in queue, just get v_flag and return */ + if (q_first->vn_ptr == ptr) { + vp->v_flag = q_first->vn_flag; + + return (0); + } + + /* 2. search the hash table for this ptr */ + hkey = VN_HTABLE_KEY(ptr, hp); + hent = hp->vn_htable[hkey]; + while (hent && (hent->vn_ptr != ptr)) + hent = hent->vn_h_next; + + /* 3. if hent is NULL, we did not find in hash table, do mdb_vread */ + if (hent == NULL) { + struct vnode vn; + + if (mdb_vread(&vn, sizeof (vnode_t), ptr) == -1) { + mdb_warn("unable to read vnode_t at %#lx", ptr); + return (-1); + } + + /* we will insert read data into the last element in queue */ + hent = hp->vn_q_last; + + /* remove last hp->vn_q_last object from hash table */ + if (hent->vn_ptr) { + htmp = &hp->vn_htable[VN_HTABLE_KEY(hent->vn_ptr, hp)]; + while (*htmp != hent) + htmp = &(*htmp)->vn_h_next; + *htmp = hent->vn_h_next; + } + + /* insert data into new free object */ + hent->vn_ptr = ptr; + hent->vn_flag = vn.v_flag; + + /* insert new object into hash table */ + hent->vn_h_next = hp->vn_htable[hkey]; + hp->vn_htable[hkey] = hent; + } + + /* Remove from queue. hent is not first, vn_q_prev is not NULL */ + q_next = hent->vn_q_next; + q_prev = hent->vn_q_prev; + if (q_next == NULL) + hp->vn_q_last = q_prev; + else + q_next->vn_q_prev = q_prev; + q_prev->vn_q_next = q_next; + + /* Add to the front of queue */ + hent->vn_q_prev = NULL; + hent->vn_q_next = q_first; + q_first->vn_q_prev = hent; + hp->vn_q_first = hent; + + /* Set v_flag in vnode pointer from hent */ + vp->v_flag = hent->vn_flag; + + return (0); +} + /* Summary statistics of pages */ typedef struct memstat { struct vnode *ms_kvp; /* Cached address of kernel vnode */ + struct vnode *ms_unused_vp; /* Unused pages vnode pointer */ struct vnode *ms_zvp; /* Cached address of zio vnode */ uint64_t ms_kmem; /* Pages of kernel memory */ uint64_t ms_zfs_data; /* Pages of zfs data */ @@ -225,6 +438,8 @@ uint64_t ms_exec; /* Pages of exec/library memory */ uint64_t ms_cachelist; /* Pages on the cachelist (free) */ uint64_t ms_total; /* Pages on page hash */ + vn_htable_t *ms_vn_htable; /* Pointer to hash table */ + struct vnode ms_vn; /* vnode buffer */ } memstat_t; #define MS_PP_ISKAS(pp, stats) \ @@ -234,36 +449,28 @@ (((stats)->ms_zvp != NULL) && ((pp)->p_vnode == (stats)->ms_zvp)) /* - * Summarize pages by type; called from page walker. + * Summarize pages by type and update stat information */ /* ARGSUSED */ static int memstat_callback(page_t *page, page_t *pp, memstat_t *stats) { - struct vnode vn, *vp; - uintptr_t ptr; + struct vnode *vp = &stats->ms_vn; - /* read page's vnode pointer */ - if ((ptr = (uintptr_t)(pp->p_vnode)) != NULL) { - if (mdb_vread(&vn, sizeof (vnode_t), ptr) == -1) { - mdb_warn("unable to read vnode_t at %#lx", - ptr); - return (WALK_ERR); - } - vp = &vn; - } else - vp = NULL; - - if (PP_ISFREE(pp)) - stats->ms_cachelist++; - else if (vp && IS_SWAPFSVP(vp)) - stats->ms_anon++; + if (pp->p_vnode == NULL || pp->p_vnode == stats->ms_unused_vp) + return (WALK_NEXT); + else if (MS_PP_ISKAS(pp, stats)) + stats->ms_kmem++; else if (MS_PP_ISZFS_DATA(pp, stats)) stats->ms_zfs_data++; - else if (MS_PP_ISKAS(pp, stats)) - stats->ms_kmem++; - else if (vp && (((vp)->v_flag & VVMEXEC)) != 0) + else if (PP_ISFREE(pp)) + stats->ms_cachelist++; + else if (vn_get(stats->ms_vn_htable, vp, (uintptr_t)pp->p_vnode)) + return (WALK_ERR); + else if (IS_SWAPFSVP(vp)) + stats->ms_anon++; + else if ((vp->v_flag & VVMEXEC) != 0) stats->ms_exec++; else stats->ms_vnode++; @@ -281,19 +488,33 @@ pgcnt_t total_pages, physmem; ulong_t freemem; memstat_t stats; - memstat_t unused_stats; GElf_Sym sym; + vn_htable_t ht; + uintptr_t vn_size = 0; #if defined(__i386) || defined(__amd64) bln_stats_t bln_stats; ssize_t bln_size; #endif bzero(&stats, sizeof (memstat_t)); - bzero(&unused_stats, sizeof (memstat_t)); - if (argc != 0 || (flags & DCMD_ADDRSPEC)) + /* + * -s size, is an internal option. It specifies the size of vn_htable. + * Hash table size is set in the following order: + * If user has specified the size that is larger than VN_LARGE: try it, + * but if malloc failed default to VN_SMALL. Otherwise try VN_LARGE, if + * failed to allocate default to VN_SMALL. + * For a better efficiency of hash table it is highly recommended to + * set size to a prime number. + */ + if ((flags & DCMD_ADDRSPEC) || mdb_getopts(argc, argv, + 's', MDB_OPT_UINTPTR, &vn_size, NULL) != argc) return (DCMD_USAGE); + /* Initialize vnode hash list and queue */ + vn_htable_init(&ht, vn_size); + stats.ms_vn_htable = &ht; + /* Grab base page size */ if (mdb_readvar(&pagesize, "_pagesize") == -1) { mdb_warn("unable to read _pagesize"); @@ -332,37 +553,26 @@ stats.ms_zvp = (struct vnode *)(uintptr_t)sym.st_value; } - /* Walk page structures, summarizing usage */ - if (mdb_walk("page", (mdb_walk_cb_t)memstat_callback, - &stats) == -1) { - mdb_warn("can't walk pages"); - return (DCMD_ERR); - } - - /* read unused pages vnode */ + /* + * If physmem != total_pages, then the administrator has limited the + * number of pages available in the system. Excluded pages are + * associated with the unused pages vnode. Read this vnode so the + * pages can be excluded in the page accounting. + */ if (mdb_lookup_by_obj(MDB_OBJ_EXEC, "unused_pages_vp", (GElf_Sym *)&sym) == -1) { mdb_warn("unable to read unused_pages_vp"); return (DCMD_ERR); } - - unused_stats.ms_kvp = (struct vnode *)(uintptr_t)sym.st_value; + stats.ms_unused_vp = (struct vnode *)(uintptr_t)sym.st_value; - /* Find unused pages */ - if (mdb_walk("page", (mdb_walk_cb_t)memstat_callback, - &unused_stats) == -1) { - mdb_warn("can't walk pages"); + /* walk all pages, collect statistics */ + if (mdb_walk("allpages", (mdb_walk_cb_t)memstat_callback, + &stats) == -1) { + mdb_warn("can't walk memseg"); return (DCMD_ERR); } - /* - * If physmem != total_pages, then the administrator has limited the - * number of pages available in the system. In order to account for - * this, we reduce the amount normally attributed to the page cache. - */ - stats.ms_vnode -= unused_stats.ms_kmem; - stats.ms_total -= unused_stats.ms_kmem; - #define MS_PCT_TOTAL(x) ((ulong_t)((((5 * total_pages) + ((x) * 1000ull))) / \ ((physmem) * 10)))
--- a/usr/src/cmd/mdb/common/modules/genunix/memory.h Wed Jun 17 13:10:47 2009 -0700 +++ b/usr/src/cmd/mdb/common/modules/genunix/memory.h Wed Jun 17 15:32:10 2009 -0700 @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright (c) 2000-2001 by Sun Microsystems, Inc. - * All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. */ #ifndef _MEMORY_H #define _MEMORY_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -37,6 +34,9 @@ int page_walk_step(mdb_walk_state_t *); void page_walk_fini(mdb_walk_state_t *); int page(uintptr_t, uint_t, int, const mdb_arg_t *); +int allpages_walk_init(mdb_walk_state_t *); +int allpages_walk_step(mdb_walk_state_t *); +void allpages_walk_fini(mdb_walk_state_t *); int memstat(uintptr_t, uint_t, int, const mdb_arg_t *); int swap_walk_init(mdb_walk_state_t *);
--- a/usr/src/uts/common/io/mem.c Wed Jun 17 13:10:47 2009 -0700 +++ b/usr/src/uts/common/io/mem.c Wed Jun 17 15:32:10 2009 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -234,18 +234,34 @@ #pragma weak mach_sync_icache_pa static int -mmio(struct uio *uio, enum uio_rw rw, pfn_t pfn, off_t pageoff, int allowio) +mmio(struct uio *uio, enum uio_rw rw, pfn_t pfn, off_t pageoff, int allowio, + page_t *pp) { int error = 0; + int devload = 0; + int is_memory = pf_is_memory(pfn); size_t nbytes = MIN((size_t)(PAGESIZE - pageoff), (size_t)uio->uio_iov->iov_len); + caddr_t va = NULL; mutex_enter(&mm_lock); - hat_devload(kas.a_hat, mm_map, PAGESIZE, pfn, - (uint_t)(rw == UIO_READ ? PROT_READ : PROT_READ | PROT_WRITE), - HAT_LOAD_NOCONSIST | HAT_LOAD_LOCK); + + if (is_memory && kpm_enable) { + if (pp) + va = hat_kpm_mapin(pp, NULL); + else + va = hat_kpm_mapin_pfn(pfn); + } - if (!pf_is_memory(pfn)) { + if (va == NULL) { + hat_devload(kas.a_hat, mm_map, PAGESIZE, pfn, + (uint_t)(rw == UIO_READ ? PROT_READ : PROT_READ|PROT_WRITE), + HAT_LOAD_NOCONSIST|HAT_LOAD_LOCK); + va = mm_map; + devload = 1; + } + + if (!is_memory) { if (allowio) { size_t c = uio->uio_iov->iov_len; @@ -256,7 +272,7 @@ } else error = EIO; } else { - error = uiomove(&mm_map[pageoff], nbytes, rw, uio); + error = uiomove(va + pageoff, nbytes, rw, uio); /* * In case this has changed executable code, @@ -267,7 +283,13 @@ } } - hat_unload(kas.a_hat, mm_map, PAGESIZE, HAT_UNLOAD_UNLOCK); + if (devload) + hat_unload(kas.a_hat, mm_map, PAGESIZE, HAT_UNLOAD_UNLOCK); + else if (pp) + hat_kpm_mapout(pp, NULL, va); + else + hat_kpm_mapout_pfn(pfn); + mutex_exit(&mm_lock); return (error); } @@ -330,13 +352,13 @@ v = BTOP((u_offset_t)uio->uio_loffset); error = mmio(uio, rw, v, - uio->uio_loffset & PAGEOFFSET, 0); + uio->uio_loffset & PAGEOFFSET, 0, NULL); break; case M_KMEM: case M_ALLKMEM: { - page_t **ppp; + page_t **ppp = NULL; caddr_t vaddr = (caddr_t)uio->uio_offset; int try_lock = NEED_LOCK_KVADDR(vaddr); int locked = 0; @@ -369,7 +391,8 @@ } error = mmio(uio, rw, v, uio->uio_loffset & PAGEOFFSET, - minor == M_ALLKMEM || mm_kmem_io_access); + minor == M_ALLKMEM || mm_kmem_io_access, + (locked && ppp) ? *ppp : NULL); if (locked) as_pageunlock(&kas, ppp, vaddr, PAGESIZE, S_WRITE);
--- a/usr/src/uts/common/sys/vnode.h Wed Jun 17 13:10:47 2009 -0700 +++ b/usr/src/uts/common/sys/vnode.h Wed Jun 17 15:32:10 2009 -0700 @@ -326,6 +326,12 @@ (pvn_vmodsort_supported != 0 && ((vp)->v_flag & VMODSORT) != 0) #define VISSWAPFS 0x20000 /* vnode is being used for swapfs */ + +/* + * The mdb memstat command assumes that IS_SWAPFSVP only uses the + * vnode's v_flag field. If this changes, cache the additional + * fields in mdb; see vn_get in mdb/common/modules/genunix/memory.c + */ #define IS_SWAPFSVP(vp) (((vp)->v_flag & VISSWAPFS) != 0) #define V_SYSATTR 0x40000 /* vnode is a GFS system attribute */
--- a/usr/src/uts/common/vm/hat.h Wed Jun 17 13:10:47 2009 -0700 +++ b/usr/src/uts/common/vm/hat.h Wed Jun 17 15:32:10 2009 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -248,6 +248,8 @@ */ caddr_t hat_kpm_mapin(struct page *, struct kpme *); void hat_kpm_mapout(struct page *, struct kpme *, caddr_t); +caddr_t hat_kpm_mapin_pfn(pfn_t); +void hat_kpm_mapout_pfn(pfn_t); caddr_t hat_kpm_page2va(struct page *, int); struct page *hat_kpm_vaddr2page(caddr_t); int hat_kpm_fault(struct hat *, caddr_t);
--- a/usr/src/uts/i86pc/vm/hat_i86.c Wed Jun 17 13:10:47 2009 -0700 +++ b/usr/src/uts/i86pc/vm/hat_i86.c Wed Jun 17 15:32:10 2009 -0700 @@ -138,7 +138,7 @@ /* * AMD shanghai processors provide better management of 1gb ptes in its tlb. - * By default, 1g page support will be disabled for pre-shanghai AMD + * By default, 1g page suppport will be disabled for pre-shanghai AMD * processors that don't have optimal tlb support for the 1g page size. * chk_optimal_1gtlb can be set to 0 to force 1g page support on sub-optimal * processors. @@ -1299,7 +1299,7 @@ int rv = 0; /* - * Is this a consistent (ie. need mapping list lock) mapping? + * Is this a consistant (ie. need mapping list lock) mapping? */ is_consist = (pp != NULL && (flags & HAT_LOAD_NOCONSIST) == 0); @@ -1991,15 +1991,22 @@ /* * Service a delayed TLB flush if coming out of being idle. - * It will be called from cpu idle notification with interrupt disabled. */ void tlb_service(void) { + ulong_t flags = getflags(); ulong_t tlb_info; ulong_t found; /* + * Be sure interrupts are off while doing this so that + * higher level interrupts correctly wait for flushes to finish. + */ + if (flags & PS_IE) + flags = intr_clear(); + + /* * We only have to do something if coming out of being idle. */ tlb_info = CPU->cpu_m.mcpu_tlb_info; @@ -2017,6 +2024,12 @@ if (tlb_info & TLB_INVAL_ALL) flush_all_tlb_entries(); } + + /* + * Restore interrupt enable control bit. + */ + if (flags & PS_IE) + sti(); } #endif /* !__xpv */ @@ -3165,7 +3178,7 @@ /* * Called when all mappings to a page should have write permission removed. - * Mostly stolen from hat_pagesync() + * Mostly stolem from hat_pagesync() */ static void hati_page_clrwrt(struct page *pp) @@ -3298,8 +3311,8 @@ /* * If flag is specified, returns 0 if attribute is disabled - * and non zero if enabled. If flag specifes multiple attributes - * then returns 0 if ALL attributes are disabled. This is an advisory + * and non zero if enabled. If flag specifes multiple attributs + * then returns 0 if ALL atriibutes are disabled. This is an advisory * call. */ uint_t @@ -4227,6 +4240,38 @@ } /* + * hat_kpm_mapin_pfn is used to obtain a kpm mapping for physical + * memory addresses that are not described by a page_t. It can + * also be used for normal pages that are not locked, but beware + * this is dangerous - no locking is performed, so the identity of + * the page could change. hat_kpm_mapin_pfn is not supported when + * vac_colors > 1, because the chosen va depends on the page identity, + * which could change. + * The caller must only pass pfn's for valid physical addresses; violation + * of this rule will cause panic. + */ +caddr_t +hat_kpm_mapin_pfn(pfn_t pfn) +{ + caddr_t paddr, vaddr; + + if (kpm_enable == 0) + return ((caddr_t)NULL); + + paddr = (caddr_t)ptob(pfn); + vaddr = (uintptr_t)kpm_vbase + paddr; + + return ((caddr_t)vaddr); +} + +/*ARGSUSED*/ +void +hat_kpm_mapout_pfn(pfn_t pfn) +{ + /* empty */ +} + +/* * Return the kpm virtual address for a specific pfn */ caddr_t
--- a/usr/src/uts/sun4u/vm/mach_kpm.c Wed Jun 17 13:10:47 2009 -0700 +++ b/usr/src/uts/sun4u/vm/mach_kpm.c Wed Jun 17 15:32:10 2009 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -58,6 +58,8 @@ void sfmmu_kpm_kpmp_exit(kpm_hlk_t *kpmp); void sfmmu_kpm_page_cache(page_t *, int, int); +extern uint_t vac_colors; + /* * Kernel Physical Mapping (kpm) facility */ @@ -168,6 +170,46 @@ } /* + * hat_kpm_mapin_pfn is used to obtain a kpm mapping for physical + * memory addresses that are not described by a page_t. It can + * only be supported if vac_colors=1, because there is no page_t + * and corresponding kpm_page_t to track VAC conflicts. Currently, + * this may not be used on pfn's backed by page_t's, because the + * kpm state may not be consistent in hat_kpm_fault if the page is + * mapped using both this routine and hat_kpm_mapin. KPM should be + * cleaned up on sun4u/vac_colors=1 to be minimal as on sun4v. + * The caller must only pass pfn's for valid physical addresses; violation + * of this rule will cause panic. + */ +caddr_t +hat_kpm_mapin_pfn(pfn_t pfn) +{ + caddr_t paddr, vaddr; + tte_t tte; + uint_t szc = kpm_smallpages ? TTE8K : TTE4M; + uint_t shift = kpm_smallpages ? MMU_PAGESHIFT : MMU_PAGESHIFT4M; + + if (kpm_enable == 0 || vac_colors > 1 || + page_numtomemseg_nolock(pfn) != NULL) + return ((caddr_t)NULL); + + paddr = (caddr_t)ptob(pfn); + vaddr = (uintptr_t)kpm_vbase + paddr; + + KPM_TTE_VCACHED(tte.ll, pfn, szc); + sfmmu_kpm_load_tsb(vaddr, &tte, shift); + + return (vaddr); +} + +/*ARGSUSED*/ +void +hat_kpm_mapout_pfn(pfn_t pfn) +{ + /* empty */ +} + +/* * Return the kpm virtual address for the page at pp. * If checkswap is non zero and the page is backed by a * swap vnode the physical address is used rather than @@ -279,17 +321,28 @@ SFMMU_KPM_VTOP(vaddr, paddr); pfn = (pfn_t)btop(paddr); - mseg = page_numtomemseg_nolock(pfn); - if (mseg == NULL) - return (EFAULT); + if ((mseg = page_numtomemseg_nolock(pfn)) != NULL) { + pp = &mseg->pages[(pgcnt_t)(pfn - mseg->pages_base)]; + ASSERT((pfn_t)pp->p_pagenum == pfn); + } - pp = &mseg->pages[(pgcnt_t)(pfn - mseg->pages_base)]; - ASSERT((pfn_t)pp->p_pagenum == pfn); + /* + * hat_kpm_mapin_pfn may add a kpm translation for memory that falls + * outside of memsegs. Check for this case and provide the translation + * here. + */ + if (vac_colors == 1 && mseg == NULL) { + tte_t tte; + uint_t szc = kpm_smallpages ? TTE8K : TTE4M; + uint_t shift = kpm_smallpages ? MMU_PAGESHIFT : MMU_PAGESHIFT4M; - if (!PAGE_LOCKED(pp)) - return (EFAULT); - - if (kpm_smallpages == 0) + ASSERT(address_in_memlist(phys_install, paddr, 1)); + KPM_TTE_VCACHED(tte.ll, pfn, szc); + sfmmu_kpm_load_tsb(vaddr, &tte, shift); + error = 0; + } else if (mseg == NULL || !PAGE_LOCKED(pp)) + error = EFAULT; + else if (kpm_smallpages == 0) error = sfmmu_kpm_fault(vaddr, mseg, pp); else error = sfmmu_kpm_fault_small(vaddr, mseg, pp); @@ -522,7 +575,6 @@ void *base; size_t size; struct memseg *msp; - extern uint_t vac_colors; for (msp = memsegs; msp; msp = msp->next) { pbase = msp->pages_base;
--- a/usr/src/uts/sun4v/vm/mach_kpm.c Wed Jun 17 13:10:47 2009 -0700 +++ b/usr/src/uts/sun4v/vm/mach_kpm.c Wed Jun 17 15:32:10 2009 -0700 @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Kernel Physical Mapping (segkpm) hat interface routines for sun4v. */ @@ -123,6 +121,38 @@ } /* + * hat_kpm_mapin_pfn is used to obtain a kpm mapping for physical + * memory addresses that are not described by a page_t. It can + * also be used for normal pages that are not locked, but beware + * this is dangerous - no locking is performed, so the identity of + * the page could change. hat_kpm_mapin_pfn is not supported when + * vac_colors > 1, because the chosen va depends on the page identity, + * which could change. + * The caller must only pass pfn's for valid physical addresses; violation + * of this rule will cause panic. + */ +caddr_t +hat_kpm_mapin_pfn(pfn_t pfn) +{ + caddr_t paddr, vaddr; + + if (kpm_enable == 0) + return ((caddr_t)NULL); + + paddr = (caddr_t)ptob(pfn); + vaddr = (uintptr_t)kpm_vbase + paddr; + + return ((caddr_t)vaddr); +} + +/*ARGSUSED*/ +void +hat_kpm_mapout_pfn(pfn_t pfn) +{ + /* empty */ +} + +/* * Return the kpm virtual address for the page at pp. */ /*ARGSUSED*/