Mercurial > illumos > illumos-gate
changeset 3290:256464cbb73c
4894692 caching data in heap inflates crash dump
6499454 time to increase size of kmem default allocation caches
6499459 vm should stop checking kvp directly
line wrap: on
line diff
--- a/usr/src/cmd/mdb/common/modules/genunix/memory.c Tue Dec 19 22:06:32 2006 -0800 +++ b/usr/src/cmd/mdb/common/modules/genunix/memory.c Tue Dec 19 23:13:06 2006 -0800 @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -218,6 +217,7 @@ /* Summary statistics of pages */ typedef struct memstat { struct vnode *ms_kvp; /* Cached address of kernel vnode */ + struct vnode *ms_zvp; /* Cached address of zio vnode */ uint64_t ms_kmem; /* Pages of kernel memory */ uint64_t ms_anon; /* Pages of anonymous memory */ uint64_t ms_vnode; /* Pages of named (vnode) memory */ @@ -226,6 +226,10 @@ uint64_t ms_total; /* Pages on page hash */ } memstat_t; +#define MS_PP_ISKAS(pp, stats) \ + (((pp)->p_vnode == (stats)->ms_kvp) || \ + (((stats)->ms_zvp != NULL) && ((pp)->p_vnode == (stats)->ms_zvp))) + /* * Summarize pages by type; called from page walker. */ @@ -252,7 +256,7 @@ stats->ms_cachelist++; else if (vp && IS_SWAPFSVP(vp)) stats->ms_anon++; - else if (pp->p_vnode == stats->ms_kvp) + else if (MS_PP_ISKAS(pp, stats)) stats->ms_kmem++; else if (vp && (((vp)->v_flag & VVMEXEC)) != 0) stats->ms_exec++; @@ -308,6 +312,17 @@ stats.ms_kvp = (struct vnode *)(uintptr_t)sym.st_value; + /* + * Read the zio vnode pointer. It may not exist on all kernels, so it + * it isn't found, it's not a fatal error. + */ + if (mdb_lookup_by_obj(MDB_OBJ_EXEC, "zvp", + (GElf_Sym *)&sym) == -1) { + stats.ms_zvp = NULL; + } else { + stats.ms_zvp = (struct vnode *)(uintptr_t)sym.st_value; + } + /* Walk page structures, summarizing usage */ if (mdb_walk("page", (mdb_walk_cb_t)memstat_callback, &stats) == -1) {
--- a/usr/src/uts/common/cpr/cpr_dump.c Tue Dec 19 22:06:32 2006 -0800 +++ b/usr/src/uts/common/cpr/cpr_dump.c Tue Dec 19 23:13:06 2006 -0800 @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -790,11 +789,11 @@ do { #if defined(__sparc) extern struct vnode prom_ppages; - if (pp->p_vnode == NULL || pp->p_vnode == &kvp || + if (pp->p_vnode == NULL || PP_ISKAS(pp) || pp->p_vnode == &prom_ppages || PP_ISFREE(pp) && PP_ISAGED(pp)) #else - if (pp->p_vnode == NULL || pp->p_vnode == &kvp || + if (pp->p_vnode == NULL || PP_ISKAS(pp) || PP_ISFREE(pp) && PP_ISAGED(pp)) #endif /* __sparc */ continue;
--- a/usr/src/uts/common/fs/fsflush.c Tue Dec 19 22:06:32 2006 -0800 +++ b/usr/src/uts/common/fs/fsflush.c Tue Dec 19 23:13:06 2006 -0800 @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -24,7 +23,7 @@ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -232,7 +231,7 @@ coal_page = NULL; } - if (pp->p_vnode == &kvp || + if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || pp->p_lckcnt != 0 || pp->p_cowcnt != 0) @@ -255,7 +254,7 @@ if (PP_ISSWAP(pp) || PP_ISFREE(pp) || vp == NULL || - vp == &kvp || + PP_ISKAS(pp) || pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || (vp->v_flag & VISSWAP) != 0) {
--- a/usr/src/uts/common/fs/lofs/lofs_vnops.c Tue Dec 19 22:06:32 2006 -0800 +++ b/usr/src/uts/common/fs/lofs/lofs_vnops.c Tue Dec 19 23:13:06 2006 -0800 @@ -1048,7 +1048,7 @@ lo_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr) { vp = realvp(vp); - if (vp != NULL && vp != &kvp) + if (vp != NULL && !VN_ISKAS(vp)) VOP_DISPOSE(vp, pp, fl, dn, cr); }
--- a/usr/src/uts/common/fs/zfs/arc.c Tue Dec 19 22:06:32 2006 -0800 +++ b/usr/src/uts/common/fs/zfs/arc.c Tue Dec 19 23:13:06 2006 -0800 @@ -230,10 +230,6 @@ }; struct arc_buf_hdr { - /* immutable */ - uint64_t b_size; - spa_t *b_spa; - /* protected by hash lock */ dva_t b_dva; uint64_t b_birth; @@ -247,8 +243,13 @@ uint32_t b_flags; uint32_t b_datacnt; + arc_callback_t *b_acb; kcondvar_t b_cv; - arc_callback_t *b_acb; + + /* immutable */ + arc_buf_contents_t b_type; + uint64_t b_size; + spa_t *b_spa; /* protected by arc state mutex */ arc_state_t *b_state; @@ -746,7 +747,7 @@ } arc_buf_t * -arc_buf_alloc(spa_t *spa, int size, void *tag) +arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) { arc_buf_hdr_t *hdr; arc_buf_t *buf; @@ -755,6 +756,7 @@ hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); ASSERT(BUF_EMPTY(hdr)); hdr->b_size = size; + hdr->b_type = type; hdr->b_spa = spa; hdr->b_state = arc.anon; hdr->b_arc_access = 0; @@ -839,10 +841,16 @@ if (buf->b_data) { arc_state_t *state = buf->b_hdr->b_state; uint64_t size = buf->b_hdr->b_size; + arc_buf_contents_t type = buf->b_hdr->b_type; arc_cksum_verify(buf); if (!recycle) { - zio_buf_free(buf->b_data, size); + if (type == ARC_BUFC_METADATA) { + zio_buf_free(buf->b_data, size); + } else { + ASSERT(type == ARC_BUFC_DATA); + zio_data_buf_free(buf->b_data, size); + } atomic_add_64(&arc.size, -size); } if (list_link_active(&buf->b_hdr->b_arc_node)) { @@ -1003,7 +1011,8 @@ * new buffer in a full arc cache. */ static void * -arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle) +arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle, + arc_buf_contents_t type) { arc_state_t *evicted_state; uint64_t bytes_evicted = 0, skipped = 0, missed = 0; @@ -1041,7 +1050,8 @@ arc_buf_t *buf = ab->b_buf; if (buf->b_data) { bytes_evicted += ab->b_size; - if (recycle && ab->b_size == bytes) { + if (recycle && ab->b_type == type && + ab->b_size == bytes) { stolen = buf->b_data; recycle = FALSE; } @@ -1147,7 +1157,7 @@ if (top_sz > arc.p && arc.mru->lsize > 0) { int64_t toevict = MIN(arc.mru->lsize, top_sz-arc.p); - (void) arc_evict(arc.mru, toevict, FALSE); + (void) arc_evict(arc.mru, toevict, FALSE, ARC_BUFC_UNDEF); top_sz = arc.anon->size + arc.mru->size; } @@ -1165,7 +1175,8 @@ if (arc.mfu->lsize > 0) { int64_t toevict = MIN(arc.mfu->lsize, arc_over); - (void) arc_evict(arc.mfu, toevict, FALSE); + (void) arc_evict(arc.mfu, toevict, FALSE, + ARC_BUFC_UNDEF); } tbl_over = arc.size + arc.mru_ghost->lsize + @@ -1207,9 +1218,9 @@ arc_flush(void) { while (list_head(&arc.mru->list)) - (void) arc_evict(arc.mru, -1, FALSE); + (void) arc_evict(arc.mru, -1, FALSE, ARC_BUFC_UNDEF); while (list_head(&arc.mfu->list)) - (void) arc_evict(arc.mfu, -1, FALSE); + (void) arc_evict(arc.mfu, -1, FALSE, ARC_BUFC_UNDEF); arc_evict_ghost(arc.mru_ghost, -1); arc_evict_ghost(arc.mfu_ghost, -1); @@ -1315,7 +1326,9 @@ { size_t i; kmem_cache_t *prev_cache = NULL; + kmem_cache_t *prev_data_cache = NULL; extern kmem_cache_t *zio_buf_cache[]; + extern kmem_cache_t *zio_data_buf_cache[]; #ifdef _KERNEL /* @@ -1344,6 +1357,10 @@ prev_cache = zio_buf_cache[i]; kmem_cache_reap_now(zio_buf_cache[i]); } + if (zio_data_buf_cache[i] != prev_data_cache) { + prev_data_cache = zio_data_buf_cache[i]; + kmem_cache_reap_now(zio_data_buf_cache[i]); + } } kmem_cache_reap_now(buf_cache); kmem_cache_reap_now(hdr_cache); @@ -1498,8 +1515,9 @@ static void arc_get_data_buf(arc_buf_t *buf) { - arc_state_t *state = buf->b_hdr->b_state; - uint64_t size = buf->b_hdr->b_size; + arc_state_t *state = buf->b_hdr->b_state; + uint64_t size = buf->b_hdr->b_size; + arc_buf_contents_t type = buf->b_hdr->b_type; arc_adapt(size, state); @@ -1508,7 +1526,12 @@ * just allocate a new buffer. */ if (!arc_evict_needed()) { - buf->b_data = zio_buf_alloc(size); + if (type == ARC_BUFC_METADATA) { + buf->b_data = zio_buf_alloc(size); + } else { + ASSERT(type == ARC_BUFC_DATA); + buf->b_data = zio_data_buf_alloc(size); + } atomic_add_64(&arc.size, size); goto out; } @@ -1530,8 +1553,13 @@ uint64_t mfu_space = arc.c - arc.p; state = (mfu_space > arc.mfu->size) ? arc.mru : arc.mfu; } - if ((buf->b_data = arc_evict(state, size, TRUE)) == NULL) { - buf->b_data = zio_buf_alloc(size); + if ((buf->b_data = arc_evict(state, size, TRUE, type)) == NULL) { + if (type == ARC_BUFC_METADATA) { + buf->b_data = zio_buf_alloc(size); + } else { + ASSERT(type == ARC_BUFC_DATA); + buf->b_data = zio_data_buf_alloc(size); + } atomic_add_64(&arc.size, size); atomic_add_64(&arc.recycle_miss, 1); } @@ -1916,8 +1944,8 @@ if (hdr == NULL) { /* this block is not in the cache */ arc_buf_hdr_t *exists; - - buf = arc_buf_alloc(spa, size, private); + arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); + buf = arc_buf_alloc(spa, size, private, type); hdr = buf->b_hdr; hdr->b_dva = *BP_IDENTITY(bp); hdr->b_birth = bp->blk_birth; @@ -2177,6 +2205,7 @@ arc_buf_t **bufp; uint64_t blksz = hdr->b_size; spa_t *spa = hdr->b_spa; + arc_buf_contents_t type = hdr->b_type; ASSERT(hdr->b_datacnt > 1); /* @@ -2202,6 +2231,7 @@ nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); nhdr->b_size = blksz; nhdr->b_spa = spa; + nhdr->b_type = type; nhdr->b_buf = buf; nhdr->b_state = arc.anon; nhdr->b_arc_access = 0;
--- a/usr/src/uts/common/fs/zfs/dbuf.c Tue Dec 19 22:06:32 2006 -0800 +++ b/usr/src/uts/common/fs/zfs/dbuf.c Tue Dec 19 23:13:06 2006 -0800 @@ -504,9 +504,11 @@ dprintf_dbuf_bp(db, bp, "%s", "blkptr:"); if (bp == NULL || BP_IS_HOLE(bp)) { + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + ASSERT(bp == NULL || BP_IS_HOLE(bp)); dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, - db->db.db_size, db)); + db->db.db_size, db, type)); bzero(db->db.db_data, db->db.db_size); db->db_state = DB_CACHED; *flags |= DB_RF_CACHED; @@ -615,10 +617,12 @@ while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) { + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, - db->db.db_size, db)); + db->db.db_size, db, type)); db->db_state = DB_FILL; } else { ASSERT3U(db->db_state, ==, DB_CACHED); @@ -643,6 +647,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) { arc_buf_t **quiescing, **syncing; + arc_buf_contents_t type; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db.db_data != NULL); @@ -665,8 +670,9 @@ ASSERT(*syncing != db->db_buf); if (refcount_count(&db->db_holds) > db->db_dirtycnt) { int size = db->db.db_size; + type = DBUF_GET_BUFC_TYPE(db); *quiescing = arc_buf_alloc( - db->db_dnode->dn_objset->os_spa, size, db); + db->db_dnode->dn_objset->os_spa, size, db, type); bcopy(db->db.db_data, (*quiescing)->b_data, size); } else { dbuf_set_data(db, NULL); @@ -685,10 +691,11 @@ ASSERT3U(db->db_dirtycnt, ==, 1); if (refcount_count(&db->db_holds) > db->db_dirtycnt) { int size = db->db.db_size; + type = DBUF_GET_BUFC_TYPE(db); /* we can't copy if we have already started a write */ ASSERT(*syncing != db->db_data_pending); *syncing = arc_buf_alloc( - db->db_dnode->dn_objset->os_spa, size, db); + db->db_dnode->dn_objset->os_spa, size, db, type); bcopy(db->db.db_data, (*syncing)->b_data, size); } else { dbuf_set_data(db, NULL); @@ -860,6 +867,7 @@ { arc_buf_t *buf, *obuf; int osize = db->db.db_size; + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); ASSERT(db->db_blkid != DB_BONUS_BLKID); @@ -879,7 +887,7 @@ dbuf_will_dirty(db, tx); /* create the data buffer for the new block */ - buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db); + buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type); /* copy old block data to the new block */ obuf = db->db_buf; @@ -1588,9 +1596,10 @@ db->db_data_pending == db->db_buf) { int size = (db->db_blkid == DB_BONUS_BLKID) ? DN_MAX_BONUSLEN : db->db.db_size; + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, - size, db)); + size, db, type)); bcopy(db->db_data_pending->b_data, db->db.db_data, db->db.db_size); } @@ -1766,6 +1775,7 @@ int checksum, compress; zbookmark_t zb; int blksz; + arc_buf_contents_t type; ASSERT(dmu_tx_is_syncing(tx)); @@ -1823,6 +1833,7 @@ } if (db->db_level == 0) { + type = DBUF_GET_BUFC_TYPE(db); data = &db->db_d.db_data_old[txg&TXG_MASK]; blksz = arc_buf_size(*data); @@ -1849,7 +1860,8 @@ db->db_d.db_overridden_by[txg&TXG_MASK] == NULL) { if (refcount_count(&db->db_holds) > 1 && *data == db->db_buf) { - *data = arc_buf_alloc(os->os_spa, blksz, db); + *data = arc_buf_alloc(os->os_spa, blksz, db, + type); bcopy(db->db.db_data, (*data)->b_data, blksz); } db->db_data_pending = *data;
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c Tue Dec 19 22:06:32 2006 -0800 +++ b/usr/src/uts/common/fs/zfs/dmu_objset.c Tue Dec 19 23:13:06 2006 -0800 @@ -737,7 +737,8 @@ int err; zbookmark_t zb; arc_buf_t *abuf = - arc_buf_alloc(os->os_spa, sizeof (objset_phys_t), FTAG); + arc_buf_alloc(os->os_spa, sizeof (objset_phys_t), FTAG, + ARC_BUFC_METADATA); ASSERT(dmu_tx_is_syncing(tx)); ASSERT(os->os_synctx == NULL);
--- a/usr/src/uts/common/fs/zfs/spa.c Tue Dec 19 22:06:32 2006 -0800 +++ b/usr/src/uts/common/fs/zfs/spa.c Tue Dec 19 23:13:06 2006 -0800 @@ -2133,7 +2133,7 @@ { spa_t *spa = zio->io_spa; - zio_buf_free(zio->io_data, zio->io_size); + zio_data_buf_free(zio->io_data, zio->io_size); mutex_enter(&spa->spa_scrub_lock); if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { @@ -2155,7 +2155,7 @@ zbookmark_t *zb) { size_t size = BP_GET_LSIZE(bp); - void *data = zio_buf_alloc(size); + void *data = zio_data_buf_alloc(size); mutex_enter(&spa->spa_scrub_lock); spa->spa_scrub_inflight++;
--- a/usr/src/uts/common/fs/zfs/sys/arc.h Tue Dec 19 22:06:32 2006 -0800 +++ b/usr/src/uts/common/fs/zfs/sys/arc.h Tue Dec 19 23:13:06 2006 -0800 @@ -54,6 +54,11 @@ void *b_private; }; +typedef enum arc_buf_contents { + ARC_BUFC_UNDEF, /* buffer contents undefined */ + ARC_BUFC_DATA, /* buffer contains data */ + ARC_BUFC_METADATA /* buffer contains metadata */ +} arc_buf_contents_t; /* * These are the flags we pass into calls to the arc */ @@ -62,7 +67,8 @@ #define ARC_PREFETCH (1 << 3) /* I/O is a prefetch */ #define ARC_CACHED (1 << 4) /* I/O was already in cache */ -arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag); +arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag, + arc_buf_contents_t type); void arc_buf_add_ref(arc_buf_t *buf, void *tag); int arc_buf_remove_ref(arc_buf_t *buf, void *tag); int arc_buf_size(arc_buf_t *buf);
--- a/usr/src/uts/common/fs/zfs/sys/dbuf.h Tue Dec 19 22:06:32 2006 -0800 +++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h Tue Dec 19 23:13:06 2006 -0800 @@ -254,6 +254,11 @@ void dbuf_init(void); void dbuf_fini(void); +#define DBUF_GET_BUFC_TYPE(db) \ + ((((db)->db_level > 0) || \ + (dmu_ot[(db)->db_dnode->dn_type].ot_metadata)) ? \ + ARC_BUFC_METADATA : ARC_BUFC_DATA); + #ifdef ZFS_DEBUG /*
--- a/usr/src/uts/common/fs/zfs/sys/spa.h Tue Dec 19 22:06:32 2006 -0800 +++ b/usr/src/uts/common/fs/zfs/sys/spa.h Tue Dec 19 23:13:06 2006 -0800 @@ -306,6 +306,9 @@ #include <sys/dmu.h> +#define BP_GET_BUFC_TYPE(bp) \ + (((BP_GET_LEVEL(bp) > 0) || (dmu_ot[BP_GET_TYPE(bp)].ot_metadata)) ? \ + ARC_BUFC_METADATA : ARC_BUFC_DATA); /* * Routines found in spa.c */
--- a/usr/src/uts/common/fs/zfs/sys/zio.h Tue Dec 19 22:06:32 2006 -0800 +++ b/usr/src/uts/common/fs/zfs/sys/zio.h Tue Dec 19 23:13:06 2006 -0800 @@ -295,6 +295,8 @@ extern void *zio_buf_alloc(size_t size); extern void zio_buf_free(void *buf, size_t size); +extern void *zio_data_buf_alloc(size_t size); +extern void zio_data_buf_free(void *buf, size_t size); /* * Move an I/O to the next stage of the pipeline and execute that stage.
--- a/usr/src/uts/common/fs/zfs/zio.c Tue Dec 19 22:06:32 2006 -0800 +++ b/usr/src/uts/common/fs/zfs/zio.c Tue Dec 19 23:13:06 2006 -0800 @@ -82,11 +82,21 @@ * ========================================================================== */ kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; +kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; + +#ifdef _KERNEL +extern vmem_t *zio_alloc_arena; +#endif void zio_init(void) { size_t c; + vmem_t *data_alloc_arena = NULL; + +#ifdef _KERNEL + data_alloc_arena = zio_alloc_arena; +#endif /* * For small buffers, we want a cache for each multiple of @@ -111,10 +121,16 @@ } if (align != 0) { - char name[30]; + char name[36]; (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); zio_buf_cache[c] = kmem_cache_create(name, size, align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG); + + (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); + zio_data_buf_cache[c] = kmem_cache_create(name, size, + align, NULL, NULL, NULL, NULL, data_alloc_arena, + KMC_NODEBUG); + dprintf("creating cache for size %5lx align %5lx\n", size, align); } @@ -124,6 +140,10 @@ ASSERT(zio_buf_cache[c] != NULL); if (zio_buf_cache[c - 1] == NULL) zio_buf_cache[c - 1] = zio_buf_cache[c]; + + ASSERT(zio_data_buf_cache[c] != NULL); + if (zio_data_buf_cache[c - 1] == NULL) + zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; } zio_inject_init(); @@ -134,6 +154,7 @@ { size_t c; kmem_cache_t *last_cache = NULL; + kmem_cache_t *last_data_cache = NULL; for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { if (zio_buf_cache[c] != last_cache) { @@ -141,6 +162,12 @@ kmem_cache_destroy(zio_buf_cache[c]); } zio_buf_cache[c] = NULL; + + if (zio_data_buf_cache[c] != last_data_cache) { + last_data_cache = zio_data_buf_cache[c]; + kmem_cache_destroy(zio_data_buf_cache[c]); + } + zio_data_buf_cache[c] = NULL; } zio_inject_fini(); @@ -151,6 +178,13 @@ * Allocate and free I/O buffers * ========================================================================== */ + +/* + * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a + * crashdump if the kernel panics, so use it judiciously. Obviously, it's + * useful to inspect ZFS metadata, but if possible, we should avoid keeping + * excess / transient data in-core during a crashdump. + */ void * zio_buf_alloc(size_t size) { @@ -161,6 +195,22 @@ return (kmem_cache_alloc(zio_buf_cache[c], KM_SLEEP)); } +/* + * Use zio_data_buf_alloc to allocate data. The data will not appear in a + * crashdump if the kernel panics. This exists so that we will limit the amount + * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount + * of kernel heap dumped to disk when the kernel panics) + */ +void * +zio_data_buf_alloc(size_t size) +{ + size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; + + ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + + return (kmem_cache_alloc(zio_data_buf_cache[c], KM_SLEEP)); +} + void zio_buf_free(void *buf, size_t size) { @@ -171,6 +221,15 @@ kmem_cache_free(zio_buf_cache[c], buf); } +void +zio_data_buf_free(void *buf, size_t size) +{ + size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; + + ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + + kmem_cache_free(zio_data_buf_cache[c], buf); +} /* * ========================================================================== * Push and pop I/O transform buffers
--- a/usr/src/uts/common/os/kmem.c Tue Dec 19 22:06:32 2006 -0800 +++ b/usr/src/uts/common/os/kmem.c Tue Dec 19 23:13:06 2006 -0800 @@ -154,9 +154,11 @@ P2ALIGN(8192 / 1, 64), 4096 * 3, 8192 * 2, + 8192 * 3, + 8192 * 4, }; -#define KMEM_MAXBUF 16384 +#define KMEM_MAXBUF 32768 static kmem_cache_t *kmem_alloc_table[KMEM_MAXBUF >> KMEM_ALIGN_SHIFT];
--- a/usr/src/uts/common/os/mem_cage.c Tue Dec 19 22:06:32 2006 -0800 +++ b/usr/src/uts/common/os/mem_cage.c Tue Dec 19 23:13:06 2006 -0800 @@ -1308,7 +1308,7 @@ * non-swapfs (i.e. anonymous memory) file system pages. */ ASSERT(rootpp->p_vnode != NULL && - rootpp->p_vnode != &kvp && + !PP_ISKAS(rootpp) && !IS_SWAPFSVP(rootpp->p_vnode)); PP_SETNORELOC(rootpp); return (1); @@ -1783,7 +1783,7 @@ continue; } - if ((pp->p_vnode == &kvp && pp->p_lckcnt > 0) || + if ((PP_ISKAS(pp) && pp->p_lckcnt > 0) || !page_trylock(pp, SE_EXCL)) { KCAGE_STAT_INCR_SCAN(kt_cantlock); continue; @@ -1791,7 +1791,7 @@ /* P_NORELOC bit should not have gone away. */ ASSERT(PP_ISNORELOC(pp)); - if (PP_ISFREE(pp) || (pp->p_vnode == &kvp && + if (PP_ISFREE(pp) || (PP_ISKAS(pp) && pp->p_lckcnt > 0)) { page_unlock(pp); continue;
--- a/usr/src/uts/common/os/mem_config.c Tue Dec 19 22:06:32 2006 -0800 +++ b/usr/src/uts/common/os/mem_config.c Tue Dec 19 23:13:06 2006 -0800 @@ -1923,7 +1923,7 @@ * Unload the mappings and check if mod bit * is set. */ - ASSERT(pp->p_vnode != &kvp); + ASSERT(!PP_ISKAS(pp)); (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); mod = hat_ismod(pp);
--- a/usr/src/uts/common/os/vm_pageout.c Tue Dec 19 22:06:32 2006 -0800 +++ b/usr/src/uts/common/os/vm_pageout.c Tue Dec 19 23:13:06 2006 -0800 @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -950,8 +949,7 @@ * NOTE: These optimizations assume that reads are atomic. */ top: - if ((pp->p_vnode == &kvp) || - (PP_ISFREE(pp)) || + if ((PP_ISKAS(pp)) || (PP_ISFREE(pp)) || (hat_page_getshare(pp) > po_share) || PAGE_LOCKED(pp)) { return (-1); }
--- a/usr/src/uts/common/sys/vnode.h Tue Dec 19 22:06:32 2006 -0800 +++ b/usr/src/uts/common/sys/vnode.h Tue Dec 19 23:13:06 2006 -0800 @@ -960,6 +960,11 @@ ((VP1) && (VP2) && (vn_getops(VP1) == vn_getops(VP2)) ? \ VOP_CMP(VP1, VP2) : 0)) +extern struct vnode kvp; +extern struct vnode zvp; + +#define VN_ISKAS(vp) ((vp) == &kvp || (vp) == &zvp) + #endif /* _KERNEL */ /* @@ -1001,7 +1006,7 @@ */ #define VN_DISPOSE(pp, flag, dn, cr) { \ extern struct vnode kvp; \ - if ((pp)->p_vnode != NULL && (pp)->p_vnode != &kvp) \ + if ((pp)->p_vnode != NULL && !VN_ISKAS((pp)->p_vnode)) \ VOP_DISPOSE((pp)->p_vnode, (pp), (flag), (dn), (cr)); \ else if ((flag) == B_FREE) \ page_free((pp), (dn)); \
--- a/usr/src/uts/common/vm/page.h Tue Dec 19 22:06:32 2006 -0800 +++ b/usr/src/uts/common/vm/page.h Tue Dec 19 23:13:06 2006 -0800 @@ -877,8 +877,9 @@ #define PP_ISAGED(pp) (((pp)->p_state & P_FREE) && \ ((pp)->p_vnode == NULL)) #define PP_ISNORELOC(pp) ((pp)->p_state & P_NORELOC) -#define PP_ISKVP(pp) ((pp)->p_vnode == &kvp) -#define PP_ISNORELOCKERNEL(pp) (PP_ISNORELOC(pp) && PP_ISKVP(pp)) +#define PP_ISKAS(pp) (((pp)->p_vnode == &kvp) || \ + ((pp)->p_vnode == &zvp)) +#define PP_ISNORELOCKERNEL(pp) (PP_ISNORELOC(pp) && PP_ISKAS(pp)) #define PP_ISMIGRATE(pp) ((pp)->p_state & P_MIGRATE) #define PP_ISSWAP(pp) ((pp)->p_state & P_SWAP) @@ -956,7 +957,7 @@ #define PP_PR_REQ(pp) (((pp)->p_toxic & PR_REASONS) && !PP_RETIRED(pp)) #define PP_PR_NOSHARE(pp) \ ((((pp)->p_toxic & (PR_RETIRED | PR_FMA | PR_UE)) == PR_FMA) && \ - !PP_ISKVP(pp)) + !PP_ISKAS(pp)) /* * Flags for page_unretire_pp
--- a/usr/src/uts/common/vm/page_lock.c Tue Dec 19 22:06:32 2006 -0800 +++ b/usr/src/uts/common/vm/page_lock.c Tue Dec 19 23:13:06 2006 -0800 @@ -142,6 +142,12 @@ extern struct vnode kvp; +/* + * Two slots after VPH_TABLE_SIZE are reserved in vph_mutex for kernel vnodes. + * The lock for kvp is VPH_TABLE_SIZE + 0, and the lock for zvp is + * VPH_TABLE_SIZE + 1. + */ + kmutex_t vph_mutex[VPH_TABLE_SIZE + 2]; /* @@ -861,6 +867,9 @@ { if (vp == &kvp) return (&vph_mutex[VPH_TABLE_SIZE + 0]); + + if (vp == &zvp) + return (&vph_mutex[VPH_TABLE_SIZE + 1]); #ifdef DEBUG if (page_vnode_mutex_stress != 0) return (&vph_mutex[0]); @@ -913,7 +922,7 @@ ASSERT(!PP_ISFREE(pp)); ASSERT(pp->p_vnode != NULL); ASSERT(!IS_SWAPFSVP(pp->p_vnode)); - ASSERT(pp->p_vnode != &kvp); + ASSERT(!PP_ISKAS(pp)); again: if (pszc == 0) {
--- a/usr/src/uts/common/vm/page_retire.c Tue Dec 19 22:06:32 2006 -0800 +++ b/usr/src/uts/common/vm/page_retire.c Tue Dec 19 23:13:06 2006 -0800 @@ -355,7 +355,7 @@ int whichtype = 0; \ if (pp->p_vnode) \ whichtype |= PRT_NAMED; \ - if (PP_ISKVP(pp)) \ + if (PP_ISKAS(pp)) \ whichtype |= PRT_KERNEL; \ if (PP_ISFREE(pp)) \ whichtype |= PRT_FREE; \ @@ -882,7 +882,7 @@ page_retire_thread_cb(page_t *pp) { PR_DEBUG(prd_tctop); - if (!PP_ISKVP(pp) && page_trylock(pp, SE_EXCL)) { + if (!PP_ISKAS(pp) && page_trylock(pp, SE_EXCL)) { PR_DEBUG(prd_tclocked); page_unlock(pp); } @@ -901,7 +901,7 @@ * Don't scrub the kernel, since we might still need it, unless * we have UEs on the page, in which case we have nothing to lose. */ - if (!PP_ISKVP(pp) || PP_TOXIC(pp)) { + if (!PP_ISKAS(pp) || PP_TOXIC(pp)) { pp->p_selock = -1; /* pacify ASSERTs */ PP_CLRFREE(pp); pagescrub(pp, 0, PAGESIZE);
--- a/usr/src/uts/common/vm/seg_kmem.c Tue Dec 19 22:06:32 2006 -0800 +++ b/usr/src/uts/common/vm/seg_kmem.c Tue Dec 19 23:13:06 2006 -0800 @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -103,6 +102,7 @@ char *ekernelheap; /* end of primary kernel heap */ struct seg kvseg; /* primary kernel heap segment */ struct seg kvseg_core; /* "core" kernel heap segment */ +struct seg kzioseg; /* Segment for zio mappings */ vmem_t *heap_arena; /* primary kernel heap arena */ vmem_t *heap_core_arena; /* core kernel heap arena */ char *heap_core_base; /* start of core kernel heap arena */ @@ -114,9 +114,12 @@ vmem_t *heaptext_arena; /* heaptext arena */ struct as kas; /* kernel address space */ struct vnode kvp; /* vnode for all segkmem pages */ +struct vnode zvp; /* vnode for zfs pages */ int segkmem_reloc; /* enable/disable relocatable segkmem pages */ vmem_t *static_arena; /* arena for caches to import static memory */ vmem_t *static_alloc_arena; /* arena for allocating static memory */ +vmem_t *zio_arena = NULL; /* arena for allocating zio memory */ +vmem_t *zio_alloc_arena = NULL; /* arena for allocating zio memory */ /* * seg_kmem driver can map part of the kernel heap with large pages. @@ -427,6 +430,7 @@ pgcnt_t npages; spgcnt_t pg; page_t *pp; + struct vnode *vp = seg->s_data; ASSERT(RW_READ_HELD(&seg->s_as->a_lock)); @@ -451,7 +455,7 @@ switch (type) { case F_SOFTLOCK: /* lock down already-loaded translations */ for (pg = 0; pg < npages; pg++) { - pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)addr, + pp = page_lookup(vp, (u_offset_t)(uintptr_t)addr, SE_SHARED); if (pp == NULL) { /* @@ -461,7 +465,7 @@ if (!hat_probe(kas.a_hat, addr)) { addr -= PAGESIZE; while (--pg >= 0) { - pp = page_find(&kvp, + pp = page_find(vp, (u_offset_t)(uintptr_t)addr); if (pp) page_unlock(pp); @@ -477,7 +481,7 @@ return (0); case F_SOFTUNLOCK: while (npages--) { - pp = page_find(&kvp, (u_offset_t)(uintptr_t)addr); + pp = page_find(vp, (u_offset_t)(uintptr_t)addr); if (pp) page_unlock(pp); addr += PAGESIZE; @@ -645,6 +649,13 @@ segkmem_dump_range, seg->s_as); vmem_walk(heaptext_arena, VMEM_ALLOC | VMEM_REENTRANT, segkmem_dump_range, seg->s_as); + } else if (seg == &kzioseg) { + /* + * We don't want to dump pages attached to kzioseg since they + * contain file data from ZFS. If this page's segment is + * kzioseg return instead of writing it to the dump device. + */ + return; } else { segkmem_dump_range(seg->s_as, seg->s_base, seg->s_size); } @@ -666,6 +677,7 @@ pgcnt_t npages; spgcnt_t pg; size_t nb; + struct vnode *vp = seg->s_data; ASSERT(ppp != NULL); @@ -706,7 +718,7 @@ } for (pg = 0; pg < npages; pg++) { - pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)addr, SE_SHARED); + pp = page_lookup(vp, (u_offset_t)(uintptr_t)addr, SE_SHARED); if (pp == NULL) { while (--pg >= 0) page_unlock(pplist[pg]); @@ -791,11 +803,21 @@ }; int +segkmem_zio_create(struct seg *seg) +{ + ASSERT(seg->s_as == &kas && RW_WRITE_HELD(&kas.a_lock)); + seg->s_ops = &segkmem_ops; + seg->s_data = &zvp; + kas.a_size += seg->s_size; + return (0); +} + +int segkmem_create(struct seg *seg) { ASSERT(seg->s_as == &kas && RW_WRITE_HELD(&kas.a_lock)); seg->s_ops = &segkmem_ops; - seg->s_data = NULL; + seg->s_data = &kvp; kas.a_size += seg->s_size; return (0); } @@ -806,6 +828,10 @@ { struct seg kseg; int pgflags; + struct vnode *vp = arg; + + if (vp == NULL) + vp = &kvp; kseg.s_as = &kas; pgflags = PG_EXCL; @@ -819,7 +845,7 @@ if (vmflag & VM_PUSHPAGE) pgflags |= PG_PUSHPAGE; - return (page_create_va(&kvp, (u_offset_t)(uintptr_t)addr, size, + return (page_create_va(vp, (u_offset_t)(uintptr_t)addr, size, pgflags, &kseg, addr)); } @@ -897,12 +923,14 @@ return (addr); } -void * -segkmem_alloc(vmem_t *vmp, size_t size, int vmflag) +static void * +segkmem_alloc_vn(vmem_t *vmp, size_t size, int vmflag, struct vnode *vp) { void *addr; segkmem_gc_list_t *gcp, **prev_gcpp; + ASSERT(vp != NULL); + if (kvseg.s_base == NULL) { #ifndef __sparc if (bootops->bsys_alloc == NULL) @@ -928,7 +956,19 @@ return (addr); } return (segkmem_xalloc(vmp, NULL, size, vmflag, 0, - segkmem_page_create, NULL)); + segkmem_page_create, vp)); +} + +void * +segkmem_alloc(vmem_t *vmp, size_t size, int vmflag) +{ + return (segkmem_alloc_vn(vmp, size, vmflag, &kvp)); +} + +void * +segkmem_zio_alloc(vmem_t *vmp, size_t size, int vmflag) +{ + return (segkmem_alloc_vn(vmp, size, vmflag, &zvp)); } /* @@ -937,8 +977,8 @@ * we currently don't have a special kernel segment for non-paged * kernel memory that is exported by drivers to user space. */ -void -segkmem_free(vmem_t *vmp, void *inaddr, size_t size) +static void +segkmem_free_vn(vmem_t *vmp, void *inaddr, size_t size, struct vnode *vp) { page_t *pp; caddr_t addr = inaddr; @@ -946,6 +986,7 @@ pgcnt_t npages = btopr(size); ASSERT(((uintptr_t)addr & PAGEOFFSET) == 0); + ASSERT(vp != NULL); if (kvseg.s_base == NULL) { segkmem_gc_list_t *gc = inaddr; @@ -960,7 +1001,7 @@ for (eaddr = addr + size; addr < eaddr; addr += PAGESIZE) { #if defined(__x86) - pp = page_find(&kvp, (u_offset_t)(uintptr_t)addr); + pp = page_find(vp, (u_offset_t)(uintptr_t)addr); if (pp == NULL) panic("segkmem_free: page not found"); if (!page_tryupgrade(pp)) { @@ -969,11 +1010,11 @@ * it to drop the lock so we can free this page. */ page_unlock(pp); - pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)addr, + pp = page_lookup(vp, (u_offset_t)(uintptr_t)addr, SE_EXCL); } #else - pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)addr, SE_EXCL); + pp = page_lookup(vp, (u_offset_t)(uintptr_t)addr, SE_EXCL); #endif if (pp == NULL) panic("segkmem_free: page not found"); @@ -985,6 +1026,19 @@ if (vmp != NULL) vmem_free(vmp, inaddr, size); + +} + +void +segkmem_free(vmem_t *vmp, void *inaddr, size_t size) +{ + segkmem_free_vn(vmp, inaddr, size, &kvp); +} + +void +segkmem_zio_free(vmem_t *vmp, void *inaddr, size_t size) +{ + segkmem_free_vn(vmp, inaddr, size, &zvp); } void @@ -1441,6 +1495,22 @@ return (use_large_pages); } +void +segkmem_zio_init(void *zio_mem_base, size_t zio_mem_size) +{ + ASSERT(zio_mem_base != NULL); + ASSERT(zio_mem_size != 0); + + zio_arena = vmem_create("zio", zio_mem_base, zio_mem_size, PAGESIZE, + NULL, NULL, NULL, 0, VM_SLEEP); + + zio_alloc_arena = vmem_create("zio_buf", NULL, 0, PAGESIZE, + segkmem_zio_alloc, segkmem_zio_free, zio_arena, 0, VM_SLEEP); + + ASSERT(zio_arena != NULL); + ASSERT(zio_alloc_arena != NULL); +} + #ifdef __sparc
--- a/usr/src/uts/common/vm/seg_kmem.h Tue Dec 19 22:06:32 2006 -0800 +++ b/usr/src/uts/common/vm/seg_kmem.h Tue Dec 19 23:13:06 2006 -0800 @@ -51,6 +51,7 @@ extern char *heap_lp_end; /* end of kernel large page heap arena */ extern struct seg kvseg; /* primary kernel heap segment */ extern struct seg kvseg_core; /* "core" kernel heap segment */ +extern struct seg kzioseg; /* Segment for zio mappings */ extern vmem_t *heap_lp_arena; /* kernel large page heap arena */ extern vmem_t *heap_arena; /* primary kernel heap arena */ extern vmem_t *hat_memload_arena; /* HAT translation arena */ @@ -59,9 +60,12 @@ extern vmem_t *heaptext_arena; /* kernel text arena, from heap */ extern struct as kas; /* kernel address space */ extern struct vnode kvp; /* vnode for all segkmem pages */ +extern struct vnode zvp; /* vnode for all segkmem pages for zfs */ extern int segkmem_reloc; /* enable/disable segkmem relocatable pages */ extern vmem_t *static_arena; /* arena for caches to import static memory */ extern vmem_t *static_alloc_arena; /* arena for allocating static memory */ +extern vmem_t *zio_arena; /* arena for zio caches */ +extern vmem_t *zio_alloc_arena; /* arena for zio caches */ extern int segkmem_create(struct seg *); extern page_t *segkmem_page_create(void *, size_t, int, void *); @@ -77,6 +81,11 @@ extern void kernelheap_extend(void *, void *); extern void segkmem_gc(void); +extern void *segkmem_zio_alloc(vmem_t *, size_t, int); +extern int segkmem_zio_create(struct seg *); +extern void segkmem_zio_free(vmem_t *, void *, size_t); +extern void segkmem_zio_init(void *, size_t); + /* * Flags for segkmem_xalloc(). *
--- a/usr/src/uts/common/vm/seg_vn.c Tue Dec 19 22:06:32 2006 -0800 +++ b/usr/src/uts/common/vm/seg_vn.c Tue Dec 19 23:13:06 2006 -0800 @@ -408,7 +408,7 @@ a->szc = 0; } else if (a->vp != NULL) { extern struct vnode kvp; - if (IS_SWAPFSVP(a->vp) || a->vp == &kvp) { + if (IS_SWAPFSVP(a->vp) || VN_ISKAS(a->vp)) { /* * paranoid check. * hat_page_demote() is not supported @@ -5537,7 +5537,7 @@ /* paranoid check */ if (svd->vp != NULL && - (IS_SWAPFSVP(svd->vp) || svd->vp == &kvp)) { + (IS_SWAPFSVP(svd->vp) || VN_ISKAS(svd->vp))) { return (EINVAL); }
--- a/usr/src/uts/common/vm/vm_page.c Tue Dec 19 22:06:32 2006 -0800 +++ b/usr/src/uts/common/vm/vm_page.c Tue Dec 19 23:13:06 2006 -0800 @@ -1035,7 +1035,7 @@ ASSERT(szc != 0); ASSERT(vp != NULL); ASSERT(!IS_SWAPFSVP(vp)); - ASSERT(vp != &kvp); + ASSERT(!VN_ISKAS(vp)); again: if (++loopcnt > 3) { @@ -2704,7 +2704,7 @@ if (pp->p_szc != 0) { if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) || - pp->p_vnode == &kvp) { + PP_ISKAS(pp)) { panic("page_free: anon or kernel " "or no vnode large page %p", (void *)pp); } @@ -3153,7 +3153,7 @@ if (pp->p_szc != 0) { if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) || - pp->p_vnode == &kvp) { + PP_ISKAS(pp)) { panic("page_destroy: anon or kernel or no vnode " "large page %p", (void *)pp); } @@ -3332,7 +3332,7 @@ vnode_t *ovp = opp->p_vnode; ASSERT(ovp != NULL); ASSERT(!IS_SWAPFSVP(ovp)); - ASSERT(ovp != &kvp); + ASSERT(!VN_ISKAS(ovp)); page_demote_vp_pages(opp); ASSERT(opp->p_szc == 0); } @@ -3399,14 +3399,14 @@ (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); if (pp->p_szc != 0) { ASSERT(!IS_SWAPFSVP(vp)); - ASSERT(vp != &kvp); + ASSERT(!VN_ISKAS(vp)); page_demote_vp_pages(pp); ASSERT(pp->p_szc == 0); } mutex_enter(phm); } else if (pp->p_szc != 0) { ASSERT(!IS_SWAPFSVP(vp)); - ASSERT(vp != &kvp); + ASSERT(!VN_ISKAS(vp)); mutex_exit(phm); page_demote_vp_pages(pp); ASSERT(pp->p_szc == 0); @@ -4378,7 +4378,7 @@ * (g) Backed by a filesystem which doesn't have a * stubbed-out sync operation */ - if (!PP_ISFREE(pp) && vp != NULL && vp != &kvp && + if (!PP_ISFREE(pp) && vp != NULL && !VN_ISKAS(vp) && hat_ismod(pp) && !IS_SWAPVP(vp) && vp->v_vfsp != NULL && vfs_can_sync(vp->v_vfsp)) { nppbusy++; @@ -4457,10 +4457,10 @@ * with the kernel vnode or prom allocated kernel mem. */ #if defined(__sparc) - if ((vp = pp->p_vnode) == NULL || vp == &kvp || + if ((vp = pp->p_vnode) == NULL || VN_ISKAS(vp) || vp == &prom_ppages) #else /* x86 doesn't have prom or prom_ppage */ - if ((vp = pp->p_vnode) == NULL || vp == &kvp) + if ((vp = pp->p_vnode) == NULL || VN_ISKAS(vp)) #endif /* __sparc */ continue; @@ -4747,7 +4747,7 @@ } if (pp->p_szc != pszc) { ASSERT(pp->p_szc < pszc); - ASSERT(pp->p_vnode != NULL && pp->p_vnode != &kvp && + ASSERT(pp->p_vnode != NULL && !PP_ISKAS(pp) && !IS_SWAPFSVP(pp->p_vnode)); tpp = pp + 1; for (i = 1; i < npgs; i++, tpp++) { @@ -4879,7 +4879,7 @@ * seg kmem pages require that the target and replacement * page be the same pagesize. */ - flags = (targ->p_vnode == &kvp) ? PGR_SAMESZC : 0; + flags = (VN_ISKAS(targ->p_vnode)) ? PGR_SAMESZC : 0; repl = page_get_replacement_page(targ, lgrp, flags); if (repl == NULL) { if (grouplock != 0) { @@ -4900,7 +4900,7 @@ /* * Let hat_page_relocate() complete the relocation if it's kernel page */ - if (targ->p_vnode == &kvp) { + if (VN_ISKAS(targ->p_vnode)) { *replacement = repl; if (hat_page_relocate(target, replacement, nrelocp) != 0) { if (grouplock != 0) { @@ -5244,7 +5244,7 @@ return (1); } - if (vp != NULL && !IS_SWAPFSVP(vp) && vp != &kvp) { + if (vp != NULL && !IS_SWAPFSVP(vp) && !VN_ISKAS(vp)) { VM_STAT_ADD(pagecnt.pc_try_demote_pages[2]); page_demote_vp_pages(pp); ASSERT(pp->p_szc == 0); @@ -5269,7 +5269,7 @@ * We can't demote kernel pages since we can't hat_unload() * the mappings. */ - if (rootpp->p_vnode == &kvp) + if (VN_ISKAS(rootpp->p_vnode)) return (0); /* @@ -5393,7 +5393,7 @@ ASSERT(!PP_ISFREE(pp)); ASSERT(pp->p_vnode != NULL); ASSERT(!IS_SWAPFSVP(pp->p_vnode)); - ASSERT(pp->p_vnode != &kvp); + ASSERT(!PP_ISKAS(pp)); VM_STAT_ADD(pagecnt.pc_demote_pages[0]); @@ -6850,7 +6850,7 @@ ret = EAGAIN; goto cleanup; } - if (PP_ISKVP(pp)) { + if (PP_ISKAS(pp)) { ret = EAGAIN; goto cleanup; } @@ -6932,7 +6932,7 @@ return (EPERM); } #else - if (PP_ISKVP(pp)) { + if (PP_ISKAS(pp)) { return (EPERM); } #endif /* __sparc */ @@ -7344,7 +7344,7 @@ bp = page_capture_hash[i].lists[j].next; while (bp != &page_capture_hash[i].lists[j]) { pp = bp->pp; - if (!PP_ISKVP(pp) && PP_TOXIC(pp)) { + if (!PP_ISKAS(pp) && PP_TOXIC(pp)) { pp->p_selock = -1; /* pacify ASSERTs */ PP_CLRFREE(pp); pagescrub(pp, 0, PAGESIZE);
--- a/usr/src/uts/common/vm/vm_pagelist.c Tue Dec 19 22:06:32 2006 -0800 +++ b/usr/src/uts/common/vm/vm_pagelist.c Tue Dec 19 23:13:06 2006 -0800 @@ -3909,7 +3909,7 @@ * pages, since we cannot properly handle demotion of kernel * pages. */ - if (like_pp->p_vnode == &kvp) + if (PP_ISKAS(like_pp)) pgrflags |= PGR_SAMESZC; /* LINTED */
--- a/usr/src/uts/i86pc/os/startup.c Tue Dec 19 22:06:32 2006 -0800 +++ b/usr/src/uts/i86pc/os/startup.c Tue Dec 19 23:13:06 2006 -0800 @@ -209,6 +209,19 @@ char kern_bootargs[OBP_MAXPATHLEN]; /* + * ZFS zio segment. This allows us to exclude large portions of ZFS data that + * gets cached in kmem caches on the heap. If this is set to zero, we allocate + * zio buffers from their own segment, otherwise they are allocated from the + * heap. The optimization of allocating zio buffers from their own segment is + * only valid on 64-bit kernels. + */ +#if defined(__amd64) +int segzio_fromheap = 0; +#else +int segzio_fromheap = 1; +#endif + +/* * new memory fragmentations are possible in startup() due to BOP_ALLOCs. this * depends on number of BOP_ALLOC calls made and requested size, memory size * combination and whether boot.bin memory needs to be freed. @@ -239,11 +252,13 @@ #endif caddr_t segkp_base; /* Base address of segkp */ +caddr_t segzio_base; /* Base address of segzio */ #if defined(__amd64) pgcnt_t segkpsize = btop(SEGKPDEFSIZE); /* size of segkp segment in pages */ #else pgcnt_t segkpsize = 0; #endif +pgcnt_t segziosize = 0; /* size of zio segment in pages */ struct memseg *memseg_base; struct vnode unused_pages_vp; @@ -362,6 +377,8 @@ * 0xFFFFFXXX.XXX00000 |-----------------------|- segkmap_start (floating) * | device mappings | * 0xFFFFFXXX.XXX00000 |-----------------------|- toxic_addr (floating) + * | segzio | + * 0xFFFFFXXX.XXX00000 |-----------------------|- segzio_base (floating) * | segkp | * --- |-----------------------|- segkp_base * | segkpm | @@ -1566,6 +1583,29 @@ PRM_DEBUG(final_kernelheap); } + if (!segzio_fromheap) { + size_t size; + + /* size is in bytes, segziosize is in pages */ + if (segziosize == 0) { + size = mmu_ptob(physmem * 2); + } else { + size = mmu_ptob(segziosize); + } + + if (size < SEGZIOMINSIZE) { + size = SEGZIOMINSIZE; + } else if (size > mmu_ptob(physmem * 4)) { + size = mmu_ptob(physmem * 4); + } + segziosize = mmu_btop(ROUND_UP_LPAGE(size)); + segzio_base = final_kernelheap; + PRM_DEBUG(segziosize); + PRM_DEBUG(segzio_base); + final_kernelheap = segzio_base + mmu_ptob(segziosize); + PRM_DEBUG(final_kernelheap); + } + /* * put the range of VA for device mappings next */ @@ -2377,6 +2417,16 @@ #if defined(__amd64) (void) seg_attach(&kas, (caddr_t)core_base, core_size, &kvseg_core); (void) segkmem_create(&kvseg_core); + + /* segzio optimization is only valid for 64-bit kernels */ + if (!segzio_fromheap) { + (void) seg_attach(&kas, segzio_base, mmu_ptob(segziosize), + &kzioseg); + (void) segkmem_zio_create(&kzioseg); + + /* create zio area covering new segment */ + segkmem_zio_init(segzio_base, mmu_ptob(segziosize)); + } #endif (void) seg_attach(&kas, (caddr_t)SEGDEBUGBASE, (size_t)SEGDEBUGSIZE,
--- a/usr/src/uts/i86pc/sys/machparam.h Tue Dec 19 22:06:32 2006 -0800 +++ b/usr/src/uts/i86pc/sys/machparam.h Tue Dec 19 23:13:06 2006 -0800 @@ -167,6 +167,11 @@ #define SEGKPMINSIZE (200L * 1024 * 1024L) /* 200M */ /* + * minimum size for segzio + */ +#define SEGZIOMINSIZE (400L * 1024 * 1024L) /* 400M */ + +/* * Boot (or, more precisely, vmx) maps most pages twice - once in the * bottom 2GB of memory and once in the bottom 2GB of the topmost 4GB. * When boot is unmapped this range is available to the kernel, but until
--- a/usr/src/uts/i86pc/vm/vm_dep.h Tue Dec 19 22:06:32 2006 -0800 +++ b/usr/src/uts/i86pc/vm/vm_dep.h Tue Dec 19 23:13:06 2006 -0800 @@ -406,7 +406,7 @@ */ #define MTYPE_INIT(mtype, vp, vaddr, flags, pgsz) { \ - if (restricted_kmemalloc && (vp) == &kvp && \ + if (restricted_kmemalloc && VN_ISKAS(vp) && \ (caddr_t)(vaddr) >= kernelheap && \ (caddr_t)(vaddr) < ekernelheap) { \ ASSERT(physmax4g); \
--- a/usr/src/uts/i86pc/vm/vm_machdep.c Tue Dec 19 22:06:32 2006 -0800 +++ b/usr/src/uts/i86pc/vm/vm_machdep.c Tue Dec 19 23:13:06 2006 -0800 @@ -1920,8 +1920,8 @@ * with kernel vnode 'kvp'. */ /* XX64 - to debug why this happens! */ - ASSERT(vp != &kvp); - if (vp == &kvp) + ASSERT(!VN_ISKAS(vp)); + if (VN_ISKAS(vp)) cmn_err(CE_NOTE, "page_create: page not expected " "in hash list for kernel vnode - pp 0x%p",
--- a/usr/src/uts/sfmmu/vm/hat_sfmmu.c Tue Dec 19 22:06:32 2006 -0800 +++ b/usr/src/uts/sfmmu/vm/hat_sfmmu.c Tue Dec 19 23:13:06 2006 -0800 @@ -3672,14 +3672,21 @@ * Somebody is holding SE_EXCL lock. Might * even be hat_page_relocate(). Drop all * our locks, lookup the page in &kvp, and - * retry. If it doesn't exist in &kvp, then - * we must be dealing with a kernel mapped + * retry. If it doesn't exist in &kvp and &zvp, + * then we must be dealing with a kernel mapped * page which doesn't actually belong to * segkmem so we punt. */ sfmmu_mlist_exit(pml); SFMMU_HASH_UNLOCK(hmebp); pp = page_lookup(&kvp, (u_offset_t)saddr, SE_SHARED); + + /* check zvp before giving up */ + if (pp == NULL) + pp = page_lookup(&zvp, (u_offset_t)saddr, + SE_SHARED); + + /* Okay, we didn't find it, give up */ if (pp == NULL) { kmem_cache_free(pa_hment_cache, pahmep); *rpfn = pfn; @@ -3710,7 +3717,7 @@ goto rehash; } - if (vp != &kvp) { + if (!VN_ISKAS(vp)) { /* * This is not a segkmem page but another page which * has been kernel mapped. It had better have at least @@ -3841,14 +3848,19 @@ * Somebody is holding SE_EXCL lock. Might * even be hat_page_relocate(). Drop all * our locks, lookup the page in &kvp, and - * retry. If it doesn't exist in &kvp, then - * we must be dealing with a kernel mapped + * retry. If it doesn't exist in &kvp and &zvp, + * then we must be dealing with a kernel mapped * page which doesn't actually belong to * segkmem so we punt. */ sfmmu_mlist_exit(pml); SFMMU_HASH_UNLOCK(hmebp); pp = page_lookup(&kvp, (u_offset_t)saddr, SE_SHARED); + /* check zvp before giving up */ + if (pp == NULL) + pp = page_lookup(&zvp, (u_offset_t)saddr, + SE_SHARED); + if (pp == NULL) { ASSERT(cookie == NULL); return; @@ -3875,7 +3887,7 @@ goto rehash; } - if (vp != &kvp) { + if (!VN_ISKAS(vp)) { /* * This is not a segkmem page but another page which * has been kernel mapped. @@ -6522,7 +6534,7 @@ ASSERT(pp != NULL); ASSERT(sfmmu_mlist_held(pp)); - ASSERT(pp->p_vnode != &kvp); + ASSERT(!PP_ISKAS(pp)); CPUSET_ZERO(cpuset);
--- a/usr/src/uts/sparc/v9/vm/seg_nf.c Tue Dec 19 22:06:32 2006 -0800 +++ b/usr/src/uts/sparc/v9/vm/seg_nf.c Tue Dec 19 23:13:06 2006 -0800 @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -122,8 +121,8 @@ * vnode and page for the page of zeros we use for the nf mappings. */ static kmutex_t segnf_lock; -static struct vnode zvp; -static struct page **zpp; +static struct vnode nfvp; +static struct page **nfpp; #define addr_to_vcolor(addr) \ (shm_alignment) ? \ @@ -195,7 +194,7 @@ * Need a page per virtual color or just 1 if no vac. */ mutex_enter(&segnf_lock); - if (zpp == NULL) { + if (nfpp == NULL) { struct seg kseg; vacpgs = 1; @@ -203,16 +202,16 @@ vacpgs = shm_alignment >> PAGESHIFT; } - zpp = kmem_alloc(sizeof (*zpp) * vacpgs, KM_SLEEP); + nfpp = kmem_alloc(sizeof (*nfpp) * vacpgs, KM_SLEEP); kseg.s_as = &kas; for (i = 0; i < vacpgs; i++, off += PAGESIZE, vaddr += PAGESIZE) { - zpp[i] = page_create_va(&zvp, off, PAGESIZE, + nfpp[i] = page_create_va(&nfvp, off, PAGESIZE, PG_WAIT | PG_NORELOC, &kseg, vaddr); - page_io_unlock(zpp[i]); - page_downgrade(zpp[i]); - pagezero(zpp[i], 0, PAGESIZE); + page_io_unlock(nfpp[i]); + page_downgrade(nfpp[i]); + pagezero(nfpp[i], 0, PAGESIZE); } } mutex_exit(&segnf_lock); @@ -234,7 +233,7 @@ color = addr_to_vcolor(seg->s_base); if (as != &kas) prot |= PROT_USER; - hat_memload(as->a_hat, seg->s_base, zpp[color], + hat_memload(as->a_hat, seg->s_base, nfpp[color], prot | HAT_NOFAULT, HAT_LOAD); /* @@ -456,7 +455,7 @@ { ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); - *vpp = &zvp; + *vpp = &nfvp; return (0); }
--- a/usr/src/uts/sun4/os/startup.c Tue Dec 19 22:06:32 2006 -0800 +++ b/usr/src/uts/sun4/os/startup.c Tue Dec 19 23:13:06 2006 -0800 @@ -192,6 +192,10 @@ struct seg *segkmap = &kmapseg; /* Kernel generic mapping segment */ struct seg *segkpm = &kpmseg; /* 64bit kernel physical mapping segment */ +int segzio_fromheap = 0; /* zio allocations occur from heap */ +caddr_t segzio_base; /* Base address of segzio */ +pgcnt_t segziosize = 0; /* size of zio segment in pages */ + /* * debugger pages (if allocated) */ @@ -373,6 +377,8 @@ * 0xFFFFFFFC.00000000 -|-----------------------|- * : : * : : + * -|-----------------------|- + * | segzio | (base and size vary) * 0xFFFFFE00.00000000 -|-----------------------|- * | | Ultrasparc I/II support * | segkpm segment | up to 2TB of physical @@ -2058,6 +2064,47 @@ mach_kpm_init(); } + if (!segzio_fromheap) { + size_t size; + + /* size is in bytes, segziosize is in pages */ + if (segziosize == 0) { + size = mmu_ptob(physmem * 2); + } else { + size = mmu_ptob(segziosize); + } + + if (size < SEGZIOMINSIZE) { + size = SEGZIOMINSIZE; + } else if (size > mmu_ptob(physmem * 4)) { + size = mmu_ptob(physmem * 4); + } + segziosize = mmu_btop(roundup(size, MMU_PAGESIZE)); + /* put the base of the ZIO segment after the kpm segment */ + segzio_base = kpm_vbase + (kpm_size * vac_colors); + PRM_DEBUG(segziosize); + PRM_DEBUG(segzio_base); + + /* + * On some platforms, kvm_init is called after the kpm + * sizes have been determined. On SPARC, kvm_init is called + * before, so we have to attach the kzioseg after kvm is + * initialized, otherwise we'll try to allocate from the boot + * area since the kernel heap hasn't yet been configured. + */ + rw_enter(&kas.a_lock, RW_WRITER); + + (void) seg_attach(&kas, segzio_base, mmu_ptob(segziosize), + &kzioseg); + (void) segkmem_zio_create(&kzioseg); + + /* create zio area covering new segment */ + segkmem_zio_init(segzio_base, mmu_ptob(segziosize)); + + rw_exit(&kas.a_lock); + } + + /* * Now create generic mapping segment. This mapping * goes SEGMAPSIZE beyond SEGMAPBASE. But if the total
--- a/usr/src/uts/sun4/sys/vm_machparam.h Tue Dec 19 22:06:32 2006 -0800 +++ b/usr/src/uts/sun4/sys/vm_machparam.h Tue Dec 19 23:13:06 2006 -0800 @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -22,10 +21,9 @@ /* Copyright (c) 1988 AT&T */ /* All Rights Reserved */ - /* - * Copyright (c) 1989,1999 by Sun Microsystems, Inc. - * All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. */ #ifndef _SYS_VM_MACHPARAM_H @@ -96,6 +94,11 @@ #endif /* _LP64 */ /* + * Define minimum size for zio segment + */ +#define SEGZIOMINSIZE (512L * 1024 * 1024L) /* 512M */ + +/* * The time for a process to be blocked before being very swappable. * This is a number of seconds which the system takes as being a non-trivial * amount of real time. You probably shouldn't change this;