changeset 12684:397e44ebb8a9

6710343 dnode cache should register a dnode_move() callback to limit fragmentation 6583724 dnode_create should not call kmem_cache_constructor directly 6374545 disk write cache flush code overloads buf_t.b_list pointer
author Tom Erickson <Tom.Erickson@Sun.COM>
date Thu, 24 Jun 2010 11:35:31 -0700
parents 92e6427b7b70
children 2d7ff21e61eb
files usr/src/cmd/zdb/zdb.c usr/src/cmd/zinject/translate.c usr/src/lib/libavl/mapfile-vers usr/src/lib/libzpool/common/sys/zfs_context.h usr/src/uts/common/Makefile.files usr/src/uts/common/fs/dnlc.c usr/src/uts/common/fs/zfs/dbuf.c usr/src/uts/common/fs/zfs/dmu.c usr/src/uts/common/fs/zfs/dmu_object.c usr/src/uts/common/fs/zfs/dmu_objset.c usr/src/uts/common/fs/zfs/dmu_tx.c usr/src/uts/common/fs/zfs/dnode.c usr/src/uts/common/fs/zfs/dnode_sync.c usr/src/uts/common/fs/zfs/refcount.c usr/src/uts/common/fs/zfs/sa.c usr/src/uts/common/fs/zfs/sys/dbuf.h usr/src/uts/common/fs/zfs/sys/dmu.h usr/src/uts/common/fs/zfs/sys/dmu_objset.h usr/src/uts/common/fs/zfs/sys/dnode.h usr/src/uts/common/fs/zfs/sys/refcount.h usr/src/uts/common/fs/zfs/sys/sa_impl.h usr/src/uts/common/fs/zfs/sys/zfs_znode.h usr/src/uts/common/fs/zfs/sys/zrlock.h usr/src/uts/common/fs/zfs/zfs_znode.c usr/src/uts/common/fs/zfs/zrlock.c usr/src/uts/common/sys/dnlc.h usr/src/uts/common/sys/kmem.h usr/src/uts/intel/io/dktp/dcdev/dadk.c usr/src/uts/sun/io/dada/targets/dad.c
diffstat 29 files changed, 1491 insertions(+), 321 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/cmd/zdb/zdb.c	Thu Jun 24 09:34:22 2010 -0700
+++ b/usr/src/cmd/zdb/zdb.c	Thu Jun 24 11:35:31 2010 -0700
@@ -1459,7 +1459,7 @@
 	}
 
 	if (object == 0) {
-		dn = os->os_meta_dnode;
+		dn = DMU_META_DNODE(os);
 	} else {
 		error = dmu_bonus_hold(os, object, FTAG, &db);
 		if (error)
@@ -1467,7 +1467,7 @@
 			    object, error);
 		bonus = db->db_data;
 		bsize = db->db_size;
-		dn = ((dmu_buf_impl_t *)db)->db_dnode;
+		dn = DB_DNODE((dmu_buf_impl_t *)db);
 	}
 	dmu_object_info_from_dnode(dn, &doi);
 
@@ -1631,8 +1631,8 @@
 
 	dump_object(os, 0, verbosity, &print_header);
 	object_count = 0;
-	if (os->os_userused_dnode &&
-	    os->os_userused_dnode->dn_type != 0) {
+	if (DMU_USERUSED_DNODE(os) != NULL &&
+	    DMU_USERUSED_DNODE(os)->dn_type != 0) {
 		dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header);
 		dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header);
 	}
--- a/usr/src/cmd/zinject/translate.c	Thu Jun 24 09:34:22 2010 -0700
+++ b/usr/src/cmd/zinject/translate.c	Thu Jun 24 11:35:31 2010 -0700
@@ -267,7 +267,7 @@
 	}
 
 	if (record->zi_object == 0) {
-		dn = os->os_meta_dnode;
+		dn = DMU_META_DNODE(os);
 	} else {
 		err = dnode_hold(os, record->zi_object, FTAG, &dn);
 		if (err != 0) {
@@ -318,7 +318,7 @@
 	ret = 0;
 out:
 	if (dn) {
-		if (dn != os->os_meta_dnode)
+		if (dn != DMU_META_DNODE(os))
 			dnode_rele(dn, FTAG);
 	}
 	if (os)
--- a/usr/src/lib/libavl/mapfile-vers	Thu Jun 24 09:34:22 2010 -0700
+++ b/usr/src/lib/libavl/mapfile-vers	Thu Jun 24 11:35:31 2010 -0700
@@ -19,8 +19,7 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
-# Use is subject to license terms.
+# Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 #
 
 #
@@ -47,6 +46,7 @@
 	avl_first;
 	avl_insert;
 	avl_insert_here;
+	avl_is_empty;
 	avl_last;
 	avl_nearest;
 	avl_numnodes;
--- a/usr/src/lib/libzpool/common/sys/zfs_context.h	Thu Jun 24 09:34:22 2010 -0700
+++ b/usr/src/lib/libzpool/common/sys/zfs_context.h	Thu Jun 24 11:35:31 2010 -0700
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef _SYS_ZFS_CONTEXT_H
@@ -231,8 +230,10 @@
 } kmutex_t;
 
 #define	MUTEX_DEFAULT	USYNC_THREAD
-#undef MUTEX_HELD
+#undef	MUTEX_HELD
+#undef	MUTEX_NOT_HELD
 #define	MUTEX_HELD(m) _mutex_held(&(m)->m_lock)
+#define	MUTEX_NOT_HELD(m) (!MUTEX_HELD(m))
 
 /*
  * Argh -- we have to get cheesy here because the kernel and userland
@@ -323,10 +324,21 @@
 #define	kmem_cache_alloc(_c, _f) umem_cache_alloc(_c, _f)
 #define	kmem_cache_free(_c, _b)	umem_cache_free(_c, _b)
 #define	kmem_debugging()	0
-#define	kmem_cache_reap_now(c)
+#define	kmem_cache_reap_now(_c)		/* nothing */
+#define	kmem_cache_set_move(_c, _cb)	/* nothing */
+#define	POINTER_INVALIDATE(_pp)		/* nothing */
+#define	POINTER_IS_VALID(_p)	0
 
 typedef umem_cache_t kmem_cache_t;
 
+typedef enum kmem_cbrc {
+	KMEM_CBRC_YES,
+	KMEM_CBRC_NO,
+	KMEM_CBRC_LATER,
+	KMEM_CBRC_DONT_NEED,
+	KMEM_CBRC_DONT_KNOW
+} kmem_cbrc_t;
+
 /*
  * Task queues
  */
--- a/usr/src/uts/common/Makefile.files	Thu Jun 24 09:34:22 2010 -0700
+++ b/usr/src/uts/common/Makefile.files	Thu Jun 24 11:35:31 2010 -0700
@@ -1380,7 +1380,8 @@
 	zio_checksum.o		\
 	zio_compress.o		\
 	zio_inject.o		\
-	zle.o
+	zle.o			\
+	zrlock.o
 
 ZFS_SHARED_OBJS +=		\
 	zfs_namecheck.o		\
--- a/usr/src/uts/common/fs/dnlc.c	Thu Jun 24 09:34:22 2010 -0700
+++ b/usr/src/uts/common/fs/dnlc.c	Thu Jun 24 11:35:31 2010 -0700
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
@@ -278,7 +277,8 @@
  */
 #define	DNLC_DIR_HASH(name, hash, namelen)			\
 	{							\
-		char Xc, *Xcp;					\
+		char Xc;					\
+		const char *Xcp;				\
 		hash = *name;					\
 		for (Xcp = (name + 1); (Xc = *Xcp) != 0; Xcp++)	\
 			hash = (hash << 4) + hash + Xc;		\
@@ -322,7 +322,8 @@
 
 /* Prototypes */
 static ncache_t *dnlc_get(uchar_t namlen);
-static ncache_t *dnlc_search(vnode_t *dp, char *name, uchar_t namlen, int hash);
+static ncache_t *dnlc_search(vnode_t *dp, const char *name, uchar_t namlen,
+    int hash);
 static void dnlc_dir_reclaim(void *unused);
 static void dnlc_dir_abort(dircache_t *dcp);
 static void dnlc_dir_adjust_fhash(dircache_t *dcp);
@@ -431,7 +432,7 @@
  * Add a name to the directory cache.
  */
 void
-dnlc_enter(vnode_t *dp, char *name, vnode_t *vp)
+dnlc_enter(vnode_t *dp, const char *name, vnode_t *vp)
 {
 	ncache_t *ncp;
 	nc_hash_t *hp;
@@ -497,7 +498,7 @@
  * it just frees up the newly allocated dnlc entry.
  */
 void
-dnlc_update(vnode_t *dp, char *name, vnode_t *vp)
+dnlc_update(vnode_t *dp, const char *name, vnode_t *vp)
 {
 	ncache_t *ncp;
 	ncache_t *tcp;
@@ -579,7 +580,7 @@
  * lost before the caller can use the vnode.
  */
 vnode_t *
-dnlc_lookup(vnode_t *dp, char *name)
+dnlc_lookup(vnode_t *dp, const char *name)
 {
 	ncache_t *ncp;
 	nc_hash_t *hp;
@@ -660,7 +661,7 @@
  * Remove an entry in the directory name cache.
  */
 void
-dnlc_remove(vnode_t *dp, char *name)
+dnlc_remove(vnode_t *dp, const char *name)
 {
 	ncache_t *ncp;
 	nc_hash_t *hp;
@@ -968,7 +969,7 @@
  * ncache entry if found, NULL otherwise.
  */
 static ncache_t *
-dnlc_search(vnode_t *dp, char *name, uchar_t namlen, int hash)
+dnlc_search(vnode_t *dp, const char *name, uchar_t namlen, int hash)
 {
 	nc_hash_t *hp;
 	ncache_t *ncp;
@@ -1141,7 +1142,7 @@
  * Lookup up an entry in a complete or partial directory cache.
  */
 dcret_t
-dnlc_dir_lookup(dcanchor_t *dcap, char *name, uint64_t *handle)
+dnlc_dir_lookup(dcanchor_t *dcap, const char *name, uint64_t *handle)
 {
 	dircache_t *dcp;
 	dcentry_t *dep;
@@ -1282,7 +1283,7 @@
  * Add a directopry entry to a partial or complete directory cache.
  */
 dcret_t
-dnlc_dir_add_entry(dcanchor_t *dcap, char *name, uint64_t handle)
+dnlc_dir_add_entry(dcanchor_t *dcap, const char *name, uint64_t handle)
 {
 	dircache_t *dcp;
 	dcentry_t **hp, *dep;
@@ -1583,7 +1584,7 @@
  * Return the handle if it's non null.
  */
 dcret_t
-dnlc_dir_rem_entry(dcanchor_t *dcap, char *name, uint64_t *handlep)
+dnlc_dir_rem_entry(dcanchor_t *dcap, const char *name, uint64_t *handlep)
 {
 	dircache_t *dcp;
 	dcentry_t **prevpp, *te;
@@ -1782,7 +1783,7 @@
  * Update the handle of an directory cache entry.
  */
 dcret_t
-dnlc_dir_update(dcanchor_t *dcap, char *name, uint64_t handle)
+dnlc_dir_update(dcanchor_t *dcap, const char *name, uint64_t handle)
 {
 	dircache_t *dcp;
 	dcentry_t *dep;
--- a/usr/src/uts/common/fs/zfs/dbuf.c	Thu Jun 24 09:34:22 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/dbuf.c	Thu Jun 24 11:35:31 2010 -0700
@@ -217,6 +217,22 @@
 	db->db_evict_func = NULL;
 }
 
+boolean_t
+dbuf_is_metadata(dmu_buf_impl_t *db)
+{
+	if (db->db_level > 0) {
+		return (B_TRUE);
+	} else {
+		boolean_t is_metadata;
+
+		DB_DNODE_ENTER(db);
+		is_metadata = dmu_ot[DB_DNODE(db)->dn_type].ot_metadata;
+		DB_DNODE_EXIT(db);
+
+		return (is_metadata);
+	}
+}
+
 void
 dbuf_evict(dmu_buf_impl_t *db)
 {
@@ -281,7 +297,7 @@
 static void
 dbuf_verify(dmu_buf_impl_t *db)
 {
-	dnode_t *dn = db->db_dnode;
+	dnode_t *dn;
 	dbuf_dirty_record_t *dr;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -290,6 +306,8 @@
 		return;
 
 	ASSERT(db->db_objset != NULL);
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
 	if (dn == NULL) {
 		ASSERT(db->db_parent == NULL);
 		ASSERT(db->db_blkptr == NULL);
@@ -297,8 +315,9 @@
 		ASSERT3U(db->db.db_object, ==, dn->dn_object);
 		ASSERT3P(db->db_objset, ==, dn->dn_objset);
 		ASSERT3U(db->db_level, <, dn->dn_nlevels);
-		ASSERT(db->db_blkid == DMU_BONUS_BLKID || db->db_blkid ==
-		    DMU_SPILL_BLKID || list_head(&dn->dn_dbufs));
+		ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
+		    db->db_blkid == DMU_SPILL_BLKID ||
+		    !list_is_empty(&dn->dn_dbufs));
 	}
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		ASSERT(dn != NULL);
@@ -355,7 +374,7 @@
 			 * have the struct_rwlock.  XXX indblksz no longer
 			 * grows.  safe to do this now?
 			 */
-			if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) {
+			if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
 				ASSERT3P(db->db_blkptr, ==,
 				    ((blkptr_t *)db->db_parent->db.db_data +
 				    db->db_blkid % epb));
@@ -380,6 +399,7 @@
 			}
 		}
 	}
+	DB_DNODE_EXIT(db);
 }
 #endif
 
@@ -424,8 +444,11 @@
 	mutex_enter(&db->db_mtx);
 	if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
 		int blksz = db->db.db_size;
+		spa_t *spa;
+
 		mutex_exit(&db->db_mtx);
-		abuf = arc_loan_buf(db->db_dnode->dn_objset->os_spa, blksz);
+		DB_GET_SPA(&spa, db);
+		abuf = arc_loan_buf(spa, blksz);
 		bcopy(db->db.db_data, abuf->b_data, blksz);
 	} else {
 		abuf = db->db_buf;
@@ -484,11 +507,14 @@
 static void
 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
 {
-	dnode_t *dn = db->db_dnode;
+	dnode_t *dn;
+	spa_t *spa;
 	zbookmark_t zb;
 	uint32_t aflags = ARC_NOWAIT;
 	arc_buf_t *pbuf;
 
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
 	ASSERT(!refcount_is_zero(&db->db_holds));
 	/* We need the struct_rwlock to prevent db_blkptr from changing. */
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
@@ -506,6 +532,7 @@
 			bzero(db->db.db_data, DN_MAX_BONUSLEN);
 		if (bonuslen)
 			bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
+		DB_DNODE_EXIT(db);
 		dbuf_update_data(db);
 		db->db_state = DB_CACHED;
 		mutex_exit(&db->db_mtx);
@@ -524,6 +551,7 @@
 
 		dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
 		    db->db.db_size, db, type));
+		DB_DNODE_EXIT(db);
 		bzero(db->db.db_data, db->db.db_size);
 		db->db_state = DB_CACHED;
 		*flags |= DB_RF_CACHED;
@@ -531,6 +559,9 @@
 		return;
 	}
 
+	spa = dn->dn_objset->os_spa;
+	DB_DNODE_EXIT(db);
+
 	db->db_state = DB_READ;
 	mutex_exit(&db->db_mtx);
 
@@ -549,7 +580,7 @@
 	else
 		pbuf = db->db_objset->os_phys_buf;
 
-	(void) dsl_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf,
+	(void) dsl_read(zio, spa, db->db_blkptr, pbuf,
 	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
 	    (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
 	    &aflags, &zb);
@@ -563,6 +594,7 @@
 	int err = 0;
 	int havepzio = (zio != NULL);
 	int prefetch;
+	dnode_t *dn;
 
 	/*
 	 * We don't have to hold the mutex to check db_state because it
@@ -573,46 +605,51 @@
 	if (db->db_state == DB_NOFILL)
 		return (EIO);
 
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
 	if ((flags & DB_RF_HAVESTRUCT) == 0)
-		rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 	prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
-	    (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL &&
+	    (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
 	    DBUF_IS_CACHEABLE(db);
 
 	mutex_enter(&db->db_mtx);
 	if (db->db_state == DB_CACHED) {
 		mutex_exit(&db->db_mtx);
 		if (prefetch)
-			dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
+			dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
 			    db->db.db_size, TRUE);
 		if ((flags & DB_RF_HAVESTRUCT) == 0)
-			rw_exit(&db->db_dnode->dn_struct_rwlock);
+			rw_exit(&dn->dn_struct_rwlock);
+		DB_DNODE_EXIT(db);
 	} else if (db->db_state == DB_UNCACHED) {
-		if (zio == NULL) {
-			zio = zio_root(db->db_dnode->dn_objset->os_spa,
-			    NULL, NULL, ZIO_FLAG_CANFAIL);
-		}
+		spa_t *spa = dn->dn_objset->os_spa;
+
+		if (zio == NULL)
+			zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 		dbuf_read_impl(db, zio, &flags);
 
 		/* dbuf_read_impl has dropped db_mtx for us */
 
 		if (prefetch)
-			dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
+			dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
 			    db->db.db_size, flags & DB_RF_CACHED);
 
 		if ((flags & DB_RF_HAVESTRUCT) == 0)
-			rw_exit(&db->db_dnode->dn_struct_rwlock);
+			rw_exit(&dn->dn_struct_rwlock);
+		DB_DNODE_EXIT(db);
 
 		if (!havepzio)
 			err = zio_wait(zio);
 	} else {
 		mutex_exit(&db->db_mtx);
 		if (prefetch)
-			dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
+			dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
 			    db->db.db_size, TRUE);
 		if ((flags & DB_RF_HAVESTRUCT) == 0)
-			rw_exit(&db->db_dnode->dn_struct_rwlock);
+			rw_exit(&dn->dn_struct_rwlock);
+		DB_DNODE_EXIT(db);
 
 		mutex_enter(&db->db_mtx);
 		if ((flags & DB_RF_NEVERWAIT) == 0) {
@@ -642,11 +679,12 @@
 		cv_wait(&db->db_changed, &db->db_mtx);
 	if (db->db_state == DB_UNCACHED) {
 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+		spa_t *spa;
 
 		ASSERT(db->db_buf == NULL);
 		ASSERT(db->db.db_data == NULL);
-		dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
-		    db->db.db_size, db, type));
+		DB_GET_SPA(&spa, db);
+		dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
 		db->db_state = DB_FILL;
 	} else if (db->db_state == DB_NOFILL) {
 		dbuf_set_data(db, NULL);
@@ -687,7 +725,7 @@
 	/*
 	 * If the last dirty record for this dbuf has not yet synced
 	 * and its referencing the dbuf data, either:
-	 * 	reset the reference to point to a new copy,
+	 *	reset the reference to point to a new copy,
 	 * or (if there a no active holders)
 	 *	just null out the current db_data pointer.
 	 */
@@ -700,8 +738,10 @@
 	} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
 		int size = db->db.db_size;
 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
-		dr->dt.dl.dr_data = arc_buf_alloc(
-		    db->db_dnode->dn_objset->os_spa, size, db, type);
+		spa_t *spa;
+
+		DB_GET_SPA(&spa, db);
+		dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
 		bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
 	} else {
 		dbuf_set_data(db, NULL);
@@ -726,9 +766,12 @@
 	ASSERT(db->db_data_pending != dr);
 
 	/* free this block */
-	if (!BP_IS_HOLE(bp))
-		zio_free(db->db_dnode->dn_objset->os_spa, txg, bp);
+	if (!BP_IS_HOLE(bp)) {
+		spa_t *spa;
 
+		DB_GET_SPA(&spa, db);
+		zio_free(spa, txg, bp);
+	}
 	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 	/*
 	 * Release the already-written buffer, so we leave it in
@@ -884,11 +927,15 @@
 	arc_buf_t *buf, *obuf;
 	int osize = db->db.db_size;
 	arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+	dnode_t *dn;
 
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+
 	/* XXX does *this* func really need the lock? */
-	ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock));
+	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 
 	/*
 	 * This call to dbuf_will_dirty() with the dn_struct_rwlock held
@@ -903,7 +950,7 @@
 	dbuf_will_dirty(db, tx);
 
 	/* create the data buffer for the new block */
-	buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type);
+	buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
 
 	/* copy old block data to the new block */
 	obuf = db->db_buf;
@@ -923,15 +970,17 @@
 	}
 	mutex_exit(&db->db_mtx);
 
-	dnode_willuse_space(db->db_dnode, size-osize, tx);
+	dnode_willuse_space(dn, size-osize, tx);
+	DB_DNODE_EXIT(db);
 }
 
 void
 dbuf_release_bp(dmu_buf_impl_t *db)
 {
-	objset_t *os = db->db_dnode->dn_objset;
+	objset_t *os;
 	zbookmark_t zb;
 
+	DB_GET_OBJSET(&os, db);
 	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
 	ASSERT(arc_released(os->os_phys_buf) ||
 	    list_link_active(&os->os_dsl_dataset->ds_synced_link));
@@ -949,8 +998,8 @@
 dbuf_dirty_record_t *
 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
-	dnode_t *dn = db->db_dnode;
-	objset_t *os = dn->dn_objset;
+	dnode_t *dn;
+	objset_t *os;
 	dbuf_dirty_record_t **drp, *dr;
 	int drop_struct_lock = FALSE;
 	boolean_t do_free_accounting = B_FALSE;
@@ -960,6 +1009,8 @@
 	ASSERT(!refcount_is_zero(&db->db_holds));
 	DMU_TX_DIRTY_BUF(tx, db);
 
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
 	/*
 	 * Shouldn't dirty a regular buffer in syncing context.  Private
 	 * objects may be dirtied in syncing context, but only if they
@@ -1014,6 +1065,8 @@
 	while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
 		drp = &dr->dr_next;
 	if (dr && dr->dr_txg == tx->tx_txg) {
+		DB_DNODE_EXIT(db);
+
 		if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
 			/*
 			 * If this buffer has already been written out,
@@ -1049,6 +1102,7 @@
 	 * we already dirtied it in open context.  Hence we must make
 	 * this assertion only if we're not already dirty.
 	 */
+	os = dn->dn_objset;
 	ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
 	    os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
 	ASSERT(db->db.db_size != 0);
@@ -1137,6 +1191,7 @@
 		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
 		mutex_exit(&dn->dn_mtx);
 		dnode_setdirty(dn, tx);
+		DB_DNODE_EXIT(db);
 		return (dr);
 	} else if (do_free_accounting) {
 		blkptr_t *bp = db->db_blkptr;
@@ -1199,8 +1254,7 @@
 	} else {
 		ASSERT(db->db_level+1 == dn->dn_nlevels);
 		ASSERT(db->db_blkid < dn->dn_nblkptr);
-		ASSERT(db->db_parent == NULL ||
-		    db->db_parent == db->db_dnode->dn_dbuf);
+		ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
 		mutex_enter(&dn->dn_mtx);
 		ASSERT(!list_link_active(&dr->dr_dirty_node));
 		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
@@ -1210,13 +1264,14 @@
 	}
 
 	dnode_setdirty(dn, tx);
+	DB_DNODE_EXIT(db);
 	return (dr);
 }
 
 static int
 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
-	dnode_t *dn = db->db_dnode;
+	dnode_t *dn;
 	uint64_t txg = tx->tx_txg;
 	dbuf_dirty_record_t *dr, **drp;
 
@@ -1237,6 +1292,9 @@
 	ASSERT(dr->dr_txg == txg);
 	ASSERT(dr->dr_dbuf == db);
 
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+
 	/*
 	 * If this buffer is currently held, we cannot undirty
 	 * it, since one of the current holders may be in the
@@ -1249,6 +1307,7 @@
 		mutex_enter(&dn->dn_mtx);
 		dnode_clear_range(dn, db->db_blkid, 1, tx);
 		mutex_exit(&dn->dn_mtx);
+		DB_DNODE_EXIT(db);
 		return (0);
 	}
 
@@ -1270,6 +1329,7 @@
 		list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
 		mutex_exit(&dn->dn_mtx);
 	}
+	DB_DNODE_EXIT(db);
 
 	if (db->db_level == 0) {
 		if (db->db_state != DB_NOFILL) {
@@ -1315,8 +1375,10 @@
 	ASSERT(tx->tx_txg != 0);
 	ASSERT(!refcount_is_zero(&db->db_holds));
 
-	if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock))
+	DB_DNODE_ENTER(db);
+	if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
 		rf |= DB_RF_HAVESTRUCT;
+	DB_DNODE_EXIT(db);
 	(void) dbuf_read(db, NULL, rf);
 	(void) dbuf_dirty(db, tx);
 }
@@ -1378,7 +1440,6 @@
 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
 {
 	ASSERT(!refcount_is_zero(&db->db_holds));
-	ASSERT(db->db_dnode->dn_object != DMU_META_DNODE_OBJECT);
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	ASSERT(db->db_level == 0);
 	ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
@@ -1442,7 +1503,7 @@
  * in this case.  For callers from the DMU we will usually see:
  *	dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
  * For the arc callback, we will usually see:
- * 	dbuf_do_evict()->dbuf_clear();dbuf_destroy()
+ *	dbuf_do_evict()->dbuf_clear();dbuf_destroy()
  * Sometimes, though, we will get a mix of these two:
  *	DMU: dbuf_clear()->arc_buf_evict()
  *	ARC: dbuf_do_evict()->dbuf_destroy()
@@ -1450,9 +1511,9 @@
 void
 dbuf_clear(dmu_buf_impl_t *db)
 {
-	dnode_t *dn = db->db_dnode;
+	dnode_t *dn;
 	dmu_buf_impl_t *parent = db->db_parent;
-	dmu_buf_impl_t *dndb = dn->dn_dbuf;
+	dmu_buf_impl_t *dndb;
 	int dbuf_gone = FALSE;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -1476,10 +1537,26 @@
 	db->db_state = DB_EVICTING;
 	db->db_blkptr = NULL;
 
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	dndb = dn->dn_dbuf;
 	if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
 		list_remove(&dn->dn_dbufs, db);
+		(void) atomic_dec_32_nv(&dn->dn_dbufs_count);
+		membar_producer();
+		DB_DNODE_EXIT(db);
+		/*
+		 * Decrementing the dbuf count means that the hold corresponding
+		 * to the removed dbuf is no longer discounted in dnode_move(),
+		 * so the dnode cannot be moved until after we release the hold.
+		 * The membar_producer() ensures visibility of the decremented
+		 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
+		 * release any lock.
+		 */
 		dnode_rele(dn, db);
-		db->db_dnode = NULL;
+		db->db_dnode_handle = NULL;
+	} else {
+		DB_DNODE_EXIT(db);
 	}
 
 	if (db->db_buf)
@@ -1489,7 +1566,7 @@
 		mutex_exit(&db->db_mtx);
 
 	/*
-	 * If this dbuf is referened from an indirect dbuf,
+	 * If this dbuf is referenced from an indirect dbuf,
 	 * decrement the ref count on the indirect dbuf.
 	 */
 	if (parent && parent != dndb)
@@ -1581,7 +1658,7 @@
 	db->db_blkid = blkid;
 	db->db_last_dirty = NULL;
 	db->db_dirtycnt = 0;
-	db->db_dnode = dn;
+	db->db_dnode_handle = dn->dn_handle;
 	db->db_parent = parent;
 	db->db_blkptr = blkptr;
 
@@ -1638,6 +1715,7 @@
 	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
 	    refcount_count(&dn->dn_holds) > 0);
 	(void) refcount_add(&dn->dn_holds, db);
+	(void) atomic_inc_32_nv(&dn->dn_dbufs_count);
 
 	dprintf_dbuf(db, "db=%p\n", db);
 
@@ -1677,15 +1755,24 @@
 		 * If this dbuf is still on the dn_dbufs list,
 		 * remove it from that list.
 		 */
-		if (db->db_dnode) {
-			dnode_t *dn = db->db_dnode;
+		if (db->db_dnode_handle != NULL) {
+			dnode_t *dn;
 
+			DB_DNODE_ENTER(db);
+			dn = DB_DNODE(db);
 			mutex_enter(&dn->dn_dbufs_mtx);
 			list_remove(&dn->dn_dbufs, db);
+			(void) atomic_dec_32_nv(&dn->dn_dbufs_count);
 			mutex_exit(&dn->dn_dbufs_mtx);
-
+			DB_DNODE_EXIT(db);
+			/*
+			 * Decrementing the dbuf count means that the hold
+			 * corresponding to the removed dbuf is no longer
+			 * discounted in dnode_move(), so the dnode cannot be
+			 * moved until after we release the hold.
+			 */
 			dnode_rele(dn, db);
-			db->db_dnode = NULL;
+			db->db_dnode_handle = NULL;
 		}
 		dbuf_hash_remove(db);
 	}
@@ -1824,7 +1911,7 @@
 			arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 
 			dbuf_set_data(db,
-			    arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
+			    arc_buf_alloc(dn->dn_objset->os_spa,
 			    db->db.db_size, db, type));
 			bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
 			    db->db.db_size);
@@ -1840,7 +1927,7 @@
 	if (parent)
 		dbuf_rele(parent, NULL);
 
-	ASSERT3P(db->db_dnode, ==, dn);
+	ASSERT3P(DB_DNODE(db), ==, dn);
 	ASSERT3U(db->db_blkid, ==, blkid);
 	ASSERT3U(db->db_level, ==, level);
 	*dbp = db;
@@ -1877,6 +1964,8 @@
 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	dnode_t *dn;
+
 	if (db->db_blkid != DMU_SPILL_BLKID)
 		return (ENOTSUP);
 	if (blksz == 0)
@@ -1886,9 +1975,12 @@
 	else
 		blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
 
-	rw_enter(&db->db_dnode->dn_struct_rwlock, RW_WRITER);
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	dbuf_new_size(db, blksz, tx);
-	rw_exit(&db->db_dnode->dn_struct_rwlock);
+	rw_exit(&dn->dn_struct_rwlock);
+	DB_DNODE_EXIT(db);
 
 	return (0);
 }
@@ -1907,6 +1999,13 @@
 	ASSERT(holds > 1);
 }
 
+/*
+ * If you call dbuf_rele() you had better not be referencing the dnode handle
+ * unless you have some other direct or indirect hold on the dnode. (An indirect
+ * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
+ * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
+ * dnode's parent dbuf evicting its dnode handles.
+ */
 #pragma weak dmu_buf_rele = dbuf_rele
 void
 dbuf_rele(dmu_buf_impl_t *db, void *tag)
@@ -1927,6 +2026,11 @@
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	DBUF_VERIFY(db);
 
+	/*
+	 * Remove the reference to the dbuf before removing its hold on the
+	 * dnode so we can guarantee in dnode_move() that a referenced bonus
+	 * buffer has a corresponding dnode hold.
+	 */
 	holds = refcount_remove(&db->db_holds, tag);
 	ASSERT(holds >= 0);
 
@@ -1944,7 +2048,20 @@
 	if (holds == 0) {
 		if (db->db_blkid == DMU_BONUS_BLKID) {
 			mutex_exit(&db->db_mtx);
-			dnode_rele(db->db_dnode, db);
+
+			/*
+			 * If the dnode moves here, we cannot cross this barrier
+			 * until the move completes.
+			 */
+			DB_DNODE_ENTER(db);
+			(void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count);
+			DB_DNODE_EXIT(db);
+			/*
+			 * The bonus buffer's dnode hold is no longer discounted
+			 * in dnode_move(). The dnode cannot move until after
+			 * the dnode_rele().
+			 */
+			dnode_rele(DB_DNODE(db), db);
 		} else if (db->db_buf == NULL) {
 			/*
 			 * This is a special case: we never associated this
@@ -2095,7 +2212,7 @@
 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
-	dnode_t *dn = db->db_dnode;
+	dnode_t *dn;
 	zio_t *zio;
 
 	ASSERT(dmu_tx_is_syncing(tx));
@@ -2113,10 +2230,13 @@
 		mutex_enter(&db->db_mtx);
 	}
 	ASSERT3U(db->db_state, ==, DB_CACHED);
-	ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
 	ASSERT(db->db_buf != NULL);
 
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
 	dbuf_check_blkptr(dn, db);
+	DB_DNODE_EXIT(db);
 
 	db->db_data_pending = dr;
 
@@ -2136,8 +2256,8 @@
 {
 	arc_buf_t **datap = &dr->dt.dl.dr_data;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
-	dnode_t *dn = db->db_dnode;
-	objset_t *os = dn->dn_objset;
+	dnode_t *dn;
+	objset_t *os;
 	uint64_t txg = tx->tx_txg;
 
 	ASSERT(dmu_tx_is_syncing(tx));
@@ -2160,6 +2280,9 @@
 	}
 	DBUF_VERIFY(db);
 
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+
 	if (db->db_blkid == DMU_SPILL_BLKID) {
 		mutex_enter(&dn->dn_mtx);
 		dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
@@ -2179,6 +2302,8 @@
 		ASSERT3U(db->db_level, ==, 0);
 		ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
 		bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
+		DB_DNODE_EXIT(db);
+
 		if (*datap != db->db.db_data) {
 			zio_buf_free(*datap, DN_MAX_BONUSLEN);
 			arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
@@ -2197,6 +2322,8 @@
 		return;
 	}
 
+	os = dn->dn_objset;
+
 	/*
 	 * This function may have dropped the db_mtx lock allowing a dmu_sync
 	 * operation to sneak in. As a result, we need to ensure that we
@@ -2206,7 +2333,7 @@
 	dbuf_check_blkptr(dn, db);
 
 	/*
-	 * If this buffer is in the middle of an immdiate write,
+	 * If this buffer is in the middle of an immediate write,
 	 * wait for the synchronous IO to complete.
 	 */
 	while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
@@ -2243,10 +2370,20 @@
 	dbuf_write(dr, *datap, tx);
 
 	ASSERT(!list_link_active(&dr->dr_dirty_node));
-	if (dn->dn_object == DMU_META_DNODE_OBJECT)
+	if (dn->dn_object == DMU_META_DNODE_OBJECT) {
 		list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
-	else
+		DB_DNODE_EXIT(db);
+	} else {
+		/*
+		 * Although zio_nowait() does not "wait for an IO", it does
+		 * initiate the IO. If this is an empty write it seems plausible
+		 * that the IO could actually be completed before the nowait
+		 * returns. We need to DB_DNODE_EXIT() first in case
+		 * zio_nowait() invalidates the dbuf.
+		 */
+		DB_DNODE_EXIT(db);
 		zio_nowait(dr->dr_zio);
+	}
 }
 
 void
@@ -2280,9 +2417,9 @@
 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
 	dmu_buf_impl_t *db = vdb;
+	dnode_t *dn;
 	blkptr_t *bp = zio->io_bp;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
-	dnode_t *dn = db->db_dnode;
 	spa_t *spa = zio->io_spa;
 	int64_t delta;
 	uint64_t fill = 0;
@@ -2290,12 +2427,15 @@
 
 	ASSERT(db->db_blkptr == bp);
 
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
 	delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
 	dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
 	zio->io_prev_space_delta = delta;
 
 	if (BP_IS_HOLE(bp)) {
 		ASSERT(bp->blk_fill == 0);
+		DB_DNODE_EXIT(db);
 		return;
 	}
 
@@ -2309,7 +2449,6 @@
 
 #ifdef ZFS_DEBUG
 	if (db->db_blkid == DMU_SPILL_BLKID) {
-		dnode_t *dn = db->db_dnode;
 		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
 		ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
 		    db->db_blkptr == &dn->dn_phys->dn_spill);
@@ -2342,6 +2481,7 @@
 			fill += ibp->blk_fill;
 		}
 	}
+	DB_DNODE_EXIT(db);
 
 	bp->blk_fill = fill;
 
@@ -2355,8 +2495,6 @@
 	dmu_buf_impl_t *db = vdb;
 	blkptr_t *bp = zio->io_bp;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
-	dnode_t *dn = db->db_dnode;
-	objset_t *os = dn->dn_objset;
 	uint64_t txg = zio->io_txg;
 	dbuf_dirty_record_t **drp, *dr;
 
@@ -2366,8 +2504,13 @@
 	if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
 		ASSERT(BP_EQUAL(bp, bp_orig));
 	} else {
-		dsl_dataset_t *ds = os->os_dsl_dataset;
-		dmu_tx_t *tx = os->os_synctx;
+		objset_t *os;
+		dsl_dataset_t *ds;
+		dmu_tx_t *tx;
+
+		DB_GET_OBJSET(&os, db);
+		ds = os->os_dsl_dataset;
+		tx = os->os_synctx;
 
 		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
 		dsl_dataset_block_born(ds, bp, tx);
@@ -2388,10 +2531,14 @@
 
 #ifdef ZFS_DEBUG
 	if (db->db_blkid == DMU_SPILL_BLKID) {
-		dnode_t *dn = db->db_dnode;
+		dnode_t *dn;
+
+		DB_DNODE_ENTER(db);
+		dn = DB_DNODE(db);
 		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
 		ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
 		    db->db_blkptr == &dn->dn_phys->dn_spill);
+		DB_DNODE_EXIT(db);
 	}
 #endif
 
@@ -2406,6 +2553,10 @@
 				arc_set_callback(db->db_buf, dbuf_do_evict, db);
 		}
 	} else {
+		dnode_t *dn;
+
+		DB_DNODE_ENTER(db);
+		dn = DB_DNODE(db);
 		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
 		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
 		if (!BP_IS_HOLE(db->db_blkptr)) {
@@ -2417,6 +2568,7 @@
 			    >> (db->db_level * epbs), >=, db->db_blkid);
 			arc_set_callback(db->db_buf, dbuf_do_evict, db);
 		}
+		DB_DNODE_EXIT(db);
 		mutex_destroy(&dr->dt.di.dr_mtx);
 		list_destroy(&dr->dt.di.dr_children);
 	}
@@ -2472,8 +2624,8 @@
 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
-	dnode_t *dn = db->db_dnode;
-	objset_t *os = dn->dn_objset;
+	dnode_t *dn;
+	objset_t *os;
 	dmu_buf_impl_t *parent = db->db_parent;
 	uint64_t txg = tx->tx_txg;
 	zbookmark_t zb;
@@ -2481,6 +2633,10 @@
 	zio_t *zio;
 	int wp_flag = 0;
 
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	os = dn->dn_objset;
+
 	if (db->db_state != DB_NOFILL) {
 		if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
 			/*
@@ -2525,6 +2681,7 @@
 	wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
 
 	dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
+	DB_DNODE_EXIT(db);
 
 	if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
 		ASSERT(db->db_state != DB_NOFILL);
--- a/usr/src/uts/common/fs/zfs/dmu.c	Thu Jun 24 09:34:22 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu.c	Thu Jun 24 11:35:31 2010 -0700
@@ -133,7 +133,7 @@
 	}
 
 	dnode_rele(dn, FTAG);
-	*dbp = &db->db;
+	*dbp = &db->db; /* NULL db plus first field offset is NULL */
 	return (err);
 }
 
@@ -144,31 +144,64 @@
 }
 
 int
-dmu_set_bonus(dmu_buf_t *db, int newsize, dmu_tx_t *tx)
+dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx)
 {
-	dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	dnode_t *dn;
+	int error;
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
 
-	if (dn->dn_bonus != (dmu_buf_impl_t *)db)
-		return (EINVAL);
-	if (newsize < 0 || newsize > db->db_size)
-		return (EINVAL);
-	dnode_setbonuslen(dn, newsize, tx);
-	return (0);
+	if (dn->dn_bonus != db) {
+		error = EINVAL;
+	} else if (newsize < 0 || newsize > db_fake->db_size) {
+		error = EINVAL;
+	} else {
+		dnode_setbonuslen(dn, newsize, tx);
+		error = 0;
+	}
+
+	DB_DNODE_EXIT(db);
+	return (error);
 }
 
 int
-dmu_set_bonustype(dmu_buf_t *db, dmu_object_type_t type, dmu_tx_t *tx)
+dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
 {
-	dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	dnode_t *dn;
+	int error;
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+
+	if (type > DMU_OT_NUMTYPES) {
+		error = EINVAL;
+	} else if (dn->dn_bonus != db) {
+		error = EINVAL;
+	} else {
+		dnode_setbonus_type(dn, type, tx);
+		error = 0;
+	}
 
-	if (type > DMU_OT_NUMTYPES)
-		return (EINVAL);
+	DB_DNODE_EXIT(db);
+	return (error);
+}
 
-	if (dn->dn_bonus != (dmu_buf_impl_t *)db)
-		return (EINVAL);
+dmu_object_type_t
+dmu_get_bonustype(dmu_buf_t *db_fake)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	dnode_t *dn;
+	dmu_object_type_t type;
 
-	dnode_setbonus_type(dn, type, tx);
-	return (0);
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	type = dn->dn_bonustype;
+	DB_DNODE_EXIT(db);
+
+	return (type);
 }
 
 int
@@ -208,11 +241,19 @@
 			dbuf_create_bonus(dn);
 	}
 	db = dn->dn_bonus;
-	rw_exit(&dn->dn_struct_rwlock);
 
 	/* as long as the bonus buf is held, the dnode will be held */
-	if (refcount_add(&db->db_holds, tag) == 1)
+	if (refcount_add(&db->db_holds, tag) == 1) {
 		VERIFY(dnode_add_ref(dn, db));
+		(void) atomic_inc_32_nv(&dn->dn_dbufs_count);
+	}
+
+	/*
+	 * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
+	 * hold and incrementing the dbuf count to ensure that dnode_move() sees
+	 * a dnode hold for every dbuf.
+	 */
+	rw_exit(&dn->dn_struct_rwlock);
 
 	dnode_rele(dn, FTAG);
 
@@ -257,28 +298,45 @@
 int
 dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
 {
-	dnode_t *dn = ((dmu_buf_impl_t *)bonus)->db_dnode;
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
+	dnode_t *dn;
 	int err;
 
-	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA)
-		return (EINVAL);
-	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+
+	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
+		err = EINVAL;
+	} else {
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
-	if (!dn->dn_have_spill) {
+		if (!dn->dn_have_spill) {
+			err = ENOENT;
+		} else {
+			err = dmu_spill_hold_by_dnode(dn,
+			    DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
+		}
+
 		rw_exit(&dn->dn_struct_rwlock);
-		return (ENOENT);
 	}
-	err = dmu_spill_hold_by_dnode(dn,
-	    DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
-	rw_exit(&dn->dn_struct_rwlock);
+
+	DB_DNODE_EXIT(db);
 	return (err);
 }
 
 int
 dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
 {
-	return (dmu_spill_hold_by_dnode(((dmu_buf_impl_t *)bonus)->db_dnode,
-	    DB_RF_CANFAIL, tag, dbp));
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
+	dnode_t *dn;
+	int err;
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp);
+	DB_DNODE_EXIT(db);
+
+	return (err);
 }
 
 /*
@@ -400,14 +458,18 @@
 }
 
 int
-dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
+dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
     uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
 {
-	dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	dnode_t *dn;
 	int err;
 
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
 	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
 	    numbufsp, dbpp, DMU_READ_PREFETCH);
+	DB_DNODE_EXIT(db);
 
 	return (err);
 }
@@ -440,7 +502,7 @@
 		return;
 
 	if (len == 0) {  /* they're interested in the bonus buffer */
-		dn = os->os_meta_dnode;
+		dn = DMU_META_DNODE(os);
 
 		if (object == 0 || object >= DN_MAX_OBJECT)
 			return;
@@ -1001,11 +1063,19 @@
 dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size,
     dmu_tx_t *tx)
 {
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
+	dnode_t *dn;
+	int err;
+
 	if (size == 0)
 		return (0);
 
-	return (dmu_write_uio_dnode(((dmu_buf_impl_t *)zdb)->db_dnode,
-	    uio, size, tx));
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	err = dmu_write_uio_dnode(dn, uio, size, tx);
+	DB_DNODE_EXIT(db);
+
+	return (err);
 }
 
 int
@@ -1091,9 +1161,11 @@
 arc_buf_t *
 dmu_request_arcbuf(dmu_buf_t *handle, int size)
 {
-	dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode;
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
+	spa_t *spa;
 
-	return (arc_loan_buf(dn->dn_objset->os_spa, size));
+	DB_GET_SPA(&spa, db);
+	return (arc_loan_buf(spa, size));
 }
 
 /*
@@ -1115,23 +1187,35 @@
 dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
     dmu_tx_t *tx)
 {
-	dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode;
+	dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
+	dnode_t *dn;
 	dmu_buf_impl_t *db;
 	uint32_t blksz = (uint32_t)arc_buf_size(buf);
 	uint64_t blkid;
 
+	DB_DNODE_ENTER(dbuf);
+	dn = DB_DNODE(dbuf);
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	blkid = dbuf_whichblock(dn, offset);
 	VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
 	rw_exit(&dn->dn_struct_rwlock);
+	DB_DNODE_EXIT(dbuf);
 
 	if (offset == db->db.db_offset && blksz == db->db.db_size) {
 		dbuf_assign_arcbuf(db, buf, tx);
 		dbuf_rele(db, FTAG);
 	} else {
+		objset_t *os;
+		uint64_t object;
+
+		DB_DNODE_ENTER(dbuf);
+		dn = DB_DNODE(dbuf);
+		os = dn->dn_objset;
+		object = dn->dn_object;
+		DB_DNODE_EXIT(dbuf);
+
 		dbuf_rele(db, FTAG);
-		dmu_write(dn->dn_objset, dn->dn_object, offset, blksz,
-		    buf->b_data, tx);
+		dmu_write(os, object, offset, blksz, buf->b_data, tx);
 		dmu_return_arcbuf(buf);
 		XUIOSTAT_BUMP(xuiostat_wbuf_copied);
 	}
@@ -1150,7 +1234,6 @@
 {
 	dmu_sync_arg_t *dsa = varg;
 	dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
-	dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio->io_error == 0) {
@@ -1161,7 +1244,6 @@
 			 */
 			BP_SET_LSIZE(bp, db->db_size);
 		} else {
-			ASSERT(BP_GET_TYPE(bp) == dn->dn_type);
 			ASSERT(BP_GET_LEVEL(bp) == 0);
 			bp->blk_fill = 1;
 		}
@@ -1284,6 +1366,7 @@
 	dmu_sync_arg_t *dsa;
 	zbookmark_t zb;
 	zio_prop_t zp;
+	dnode_t *dn;
 
 	ASSERT(pio != NULL);
 	ASSERT(BP_IS_HOLE(bp));
@@ -1292,7 +1375,10 @@
 	SET_BOOKMARK(&zb, ds->ds_object,
 	    db->db.db_object, db->db_level, db->db_blkid);
 
-	dmu_write_policy(os, db->db_dnode, db->db_level, WP_DMU_SYNC, &zp);
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
+	DB_DNODE_EXIT(db);
 
 	/*
 	 * If we're frozen (running ziltest), we always need to generate a bp.
@@ -1574,9 +1660,13 @@
  * As above, but faster; can be used when you have a held dbuf in hand.
  */
 void
-dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi)
+dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi)
 {
-	dmu_object_info_from_dnode(((dmu_buf_impl_t *)db)->db_dnode, doi);
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+	DB_DNODE_ENTER(db);
+	dmu_object_info_from_dnode(DB_DNODE(db), doi);
+	DB_DNODE_EXIT(db);
 }
 
 /*
@@ -1584,14 +1674,20 @@
  * This is specifically optimized for zfs_getattr().
  */
 void
-dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512)
+dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
+    u_longlong_t *nblk512)
 {
-	dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	dnode_t *dn;
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
 
 	*blksize = dn->dn_datablksz;
 	/* add 1 for dnode space */
 	*nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
 	    SPA_MINBLOCKSHIFT) + 1;
+	DB_DNODE_EXIT(db);
 }
 
 void
@@ -1643,23 +1739,25 @@
 dmu_init(void)
 {
 	zfs_dbgmsg_init();
+	sa_cache_init();
+	xuio_stat_init();
+	dmu_objset_init();
+	dnode_init();
 	dbuf_init();
-	dnode_init();
 	zfetch_init();
 	arc_init();
 	l2arc_init();
-	xuio_stat_init();
-	sa_cache_init();
 }
 
 void
 dmu_fini(void)
 {
+	l2arc_fini();
 	arc_fini();
 	zfetch_fini();
+	dbuf_fini();
 	dnode_fini();
-	dbuf_fini();
-	l2arc_fini();
+	dmu_objset_fini();
 	xuio_stat_fini();
 	sa_cache_fini();
 	zfs_dbgmsg_fini();
--- a/usr/src/uts/common/fs/zfs/dmu_object.c	Thu Jun 24 09:34:22 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_object.c	Thu Jun 24 11:35:31 2010 -0700
@@ -33,7 +33,7 @@
 {
 	uint64_t object;
 	uint64_t L2_dnode_count = DNODES_PER_BLOCK <<
-	    (os->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT);
+	    (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
 	dnode_t *dn = NULL;
 	int restarted = B_FALSE;
 
@@ -49,7 +49,7 @@
 		 */
 		if (P2PHASE(object, L2_dnode_count) == 0) {
 			uint64_t offset = restarted ? object << DNODE_SHIFT : 0;
-			int error = dnode_next_offset(os->os_meta_dnode,
+			int error = dnode_next_offset(DMU_META_DNODE(os),
 			    DNODE_FIND_HOLE,
 			    &offset, 2, DNODES_PER_BLOCK >> 2, 0);
 			restarted = B_TRUE;
@@ -187,7 +187,7 @@
 	uint64_t offset = (*objectp + 1) << DNODE_SHIFT;
 	int error;
 
-	error = dnode_next_offset(os->os_meta_dnode,
+	error = dnode_next_offset(DMU_META_DNODE(os),
 	    (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
 
 	*objectp = offset >> DNODE_SHIFT;
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c	Thu Jun 24 09:34:22 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c	Thu Jun 24 11:35:31 2010 -0700
@@ -41,9 +41,26 @@
 #include <sys/zil.h>
 #include <sys/dmu_impl.h>
 #include <sys/zfs_ioctl.h>
-#include <sys/sunddi.h>
 #include <sys/sa.h>
 
+/*
+ * Needed to close a window in dnode_move() that allows the objset to be freed
+ * before it can be safely accessed.
+ */
+krwlock_t os_lock;
+
+void
+dmu_objset_init(void)
+{
+	rw_init(&os_lock, NULL, RW_DEFAULT, NULL);
+}
+
+void
+dmu_objset_fini(void)
+{
+	rw_destroy(&os_lock);
+}
+
 spa_t *
 dmu_objset_spa(objset_t *os)
 {
@@ -368,13 +385,16 @@
 	mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
 
-	os->os_meta_dnode = dnode_special_open(os,
-	    &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT);
+	DMU_META_DNODE(os) = dnode_special_open(os,
+	    &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT,
+	    &os->os_meta_dnode);
 	if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) {
-		os->os_userused_dnode = dnode_special_open(os,
-		    &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT);
-		os->os_groupused_dnode = dnode_special_open(os,
-		    &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT);
+		DMU_USERUSED_DNODE(os) = dnode_special_open(os,
+		    &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT,
+		    &os->os_userused_dnode);
+		DMU_GROUPUSED_DNODE(os) = dnode_special_open(os,
+		    &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT,
+		    &os->os_groupused_dnode);
 	}
 
 	/*
@@ -470,8 +490,8 @@
 	mutex_enter(&os->os_lock);
 
 	/* process the mdn last, since the other dnodes have holds on it */
-	list_remove(&os->os_dnodes, os->os_meta_dnode);
-	list_insert_tail(&os->os_dnodes, os->os_meta_dnode);
+	list_remove(&os->os_dnodes, DMU_META_DNODE(os));
+	list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os));
 
 	/*
 	 * Find the first dnode with holds.  We have to do this dance
@@ -497,8 +517,9 @@
 		mutex_enter(&os->os_lock);
 		dn = next_dn;
 	}
+	dn = list_head(&os->os_dnodes);
 	mutex_exit(&os->os_lock);
-	return (list_head(&os->os_dnodes) != os->os_meta_dnode);
+	return (dn != DMU_META_DNODE(os));
 }
 
 void
@@ -539,16 +560,26 @@
 	 */
 	(void) dmu_objset_evict_dbufs(os);
 
-	dnode_special_close(os->os_meta_dnode);
-	if (os->os_userused_dnode) {
-		dnode_special_close(os->os_userused_dnode);
-		dnode_special_close(os->os_groupused_dnode);
+	dnode_special_close(&os->os_meta_dnode);
+	if (DMU_USERUSED_DNODE(os)) {
+		dnode_special_close(&os->os_userused_dnode);
+		dnode_special_close(&os->os_groupused_dnode);
 	}
 	zil_free(os->os_zil);
 
 	ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
 
 	VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf) == 1);
+
+	/*
+	 * This is a barrier to prevent the objset from going away in
+	 * dnode_move() until we can safely ensure that the objset is still in
+	 * use. We consider the objset valid before the barrier and invalid
+	 * after the barrier.
+	 */
+	rw_enter(&os_lock, RW_READER);
+	rw_exit(&os_lock);
+
 	mutex_destroy(&os->os_lock);
 	mutex_destroy(&os->os_obj_lock);
 	mutex_destroy(&os->os_user_ptr_lock);
@@ -575,7 +606,7 @@
 	VERIFY(0 == dmu_objset_open_impl(spa, ds, bp, &os));
 	if (ds)
 		mutex_exit(&ds->ds_opening_lock);
-	mdn = os->os_meta_dnode;
+	mdn = DMU_META_DNODE(os);
 
 	dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
 	    DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx);
@@ -1035,17 +1066,17 @@
 	/*
 	 * Sync special dnodes - the parent IO for the sync is the root block
 	 */
-	os->os_meta_dnode->dn_zio = zio;
-	dnode_sync(os->os_meta_dnode, tx);
+	DMU_META_DNODE(os)->dn_zio = zio;
+	dnode_sync(DMU_META_DNODE(os), tx);
 
 	os->os_phys->os_flags = os->os_flags;
 
-	if (os->os_userused_dnode &&
-	    os->os_userused_dnode->dn_type != DMU_OT_NONE) {
-		os->os_userused_dnode->dn_zio = zio;
-		dnode_sync(os->os_userused_dnode, tx);
-		os->os_groupused_dnode->dn_zio = zio;
-		dnode_sync(os->os_groupused_dnode, tx);
+	if (DMU_USERUSED_DNODE(os) &&
+	    DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) {
+		DMU_USERUSED_DNODE(os)->dn_zio = zio;
+		dnode_sync(DMU_USERUSED_DNODE(os), tx);
+		DMU_GROUPUSED_DNODE(os)->dn_zio = zio;
+		dnode_sync(DMU_GROUPUSED_DNODE(os), tx);
 	}
 
 	txgoff = tx->tx_txg & TXG_MASK;
@@ -1063,7 +1094,7 @@
 	dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx);
 	dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx);
 
-	list = &os->os_meta_dnode->dn_dirty_records[txgoff];
+	list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
 	while (dr = list_head(list)) {
 		ASSERT(dr->dr_dbuf->db_level == 0);
 		list_remove(list, dr);
@@ -1085,7 +1116,7 @@
 	    !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK]));
 }
 
-objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];
+static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];
 
 void
 dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb)
@@ -1097,8 +1128,8 @@
 dmu_objset_userused_enabled(objset_t *os)
 {
 	return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE &&
-	    used_cbs[os->os_phys->os_type] &&
-	    os->os_userused_dnode);
+	    used_cbs[os->os_phys->os_type] != NULL &&
+	    DMU_USERUSED_DNODE(os) != NULL);
 }
 
 static void
@@ -1132,7 +1163,7 @@
 		    DNODE_FLAG_USERUSED_ACCOUNTED);
 
 		/* Allocate the user/groupused objects if necessary. */
-		if (os->os_userused_dnode->dn_type == DMU_OT_NONE) {
+		if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
 			VERIFY(0 == zap_create_claim(os,
 			    DMU_USERUSED_OBJECT,
 			    DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
@@ -1201,13 +1232,23 @@
 		if (dr->dr_txg == tx->tx_txg)
 			break;
 
-	if (dr == NULL)
+	if (dr == NULL) {
 		data = NULL;
-	else if (dr->dr_dbuf->db_dnode->dn_bonuslen == 0 &&
-	    dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID)
-		data = dr->dt.dl.dr_data->b_data;
-	else
-		data = dr->dt.dl.dr_data;
+	} else {
+		dnode_t *dn;
+
+		DB_DNODE_ENTER(dr->dr_dbuf);
+		dn = DB_DNODE(dr->dr_dbuf);
+
+		if (dn->dn_bonuslen == 0 &&
+		    dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID)
+			data = dr->dt.dl.dr_data->b_data;
+		else
+			data = dr->dt.dl.dr_data;
+
+		DB_DNODE_EXIT(dr->dr_dbuf);
+	}
+
 	return (data);
 }
 
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c	Thu Jun 24 09:34:22 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c	Thu Jun 24 11:35:31 2010 -0700
@@ -186,7 +186,7 @@
 		ASSERT(level != 0);
 		db = NULL;
 	} else {
-		ASSERT(db->db_dnode == dn);
+		ASSERT(DB_DNODE(db) == dn);
 		ASSERT(db->db_level == level);
 		ASSERT(db->db.db_size == space);
 		ASSERT(db->db_blkid == blkid);
@@ -384,7 +384,7 @@
 dmu_tx_count_dnode(dmu_tx_hold_t *txh)
 {
 	dnode_t *dn = txh->txh_dnode;
-	dnode_t *mdn = txh->txh_tx->tx_objset->os_meta_dnode;
+	dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset);
 	uint64_t space = mdn->dn_datablksz +
 	    ((mdn->dn_nlevels-1) << mdn->dn_indblkshift);
 
@@ -787,18 +787,24 @@
 {
 	dmu_tx_hold_t *txh;
 	int match_object = FALSE, match_offset = FALSE;
-	dnode_t *dn = db->db_dnode;
+	dnode_t *dn;
 
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
 	ASSERT(tx->tx_txg != 0);
 	ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
 	ASSERT3U(dn->dn_object, ==, db->db.db_object);
 
-	if (tx->tx_anyobj)
+	if (tx->tx_anyobj) {
+		DB_DNODE_EXIT(db);
 		return;
+	}
 
 	/* XXX No checking on the meta dnode for now */
-	if (db->db.db_object == DMU_META_DNODE_OBJECT)
+	if (db->db.db_object == DMU_META_DNODE_OBJECT) {
+		DB_DNODE_EXIT(db);
 		return;
+	}
 
 	for (txh = list_head(&tx->tx_holds); txh;
 	    txh = list_next(&tx->tx_holds, txh)) {
@@ -870,9 +876,12 @@
 				ASSERT(!"bad txh_type");
 			}
 		}
-		if (match_object && match_offset)
+		if (match_object && match_offset) {
+			DB_DNODE_EXIT(db);
 			return;
+		}
 	}
+	DB_DNODE_EXIT(db);
 	panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
 	    (u_longlong_t)db->db.db_object, db->db_level,
 	    (u_longlong_t)db->db_blkid);
@@ -1355,9 +1364,19 @@
 	if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
 		dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
 
-	if (sa->sa_force_spill || may_grow || hdl->sa_spill ||
-	    ((dmu_buf_impl_t *)hdl->sa_bonus)->db_dnode->dn_have_spill) {
+	if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
 		ASSERT(tx->tx_txg == 0);
 		dmu_tx_hold_spill(tx, object);
+	} else {
+		dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
+		dnode_t *dn;
+
+		DB_DNODE_ENTER(db);
+		dn = DB_DNODE(db);
+		if (dn->dn_have_spill) {
+			ASSERT(tx->tx_txg == 0);
+			dmu_tx_hold_spill(tx, object);
+		}
+		DB_DNODE_EXIT(db);
 	}
 }
--- a/usr/src/uts/common/fs/zfs/dnode.c	Thu Jun 24 09:34:22 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/dnode.c	Thu Jun 24 11:35:31 2010 -0700
@@ -38,19 +38,33 @@
 static int free_range_compar(const void *node1, const void *node2);
 
 static kmem_cache_t *dnode_cache;
+/*
+ * Define DNODE_STATS to turn on statistic gathering. By default, it is only
+ * turned on when DEBUG is also defined.
+ */
+#ifdef	DEBUG
+#define	DNODE_STATS
+#endif	/* DEBUG */
+
+#ifdef	DNODE_STATS
+#define	DNODE_STAT_ADD(stat)			((stat)++)
+#else
+#define	DNODE_STAT_ADD(stat)			/* nothing */
+#endif	/* DNODE_STATS */
 
 static dnode_phys_t dnode_phys_zero;
 
 int zfs_default_bs = SPA_MINBLOCKSHIFT;
 int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
 
+static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
+
 /* ARGSUSED */
 static int
 dnode_cons(void *arg, void *unused, int kmflag)
 {
+	dnode_t *dn = arg;
 	int i;
-	dnode_t *dn = arg;
-	bzero(dn, sizeof (dnode_t));
 
 	rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
 	mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
@@ -59,8 +73,18 @@
 
 	refcount_create(&dn->dn_holds);
 	refcount_create(&dn->dn_tx_holds);
+	list_link_init(&dn->dn_link);
+
+	bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr));
+	bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels));
+	bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift));
+	bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype));
+	bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk));
+	bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen));
+	bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
 
 	for (i = 0; i < TXG_SIZE; i++) {
+		list_link_init(&dn->dn_dirty_link[i]);
 		avl_create(&dn->dn_ranges[i], free_range_compar,
 		    sizeof (free_range_t),
 		    offsetof(struct free_range, fr_node));
@@ -69,9 +93,27 @@
 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
 	}
 
+	dn->dn_allocated_txg = 0;
+	dn->dn_free_txg = 0;
+	dn->dn_assigned_txg = 0;
+	dn->dn_dirtyctx = 0;
+	dn->dn_dirtyctx_firstset = NULL;
+	dn->dn_bonus = NULL;
+	dn->dn_have_spill = B_FALSE;
+	dn->dn_zio = NULL;
+	dn->dn_oldused = 0;
+	dn->dn_oldflags = 0;
+	dn->dn_olduid = 0;
+	dn->dn_oldgid = 0;
+	dn->dn_newuid = 0;
+	dn->dn_newgid = 0;
+	dn->dn_id_flags = 0;
+
+	dn->dn_dbufs_count = 0;
 	list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t),
 	    offsetof(dmu_buf_impl_t, db_link));
 
+	dn->dn_moved = 0;
 	return (0);
 }
 
@@ -88,27 +130,56 @@
 	cv_destroy(&dn->dn_notxholds);
 	refcount_destroy(&dn->dn_holds);
 	refcount_destroy(&dn->dn_tx_holds);
+	ASSERT(!list_link_active(&dn->dn_link));
 
 	for (i = 0; i < TXG_SIZE; i++) {
+		ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
 		avl_destroy(&dn->dn_ranges[i]);
 		list_destroy(&dn->dn_dirty_records[i]);
+		ASSERT3U(dn->dn_next_nblkptr[i], ==, 0);
+		ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
+		ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
+		ASSERT3U(dn->dn_next_bonustype[i], ==, 0);
+		ASSERT3U(dn->dn_rm_spillblk[i], ==, 0);
+		ASSERT3U(dn->dn_next_bonuslen[i], ==, 0);
+		ASSERT3U(dn->dn_next_blksz[i], ==, 0);
 	}
 
+	ASSERT3U(dn->dn_allocated_txg, ==, 0);
+	ASSERT3U(dn->dn_free_txg, ==, 0);
+	ASSERT3U(dn->dn_assigned_txg, ==, 0);
+	ASSERT3U(dn->dn_dirtyctx, ==, 0);
+	ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
+	ASSERT3P(dn->dn_bonus, ==, NULL);
+	ASSERT(!dn->dn_have_spill);
+	ASSERT3P(dn->dn_zio, ==, NULL);
+	ASSERT3U(dn->dn_oldused, ==, 0);
+	ASSERT3U(dn->dn_oldflags, ==, 0);
+	ASSERT3U(dn->dn_olduid, ==, 0);
+	ASSERT3U(dn->dn_oldgid, ==, 0);
+	ASSERT3U(dn->dn_newuid, ==, 0);
+	ASSERT3U(dn->dn_newgid, ==, 0);
+	ASSERT3U(dn->dn_id_flags, ==, 0);
+
+	ASSERT3U(dn->dn_dbufs_count, ==, 0);
 	list_destroy(&dn->dn_dbufs);
 }
 
 void
 dnode_init(void)
 {
+	ASSERT(dnode_cache == NULL);
 	dnode_cache = kmem_cache_create("dnode_t",
 	    sizeof (dnode_t),
 	    0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
+	kmem_cache_set_move(dnode_cache, dnode_move);
 }
 
 void
 dnode_fini(void)
 {
 	kmem_cache_destroy(dnode_cache);
+	dnode_cache = NULL;
 }
 
 
@@ -120,6 +191,7 @@
 
 	ASSERT(dn->dn_phys);
 	ASSERT(dn->dn_objset);
+	ASSERT(dn->dn_handle->dnh_dnode == dn);
 
 	ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
 
@@ -298,18 +370,29 @@
 
 static dnode_t *
 dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
-    uint64_t object)
+    uint64_t object, dnode_handle_t *dnh)
 {
 	dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
-	(void) dnode_cons(dn, NULL, 0); /* XXX */
+
+	ASSERT(!POINTER_IS_VALID(dn->dn_objset));
+	dn->dn_moved = 0;
 
-	dn->dn_objset = os;
+	/*
+	 * Defer setting dn_objset until the dnode is ready to be a candidate
+	 * for the dnode_move() callback.
+	 */
 	dn->dn_object = object;
 	dn->dn_dbuf = db;
+	dn->dn_handle = dnh;
 	dn->dn_phys = dnp;
 
-	if (dnp->dn_datablkszsec)
+	if (dnp->dn_datablkszsec) {
 		dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+	} else {
+		dn->dn_datablksz = 0;
+		dn->dn_datablkszsec = 0;
+		dn->dn_datablkshift = 0;
+	}
 	dn->dn_indblkshift = dnp->dn_indblkshift;
 	dn->dn_nlevels = dnp->dn_nlevels;
 	dn->dn_type = dnp->dn_type;
@@ -325,45 +408,65 @@
 	dmu_zfetch_init(&dn->dn_zfetch, dn);
 
 	ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
+
 	mutex_enter(&os->os_lock);
 	list_insert_head(&os->os_dnodes, dn);
+	membar_producer();
+	/*
+	 * Everything else must be valid before assigning dn_objset makes the
+	 * dnode eligible for dnode_move().
+	 */
+	dn->dn_objset = os;
 	mutex_exit(&os->os_lock);
 
 	arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
 	return (dn);
 }
 
+/*
+ * Caller must be holding the dnode handle, which is released upon return.
+ */
 static void
 dnode_destroy(dnode_t *dn)
 {
 	objset_t *os = dn->dn_objset;
 
-#ifdef ZFS_DEBUG
-	int i;
-
-	for (i = 0; i < TXG_SIZE; i++) {
-		ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
-		ASSERT(NULL == list_head(&dn->dn_dirty_records[i]));
-		ASSERT(0 == avl_numnodes(&dn->dn_ranges[i]));
-	}
-	ASSERT(NULL == list_head(&dn->dn_dbufs));
-#endif
 	ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
 
 	mutex_enter(&os->os_lock);
+	POINTER_INVALIDATE(&dn->dn_objset);
 	list_remove(&os->os_dnodes, dn);
 	mutex_exit(&os->os_lock);
 
-	if (dn->dn_dirtyctx_firstset) {
+	/* the dnode can no longer move, so we can release the handle */
+	zrl_remove(&dn->dn_handle->dnh_zrlock);
+
+	dn->dn_allocated_txg = 0;
+	dn->dn_free_txg = 0;
+	dn->dn_assigned_txg = 0;
+
+	dn->dn_dirtyctx = 0;
+	if (dn->dn_dirtyctx_firstset != NULL) {
 		kmem_free(dn->dn_dirtyctx_firstset, 1);
 		dn->dn_dirtyctx_firstset = NULL;
 	}
-	dmu_zfetch_rele(&dn->dn_zfetch);
-	if (dn->dn_bonus) {
+	if (dn->dn_bonus != NULL) {
 		mutex_enter(&dn->dn_bonus->db_mtx);
 		dbuf_evict(dn->dn_bonus);
 		dn->dn_bonus = NULL;
 	}
+	dn->dn_zio = NULL;
+
+	dn->dn_have_spill = B_FALSE;
+	dn->dn_oldused = 0;
+	dn->dn_oldflags = 0;
+	dn->dn_olduid = 0;
+	dn->dn_oldgid = 0;
+	dn->dn_newuid = 0;
+	dn->dn_newgid = 0;
+	dn->dn_id_flags = 0;
+
+	dmu_zfetch_rele(&dn->dn_zfetch);
 	kmem_cache_free(dnode_cache, dn);
 	arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
 }
@@ -408,6 +511,7 @@
 	ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
 
 	for (i = 0; i < TXG_SIZE; i++) {
+		ASSERT3U(dn->dn_next_nblkptr[i], ==, 0);
 		ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
 		ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
 		ASSERT3U(dn->dn_next_bonuslen[i], ==, 0);
@@ -522,9 +626,304 @@
 	mutex_exit(&dn->dn_mtx);
 }
 
+#ifdef	DNODE_STATS
+static struct {
+	uint64_t dms_dnode_invalid;
+	uint64_t dms_dnode_recheck1;
+	uint64_t dms_dnode_recheck2;
+	uint64_t dms_dnode_special;
+	uint64_t dms_dnode_handle;
+	uint64_t dms_dnode_rwlock;
+	uint64_t dms_dnode_active;
+} dnode_move_stats;
+#endif	/* DNODE_STATS */
+
+static void
+dnode_move_impl(dnode_t *odn, dnode_t *ndn)
+{
+	int i;
+
+	ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock));
+	ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx));
+	ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx));
+	ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock));
+
+	/* Copy fields. */
+	ndn->dn_objset = odn->dn_objset;
+	ndn->dn_object = odn->dn_object;
+	ndn->dn_dbuf = odn->dn_dbuf;
+	ndn->dn_handle = odn->dn_handle;
+	ndn->dn_phys = odn->dn_phys;
+	ndn->dn_type = odn->dn_type;
+	ndn->dn_bonuslen = odn->dn_bonuslen;
+	ndn->dn_bonustype = odn->dn_bonustype;
+	ndn->dn_nblkptr = odn->dn_nblkptr;
+	ndn->dn_checksum = odn->dn_checksum;
+	ndn->dn_compress = odn->dn_compress;
+	ndn->dn_nlevels = odn->dn_nlevels;
+	ndn->dn_indblkshift = odn->dn_indblkshift;
+	ndn->dn_datablkshift = odn->dn_datablkshift;
+	ndn->dn_datablkszsec = odn->dn_datablkszsec;
+	ndn->dn_datablksz = odn->dn_datablksz;
+	ndn->dn_maxblkid = odn->dn_maxblkid;
+	bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0],
+	    sizeof (odn->dn_next_nblkptr));
+	bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0],
+	    sizeof (odn->dn_next_nlevels));
+	bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0],
+	    sizeof (odn->dn_next_indblkshift));
+	bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0],
+	    sizeof (odn->dn_next_bonustype));
+	bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0],
+	    sizeof (odn->dn_rm_spillblk));
+	bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0],
+	    sizeof (odn->dn_next_bonuslen));
+	bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0],
+	    sizeof (odn->dn_next_blksz));
+	for (i = 0; i < TXG_SIZE; i++) {
+		list_move_tail(&ndn->dn_dirty_records[i],
+		    &odn->dn_dirty_records[i]);
+	}
+	bcopy(&odn->dn_ranges[0], &ndn->dn_ranges[0], sizeof (odn->dn_ranges));
+	ndn->dn_allocated_txg = odn->dn_allocated_txg;
+	ndn->dn_free_txg = odn->dn_free_txg;
+	ndn->dn_assigned_txg = odn->dn_assigned_txg;
+	ndn->dn_dirtyctx = odn->dn_dirtyctx;
+	ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
+	ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
+	refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
+	ASSERT(list_is_empty(&ndn->dn_dbufs));
+	list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs);
+	ndn->dn_dbufs_count = odn->dn_dbufs_count;
+	ndn->dn_bonus = odn->dn_bonus;
+	ndn->dn_have_spill = odn->dn_have_spill;
+	ndn->dn_zio = odn->dn_zio;
+	ndn->dn_oldused = odn->dn_oldused;
+	ndn->dn_oldflags = odn->dn_oldflags;
+	ndn->dn_olduid = odn->dn_olduid;
+	ndn->dn_oldgid = odn->dn_oldgid;
+	ndn->dn_newuid = odn->dn_newuid;
+	ndn->dn_newgid = odn->dn_newgid;
+	ndn->dn_id_flags = odn->dn_id_flags;
+	dmu_zfetch_init(&ndn->dn_zfetch, NULL);
+	list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream);
+	ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode;
+	ndn->dn_zfetch.zf_stream_cnt = odn->dn_zfetch.zf_stream_cnt;
+	ndn->dn_zfetch.zf_alloc_fail = odn->dn_zfetch.zf_alloc_fail;
+
+	/*
+	 * Update back pointers. Updating the handle fixes the back pointer of
+	 * every descendant dbuf as well as the bonus dbuf.
+	 */
+	ASSERT(ndn->dn_handle->dnh_dnode == odn);
+	ndn->dn_handle->dnh_dnode = ndn;
+	if (ndn->dn_zfetch.zf_dnode == odn) {
+		ndn->dn_zfetch.zf_dnode = ndn;
+	}
+
+	/*
+	 * Invalidate the original dnode by clearing all of its back pointers.
+	 */
+	odn->dn_dbuf = NULL;
+	odn->dn_handle = NULL;
+	list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t),
+	    offsetof(dmu_buf_impl_t, db_link));
+	odn->dn_dbufs_count = 0;
+	odn->dn_bonus = NULL;
+	odn->dn_zfetch.zf_dnode = NULL;
+
+	/*
+	 * Set the low bit of the objset pointer to ensure that dnode_move()
+	 * recognizes the dnode as invalid in any subsequent callback.
+	 */
+	POINTER_INVALIDATE(&odn->dn_objset);
+
+	/*
+	 * Satisfy the destructor.
+	 */
+	for (i = 0; i < TXG_SIZE; i++) {
+		list_create(&odn->dn_dirty_records[i],
+		    sizeof (dbuf_dirty_record_t),
+		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
+		odn->dn_ranges[i].avl_root = NULL;
+		odn->dn_ranges[i].avl_numnodes = 0;
+		odn->dn_next_nlevels[i] = 0;
+		odn->dn_next_indblkshift[i] = 0;
+		odn->dn_next_bonustype[i] = 0;
+		odn->dn_rm_spillblk[i] = 0;
+		odn->dn_next_bonuslen[i] = 0;
+		odn->dn_next_blksz[i] = 0;
+	}
+	odn->dn_allocated_txg = 0;
+	odn->dn_free_txg = 0;
+	odn->dn_assigned_txg = 0;
+	odn->dn_dirtyctx = 0;
+	odn->dn_dirtyctx_firstset = NULL;
+	odn->dn_have_spill = B_FALSE;
+	odn->dn_zio = NULL;
+	odn->dn_oldused = 0;
+	odn->dn_oldflags = 0;
+	odn->dn_olduid = 0;
+	odn->dn_oldgid = 0;
+	odn->dn_newuid = 0;
+	odn->dn_newgid = 0;
+	odn->dn_id_flags = 0;
+
+	/*
+	 * Mark the dnode.
+	 */
+	ndn->dn_moved = 1;
+	odn->dn_moved = (uint8_t)-1;
+}
+
+#ifdef	_KERNEL
+/*ARGSUSED*/
+static kmem_cbrc_t
+dnode_move(void *buf, void *newbuf, size_t size, void *arg)
+{
+	dnode_t *odn = buf, *ndn = newbuf;
+	objset_t *os;
+	int64_t refcount;
+	uint32_t dbufs;
+
+	/*
+	 * The dnode is on the objset's list of known dnodes if the objset
+	 * pointer is valid. We set the low bit of the objset pointer when
+	 * freeing the dnode to invalidate it, and the memory patterns written
+	 * by kmem (baddcafe and deadbeef) set at least one of the two low bits.
+	 * A newly created dnode sets the objset pointer last of all to indicate
+	 * that the dnode is known and in a valid state to be moved by this
+	 * function.
+	 */
+	os = odn->dn_objset;
+	if (!POINTER_IS_VALID(os)) {
+		DNODE_STAT_ADD(dnode_move_stats.dms_dnode_invalid);
+		return (KMEM_CBRC_DONT_KNOW);
+	}
+
+	/*
+	 * Ensure that the objset does not go away during the move.
+	 */
+	rw_enter(&os_lock, RW_WRITER);
+	if (os != odn->dn_objset) {
+		rw_exit(&os_lock);
+		DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck1);
+		return (KMEM_CBRC_DONT_KNOW);
+	}
+
+	/*
+	 * If the dnode is still valid, then so is the objset. We know that no
+	 * valid objset can be freed while we hold os_lock, so we can safely
+	 * ensure that the objset remains in use.
+	 */
+	mutex_enter(&os->os_lock);
+
+	/*
+	 * Recheck the objset pointer in case the dnode was removed just before
+	 * acquiring the lock.
+	 */
+	if (os != odn->dn_objset) {
+		mutex_exit(&os->os_lock);
+		rw_exit(&os_lock);
+		DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck2);
+		return (KMEM_CBRC_DONT_KNOW);
+	}
+
+	/*
+	 * At this point we know that as long as we hold os->os_lock, the dnode
+	 * cannot be freed and fields within the dnode can be safely accessed.
+	 * The objset listing this dnode cannot go away as long as this dnode is
+	 * on its list.
+	 */
+	rw_exit(&os_lock);
+	if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) {
+		mutex_exit(&os->os_lock);
+		DNODE_STAT_ADD(dnode_move_stats.dms_dnode_special);
+		return (KMEM_CBRC_NO);
+	}
+	ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */
+
+	/*
+	 * Lock the dnode handle to prevent the dnode from obtaining any new
+	 * holds. This also prevents the descendant dbufs and the bonus dbuf
+	 * from accessing the dnode, so that we can discount their holds. The
+	 * handle is safe to access because we know that while the dnode cannot
+	 * go away, neither can its handle. Once we hold dnh_zrlock, we can
+	 * safely move any dnode referenced only by dbufs.
+	 */
+	if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) {
+		mutex_exit(&os->os_lock);
+		DNODE_STAT_ADD(dnode_move_stats.dms_dnode_handle);
+		return (KMEM_CBRC_LATER);
+	}
+
+	/*
+	 * Ensure a consistent view of the dnode's holds and the dnode's dbufs.
+	 * We need to guarantee that there is a hold for every dbuf in order to
+	 * determine whether the dnode is actively referenced. Falsely matching
+	 * a dbuf to an active hold would lead to an unsafe move. It's possible
+	 * that a thread already having an active dnode hold is about to add a
+	 * dbuf, and we can't compare hold and dbuf counts while the add is in
+	 * progress.
+	 */
+	if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) {
+		zrl_exit(&odn->dn_handle->dnh_zrlock);
+		mutex_exit(&os->os_lock);
+		DNODE_STAT_ADD(dnode_move_stats.dms_dnode_rwlock);
+		return (KMEM_CBRC_LATER);
+	}
+
+	/*
+	 * A dbuf may be removed (evicted) without an active dnode hold. In that
+	 * case, the dbuf count is decremented under the handle lock before the
+	 * dbuf's hold is released. This order ensures that if we count the hold
+	 * after the dbuf is removed but before its hold is released, we will
+	 * treat the unmatched hold as active and exit safely. If we count the
+	 * hold before the dbuf is removed, the hold is discounted, and the
+	 * removal is blocked until the move completes.
+	 */
+	refcount = refcount_count(&odn->dn_holds);
+	ASSERT(refcount >= 0);
+	dbufs = odn->dn_dbufs_count;
+
+	/* We can't have more dbufs than dnode holds. */
+	ASSERT3U(dbufs, <=, refcount);
+	DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount,
+	    uint32_t, dbufs);
+
+	if (refcount > dbufs) {
+		rw_exit(&odn->dn_struct_rwlock);
+		zrl_exit(&odn->dn_handle->dnh_zrlock);
+		mutex_exit(&os->os_lock);
+		DNODE_STAT_ADD(dnode_move_stats.dms_dnode_active);
+		return (KMEM_CBRC_LATER);
+	}
+
+	rw_exit(&odn->dn_struct_rwlock);
+
+	/*
+	 * At this point we know that anyone with a hold on the dnode is not
+	 * actively referencing it. The dnode is known and in a valid state to
+	 * move. We're holding the locks needed to execute the critical section.
+	 */
+	dnode_move_impl(odn, ndn);
+
+	list_link_replace(&odn->dn_link, &ndn->dn_link);
+	/* If the dnode was safe to move, the refcount cannot have changed. */
+	ASSERT(refcount == refcount_count(&ndn->dn_holds));
+	ASSERT(dbufs == ndn->dn_dbufs_count);
+	zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */
+	mutex_exit(&os->os_lock);
+
+	return (KMEM_CBRC_YES);
+}
+#endif	/* _KERNEL */
+
 void
-dnode_special_close(dnode_t *dn)
+dnode_special_close(dnode_handle_t *dnh)
 {
+	dnode_t *dn = dnh->dnh_dnode;
+
 	/*
 	 * Wait for final references to the dnode to clear.  This can
 	 * only happen if the arc is asyncronously evicting state that
@@ -533,13 +932,19 @@
 	 */
 	while (refcount_count(&dn->dn_holds) > 0)
 		delay(1);
-	dnode_destroy(dn);
+	zrl_add(&dnh->dnh_zrlock);
+	dnode_destroy(dn); /* implicit zrl_remove() */
+	zrl_destroy(&dnh->dnh_zrlock);
+	dnh->dnh_dnode = NULL;
 }
 
 dnode_t *
-dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object)
+dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
+    dnode_handle_t *dnh)
 {
-	dnode_t *dn = dnode_create(os, dnp, NULL, object);
+	dnode_t *dn = dnode_create(os, dnp, NULL, object, dnh);
+	dnh->dnh_dnode = dn;
+	zrl_init(&dnh->dnh_zrlock);
 	DNODE_VERIFY(dn);
 	return (dn);
 }
@@ -547,34 +952,43 @@
 static void
 dnode_buf_pageout(dmu_buf_t *db, void *arg)
 {
-	dnode_t **children_dnodes = arg;
+	dnode_children_t *children_dnodes = arg;
 	int i;
 	int epb = db->db_size >> DNODE_SHIFT;
 
+	ASSERT(epb == children_dnodes->dnc_count);
+
 	for (i = 0; i < epb; i++) {
-		dnode_t *dn = children_dnodes[i];
-		int n;
+		dnode_handle_t *dnh = &children_dnodes->dnc_children[i];
+		dnode_t *dn;
 
-		if (dn == NULL)
+		/*
+		 * The dnode handle lock guards against the dnode moving to
+		 * another valid address, so there is no need here to guard
+		 * against changes to or from NULL.
+		 */
+		if (dnh->dnh_dnode == NULL) {
+			zrl_destroy(&dnh->dnh_zrlock);
 			continue;
-#ifdef ZFS_DEBUG
+		}
+
+		zrl_add(&dnh->dnh_zrlock);
+		dn = dnh->dnh_dnode;
 		/*
 		 * If there are holds on this dnode, then there should
 		 * be holds on the dnode's containing dbuf as well; thus
-		 * it wouldn't be eligable for eviction and this function
+		 * it wouldn't be eligible for eviction and this function
 		 * would not have been called.
 		 */
 		ASSERT(refcount_is_zero(&dn->dn_holds));
-		ASSERT(list_head(&dn->dn_dbufs) == NULL);
 		ASSERT(refcount_is_zero(&dn->dn_tx_holds));
 
-		for (n = 0; n < TXG_SIZE; n++)
-			ASSERT(!list_link_active(&dn->dn_dirty_link[n]));
-#endif
-		children_dnodes[i] = NULL;
-		dnode_destroy(dn);
+		dnode_destroy(dn); /* implicit zrl_remove() */
+		zrl_destroy(&dnh->dnh_zrlock);
+		dnh->dnh_dnode = NULL;
 	}
-	kmem_free(children_dnodes, epb * sizeof (dnode_t *));
+	kmem_free(children_dnodes, sizeof (dnode_children_t) +
+	    (epb - 1) * sizeof (dnode_handle_t));
 }
 
 /*
@@ -593,7 +1007,8 @@
 	uint64_t blk;
 	dnode_t *mdn, *dn;
 	dmu_buf_impl_t *db;
-	dnode_t **children_dnodes;
+	dnode_children_t *children_dnodes;
+	dnode_handle_t *dnh;
 
 	/*
 	 * If you are holding the spa config lock as writer, you shouldn't
@@ -607,7 +1022,7 @@
 
 	if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) {
 		dn = (object == DMU_USERUSED_OBJECT) ?
-		    os->os_userused_dnode : os->os_groupused_dnode;
+		    DMU_USERUSED_DNODE(os) : DMU_GROUPUSED_DNODE(os);
 		if (dn == NULL)
 			return (ENOENT);
 		type = dn->dn_type;
@@ -624,7 +1039,8 @@
 	if (object == 0 || object >= DN_MAX_OBJECT)
 		return (EINVAL);
 
-	mdn = os->os_meta_dnode;
+	mdn = DMU_META_DNODE(os);
+	ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT);
 
 	DNODE_VERIFY(mdn);
 
@@ -651,26 +1067,39 @@
 
 	idx = object & (epb-1);
 
+	ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
 	children_dnodes = dmu_buf_get_user(&db->db);
 	if (children_dnodes == NULL) {
-		dnode_t **winner;
-		children_dnodes = kmem_zalloc(epb * sizeof (dnode_t *),
-		    KM_SLEEP);
+		int i;
+		dnode_children_t *winner;
+		children_dnodes = kmem_alloc(sizeof (dnode_children_t) +
+		    (epb - 1) * sizeof (dnode_handle_t), KM_SLEEP);
+		children_dnodes->dnc_count = epb;
+		dnh = &children_dnodes->dnc_children[0];
+		for (i = 0; i < epb; i++) {
+			zrl_init(&dnh[i].dnh_zrlock);
+			dnh[i].dnh_dnode = NULL;
+		}
 		if (winner = dmu_buf_set_user(&db->db, children_dnodes, NULL,
 		    dnode_buf_pageout)) {
-			kmem_free(children_dnodes, epb * sizeof (dnode_t *));
+			kmem_free(children_dnodes, sizeof (dnode_children_t) +
+			    (epb - 1) * sizeof (dnode_handle_t));
 			children_dnodes = winner;
 		}
 	}
+	ASSERT(children_dnodes->dnc_count == epb);
 
-	if ((dn = children_dnodes[idx]) == NULL) {
-		dnode_phys_t *dnp = (dnode_phys_t *)db->db.db_data+idx;
+	dnh = &children_dnodes->dnc_children[idx];
+	zrl_add(&dnh->dnh_zrlock);
+	if ((dn = dnh->dnh_dnode) == NULL) {
+		dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx;
 		dnode_t *winner;
 
-		dn = dnode_create(os, dnp, db, object);
-		winner = atomic_cas_ptr(&children_dnodes[idx], NULL, dn);
+		dn = dnode_create(os, phys, db, object, dnh);
+		winner = atomic_cas_ptr(&dnh->dnh_dnode, NULL, dn);
 		if (winner != NULL) {
-			dnode_destroy(dn);
+			zrl_add(&dnh->dnh_zrlock);
+			dnode_destroy(dn); /* implicit zrl_remove() */
 			dn = winner;
 		}
 	}
@@ -682,13 +1111,16 @@
 	    ((flag & DNODE_MUST_BE_FREE) &&
 	    (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) {
 		mutex_exit(&dn->dn_mtx);
+		zrl_remove(&dnh->dnh_zrlock);
 		dbuf_rele(db, FTAG);
 		return (type == DMU_OT_NONE ? ENOENT : EEXIST);
 	}
 	mutex_exit(&dn->dn_mtx);
 
 	if (refcount_add(&dn->dn_holds, tag) == 1)
-		dbuf_add_ref(db, dn);
+		dbuf_add_ref(db, dnh);
+	/* Now we can rely on the hold to prevent the dnode from moving. */
+	zrl_remove(&dnh->dnh_zrlock);
 
 	DNODE_VERIFY(dn);
 	ASSERT3P(dn->dn_dbuf, ==, db);
@@ -730,13 +1162,37 @@
 dnode_rele(dnode_t *dn, void *tag)
 {
 	uint64_t refs;
+	/* Get while the hold prevents the dnode from moving. */
+	dmu_buf_impl_t *db = dn->dn_dbuf;
+	dnode_handle_t *dnh = dn->dn_handle;
 
 	mutex_enter(&dn->dn_mtx);
 	refs = refcount_remove(&dn->dn_holds, tag);
 	mutex_exit(&dn->dn_mtx);
+
+	/*
+	 * It's unsafe to release the last hold on a dnode by dnode_rele() or
+	 * indirectly by dbuf_rele() while relying on the dnode handle to
+	 * prevent the dnode from moving, since releasing the last hold could
+	 * result in the dnode's parent dbuf evicting its dnode handles. For
+	 * that reason anyone calling dnode_rele() or dbuf_rele() without some
+	 * other direct or indirect hold on the dnode must first drop the dnode
+	 * handle.
+	 */
+	ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread);
+
 	/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
-	if (refs == 0 && dn->dn_dbuf)
-		dbuf_rele(dn->dn_dbuf, dn);
+	if (refs == 0 && db != NULL) {
+		/*
+		 * Another thread could add a hold to the dnode handle in
+		 * dnode_hold_impl() while holding the parent dbuf. Since the
+		 * hold on the parent dbuf prevents the handle from being
+		 * destroyed, the hold on the handle is OK. We can't yet assert
+		 * that the handle has zero references, but that will be
+		 * asserted anyway when the handle gets destroyed.
+		 */
+		dbuf_rele(db, dnh);
+	}
 }
 
 void
@@ -755,7 +1211,7 @@
 #ifdef ZFS_DEBUG
 	mutex_enter(&dn->dn_mtx);
 	ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
-	/* ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); */
+	ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg);
 	mutex_exit(&dn->dn_mtx);
 #endif
 
@@ -794,7 +1250,7 @@
 	/*
 	 * The dnode maintains a hold on its containing dbuf as
 	 * long as there are holds on it.  Each instantiated child
-	 * dbuf maintaines a hold on the dnode.  When the last child
+	 * dbuf maintains a hold on the dnode.  When the last child
 	 * drops its hold, the dnode will drop its hold on the
 	 * containing dbuf. We add a "dirty hold" here so that the
 	 * dnode will hang around after we finish processing its
--- a/usr/src/uts/common/fs/zfs/dnode_sync.c	Thu Jun 24 09:34:22 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/dnode_sync.c	Thu Jun 24 11:35:31 2010 -0700
@@ -76,7 +76,11 @@
 
 		if (child == NULL)
 			continue;
-		ASSERT3P(child->db_dnode, ==, dn);
+#ifdef	DEBUG
+		DB_DNODE_ENTER(child);
+		ASSERT3P(DB_DNODE(child), ==, dn);
+		DB_DNODE_EXIT(child);
+#endif	/* DEBUG */
 		if (child->db_parent && child->db_parent != dn->dn_dbuf) {
 			ASSERT(child->db_parent->db_level == db->db_level);
 			ASSERT(child->db_blkptr !=
@@ -135,15 +139,18 @@
 	int off, num;
 	int i, err, epbs;
 	uint64_t txg = tx->tx_txg;
+	dnode_t *dn;
 
-	epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
 	off = start - (db->db_blkid * 1<<epbs);
 	num = end - start + 1;
 
 	ASSERT3U(off, >=, 0);
 	ASSERT3U(num, >=, 0);
 	ASSERT3U(db->db_level, >, 0);
-	ASSERT3U(db->db.db_size, ==, 1<<db->db_dnode->dn_phys->dn_indblkshift);
+	ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
 	ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT);
 	ASSERT(db->db_blkptr != NULL);
 
@@ -155,10 +162,10 @@
 
 		ASSERT(db->db_level == 1);
 
-		rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
-		err = dbuf_hold_impl(db->db_dnode, db->db_level-1,
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+		err = dbuf_hold_impl(dn, db->db_level-1,
 		    (db->db_blkid << epbs) + i, TRUE, FTAG, &child);
-		rw_exit(&db->db_dnode->dn_struct_rwlock);
+		rw_exit(&dn->dn_struct_rwlock);
 		if (err == ENOENT)
 			continue;
 		ASSERT(err == 0);
@@ -200,6 +207,7 @@
 
 		dbuf_rele(child, FTAG);
 	}
+	DB_DNODE_EXIT(db);
 }
 #endif
 
@@ -209,7 +217,7 @@
 free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
     dmu_tx_t *tx)
 {
-	dnode_t *dn = db->db_dnode;
+	dnode_t *dn;
 	blkptr_t *bp;
 	dmu_buf_impl_t *subdb;
 	uint64_t start, end, dbstart, dbend, i;
@@ -230,7 +238,9 @@
 	dbuf_release_bp(db);
 	bp = (blkptr_t *)db->db.db_data;
 
-	epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
 	shift = (db->db_level - 1) * epbs;
 	dbstart = db->db_blkid << epbs;
 	start = blkid >> shift;
@@ -253,6 +263,7 @@
 		blocks_freed = free_blocks(dn, bp, end-start+1, tx);
 		arc_buf_freeze(db->db_buf);
 		ASSERT(all || blocks_freed == 0 || db->db_last_dirty);
+		DB_DNODE_EXIT(db);
 		return (all ? ALL : blocks_freed);
 	}
 
@@ -272,6 +283,7 @@
 		}
 		dbuf_rele(subdb, FTAG);
 	}
+	DB_DNODE_EXIT(db);
 	arc_buf_freeze(db->db_buf);
 #ifdef ZFS_DEBUG
 	bp -= (end-start)+1;
@@ -375,7 +387,11 @@
 		for (; db != &marker; db = list_head(&dn->dn_dbufs)) {
 			list_remove(&dn->dn_dbufs, db);
 			list_insert_tail(&dn->dn_dbufs, db);
-			ASSERT3P(db->db_dnode, ==, dn);
+#ifdef	DEBUG
+			DB_DNODE_ENTER(db);
+			ASSERT3P(DB_DNODE(db), ==, dn);
+			DB_DNODE_EXIT(db);
+#endif	/* DEBUG */
 
 			mutex_enter(&db->db_mtx);
 			if (db->db_state == DB_EVICTING) {
--- a/usr/src/uts/common/fs/zfs/refcount.c	Thu Jun 24 09:34:22 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/refcount.c	Thu Jun 24 11:35:31 2010 -0700
@@ -25,7 +25,7 @@
 #include <sys/zfs_context.h>
 #include <sys/refcount.h>
 
-#if defined(DEBUG) || !defined(_KERNEL)
+#ifdef	ZFS_DEBUG
 
 #ifdef _KERNEL
 int reference_tracking_enable = FALSE; /* runs out of memory too easily */
@@ -189,4 +189,35 @@
 	return (refcount_remove_many(rc, 1, holder));
 }
 
-#endif
+void
+refcount_transfer(refcount_t *dst, refcount_t *src)
+{
+	int64_t count, removed_count;
+	list_t list, removed;
+
+	list_create(&list, sizeof (reference_t),
+	    offsetof(reference_t, ref_link));
+	list_create(&removed, sizeof (reference_t),
+	    offsetof(reference_t, ref_link));
+
+	mutex_enter(&src->rc_mtx);
+	count = src->rc_count;
+	removed_count = src->rc_removed_count;
+	src->rc_count = 0;
+	src->rc_removed_count = 0;
+	list_move_tail(&list, &src->rc_list);
+	list_move_tail(&removed, &src->rc_removed);
+	mutex_exit(&src->rc_mtx);
+
+	mutex_enter(&dst->rc_mtx);
+	dst->rc_count += count;
+	dst->rc_removed_count += removed_count;
+	list_move_tail(&dst->rc_list, &list);
+	list_move_tail(&dst->rc_removed, &removed);
+	mutex_exit(&dst->rc_mtx);
+
+	list_destroy(&list);
+	list_destroy(&removed);
+}
+
+#endif	/* ZFS_DEBUG */
--- a/usr/src/uts/common/fs/zfs/sa.c	Thu Jun 24 09:34:22 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sa.c	Thu Jun 24 11:35:31 2010 -0700
@@ -1612,6 +1612,8 @@
     uint16_t buflen, dmu_tx_t *tx)
 {
 	sa_os_t *sa = hdl->sa_os->os_sa;
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
+	dnode_t *dn;
 	sa_bulk_attr_t *attr_desc;
 	void *old_data[2];
 	int bonus_attr_count = 0;
@@ -1629,7 +1631,9 @@
 
 	/* First make of copy of the old data */
 
-	if (((dmu_buf_impl_t *)hdl->sa_bonus)->db_dnode->dn_bonuslen) {
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	if (dn->dn_bonuslen != 0) {
 		bonus_data_size = hdl->sa_bonus->db_size;
 		old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP);
 		bcopy(hdl->sa_bonus->db_data, old_data[0],
@@ -1638,6 +1642,7 @@
 	} else {
 		old_data[0] = NULL;
 	}
+	DB_DNODE_EXIT(db);
 
 	/* Bring spill buffer online if it isn't currently */
 
--- a/usr/src/uts/common/fs/zfs/sys/dbuf.h	Thu Jun 24 09:34:22 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h	Thu Jun 24 11:35:31 2010 -0700
@@ -32,6 +32,7 @@
 #include <sys/arc.h>
 #include <sys/zfs_context.h>
 #include <sys/refcount.h>
+#include <sys/zrlock.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -82,9 +83,6 @@
  * etc.
  */
 
-#define	LIST_LINK_INACTIVE(link) \
-	((link)->list_next == NULL && (link)->list_prev == NULL)
-
 struct dmu_buf_impl;
 
 typedef enum override_states {
@@ -149,15 +147,17 @@
 	struct objset *db_objset;
 
 	/*
-	 * the dnode we belong to (NULL when evicted)
+	 * handle to safely access the dnode we belong to (NULL when evicted)
 	 */
-	struct dnode *db_dnode;
+	struct dnode_handle *db_dnode_handle;
 
 	/*
 	 * our parent buffer; if the dnode points to us directly,
-	 * db_parent == db_dnode->dn_dbuf
+	 * db_parent == db_dnode_handle->dnh_dnode->dn_dbuf
 	 * only accessed by sync thread ???
 	 * (NULL when evicted)
+	 * May change from NULL to non-NULL under the protection of db_mtx
+	 * (see dbuf_check_blkptr())
 	 */
 	struct dmu_buf_impl *db_parent;
 
@@ -284,24 +284,46 @@
 
 void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
 
+#define	DB_DNODE(_db)		((_db)->db_dnode_handle->dnh_dnode)
+#define	DB_DNODE_LOCK(_db)	((_db)->db_dnode_handle->dnh_zrlock)
+#define	DB_DNODE_ENTER(_db)	(zrl_add(&DB_DNODE_LOCK(_db)))
+#define	DB_DNODE_EXIT(_db)	(zrl_remove(&DB_DNODE_LOCK(_db)))
+#define	DB_DNODE_HELD(_db)	(!zrl_is_zero(&DB_DNODE_LOCK(_db)))
+#define	DB_GET_SPA(_spa_p, _db) {		\
+	dnode_t *__dn;				\
+	DB_DNODE_ENTER(_db);			\
+	__dn = DB_DNODE(_db);			\
+	*(_spa_p) = __dn->dn_objset->os_spa;	\
+	DB_DNODE_EXIT(_db);			\
+}
+#define	DB_GET_OBJSET(_os_p, _db) {		\
+	dnode_t *__dn;				\
+	DB_DNODE_ENTER(_db);			\
+	__dn = DB_DNODE(_db);			\
+	*(_os_p) = __dn->dn_objset;		\
+	DB_DNODE_EXIT(_db);			\
+}
+
 void dbuf_init(void);
 void dbuf_fini(void);
 
-#define	DBUF_IS_METADATA(db)	\
-	((db)->db_level > 0 || dmu_ot[(db)->db_dnode->dn_type].ot_metadata)
+boolean_t dbuf_is_metadata(dmu_buf_impl_t *db);
 
-#define	DBUF_GET_BUFC_TYPE(db)	\
-	(DBUF_IS_METADATA(db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
+#define	DBUF_IS_METADATA(_db)	\
+	(dbuf_is_metadata(_db))
+
+#define	DBUF_GET_BUFC_TYPE(_db)	\
+	(DBUF_IS_METADATA(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
 
-#define	DBUF_IS_CACHEABLE(db)						\
-	((db)->db_objset->os_primary_cache == ZFS_CACHE_ALL ||		\
-	(DBUF_IS_METADATA(db) &&					\
-	((db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
+#define	DBUF_IS_CACHEABLE(_db)						\
+	((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL ||		\
+	(DBUF_IS_METADATA(_db) &&					\
+	((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
 
-#define	DBUF_IS_L2CACHEABLE(db)						\
-	((db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL ||	\
-	(DBUF_IS_METADATA(db) &&					\
-	((db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
+#define	DBUF_IS_L2CACHEABLE(_db)					\
+	((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL ||	\
+	(DBUF_IS_METADATA(_db) &&					\
+	((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
 
 #ifdef ZFS_DEBUG
 
@@ -332,7 +354,7 @@
 	sprintf_blkptr(__blkbuf, bp);				\
 	dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf);	\
 	kmem_free(__blkbuf, BP_SPRINTF_LEN);			\
-	} 							\
+	}							\
 _NOTE(CONSTCOND) } while (0)
 
 #define	DBUF_VERIFY(db)	dbuf_verify(db)
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h	Thu Jun 24 09:34:22 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h	Thu Jun 24 11:35:31 2010 -0700
@@ -335,6 +335,7 @@
 int dmu_bonus_max(void);
 int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *);
 int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *);
+dmu_object_type_t dmu_get_bonustype(dmu_buf_t *);
 int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *);
 
 /*
--- a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h	Thu Jun 24 09:34:22 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h	Thu Jun 24 11:35:31 2010 -0700
@@ -40,6 +40,8 @@
 extern "C" {
 #endif
 
+extern krwlock_t os_lock;
+
 struct dsl_dataset;
 struct dmu_tx;
 
@@ -68,9 +70,15 @@
 	spa_t *os_spa;
 	arc_buf_t *os_phys_buf;
 	objset_phys_t *os_phys;
-	dnode_t *os_meta_dnode;
-	dnode_t *os_userused_dnode;
-	dnode_t *os_groupused_dnode;
+	/*
+	 * The following "special" dnodes have no parent and are exempt from
+	 * dnode_move(), but they root their descendents in this objset using
+	 * handles anyway, so that all access to dnodes from dbufs consistently
+	 * uses handles.
+	 */
+	dnode_handle_t os_meta_dnode;
+	dnode_handle_t os_userused_dnode;
+	dnode_handle_t os_groupused_dnode;
 	zilog_t *os_zil;
 
 	/* can change, under dsl_dir's locks: */
@@ -113,6 +121,9 @@
 #define	DMU_META_OBJSET		0
 #define	DMU_META_DNODE_OBJECT	0
 #define	DMU_OBJECT_IS_SPECIAL(obj) ((int64_t)(obj) <= 0)
+#define	DMU_META_DNODE(os)	((os)->os_meta_dnode.dnh_dnode)
+#define	DMU_USERUSED_DNODE(os)	((os)->os_userused_dnode.dnh_dnode)
+#define	DMU_GROUPUSED_DNODE(os)	((os)->os_groupused_dnode.dnh_dnode)
 
 #define	DMU_OS_IS_L2CACHEABLE(os)				\
 	((os)->os_secondary_cache == ZFS_CACHE_ALL ||		\
@@ -161,6 +172,9 @@
 int dmu_objset_userspace_upgrade(objset_t *os);
 boolean_t dmu_objset_userspace_present(objset_t *os);
 
+void dmu_objset_init(void);
+void dmu_objset_fini(void);
+
 #ifdef	__cplusplus
 }
 #endif
--- a/usr/src/uts/common/fs/zfs/sys/dnode.h	Thu Jun 24 09:34:22 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dnode.h	Thu Jun 24 11:35:31 2010 -0700
@@ -32,6 +32,7 @@
 #include <sys/zio.h>
 #include <sys/refcount.h>
 #include <sys/dmu_zfetch.h>
+#include <sys/zrlock.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -156,6 +157,7 @@
 	struct objset *dn_objset;
 	uint64_t dn_object;
 	struct dmu_buf_impl *dn_dbuf;
+	struct dnode_handle *dn_handle;
 	dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */
 
 	/*
@@ -172,6 +174,7 @@
 	uint8_t dn_nlevels;
 	uint8_t dn_indblkshift;
 	uint8_t dn_datablkshift;	/* zero if blksz not power of 2! */
+	uint8_t dn_moved;		/* Has this dnode been moved? */
 	uint16_t dn_datablkszsec;	/* in 512b sectors */
 	uint32_t dn_datablksz;		/* in bytes */
 	uint64_t dn_maxblkid;
@@ -183,6 +186,9 @@
 	uint16_t dn_next_bonuslen[TXG_SIZE];
 	uint32_t dn_next_blksz[TXG_SIZE];	/* next block size in bytes */
 
+	/* protected by dn_dbufs_mtx; declared here to fill 32-bit hole */
+	uint32_t dn_dbufs_count;	/* count of dn_dbufs */
+
 	/* protected by os_lock: */
 	list_node_t dn_dirty_link[TXG_SIZE];	/* next on dataset's dirty */
 
@@ -202,8 +208,11 @@
 	refcount_t dn_holds;
 
 	kmutex_t dn_dbufs_mtx;
-	list_t dn_dbufs;		/* linked list of descendent dbuf_t's */
+	list_t dn_dbufs;		/* descendent dbufs */
+
+	/* protected by dn_struct_rwlock */
 	struct dmu_buf_impl *dn_bonus;	/* bonus buffer dbuf */
+
 	boolean_t dn_have_spill;	/* have spill or are spilling */
 
 	/* parent IO for current sync write */
@@ -220,6 +229,22 @@
 	struct zfetch	dn_zfetch;
 } dnode_t;
 
+/*
+ * Adds a level of indirection between the dbuf and the dnode to avoid
+ * iterating descendent dbufs in dnode_move(). Handles are not allocated
+ * individually, but as an array of child dnodes in dnode_hold_impl().
+ */
+typedef struct dnode_handle {
+	/* Protects dnh_dnode from modification by dnode_move(). */
+	zrlock_t dnh_zrlock;
+	dnode_t *dnh_dnode;
+} dnode_handle_t;
+
+typedef struct dnode_children {
+	size_t dnc_count;		/* number of children */
+	dnode_handle_t dnc_children[1];	/* sized dynamically */
+} dnode_children_t;
+
 typedef struct free_range {
 	avl_node_t fr_node;
 	uint64_t fr_blkid;
@@ -227,8 +252,8 @@
 } free_range_t;
 
 dnode_t *dnode_special_open(struct objset *dd, dnode_phys_t *dnp,
-    uint64_t object);
-void dnode_special_close(dnode_t *dn);
+    uint64_t object, dnode_handle_t *dnh);
+void dnode_special_close(dnode_handle_t *dnh);
 
 void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx);
 void dnode_setbonus_type(dnode_t *dn, dmu_object_type_t, dmu_tx_t *tx);
--- a/usr/src/uts/common/fs/zfs/sys/refcount.h	Thu Jun 24 09:34:22 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/refcount.h	Thu Jun 24 11:35:31 2010 -0700
@@ -40,7 +40,7 @@
  */
 #define	FTAG ((char *)__func__)
 
-#if defined(DEBUG) || !defined(_KERNEL)
+#ifdef	ZFS_DEBUG
 typedef struct reference {
 	list_node_t ref_link;
 	void *ref_holder;
@@ -67,11 +67,12 @@
 int64_t refcount_remove(refcount_t *rc, void *holder_tag);
 int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag);
 int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag);
+void refcount_transfer(refcount_t *dst, refcount_t *src);
 
 void refcount_init(void);
 void refcount_fini(void);
 
-#else /* DEBUG */
+#else	/* ZFS_DEBUG */
 
 typedef struct refcount {
 	uint64_t rc_count;
@@ -97,7 +98,7 @@
 #define	refcount_init()
 #define	refcount_fini()
 
-#endif /* DEBUG */
+#endif	/* ZFS_DEBUG */
 
 #ifdef	__cplusplus
 }
--- a/usr/src/uts/common/fs/zfs/sys/sa_impl.h	Thu Jun 24 09:34:22 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/sa_impl.h	Thu Jun 24 11:35:31 2010 -0700
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef	_SYS_SA_IMPL_H
@@ -232,7 +231,7 @@
 	((a == DMU_OT_SA) ? B_TRUE : B_FALSE)
 
 #define	SA_BONUSTYPE_FROM_DB(db) \
-	(((dmu_buf_impl_t *)db)->db_dnode->dn_bonustype)
+	(dmu_get_bonustype((dmu_buf_t *)db))
 
 #define	SA_BLKPTR_SPACE	(DN_MAX_BONUSLEN - sizeof (blkptr_t))
 
--- a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h	Thu Jun 24 09:34:22 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h	Thu Jun 24 11:35:31 2010 -0700
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef	_SYS_FS_ZFS_ZNODE_H
@@ -188,6 +187,7 @@
 	uint8_t		z_unlinked;	/* file has been unlinked */
 	uint8_t		z_atime_dirty;	/* atime needs to be synced */
 	uint8_t		z_zn_prefetch;	/* Prefetch znodes? */
+	uint8_t		z_moved;	/* Has this znode been moved? */
 	uint_t		z_blksz;	/* block size in bytes */
 	uint_t		z_seq;		/* modification sequence number */
 	uint64_t	z_mapcnt;	/* number of pages mapped to file */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/fs/zfs/sys/zrlock.h	Thu Jun 24 11:35:31 2010 -0700
@@ -0,0 +1,66 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef	_SYS_ZRLOCK_H
+#define	_SYS_ZRLOCK_H
+
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef struct zrlock {
+	kmutex_t zr_mtx;
+	volatile int32_t zr_refcount;
+	kcondvar_t zr_cv;
+	uint16_t zr_pad;
+#ifdef	ZFS_DEBUG
+	kthread_t *zr_owner;
+	const char *zr_caller;
+#endif
+} zrlock_t;
+
+extern void zrl_init(zrlock_t *);
+extern void zrl_destroy(zrlock_t *);
+#ifdef	ZFS_DEBUG
+#define	zrl_add(_z)	zrl_add_debug((_z), __func__)
+extern void zrl_add_debug(zrlock_t *, const char *);
+#else
+extern void zrl_add(zrlock_t *);
+#endif
+extern void zrl_remove(zrlock_t *);
+extern int zrl_tryenter(zrlock_t *);
+extern void zrl_exit(zrlock_t *);
+extern int zrl_is_zero(zrlock_t *);
+extern int zrl_is_locked(zrlock_t *);
+#ifdef	ZFS_DEBUG
+extern kthread_t *zrl_owner(zrlock_t *);
+#endif
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_ZRLOCK_H */
--- a/usr/src/uts/common/fs/zfs/zfs_znode.c	Thu Jun 24 09:34:22 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_znode.c	Thu Jun 24 11:35:31 2010 -0700
@@ -81,9 +81,6 @@
 #define	ZNODE_STAT_ADD(stat)			/* nothing */
 #endif	/* ZNODE_STATS */
 
-#define	POINTER_IS_VALID(p)	(!((uintptr_t)(p) & 0x3))
-#define	POINTER_INVALIDATE(pp)	(*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1))
-
 /*
  * Functions needed for userland (ie: libzpool) are not put under
  * #ifdef_KERNEL; the rest of the functions have dependencies
@@ -136,6 +133,7 @@
 
 	zp->z_dirlocks = NULL;
 	zp->z_acl_cached = NULL;
+	zp->z_moved = 0;
 	return (0);
 }
 
@@ -228,6 +226,12 @@
 	 */
 	ozp->z_sa_hdl = NULL;
 	POINTER_INVALIDATE(&ozp->z_zfsvfs);
+
+	/*
+	 * Mark the znode.
+	 */
+	nzp->z_moved = 1;
+	ozp->z_moved = (uint8_t)-1;
 }
 
 /*ARGSUSED*/
@@ -478,6 +482,8 @@
 	vattr.va_gid = crgetgid(kcred);
 
 	sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+	ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs));
+	sharezp->z_moved = 0;
 	sharezp->z_unlinked = 0;
 	sharezp->z_atime_dirty = 0;
 	sharezp->z_zfsvfs = zfsvfs;
@@ -627,6 +633,7 @@
 
 	ASSERT(zp->z_dirlocks == NULL);
 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
+	zp->z_moved = 0;
 
 	/*
 	 * Defer setting z_zfsvfs until the znode is ready to be a candidate for
@@ -759,7 +766,7 @@
 {
 	uint64_t	crtime[2], atime[2], mtime[2], ctime[2];
 	uint64_t	mode, size, links, parent, pflags;
-	uint64_t 	dzp_pflags = 0;
+	uint64_t	dzp_pflags = 0;
 	uint64_t	rdev = 0;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	dmu_buf_t	*db;
@@ -794,7 +801,7 @@
 	 */
 	/*
 	 * There's currently no mechanism for pre-reading the blocks that will
-	 * be to needed allocate a new object, so we accept the small chance
+	 * be needed to allocate a new object, so we accept the small chance
 	 * that there will be an i/o error and we will fail one of the
 	 * assertions below.
 	 */
@@ -1807,6 +1814,8 @@
 	vattr.va_gid = crgetgid(cr);
 
 	rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+	ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
+	rootzp->z_moved = 0;
 	rootzp->z_unlinked = 0;
 	rootzp->z_atime_dirty = 0;
 	rootzp->z_is_sa = USE_SA(version, os);
@@ -1843,7 +1852,6 @@
 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
 		mutex_init(&zfsvfs.z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
 
-	ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
 	rootzp->z_zfsvfs = &zfsvfs;
 	VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
 	    cr, NULL, &acl_ids));
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/fs/zfs/zrlock.c	Thu Jun 24 11:35:31 2010 -0700
@@ -0,0 +1,194 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * A Zero Reference Lock (ZRL) is a reference count that can lock out new
+ * references only when the count is zero and only without waiting if the count
+ * is not already zero. It is similar to a read-write lock in that it allows
+ * multiple readers and only a single writer, but it does not allow a writer to
+ * block while waiting for readers to exit, and therefore the question of
+ * reader/writer priority is moot (no WRWANT bit). Since the equivalent of
+ * rw_enter(&lock, RW_WRITER) is disallowed and only tryenter() is allowed, it
+ * is perfectly safe for the same reader to acquire the same lock multiple
+ * times. The fact that a ZRL is reentrant for readers (through multiple calls
+ * to zrl_add()) makes it convenient for determining whether something is
+ * actively referenced without the fuss of flagging lock ownership across
+ * function calls.
+ */
+#include <sys/zrlock.h>
+
+/*
+ * A ZRL can be locked only while there are zero references, so ZRL_LOCKED is
+ * treated as zero references.
+ */
+#define	ZRL_LOCKED	((uint32_t)-1)
+#define	ZRL_DESTROYED	-2
+
+void
+zrl_init(zrlock_t *zrl)
+{
+	mutex_init(&zrl->zr_mtx, NULL, MUTEX_DEFAULT, NULL);
+	zrl->zr_refcount = 0;
+	cv_init(&zrl->zr_cv, NULL, CV_DEFAULT, NULL);
+#ifdef	ZFS_DEBUG
+	zrl->zr_owner = NULL;
+	zrl->zr_caller = NULL;
+#endif
+}
+
+void
+zrl_destroy(zrlock_t *zrl)
+{
+	ASSERT(zrl->zr_refcount == 0);
+
+	mutex_destroy(&zrl->zr_mtx);
+	zrl->zr_refcount = ZRL_DESTROYED;
+	cv_destroy(&zrl->zr_cv);
+}
+
+void
+#ifdef	ZFS_DEBUG
+zrl_add_debug(zrlock_t *zrl, const char *zc)
+#else
+zrl_add(zrlock_t *zrl)
+#endif
+{
+	uint32_t n = (uint32_t)zrl->zr_refcount;
+
+	while (n != ZRL_LOCKED) {
+		uint32_t cas = atomic_cas_32(
+		    (uint32_t *)&zrl->zr_refcount, n, n + 1);
+		if (cas == n) {
+			ASSERT((int32_t)n >= 0);
+#ifdef	ZFS_DEBUG
+			if (zrl->zr_owner == curthread) {
+				DTRACE_PROBE2(zrlock__reentry,
+				    zrlock_t *, zrl, uint32_t, n);
+			}
+			zrl->zr_owner = curthread;
+			zrl->zr_caller = zc;
+#endif
+			return;
+		}
+		n = cas;
+	}
+
+	mutex_enter(&zrl->zr_mtx);
+	while (zrl->zr_refcount == ZRL_LOCKED) {
+		cv_wait(&zrl->zr_cv, &zrl->zr_mtx);
+	}
+	ASSERT(zrl->zr_refcount >= 0);
+	zrl->zr_refcount++;
+#ifdef	ZFS_DEBUG
+	zrl->zr_owner = curthread;
+	zrl->zr_caller = zc;
+#endif
+	mutex_exit(&zrl->zr_mtx);
+}
+
+void
+zrl_remove(zrlock_t *zrl)
+{
+	uint32_t n;
+
+	n = atomic_dec_32_nv((uint32_t *)&zrl->zr_refcount);
+	ASSERT((int32_t)n >= 0);
+#ifdef	ZFS_DEBUG
+	if (zrl->zr_owner == curthread) {
+		zrl->zr_owner = NULL;
+		zrl->zr_caller = NULL;
+	}
+#endif
+}
+
+int
+zrl_tryenter(zrlock_t *zrl)
+{
+	uint32_t n = (uint32_t)zrl->zr_refcount;
+
+	if (n == 0) {
+		uint32_t cas = atomic_cas_32(
+		    (uint32_t *)&zrl->zr_refcount, 0, ZRL_LOCKED);
+		if (cas == 0) {
+#ifdef	ZFS_DEBUG
+			ASSERT(zrl->zr_owner == NULL);
+			zrl->zr_owner = curthread;
+#endif
+			return (1);
+		}
+	}
+
+	ASSERT((int32_t)n > ZRL_DESTROYED);
+
+	return (0);
+}
+
+void
+zrl_exit(zrlock_t *zrl)
+{
+	ASSERT(zrl->zr_refcount == ZRL_LOCKED);
+
+	mutex_enter(&zrl->zr_mtx);
+#ifdef	ZFS_DEBUG
+	ASSERT(zrl->zr_owner == curthread);
+	zrl->zr_owner = NULL;
+	membar_producer();	/* make sure the owner store happens first */
+#endif
+	zrl->zr_refcount = 0;
+	cv_broadcast(&zrl->zr_cv);
+	mutex_exit(&zrl->zr_mtx);
+}
+
+int
+zrl_refcount(zrlock_t *zrl)
+{
+	ASSERT(zrl->zr_refcount > ZRL_DESTROYED);
+
+	int n = (int)zrl->zr_refcount;
+	return (n <= 0 ? 0 : n);
+}
+
+int
+zrl_is_zero(zrlock_t *zrl)
+{
+	ASSERT(zrl->zr_refcount > ZRL_DESTROYED);
+
+	return (zrl->zr_refcount <= 0);
+}
+
+int
+zrl_is_locked(zrlock_t *zrl)
+{
+	ASSERT(zrl->zr_refcount > ZRL_DESTROYED);
+
+	return (zrl->zr_refcount == ZRL_LOCKED);
+}
+
+#ifdef	ZFS_DEBUG
+kthread_t *
+zrl_owner(zrlock_t *zrl)
+{
+	return (zrl->zr_owner);
+}
+#endif
--- a/usr/src/uts/common/sys/dnlc.h	Thu Jun 24 09:34:22 2010 -0700
+++ b/usr/src/uts/common/sys/dnlc.h	Thu Jun 24 11:35:31 2010 -0700
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
@@ -39,8 +38,6 @@
 #ifndef _SYS_DNLC_H
 #define	_SYS_DNLC_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -163,7 +160,8 @@
  */
 #define	DNLCHASH(name, dvp, hash, namlen)			\
 	{							\
-		char Xc, *Xcp;					\
+		char Xc;					\
+		const char *Xcp;				\
 		hash = (int)((uintptr_t)(dvp)) >> 8;		\
 		for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++)	\
 			(hash) = ((hash) << 4) + (hash) + Xc;	\
@@ -181,13 +179,13 @@
 #define	DNLC_NO_VNODE &negative_cache_vnode
 
 void	dnlc_init(void);
-void	dnlc_enter(vnode_t *, char *, vnode_t *);
-void	dnlc_update(vnode_t *, char *, vnode_t *);
-vnode_t	*dnlc_lookup(vnode_t *, char *);
+void	dnlc_enter(vnode_t *, const char *, vnode_t *);
+void	dnlc_update(vnode_t *, const char *, vnode_t *);
+vnode_t	*dnlc_lookup(vnode_t *, const char *);
 void	dnlc_purge(void);
 void	dnlc_purge_vp(vnode_t *);
 int	dnlc_purge_vfsp(vfs_t *, int);
-void	dnlc_remove(vnode_t *, char *);
+void	dnlc_remove(vnode_t *, const char *);
 int	dnlc_fs_purge1(struct vnodeops *);
 vnode_t	*dnlc_reverse_lookup(vnode_t *, char *, size_t);
 void	dnlc_reduce_cache(void *);
@@ -296,7 +294,7 @@
  * For example, "handle" for ufs holds the inumber and a directory
  * entry offset. Returns DOK, DNOCACHE, DTOOBIG.
  */
-dcret_t dnlc_dir_add_entry(dcanchor_t *dcap, char *name, uint64_t handle);
+dcret_t dnlc_dir_add_entry(dcanchor_t *dcap, const char *name, uint64_t handle);
 
 /*
  * dnlc_dir_add_space adds free space (length and file system specific
@@ -322,21 +320,22 @@
  * and returns the file system handle specified on dnlc_dir_add_entry()
  * in "handlep". Returns DFOUND, DNOENT, DNOCACHE.
  */
-dcret_t dnlc_dir_lookup(dcanchor_t *dcap, char *name, uint64_t *handlep);
+dcret_t dnlc_dir_lookup(dcanchor_t *dcap, const char *name, uint64_t *handlep);
 
 /*
  * dnlc_dir_update() amends the handle for an entry in a directory cache
  * "handle" is the new file system specific handle for the file "name".
  * Returns DFOUND, DNOENT, DNOCACHE.
  */
-dcret_t dnlc_dir_update(dcanchor_t *dcap, char *name, uint64_t handle);
+dcret_t dnlc_dir_update(dcanchor_t *dcap, const char *name, uint64_t handle);
 
 /*
  * dnlc_dir_rem_entry() removes an entry form a directory cache.
  * Returns the handle if "handlep" non null.
  * Returns DFOUND, DNOENT, DNOCACHE.
  */
-dcret_t dnlc_dir_rem_entry(dcanchor_t *dcap, char *name, uint64_t *handlep);
+dcret_t dnlc_dir_rem_entry(dcanchor_t *dcap, const char *name,
+    uint64_t *handlep);
 
 /*
  * dnlc_dir_rem_space_by_len() looks up and returns free space in a
--- a/usr/src/uts/common/sys/kmem.h	Thu Jun 24 09:34:22 2010 -0700
+++ b/usr/src/uts/common/sys/kmem.h	Thu Jun 24 11:35:31 2010 -0700
@@ -95,6 +95,15 @@
 
 #ifdef _KERNEL
 
+/*
+ * Helps clients implementing the move() callback to recognize known objects by
+ * testing a client-designated pointer member. Takes advantage of the fact that
+ * any scribbling to freed memory done by kmem is guaranteed to set one of the
+ * two low order bits.
+ */
+#define	POINTER_IS_VALID(p)	(!((uintptr_t)(p) & 0x3))
+#define	POINTER_INVALIDATE(pp)	(*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1))
+
 extern int kmem_ready;
 extern pgcnt_t kmem_reapahead;
 
--- a/usr/src/uts/intel/io/dktp/dcdev/dadk.c	Thu Jun 24 09:34:22 2010 -0700
+++ b/usr/src/uts/intel/io/dktp/dcdev/dadk.c	Thu Jun 24 11:35:31 2010 -0700
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*
@@ -868,11 +867,7 @@
 				    sizeof (struct dk_callback), KM_SLEEP);
 
 				bcopy(dkc, dkc2, sizeof (*dkc2));
-				/*
-				 * Borrow b_list to carry private data
-				 * to the b_iodone func.
-				 */
-				bp->b_list = (struct buf *)dkc2;
+				bp->b_private = dkc2;
 				bp->b_iodone = dadk_flushdone;
 				is_sync = 0;
 			}
@@ -988,7 +983,7 @@
 int
 dadk_flushdone(struct buf *bp)
 {
-	struct dk_callback *dkc = (struct dk_callback *)bp->b_list;
+	struct dk_callback *dkc = bp->b_private;
 
 	ASSERT(dkc != NULL && dkc->dkc_callback != NULL);
 
--- a/usr/src/uts/sun/io/dada/targets/dad.c	Thu Jun 24 09:34:22 2010 -0700
+++ b/usr/src/uts/sun/io/dada/targets/dad.c	Thu Jun 24 11:35:31 2010 -0700
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 
@@ -3465,6 +3464,7 @@
 		bp->b_un.b_addr = 0;
 		bp->b_iodone = NULL;
 		bp->b_list = NULL;
+		bp->b_private = NULL;
 
 		if ((flag & FKIOCTL) && dkc != NULL &&
 		    dkc->dkc_callback != NULL) {
@@ -3472,7 +3472,7 @@
 			    kmem_zalloc(sizeof (*dkc2), KM_SLEEP);
 			bcopy(dkc, dkc2, sizeof (*dkc2));
 
-			bp->b_list = (struct buf *)dkc2;
+			bp->b_private = dkc2;
 			bp->b_iodone = dcdflushdone;
 			is_sync = 0;
 		}
@@ -3500,7 +3500,7 @@
 	struct dcd_disk *un = ddi_get_soft_state(dcd_state,
 	    DCDUNIT(bp->b_edev));
 	struct dcd_pkt *pkt = BP_PKT(bp);
-	struct dk_callback *dkc = (struct dk_callback *)bp->b_list;
+	struct dk_callback *dkc = bp->b_private;
 
 	ASSERT(un != NULL);
 	ASSERT(bp == un->un_sbufp);
@@ -3514,7 +3514,7 @@
 		(*dkc->dkc_callback)(dkc->dkc_cookie, geterror(bp));
 		kmem_free(dkc, sizeof (*dkc));
 		bp->b_iodone = NULL;
-		bp->b_list = NULL;
+		bp->b_private = NULL;
 	}
 
 	/*