changeset 13973:4972ab336f54

3464 zfs synctask code needs restructuring Reviewed by: Dan Kimmel <dan.kimmel@delphix.com> Reviewed by: Adam Leventhal <ahl@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Christopher Siden <christopher.siden@delphix.com> Approved by: Garrett D'Amore <garrett@damore.org>
author Matthew Ahrens <mahrens@delphix.com>
date Thu, 28 Feb 2013 12:44:05 -0800
parents e844373201f4
children 9eec6e773689
files usr/src/cmd/mdb/common/modules/zfs/zfs.c usr/src/cmd/ndmpd/ndmp/ndmpd_chkpnt.c usr/src/cmd/truss/expound.c usr/src/cmd/zdb/zdb.c usr/src/cmd/zfs/zfs_main.c usr/src/cmd/zhack/zhack.c usr/src/cmd/ztest/ztest.c usr/src/common/nvpair/fnvpair.c usr/src/lib/libnvpair/mapfile-vers usr/src/lib/libzfs/common/libzfs.h usr/src/lib/libzfs/common/libzfs_dataset.c usr/src/lib/libzfs/common/libzfs_sendrecv.c usr/src/lib/libzfs_core/common/libzfs_core.c usr/src/lib/libzfs_core/common/libzfs_core.h usr/src/lib/libzfs_core/common/mapfile-vers usr/src/lib/libzpool/common/kernel.c usr/src/lib/libzpool/common/llib-lzpool usr/src/lib/libzpool/common/sys/zfs_context.h usr/src/man/man1m/zfs.1m usr/src/uts/common/Makefile.files usr/src/uts/common/fs/zfs/arc.c usr/src/uts/common/fs/zfs/bplist.c usr/src/uts/common/fs/zfs/bpobj.c usr/src/uts/common/fs/zfs/dbuf.c usr/src/uts/common/fs/zfs/dmu.c usr/src/uts/common/fs/zfs/dmu_diff.c usr/src/uts/common/fs/zfs/dmu_objset.c usr/src/uts/common/fs/zfs/dmu_send.c usr/src/uts/common/fs/zfs/dmu_traverse.c usr/src/uts/common/fs/zfs/dmu_tx.c usr/src/uts/common/fs/zfs/dnode.c usr/src/uts/common/fs/zfs/dnode_sync.c usr/src/uts/common/fs/zfs/dsl_dataset.c usr/src/uts/common/fs/zfs/dsl_deleg.c usr/src/uts/common/fs/zfs/dsl_destroy.c usr/src/uts/common/fs/zfs/dsl_dir.c usr/src/uts/common/fs/zfs/dsl_pool.c usr/src/uts/common/fs/zfs/dsl_prop.c usr/src/uts/common/fs/zfs/dsl_scan.c usr/src/uts/common/fs/zfs/dsl_synctask.c usr/src/uts/common/fs/zfs/dsl_userhold.c usr/src/uts/common/fs/zfs/metaslab.c usr/src/uts/common/fs/zfs/refcount.c usr/src/uts/common/fs/zfs/rrwlock.c usr/src/uts/common/fs/zfs/sa.c usr/src/uts/common/fs/zfs/spa.c usr/src/uts/common/fs/zfs/spa_history.c usr/src/uts/common/fs/zfs/spa_misc.c usr/src/uts/common/fs/zfs/space_map.c usr/src/uts/common/fs/zfs/sys/arc.h usr/src/uts/common/fs/zfs/sys/dbuf.h usr/src/uts/common/fs/zfs/sys/dmu.h usr/src/uts/common/fs/zfs/sys/dmu_objset.h usr/src/uts/common/fs/zfs/sys/dmu_send.h usr/src/uts/common/fs/zfs/sys/dmu_tx.h usr/src/uts/common/fs/zfs/sys/dsl_dataset.h usr/src/uts/common/fs/zfs/sys/dsl_destroy.h usr/src/uts/common/fs/zfs/sys/dsl_dir.h usr/src/uts/common/fs/zfs/sys/dsl_pool.h usr/src/uts/common/fs/zfs/sys/dsl_prop.h usr/src/uts/common/fs/zfs/sys/dsl_synctask.h usr/src/uts/common/fs/zfs/sys/dsl_userhold.h usr/src/uts/common/fs/zfs/sys/metaslab.h usr/src/uts/common/fs/zfs/sys/refcount.h usr/src/uts/common/fs/zfs/sys/rrwlock.h usr/src/uts/common/fs/zfs/sys/space_map.h usr/src/uts/common/fs/zfs/sys/txg.h usr/src/uts/common/fs/zfs/sys/zfeature.h usr/src/uts/common/fs/zfs/sys/zfs_debug.h usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h usr/src/uts/common/fs/zfs/sys/zfs_znode.h usr/src/uts/common/fs/zfs/sys/zil.h usr/src/uts/common/fs/zfs/txg.c usr/src/uts/common/fs/zfs/zfs_ctldir.c usr/src/uts/common/fs/zfs/zfs_ioctl.c usr/src/uts/common/fs/zfs/zfs_vfsops.c usr/src/uts/common/fs/zfs/zil.c usr/src/uts/common/fs/zfs/zio.c usr/src/uts/common/fs/zfs/zvol.c usr/src/uts/common/sys/nvpair.h
diffstat 80 files changed, 6418 insertions(+), 5961 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c	Wed Feb 27 21:13:42 2013 +0100
+++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c	Thu Feb 28 12:44:05 2013 -0800
@@ -859,7 +859,7 @@
 	    NULL) != argc)
 		return (DCMD_USAGE);
 
-	if (mdb_lookup_by_name("zfs_dbgmsgs", &sym)) {
+	if (mdb_lookup_by_obj(ZFS_OBJ_NAME, "zfs_dbgmsgs", &sym)) {
 		mdb_warn("can't find zfs_dbgmsgs");
 		return (DCMD_ERR);
 	}
--- a/usr/src/cmd/ndmpd/ndmp/ndmpd_chkpnt.c	Wed Feb 27 21:13:42 2013 +0100
+++ b/usr/src/cmd/ndmpd/ndmp/ndmpd_chkpnt.c	Thu Feb 28 12:44:05 2013 -0800
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
  */
 
 /*
@@ -202,8 +203,7 @@
 	}
 
 	p = strchr(snapname, '@') + 1;
-	if (zfs_hold(zhp, p, jname, recursive, B_TRUE, B_FALSE,
-	    cleanup_fd, 0, 0) != 0) {
+	if (zfs_hold(zhp, p, jname, recursive, B_FALSE, cleanup_fd) != 0) {
 		NDMP_LOG(LOG_ERR, "Cannot hold snapshot %s", p);
 		zfs_close(zhp);
 		return (-1);
--- a/usr/src/cmd/truss/expound.c	Wed Feb 27 21:13:42 2013 +0100
+++ b/usr/src/cmd/truss/expound.c	Thu Feb 28 12:44:05 2013 -0800
@@ -22,6 +22,7 @@
 /*
  * Copyright 2012 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -4876,9 +4877,7 @@
 	if (zc.zc_value[0])
 		(void) printf("    zc_value=%s\n", zc.zc_value);
 	if (zc.zc_string[0])
-		(void) printf("    zc_strign=%s\n", zc.zc_string);
-	if (zc.zc_top_ds[0])
-		(void) printf("    zc_top_ds=%s\n", zc.zc_top_ds);
+		(void) printf("    zc_string=%s\n", zc.zc_string);
 	if (zc.zc_guid != 0) {
 		(void) printf("    zc_guid=%llu\n",
 		    (u_longlong_t)zc.zc_guid);
--- a/usr/src/cmd/zdb/zdb.c	Wed Feb 27 21:13:42 2013 +0100
+++ b/usr/src/cmd/zdb/zdb.c	Thu Feb 28 12:44:05 2013 -0800
@@ -1658,7 +1658,9 @@
 	int print_header = 1;
 	int i, error;
 
+	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
 	dmu_objset_fast_stat(os, &dds);
+	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 
 	if (dds.dds_type < DMU_OST_NUMTYPES)
 		type = objset_types[dds.dds_type];
@@ -2109,7 +2111,6 @@
 
 		zio_nowait(zio_read(NULL, spa, bp, data, size,
 		    zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb));
-
 	}
 
 	zcb->zcb_readfails = 0;
@@ -2297,8 +2298,10 @@
 	 */
 	(void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
 	    count_block_cb, &zcb, NULL);
-	(void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
-	    count_block_cb, &zcb, NULL);
+	if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+		(void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
+		    count_block_cb, &zcb, NULL);
+	}
 	if (spa_feature_is_active(spa,
 	    &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
 		VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset,
--- a/usr/src/cmd/zfs/zfs_main.c	Wed Feb 27 21:13:42 2013 +0100
+++ b/usr/src/cmd/zfs/zfs_main.c	Thu Feb 28 12:44:05 2013 -0800
@@ -898,6 +898,7 @@
 	boolean_t	cb_parsable;
 	boolean_t	cb_dryrun;
 	nvlist_t	*cb_nvl;
+	nvlist_t	*cb_batchedsnaps;
 
 	/* first snap in contiguous run */
 	char		*cb_firstsnap;
@@ -994,9 +995,27 @@
 		zfs_close(zhp);
 		return (0);
 	}
-
-	if (!cb->cb_dryrun) {
-		if (zfs_unmount(zhp, NULL, cb->cb_force ? MS_FORCE : 0) != 0 ||
+	if (cb->cb_dryrun) {
+		zfs_close(zhp);
+		return (0);
+	}
+
+	/*
+	 * We batch up all contiguous snapshots (even of different
+	 * filesystems) and destroy them with one ioctl.  We can't
+	 * simply do all snap deletions and then all fs deletions,
+	 * because we must delete a clone before its origin.
+	 */
+	if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) {
+		fnvlist_add_boolean(cb->cb_batchedsnaps, name);
+	} else {
+		int error = zfs_destroy_snaps_nvl(g_zfs,
+		    cb->cb_batchedsnaps, B_FALSE);
+		fnvlist_free(cb->cb_batchedsnaps);
+		cb->cb_batchedsnaps = fnvlist_alloc();
+
+		if (error != 0 ||
+		    zfs_unmount(zhp, NULL, cb->cb_force ? MS_FORCE : 0) != 0 ||
 		    zfs_destroy(zhp, cb->cb_defer_destroy) != 0) {
 			zfs_close(zhp);
 			return (-1);
@@ -1152,8 +1171,10 @@
 zfs_do_destroy(int argc, char **argv)
 {
 	destroy_cbdata_t cb = { 0 };
+	int rv = 0;
+	int err = 0;
 	int c;
-	zfs_handle_t *zhp;
+	zfs_handle_t *zhp = NULL;
 	char *at;
 	zfs_type_t type = ZFS_TYPE_DATASET;
 
@@ -1207,11 +1228,9 @@
 
 	at = strchr(argv[0], '@');
 	if (at != NULL) {
-		int err = 0;
 
 		/* Build the list of snaps to destroy in cb_nvl. */
-		if (nvlist_alloc(&cb.cb_nvl, NV_UNIQUE_NAME, 0) != 0)
-			nomem();
+		cb.cb_nvl = fnvlist_alloc();
 
 		*at = '\0';
 		zhp = zfs_open(g_zfs, argv[0],
@@ -1222,17 +1241,15 @@
 		cb.cb_snapspec = at + 1;
 		if (gather_snapshots(zfs_handle_dup(zhp), &cb) != 0 ||
 		    cb.cb_error) {
-			zfs_close(zhp);
-			nvlist_free(cb.cb_nvl);
-			return (1);
+			rv = 1;
+			goto out;
 		}
 
 		if (nvlist_empty(cb.cb_nvl)) {
 			(void) fprintf(stderr, gettext("could not find any "
 			    "snapshots to destroy; check snapshot names.\n"));
-			zfs_close(zhp);
-			nvlist_free(cb.cb_nvl);
-			return (1);
+			rv = 1;
+			goto out;
 		}
 
 		if (cb.cb_verbose) {
@@ -1251,18 +1268,26 @@
 		}
 
 		if (!cb.cb_dryrun) {
-			if (cb.cb_doclones)
+			if (cb.cb_doclones) {
+				cb.cb_batchedsnaps = fnvlist_alloc();
 				err = destroy_clones(&cb);
+				if (err == 0) {
+					err = zfs_destroy_snaps_nvl(g_zfs,
+					    cb.cb_batchedsnaps, B_FALSE);
+				}
+				if (err != 0) {
+					rv = 1;
+					goto out;
+				}
+			}
 			if (err == 0) {
-				err = zfs_destroy_snaps_nvl(zhp, cb.cb_nvl,
+				err = zfs_destroy_snaps_nvl(g_zfs, cb.cb_nvl,
 				    cb.cb_defer_destroy);
 			}
 		}
 
-		zfs_close(zhp);
-		nvlist_free(cb.cb_nvl);
 		if (err != 0)
-			return (1);
+			rv = 1;
 	} else {
 		/* Open the given dataset */
 		if ((zhp = zfs_open(g_zfs, argv[0], type)) == NULL)
@@ -1283,8 +1308,8 @@
 			    zfs_get_name(zhp));
 			(void) fprintf(stderr, gettext("use 'zpool destroy %s' "
 			    "to destroy the pool itself\n"), zfs_get_name(zhp));
-			zfs_close(zhp);
-			return (1);
+			rv = 1;
+			goto out;
 		}
 
 		/*
@@ -1294,30 +1319,42 @@
 		if (!cb.cb_doclones &&
 		    zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent,
 		    &cb) != 0) {
-			zfs_close(zhp);
-			return (1);
+			rv = 1;
+			goto out;
 		}
 
 		if (cb.cb_error) {
-			zfs_close(zhp);
-			return (1);
+			rv = 1;
+			goto out;
 		}
 
+		cb.cb_batchedsnaps = fnvlist_alloc();
 		if (zfs_iter_dependents(zhp, B_FALSE, destroy_callback,
 		    &cb) != 0) {
-			zfs_close(zhp);
-			return (1);
+			rv = 1;
+			goto out;
 		}
 
 		/*
 		 * Do the real thing.  The callback will close the
 		 * handle regardless of whether it succeeds or not.
 		 */
-		if (destroy_callback(zhp, &cb) != 0)
-			return (1);
-	}
-
-	return (0);
+		err = destroy_callback(zhp, &cb);
+		zhp = NULL;
+		if (err == 0) {
+			err = zfs_destroy_snaps_nvl(g_zfs,
+			    cb.cb_batchedsnaps, cb.cb_defer_destroy);
+		}
+		if (err != 0)
+			rv = 1;
+	}
+
+out:
+	fnvlist_free(cb.cb_batchedsnaps);
+	fnvlist_free(cb.cb_nvl);
+	if (zhp != NULL)
+		zfs_close(zhp);
+	return (rv);
 }
 
 static boolean_t
@@ -5052,28 +5089,12 @@
 	return (error);
 }
 
-/*
- * zfs allow [-r] [-t] <tag> <snap> ...
- *
- *	-r	Recursively hold
- *	-t	Temporary hold (hidden option)
- *
- * Apply a user-hold with the given tag to the list of snapshots.
- */
 static int
 zfs_do_allow(int argc, char **argv)
 {
 	return (zfs_do_allow_unallow_impl(argc, argv, B_FALSE));
 }
 
-/*
- * zfs unallow [-r] [-t] <tag> <snap> ...
- *
- *	-r	Recursively hold
- *	-t	Temporary hold (hidden option)
- *
- * Apply a user-hold with the given tag to the list of snapshots.
- */
 static int
 zfs_do_unallow(int argc, char **argv)
 {
@@ -5087,7 +5108,6 @@
 	int i;
 	const char *tag;
 	boolean_t recursive = B_FALSE;
-	boolean_t temphold = B_FALSE;
 	const char *opts = holding ? "rt" : "r";
 	int c;
 
@@ -5097,9 +5117,6 @@
 		case 'r':
 			recursive = B_TRUE;
 			break;
-		case 't':
-			temphold = B_TRUE;
-			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
@@ -5148,7 +5165,7 @@
 		}
 		if (holding) {
 			if (zfs_hold(zhp, delim+1, tag, recursive,
-			    temphold, B_FALSE, -1, 0, 0) != 0)
+			    B_FALSE, -1) != 0)
 				++errors;
 		} else {
 			if (zfs_release(zhp, delim+1, tag, recursive) != 0)
@@ -5164,7 +5181,6 @@
  * zfs hold [-r] [-t] <tag> <snap> ...
  *
  *	-r	Recursively hold
- *	-t	Temporary hold (hidden option)
  *
  * Apply a user-hold with the given tag to the list of snapshots.
  */
--- a/usr/src/cmd/zhack/zhack.c	Wed Feb 27 21:13:42 2013 +0100
+++ b/usr/src/cmd/zhack/zhack.c	Thu Feb 28 12:44:05 2013 -0800
@@ -46,6 +46,7 @@
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
 #include <sys/zfeature.h>
+#include <sys/dmu_tx.h>
 #undef ZFS_MAXNAMELEN
 #undef verify
 #include <libzfs.h>
@@ -273,10 +274,10 @@
 }
 
 static void
-feature_enable_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+feature_enable_sync(void *arg, dmu_tx_t *tx)
 {
-	spa_t *spa = arg1;
-	zfeature_info_t *feature = arg2;
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	zfeature_info_t *feature = arg;
 
 	spa_feature_enable(spa, feature, tx);
 	spa_history_log_internal(spa, "zhack enable feature", tx,
@@ -344,8 +345,8 @@
 	if (0 == zap_contains(mos, spa->spa_feat_desc_obj, feature.fi_guid))
 		fatal("feature already enabled: %s", feature.fi_guid);
 
-	VERIFY3U(0, ==, dsl_sync_task_do(spa->spa_dsl_pool, NULL,
-	    feature_enable_sync, spa, &feature, 5));
+	VERIFY0(dsl_sync_task(spa_name(spa), NULL,
+	    feature_enable_sync, &feature, 5));
 
 	spa_close(spa, FTAG);
 
@@ -353,10 +354,10 @@
 }
 
 static void
-feature_incr_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+feature_incr_sync(void *arg, dmu_tx_t *tx)
 {
-	spa_t *spa = arg1;
-	zfeature_info_t *feature = arg2;
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	zfeature_info_t *feature = arg;
 
 	spa_feature_incr(spa, feature, tx);
 	spa_history_log_internal(spa, "zhack feature incr", tx,
@@ -364,10 +365,10 @@
 }
 
 static void
-feature_decr_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+feature_decr_sync(void *arg, dmu_tx_t *tx)
 {
-	spa_t *spa = arg1;
-	zfeature_info_t *feature = arg2;
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	zfeature_info_t *feature = arg;
 
 	spa_feature_decr(spa, feature, tx);
 	spa_history_log_internal(spa, "zhack feature decr", tx,
@@ -442,8 +443,8 @@
 	if (decr && !spa_feature_is_active(spa, &feature))
 		fatal("feature refcount already 0: %s", feature.fi_guid);
 
-	VERIFY3U(0, ==, dsl_sync_task_do(spa->spa_dsl_pool, NULL,
-	    decr ? feature_decr_sync : feature_incr_sync, spa, &feature, 5));
+	VERIFY0(dsl_sync_task(spa_name(spa), NULL,
+	    decr ? feature_decr_sync : feature_incr_sync, &feature, 5));
 
 	spa_close(spa, FTAG);
 }
--- a/usr/src/cmd/ztest/ztest.c	Wed Feb 27 21:13:42 2013 +0100
+++ b/usr/src/cmd/ztest/ztest.c	Thu Feb 28 12:44:05 2013 -0800
@@ -103,10 +103,12 @@
 #include <sys/metaslab_impl.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_dataset.h>
+#include <sys/dsl_destroy.h>
 #include <sys/dsl_scan.h>
 #include <sys/zio_checksum.h>
 #include <sys/refcount.h>
 #include <sys/zfeature.h>
+#include <sys/dsl_userhold.h>
 #include <stdio.h>
 #include <stdio_ext.h>
 #include <stdlib.h>
@@ -365,7 +367,7 @@
 	{ ztest_scrub,				1,	&zopt_rarely	},
 	{ ztest_spa_upgrade,			1,	&zopt_rarely	},
 	{ ztest_dsl_dataset_promote_busy,	1,	&zopt_rarely	},
-	{ ztest_vdev_attach_detach,		1,	&zopt_rarely	},
+	{ ztest_vdev_attach_detach,		1,	&zopt_sometimes	},
 	{ ztest_vdev_LUN_growth,		1,	&zopt_rarely	},
 	{ ztest_vdev_add_remove,		1,
 	    &ztest_opts.zo_vdevtime				},
@@ -1006,9 +1008,8 @@
 	uint64_t curval;
 	int error;
 
-	error = dsl_prop_set(osname, propname,
-	    (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL),
-	    sizeof (value), 1, &value);
+	error = dsl_prop_set_int(osname, propname,
+	    (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value);
 
 	if (error == ENOSPC) {
 		ztest_record_enospc(FTAG);
@@ -1016,8 +1017,7 @@
 	}
 	ASSERT0(error);
 
-	VERIFY3U(dsl_prop_get(osname, propname, sizeof (curval),
-	    1, &curval, setpoint), ==, 0);
+	VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint));
 
 	if (ztest_opts.zo_verbose >= 6) {
 		VERIFY(zfs_prop_index_to_string(prop, curval, &valname) == 0);
@@ -2479,8 +2479,7 @@
 	int error;
 
 	VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
-	leaves =
-	    MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz;
+	leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz;
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 
@@ -3180,7 +3179,7 @@
 	/*
 	 * Verify that the dataset contains a directory object.
 	 */
-	VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os));
+	VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, FTAG, &os));
 	error = dmu_object_info(os, ZTEST_DIROBJ, &doi);
 	if (error != ENOENT) {
 		/* We could have crashed in the middle of destroying it */
@@ -3188,12 +3187,16 @@
 		ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER);
 		ASSERT3S(doi.doi_physical_blocks_512, >=, 0);
 	}
-	dmu_objset_rele(os, FTAG);
+	dmu_objset_disown(os, FTAG);
 
 	/*
 	 * Destroy the dataset.
 	 */
-	VERIFY3U(0, ==, dmu_objset_destroy(name, B_FALSE));
+	if (strchr(name, '@') != NULL) {
+		VERIFY0(dsl_destroy_snapshot(name, B_FALSE));
+	} else {
+		VERIFY0(dsl_destroy_head(name));
+	}
 	return (0);
 }
 
@@ -3203,16 +3206,17 @@
 	char snapname[MAXNAMELEN];
 	int error;
 
-	(void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname,
-	    (u_longlong_t)id);
-
-	error = dmu_objset_snapshot_one(osname, strchr(snapname, '@') + 1);
+	(void) snprintf(snapname, sizeof (snapname), "%llu", (u_longlong_t)id);
+
+	error = dmu_objset_snapshot_one(osname, snapname);
 	if (error == ENOSPC) {
 		ztest_record_enospc(FTAG);
 		return (B_FALSE);
 	}
-	if (error != 0 && error != EEXIST)
-		fatal(0, "ztest_snapshot_create(%s) = %d", snapname, error);
+	if (error != 0 && error != EEXIST) {
+		fatal(0, "ztest_snapshot_create(%s@%s) = %d", osname,
+		    snapname, error);
+	}
 	return (B_TRUE);
 }
 
@@ -3225,7 +3229,7 @@
 	(void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname,
 	    (u_longlong_t)id);
 
-	error = dmu_objset_destroy(snapname, B_FALSE);
+	error = dsl_destroy_snapshot(snapname, B_FALSE);
 	if (error != 0 && error != ENOENT)
 		fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error);
 	return (B_TRUE);
@@ -3271,7 +3275,8 @@
 	/*
 	 * Verify that the destroyed dataset is no longer in the namespace.
 	 */
-	VERIFY3U(ENOENT, ==, dmu_objset_hold(name, FTAG, &os));
+	VERIFY3U(ENOENT, ==, dmu_objset_own(name, DMU_OST_OTHER, B_TRUE,
+	    FTAG, &os));
 
 	/*
 	 * Verify that we can create a new dataset.
@@ -3286,8 +3291,7 @@
 		fatal(0, "dmu_objset_create(%s) = %d", name, error);
 	}
 
-	VERIFY3U(0, ==,
-	    dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os));
+	VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os));
 
 	ztest_zd_init(&zdtmp, NULL, os);
 
@@ -3363,21 +3367,21 @@
 	(void) snprintf(clone2name, MAXNAMELEN, "%s/c2_%llu", osname, id);
 	(void) snprintf(snap3name, MAXNAMELEN, "%s@s3_%llu", clone1name, id);
 
-	error = dmu_objset_destroy(clone2name, B_FALSE);
+	error = dsl_destroy_head(clone2name);
 	if (error && error != ENOENT)
-		fatal(0, "dmu_objset_destroy(%s) = %d", clone2name, error);
-	error = dmu_objset_destroy(snap3name, B_FALSE);
+		fatal(0, "dsl_destroy_head(%s) = %d", clone2name, error);
+	error = dsl_destroy_snapshot(snap3name, B_FALSE);
 	if (error && error != ENOENT)
-		fatal(0, "dmu_objset_destroy(%s) = %d", snap3name, error);
-	error = dmu_objset_destroy(snap2name, B_FALSE);
+		fatal(0, "dsl_destroy_snapshot(%s) = %d", snap3name, error);
+	error = dsl_destroy_snapshot(snap2name, B_FALSE);
 	if (error && error != ENOENT)
-		fatal(0, "dmu_objset_destroy(%s) = %d", snap2name, error);
-	error = dmu_objset_destroy(clone1name, B_FALSE);
+		fatal(0, "dsl_destroy_snapshot(%s) = %d", snap2name, error);
+	error = dsl_destroy_head(clone1name);
 	if (error && error != ENOENT)
-		fatal(0, "dmu_objset_destroy(%s) = %d", clone1name, error);
-	error = dmu_objset_destroy(snap1name, B_FALSE);
+		fatal(0, "dsl_destroy_head(%s) = %d", clone1name, error);
+	error = dsl_destroy_snapshot(snap1name, B_FALSE);
 	if (error && error != ENOENT)
-		fatal(0, "dmu_objset_destroy(%s) = %d", snap1name, error);
+		fatal(0, "dsl_destroy_snapshot(%s) = %d", snap1name, error);
 }
 
 /*
@@ -3386,8 +3390,7 @@
 void
 ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id)
 {
-	objset_t *clone;
-	dsl_dataset_t *ds;
+	objset_t *os;
 	char snap1name[MAXNAMELEN];
 	char clone1name[MAXNAMELEN];
 	char snap2name[MAXNAMELEN];
@@ -3415,12 +3418,7 @@
 		fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error);
 	}
 
-	error = dmu_objset_hold(snap1name, FTAG, &clone);
-	if (error)
-		fatal(0, "dmu_open_snapshot(%s) = %d", snap1name, error);
-
-	error = dmu_objset_clone(clone1name, dmu_objset_ds(clone), 0);
-	dmu_objset_rele(clone, FTAG);
+	error = dmu_objset_clone(clone1name, snap1name);
 	if (error) {
 		if (error == ENOSPC) {
 			ztest_record_enospc(FTAG);
@@ -3447,12 +3445,7 @@
 		fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error);
 	}
 
-	error = dmu_objset_hold(snap3name, FTAG, &clone);
-	if (error)
-		fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error);
-
-	error = dmu_objset_clone(clone2name, dmu_objset_ds(clone), 0);
-	dmu_objset_rele(clone, FTAG);
+	error = dmu_objset_clone(clone2name, snap3name);
 	if (error) {
 		if (error == ENOSPC) {
 			ztest_record_enospc(FTAG);
@@ -3461,14 +3454,14 @@
 		fatal(0, "dmu_objset_create(%s) = %d", clone2name, error);
 	}
 
-	error = dsl_dataset_own(snap2name, B_FALSE, FTAG, &ds);
+	error = dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, FTAG, &os);
 	if (error)
-		fatal(0, "dsl_dataset_own(%s) = %d", snap2name, error);
+		fatal(0, "dmu_objset_own(%s) = %d", snap2name, error);
 	error = dsl_dataset_promote(clone2name, NULL);
 	if (error != EBUSY)
 		fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name,
 		    error);
-	dsl_dataset_disown(ds, FTAG);
+	dmu_objset_disown(os, FTAG);
 
 out:
 	ztest_dsl_dataset_cleanup(osname, id);
@@ -4280,7 +4273,7 @@
 	}
 
 	count = -1ULL;
-	VERIFY(zap_count(os, object, &count) == 0);
+	VERIFY0(zap_count(os, object, &count));
 	ASSERT(count != -1ULL);
 
 	/*
@@ -4591,6 +4584,22 @@
 	(void) rw_unlock(&ztest_name_lock);
 }
 
+static int
+user_release_one(const char *snapname, const char *holdname)
+{
+	nvlist_t *snaps, *holds;
+	int error;
+
+	snaps = fnvlist_alloc();
+	holds = fnvlist_alloc();
+	fnvlist_add_boolean(holds, holdname);
+	fnvlist_add_nvlist(snaps, snapname, holds);
+	fnvlist_free(holds);
+	error = dsl_dataset_user_release(snaps, NULL);
+	fnvlist_free(snaps);
+	return (error);
+}
+
 /*
  * Test snapshot hold/release and deferred destroy.
  */
@@ -4605,22 +4614,30 @@
 	char clonename[100];
 	char tag[100];
 	char osname[MAXNAMELEN];
+	nvlist_t *holds;
 
 	(void) rw_rdlock(&ztest_name_lock);
 
 	dmu_objset_name(os, osname);
 
-	(void) snprintf(snapname, 100, "sh1_%llu", id);
-	(void) snprintf(fullname, 100, "%s@%s", osname, snapname);
-	(void) snprintf(clonename, 100, "%s/ch1_%llu", osname, id);
-	(void) snprintf(tag, 100, "%tag_%llu", id);
+	(void) snprintf(snapname, sizeof (snapname), "sh1_%llu", id);
+	(void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname);
+	(void) snprintf(clonename, sizeof (clonename),
+	    "%s/ch1_%llu", osname, id);
+	(void) snprintf(tag, sizeof (tag), "tag_%llu", id);
 
 	/*
 	 * Clean up from any previous run.
 	 */
-	(void) dmu_objset_destroy(clonename, B_FALSE);
-	(void) dsl_dataset_user_release(osname, snapname, tag, B_FALSE);
-	(void) dmu_objset_destroy(fullname, B_FALSE);
+	error = dsl_destroy_head(clonename);
+	if (error != ENOENT)
+		ASSERT0(error);
+	error = user_release_one(fullname, tag);
+	if (error != ESRCH && error != ENOENT)
+		ASSERT0(error);
+	error = dsl_destroy_snapshot(fullname, B_FALSE);
+	if (error != ENOENT)
+		ASSERT0(error);
 
 	/*
 	 * Create snapshot, clone it, mark snap for deferred destroy,
@@ -4635,12 +4652,7 @@
 		fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error);
 	}
 
-	error = dmu_objset_hold(fullname, FTAG, &origin);
-	if (error)
-		fatal(0, "dmu_objset_hold(%s) = %d", fullname, error);
-
-	error = dmu_objset_clone(clonename, dmu_objset_ds(origin), 0);
-	dmu_objset_rele(origin, FTAG);
+	error = dmu_objset_clone(clonename, fullname);
 	if (error) {
 		if (error == ENOSPC) {
 			ztest_record_enospc("dmu_objset_clone");
@@ -4649,15 +4661,15 @@
 		fatal(0, "dmu_objset_clone(%s) = %d", clonename, error);
 	}
 
-	error = dmu_objset_destroy(fullname, B_TRUE);
+	error = dsl_destroy_snapshot(fullname, B_TRUE);
 	if (error) {
-		fatal(0, "dmu_objset_destroy(%s, B_TRUE) = %d",
+		fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d",
 		    fullname, error);
 	}
 
-	error = dmu_objset_destroy(clonename, B_FALSE);
+	error = dsl_destroy_head(clonename);
 	if (error)
-		fatal(0, "dmu_objset_destroy(%s) = %d", clonename, error);
+		fatal(0, "dsl_destroy_head(%s) = %d", clonename, error);
 
 	error = dmu_objset_hold(fullname, FTAG, &origin);
 	if (error != ENOENT)
@@ -4677,28 +4689,31 @@
 		fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error);
 	}
 
-	error = dsl_dataset_user_hold(osname, snapname, tag, B_FALSE,
-	    B_TRUE, -1);
+	holds = fnvlist_alloc();
+	fnvlist_add_string(holds, fullname, tag);
+	error = dsl_dataset_user_hold(holds, 0, NULL);
+	fnvlist_free(holds);
+
 	if (error)
 		fatal(0, "dsl_dataset_user_hold(%s)", fullname, tag);
 
-	error = dmu_objset_destroy(fullname, B_FALSE);
+	error = dsl_destroy_snapshot(fullname, B_FALSE);
 	if (error != EBUSY) {
-		fatal(0, "dmu_objset_destroy(%s, B_FALSE) = %d",
+		fatal(0, "dsl_destroy_snapshot(%s, B_FALSE) = %d",
 		    fullname, error);
 	}
 
-	error = dmu_objset_destroy(fullname, B_TRUE);
+	error = dsl_destroy_snapshot(fullname, B_TRUE);
 	if (error) {
-		fatal(0, "dmu_objset_destroy(%s, B_TRUE) = %d",
+		fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d",
 		    fullname, error);
 	}
 
-	error = dsl_dataset_user_release(osname, snapname, tag, B_FALSE);
+	error = user_release_one(fullname, tag);
 	if (error)
-		fatal(0, "dsl_dataset_user_release(%s)", fullname, tag);
-
-	VERIFY(dmu_objset_hold(fullname, FTAG, &origin) == ENOENT);
+		fatal(0, "user_release_one(%s)", fullname, tag);
+
+	VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT);
 
 out:
 	(void) rw_unlock(&ztest_name_lock);
@@ -4952,8 +4967,12 @@
 	 */
 	for (int i = 0; i < copies; i++) {
 		uint64_t offset = i * blocksize;
-		VERIFY0(dmu_buf_hold(os, object, offset, FTAG, &db,
-		    DMU_READ_NO_PREFETCH));
+		int error = dmu_buf_hold(os, object, offset, FTAG, &db,
+		    DMU_READ_NO_PREFETCH);
+		if (error != 0) {
+			fatal(B_FALSE, "dmu_buf_hold(%p, %llu, %llu) = %u",
+			    os, (long long)object, (long long) offset, error);
+		}
 		ASSERT(db->db_offset == offset);
 		ASSERT(db->db_size == blocksize);
 		ASSERT(ztest_pattern_match(db->db_data, db->db_size, pattern) ||
@@ -5163,6 +5182,7 @@
 	nvlist_t *config, *newconfig;
 	uint64_t pool_guid;
 	spa_t *spa;
+	int error;
 
 	if (ztest_opts.zo_verbose >= 4) {
 		(void) printf("import/export: old = %s, new = %s\n",
@@ -5207,7 +5227,12 @@
 	/*
 	 * Import it under the new name.
 	 */
-	VERIFY3U(0, ==, spa_import(newname, config, NULL, 0));
+	error = spa_import(newname, config, NULL, 0);
+	if (error != 0) {
+		dump_nvlist(config, 0);
+		fatal(B_FALSE, "couldn't import pool %s as %s: error %u",
+		    oldname, newname, error);
+	}
 
 	ztest_walk_pool_directory("pools after import");
 
@@ -5414,7 +5439,7 @@
 	}
 	ASSERT(error == 0 || error == EEXIST);
 
-	VERIFY0(dmu_objset_hold(name, zd, &os));
+	VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, zd, &os));
 	(void) rw_unlock(&ztest_name_lock);
 
 	ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os);
@@ -5455,7 +5480,7 @@
 	ztest_ds_t *zd = &ztest_ds[d];
 
 	zil_close(zd->zd_zilog);
-	dmu_objset_rele(zd->zd_os, zd);
+	dmu_objset_disown(zd->zd_os, zd);
 
 	ztest_zd_fini(zd);
 }
@@ -5499,13 +5524,14 @@
 	 * Open our pool.
 	 */
 	kernel_init(FREAD | FWRITE);
-	VERIFY(spa_open(ztest_opts.zo_pool, &spa, FTAG) == 0);
+	VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG));
 	spa->spa_debug = B_TRUE;
 	ztest_spa = spa;
 
-	VERIFY3U(0, ==, dmu_objset_hold(ztest_opts.zo_pool, FTAG, &os));
+	VERIFY0(dmu_objset_own(ztest_opts.zo_pool,
+	    DMU_OST_ANY, B_TRUE, FTAG, &os));
 	zs->zs_guid = dmu_objset_fsid_guid(os);
-	dmu_objset_rele(os, FTAG);
+	dmu_objset_disown(os, FTAG);
 
 	spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN;
 
--- a/usr/src/common/nvpair/fnvpair.c	Wed Feb 27 21:13:42 2013 +0100
+++ b/usr/src/common/nvpair/fnvpair.c	Thu Feb 28 12:44:05 2013 -0800
@@ -26,6 +26,7 @@
 #include <sys/nvpair.h>
 #include <sys/kmem.h>
 #include <sys/debug.h>
+#include <sys/param.h>
 #ifndef _KERNEL
 #include <stdlib.h>
 #endif
@@ -114,6 +115,18 @@
 	VERIFY0(nvlist_merge(dst, src, KM_SLEEP));
 }
 
+size_t
+fnvlist_num_pairs(nvlist_t *nvl)
+{
+	size_t count = 0;
+	nvpair_t *pair;
+
+	for (pair = nvlist_next_nvpair(nvl, 0); pair != NULL;
+	    pair = nvlist_next_nvpair(nvl, pair))
+		count++;
+	return (count);
+}
+
 void
 fnvlist_add_boolean(nvlist_t *nvl, const char *name)
 {
--- a/usr/src/lib/libnvpair/mapfile-vers	Wed Feb 27 21:13:42 2013 +0100
+++ b/usr/src/lib/libnvpair/mapfile-vers	Thu Feb 28 12:44:05 2013 -0800
@@ -49,6 +49,7 @@
     fnvlist_unpack;
     fnvlist_dup;
     fnvlist_merge;
+    fnvlist_num_pairs;
     fnvlist_add_boolean;
     fnvlist_add_boolean_value;
     fnvlist_add_byte;
--- a/usr/src/lib/libzfs/common/libzfs.h	Wed Feb 27 21:13:42 2013 +0100
+++ b/usr/src/lib/libzfs/common/libzfs.h	Thu Feb 28 12:44:05 2013 -0800
@@ -550,7 +550,7 @@
 extern int zfs_create_ancestors(libzfs_handle_t *, const char *);
 extern int zfs_destroy(zfs_handle_t *, boolean_t);
 extern int zfs_destroy_snaps(zfs_handle_t *, char *, boolean_t);
-extern int zfs_destroy_snaps_nvl(zfs_handle_t *, nvlist_t *, boolean_t);
+extern int zfs_destroy_snaps_nvl(libzfs_handle_t *, nvlist_t *, boolean_t);
 extern int zfs_clone(zfs_handle_t *, const char *, nvlist_t *);
 extern int zfs_snapshot(libzfs_handle_t *, const char *, boolean_t, nvlist_t *);
 extern int zfs_snapshot_nvl(libzfs_handle_t *hdl, nvlist_t *snaps,
@@ -593,8 +593,8 @@
     sendflags_t *, int, snapfilter_cb_t, void *, nvlist_t **);
 
 extern int zfs_promote(zfs_handle_t *);
-extern int zfs_hold(zfs_handle_t *, const char *, const char *, boolean_t,
-    boolean_t, boolean_t, int, uint64_t, uint64_t);
+extern int zfs_hold(zfs_handle_t *, const char *, const char *,
+    boolean_t, boolean_t, int);
 extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t);
 extern int zfs_get_holds(zfs_handle_t *, nvlist_t **);
 extern uint64_t zvol_volsize_to_reservation(uint64_t, nvlist_t *);
--- a/usr/src/lib/libzfs/common/libzfs_dataset.c	Wed Feb 27 21:13:42 2013 +0100
+++ b/usr/src/lib/libzfs/common/libzfs_dataset.c	Thu Feb 28 12:44:05 2013 -0800
@@ -1973,10 +1973,7 @@
 	    NULL, NULL, 0, B_TRUE) != 0)
 		goto out;
 	if (strcmp(gca->buf, gca->origin) == 0) {
-		if (nvlist_add_boolean(gca->value, zfs_get_name(zhp)) != 0) {
-			zfs_close(zhp);
-			return (no_memory(zhp->zfs_hdl));
-		}
+		fnvlist_add_boolean(gca->value, zfs_get_name(zhp));
 		gca->numclones--;
 	}
 
@@ -3142,45 +3139,49 @@
 		    dgettext(TEXT_DOMAIN, "cannot destroy '%s@%s'"),
 		    zhp->zfs_name, snapname);
 	} else {
-		ret = zfs_destroy_snaps_nvl(zhp, dd.nvl, defer);
+		ret = zfs_destroy_snaps_nvl(zhp->zfs_hdl, dd.nvl, defer);
 	}
 	nvlist_free(dd.nvl);
 	return (ret);
 }
 
 /*
- * Destroys all the snapshots named in the nvlist.  They must be underneath
- * the zhp (either snapshots of it, or snapshots of its descendants).
+ * Destroys all the snapshots named in the nvlist.
  */
 int
-zfs_destroy_snaps_nvl(zfs_handle_t *zhp, nvlist_t *snaps, boolean_t defer)
+zfs_destroy_snaps_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, boolean_t defer)
 {
 	int ret;
 	nvlist_t *errlist;
 
 	ret = lzc_destroy_snaps(snaps, defer, &errlist);
 
-	if (ret != 0) {
-		for (nvpair_t *pair = nvlist_next_nvpair(errlist, NULL);
-		    pair != NULL; pair = nvlist_next_nvpair(errlist, pair)) {
-			char errbuf[1024];
-			(void) snprintf(errbuf, sizeof (errbuf),
-			    dgettext(TEXT_DOMAIN, "cannot destroy snapshot %s"),
-			    nvpair_name(pair));
-
-			switch (fnvpair_value_int32(pair)) {
-			case EEXIST:
-				zfs_error_aux(zhp->zfs_hdl,
-				    dgettext(TEXT_DOMAIN,
-				    "snapshot is cloned"));
-				ret = zfs_error(zhp->zfs_hdl, EZFS_EXISTS,
-				    errbuf);
-				break;
-			default:
-				ret = zfs_standard_error(zhp->zfs_hdl, errno,
-				    errbuf);
-				break;
-			}
+	if (ret == 0)
+		return (0);
+
+	if (nvlist_next_nvpair(errlist, NULL) == NULL) {
+		char errbuf[1024];
+		(void) snprintf(errbuf, sizeof (errbuf),
+		    dgettext(TEXT_DOMAIN, "cannot destroy snapshots"));
+
+		ret = zfs_standard_error(hdl, ret, errbuf);
+	}
+	for (nvpair_t *pair = nvlist_next_nvpair(errlist, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(errlist, pair)) {
+		char errbuf[1024];
+		(void) snprintf(errbuf, sizeof (errbuf),
+		    dgettext(TEXT_DOMAIN, "cannot destroy snapshot %s"),
+		    nvpair_name(pair));
+
+		switch (fnvpair_value_int32(pair)) {
+		case EEXIST:
+			zfs_error_aux(hdl,
+			    dgettext(TEXT_DOMAIN, "snapshot is cloned"));
+			ret = zfs_error(hdl, EZFS_EXISTS, errbuf);
+			break;
+		default:
+			ret = zfs_standard_error(hdl, errno, errbuf);
+			break;
 		}
 	}
 
@@ -4047,7 +4048,7 @@
 
 		zc.zc_nvlist_dst_size = sizeof (buf);
 		if (zfs_ioctl(hdl, ZFS_IOC_USERSPACE_MANY, &zc) != 0) {
-			char errbuf[ZFS_MAXNAMELEN + 32];
+			char errbuf[1024];
 
 			(void) snprintf(errbuf, sizeof (errbuf),
 			    dgettext(TEXT_DOMAIN,
@@ -4069,37 +4070,83 @@
 	return (0);
 }
 
+struct holdarg {
+	nvlist_t *nvl;
+	const char *snapname;
+	const char *tag;
+	boolean_t recursive;
+};
+
+static int
+zfs_hold_one(zfs_handle_t *zhp, void *arg)
+{
+	struct holdarg *ha = arg;
+	zfs_handle_t *szhp;
+	char name[ZFS_MAXNAMELEN];
+	int rv = 0;
+
+	(void) snprintf(name, sizeof (name),
+	    "%s@%s", zhp->zfs_name, ha->snapname);
+
+	szhp = make_dataset_handle(zhp->zfs_hdl, name);
+	if (szhp) {
+		fnvlist_add_string(ha->nvl, name, ha->tag);
+		zfs_close(szhp);
+	}
+
+	if (ha->recursive)
+		rv = zfs_iter_filesystems(zhp, zfs_hold_one, ha);
+	zfs_close(zhp);
+	return (rv);
+}
+
 int
 zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag,
-    boolean_t recursive, boolean_t temphold, boolean_t enoent_ok,
-    int cleanup_fd, uint64_t dsobj, uint64_t createtxg)
+    boolean_t recursive, boolean_t enoent_ok, int cleanup_fd)
 {
-	zfs_cmd_t zc = { 0 };
+	int ret;
+	struct holdarg ha;
+	nvlist_t *errors;
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
-
-	ASSERT(!recursive || dsobj == 0);
-
-	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
-	(void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));
-	if (strlcpy(zc.zc_string, tag, sizeof (zc.zc_string))
-	    >= sizeof (zc.zc_string))
-		return (zfs_error(hdl, EZFS_TAGTOOLONG, tag));
-	zc.zc_cookie = recursive;
-	zc.zc_temphold = temphold;
-	zc.zc_cleanup_fd = cleanup_fd;
-	zc.zc_sendobj = dsobj;
-	zc.zc_createtxg = createtxg;
-
-	if (zfs_ioctl(hdl, ZFS_IOC_HOLD, &zc) != 0) {
-		char errbuf[ZFS_MAXNAMELEN+32];
-
-		/*
-		 * if it was recursive, the one that actually failed will be in
-		 * zc.zc_name.
-		 */
-		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
-		    "cannot hold '%s@%s'"), zc.zc_name, snapname);
-		switch (errno) {
+	char errbuf[1024];
+	nvpair_t *elem;
+
+	ha.nvl = fnvlist_alloc();
+	ha.snapname = snapname;
+	ha.tag = tag;
+	ha.recursive = recursive;
+	(void) zfs_hold_one(zfs_handle_dup(zhp), &ha);
+	ret = lzc_hold(ha.nvl, cleanup_fd, &errors);
+	fnvlist_free(ha.nvl);
+
+	if (ret == 0)
+		return (0);
+
+	if (nvlist_next_nvpair(errors, NULL) == NULL) {
+		/* no hold-specific errors */
+		(void) snprintf(errbuf, sizeof (errbuf),
+		    dgettext(TEXT_DOMAIN, "cannot hold"));
+		switch (ret) {
+		case ENOTSUP:
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "pool must be upgraded"));
+			(void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
+			break;
+		case EINVAL:
+			(void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
+			break;
+		default:
+			(void) zfs_standard_error(hdl, ret, errbuf);
+		}
+	}
+
+	for (elem = nvlist_next_nvpair(errors, NULL);
+	    elem != NULL;
+	    elem = nvlist_next_nvpair(errors, elem)) {
+		(void) snprintf(errbuf, sizeof (errbuf),
+		    dgettext(TEXT_DOMAIN,
+		    "cannot hold snapshot '%s'"), nvpair_name(elem));
+		switch (fnvpair_value_int32(elem)) {
 		case E2BIG:
 			/*
 			 * Temporary tags wind up having the ds object id
@@ -4107,66 +4154,122 @@
 			 * above, it's still possible for the tag to wind
 			 * up being slightly too long.
 			 */
-			return (zfs_error(hdl, EZFS_TAGTOOLONG, errbuf));
-		case ENOTSUP:
-			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-			    "pool must be upgraded"));
-			return (zfs_error(hdl, EZFS_BADVERSION, errbuf));
+			(void) zfs_error(hdl, EZFS_TAGTOOLONG, errbuf);
+			break;
 		case EINVAL:
-			return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
+			(void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
+			break;
 		case EEXIST:
-			return (zfs_error(hdl, EZFS_REFTAG_HOLD, errbuf));
+			(void) zfs_error(hdl, EZFS_REFTAG_HOLD, errbuf);
+			break;
 		case ENOENT:
 			if (enoent_ok)
 				return (ENOENT);
 			/* FALLTHROUGH */
 		default:
-			return (zfs_standard_error_fmt(hdl, errno, errbuf));
+			(void) zfs_standard_error(hdl,
+			    fnvpair_value_int32(elem), errbuf);
 		}
 	}
 
-	return (0);
+	fnvlist_free(errors);
+	return (ret);
+}
+
+struct releasearg {
+	nvlist_t *nvl;
+	const char *snapname;
+	const char *tag;
+	boolean_t recursive;
+};
+
+static int
+zfs_release_one(zfs_handle_t *zhp, void *arg)
+{
+	struct holdarg *ha = arg;
+	zfs_handle_t *szhp;
+	char name[ZFS_MAXNAMELEN];
+	int rv = 0;
+
+	(void) snprintf(name, sizeof (name),
+	    "%s@%s", zhp->zfs_name, ha->snapname);
+
+	szhp = make_dataset_handle(zhp->zfs_hdl, name);
+	if (szhp) {
+		nvlist_t *holds = fnvlist_alloc();
+		fnvlist_add_boolean(holds, ha->tag);
+		fnvlist_add_nvlist(ha->nvl, name, holds);
+		zfs_close(szhp);
+	}
+
+	if (ha->recursive)
+		rv = zfs_iter_filesystems(zhp, zfs_release_one, ha);
+	zfs_close(zhp);
+	return (rv);
 }
 
 int
 zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag,
     boolean_t recursive)
 {
-	zfs_cmd_t zc = { 0 };
+	int ret;
+	struct holdarg ha;
+	nvlist_t *errors;
+	nvpair_t *elem;
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 
-	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
-	(void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));
-	if (strlcpy(zc.zc_string, tag, sizeof (zc.zc_string))
-	    >= sizeof (zc.zc_string))
-		return (zfs_error(hdl, EZFS_TAGTOOLONG, tag));
-	zc.zc_cookie = recursive;
-
-	if (zfs_ioctl(hdl, ZFS_IOC_RELEASE, &zc) != 0) {
-		char errbuf[ZFS_MAXNAMELEN+32];
-
-		/*
-		 * if it was recursive, the one that actually failed will be in
-		 * zc.zc_name.
-		 */
+	ha.nvl = fnvlist_alloc();
+	ha.snapname = snapname;
+	ha.tag = tag;
+	ha.recursive = recursive;
+	(void) zfs_release_one(zfs_handle_dup(zhp), &ha);
+	ret = lzc_release(ha.nvl, &errors);
+	fnvlist_free(ha.nvl);
+
+	if (ret == 0)
+		return (0);
+
+	if (nvlist_next_nvpair(errors, NULL) == NULL) {
+		/* no hold-specific errors */
+		char errbuf[1024];
+
 		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
-		    "cannot release '%s' from '%s@%s'"), tag, zc.zc_name,
-		    snapname);
+		    "cannot release"));
 		switch (errno) {
-		case ESRCH:
-			return (zfs_error(hdl, EZFS_REFTAG_RELE, errbuf));
 		case ENOTSUP:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "pool must be upgraded"));
-			return (zfs_error(hdl, EZFS_BADVERSION, errbuf));
-		case EINVAL:
-			return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
+			(void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
+			break;
 		default:
-			return (zfs_standard_error_fmt(hdl, errno, errbuf));
+			(void) zfs_standard_error_fmt(hdl, errno, errbuf);
 		}
 	}
 
-	return (0);
+	for (elem = nvlist_next_nvpair(errors, NULL);
+	    elem != NULL;
+	    elem = nvlist_next_nvpair(errors, elem)) {
+		char errbuf[1024];
+
+		(void) snprintf(errbuf, sizeof (errbuf),
+		    dgettext(TEXT_DOMAIN,
+		    "cannot release hold from snapshot '%s'"),
+		    nvpair_name(elem));
+		switch (fnvpair_value_int32(elem)) {
+		case ESRCH:
+			(void) zfs_error(hdl, EZFS_REFTAG_RELE, errbuf);
+			break;
+		case EINVAL:
+			(void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
+			break;
+		default:
+			(void) zfs_standard_error_fmt(hdl,
+			    fnvpair_value_int32(elem), errbuf);
+		}
+	}
+
+	fnvlist_free(errors);
+	return (ret);
 }
 
 int
@@ -4177,7 +4280,7 @@
 	int nvsz = 2048;
 	void *nvbuf;
 	int err = 0;
-	char errbuf[ZFS_MAXNAMELEN+32];
+	char errbuf[1024];
 
 	assert(zhp->zfs_type == ZFS_TYPE_VOLUME ||
 	    zhp->zfs_type == ZFS_TYPE_FILESYSTEM);
@@ -4242,7 +4345,7 @@
 	zfs_cmd_t zc = { 0 };
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	char *nvbuf;
-	char errbuf[ZFS_MAXNAMELEN+32];
+	char errbuf[1024];
 	size_t nvsz;
 	int err;
 
@@ -4293,38 +4396,18 @@
 int
 zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl)
 {
-	zfs_cmd_t zc = { 0 };
-	libzfs_handle_t *hdl = zhp->zfs_hdl;
-	int nvsz = 2048;
-	void *nvbuf;
-	int err = 0;
-	char errbuf[ZFS_MAXNAMELEN+32];
-
-	assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
-
-tryagain:
-
-	nvbuf = malloc(nvsz);
-	if (nvbuf == NULL) {
-		err = (zfs_error(hdl, EZFS_NOMEM, strerror(errno)));
-		goto out;
-	}
-
-	zc.zc_nvlist_dst_size = nvsz;
-	zc.zc_nvlist_dst = (uintptr_t)nvbuf;
-
-	(void) strlcpy(zc.zc_name, zhp->zfs_name, ZFS_MAXNAMELEN);
-
-	if (zfs_ioctl(hdl, ZFS_IOC_GET_HOLDS, &zc) != 0) {
+	int err;
+	char errbuf[1024];
+
+	err = lzc_get_holds(zhp->zfs_name, nvl);
+
+	if (err != 0) {
+		libzfs_handle_t *hdl = zhp->zfs_hdl;
+
 		(void) snprintf(errbuf, sizeof (errbuf),
 		    dgettext(TEXT_DOMAIN, "cannot get holds for '%s'"),
-		    zc.zc_name);
-		switch (errno) {
-		case ENOMEM:
-			free(nvbuf);
-			nvsz = zc.zc_nvlist_dst_size;
-			goto tryagain;
-
+		    zhp->zfs_name);
+		switch (err) {
 		case ENOTSUP:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "pool must be upgraded"));
@@ -4340,19 +4423,8 @@
 			err = zfs_standard_error_fmt(hdl, errno, errbuf);
 			break;
 		}
-	} else {
-		/* success */
-		int rc = nvlist_unpack(nvbuf, zc.zc_nvlist_dst_size, nvl, 0);
-		if (rc) {
-			(void) snprintf(errbuf, sizeof (errbuf),
-			    dgettext(TEXT_DOMAIN, "cannot get holds for '%s'"),
-			    zc.zc_name);
-			err = zfs_standard_error_fmt(hdl, rc, errbuf);
-		}
 	}
 
-	free(nvbuf);
-out:
 	return (err);
 }
 
--- a/usr/src/lib/libzfs/common/libzfs_sendrecv.c	Wed Feb 27 21:13:42 2013 +0100
+++ b/usr/src/lib/libzfs/common/libzfs_sendrecv.c	Thu Feb 28 12:44:05 2013 -0800
@@ -972,9 +972,7 @@
 	 */
 	if (pzhp) {
 		error = zfs_hold(pzhp, thissnap, sdd->holdtag,
-		    B_FALSE, B_TRUE, B_TRUE, sdd->cleanup_fd,
-		    zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID),
-		    zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG));
+		    B_FALSE, B_TRUE, sdd->cleanup_fd);
 		zfs_close(pzhp);
 	}
 
@@ -1713,12 +1711,11 @@
 		err = ENOENT;
 	}
 
-	if (err != 0 && strncmp(name+baselen, "recv-", 5) != 0) {
+	if (err != 0 && strncmp(name + baselen, "recv-", 5) != 0) {
 		seq++;
 
-		(void) strncpy(newname, name, baselen);
-		(void) snprintf(newname+baselen, ZFS_MAXNAMELEN-baselen,
-		    "recv-%u-%u", getpid(), seq);
+		(void) snprintf(newname, ZFS_MAXNAMELEN, "%.*srecv-%u-%u",
+		    baselen, name, getpid(), seq);
 		(void) strlcpy(zc.zc_value, newname, sizeof (zc.zc_value));
 
 		if (flags->verbose) {
@@ -2643,7 +2640,6 @@
 	/*
 	 * Determine name of destination snapshot, store in zc_value.
 	 */
-	(void) strcpy(zc.zc_top_ds, tosnap);
 	(void) strcpy(zc.zc_value, tosnap);
 	(void) strncat(zc.zc_value, chopprefix, sizeof (zc.zc_value));
 	free(cp);
--- a/usr/src/lib/libzfs_core/common/libzfs_core.c	Wed Feb 27 21:13:42 2013 +0100
+++ b/usr/src/lib/libzfs_core/common/libzfs_core.c	Thu Feb 28 12:44:05 2013 -0800
@@ -132,6 +132,7 @@
 	zc.zc_nvlist_src_size = size;
 
 	if (resultp != NULL) {
+		*resultp = NULL;
 		zc.zc_nvlist_dst_size = MAX(size * 2, 128 * 1024);
 		zc.zc_nvlist_dst = (uint64_t)(uintptr_t)
 		    malloc(zc.zc_nvlist_dst_size);
@@ -159,8 +160,6 @@
 	if (zc.zc_nvlist_dst_filled) {
 		*resultp = fnvlist_unpack((void *)(uintptr_t)zc.zc_nvlist_dst,
 		    zc.zc_nvlist_dst_size);
-	} else if (resultp != NULL) {
-		*resultp = NULL;
 	}
 
 out:
@@ -209,7 +208,7 @@
  * The value will be the (int32) error code.
  *
  * The return value will be 0 if all snapshots were created, otherwise it will
- * be the errno of a (undetermined) snapshot that failed.
+ * be the errno of a (unspecified) snapshot that failed.
  */
 int
 lzc_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t **errlist)
@@ -258,7 +257,7 @@
  * The return value will be 0 if all snapshots were destroyed (or marked for
  * later destruction if 'defer' is set) or didn't exist to begin with.
  *
- * Otherwise the return value will be the errno of a (undetermined) snapshot
+ * Otherwise the return value will be the errno of a (unspecified) snapshot
  * that failed, no snapshots will be destroyed, and the errlist will have an
  * entry for each snapshot that failed.  The value in the errlist will be
  * the (int32) error code.
@@ -333,6 +332,101 @@
 }
 
 /*
+ * Create "user holds" on snapshots.  If there is a hold on a snapshot,
+ * the snapshot can not be destroyed.  (However, it can be marked for deletion
+ * by lzc_destroy_snaps(defer=B_TRUE).)
+ *
+ * The keys in the nvlist are snapshot names.
+ * The snapshots must all be in the same pool.
+ * The value is the name of the hold (string type).
+ *
+ * If cleanup_fd is not -1, it must be the result of open("/dev/zfs", O_EXCL).
+ * In this case, when the cleanup_fd is closed (including on process
+ * termination), the holds will be released.  If the system is shut down
+ * uncleanly, the holds will be released when the pool is next opened
+ * or imported.
+ *
+ * The return value will be 0 if all holds were created. Otherwise the return
+ * value will be the errno of a (unspecified) hold that failed, no holds will
+ * be created, and the errlist will have an entry for each hold that
+ * failed (name = snapshot).  The value in the errlist will be the error
+ * code (int32).
+ */
+int
+lzc_hold(nvlist_t *holds, int cleanup_fd, nvlist_t **errlist)
+{
+	char pool[MAXNAMELEN];
+	nvlist_t *args;
+	nvpair_t *elem;
+	int error;
+
+	/* determine the pool name */
+	elem = nvlist_next_nvpair(holds, NULL);
+	if (elem == NULL)
+		return (0);
+	(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
+	pool[strcspn(pool, "/@")] = '\0';
+
+	args = fnvlist_alloc();
+	fnvlist_add_nvlist(args, "holds", holds);
+	if (cleanup_fd != -1)
+		fnvlist_add_int32(args, "cleanup_fd", cleanup_fd);
+
+	error = lzc_ioctl(ZFS_IOC_HOLD, pool, args, errlist);
+	nvlist_free(args);
+	return (error);
+}
+
+/*
+ * Release "user holds" on snapshots.  If the snapshot has been marked for
+ * deferred destroy (by lzc_destroy_snaps(defer=B_TRUE)), it does not have
+ * any clones, and all the user holds are removed, then the snapshot will be
+ * destroyed.
+ *
+ * The keys in the nvlist are snapshot names.
+ * The snapshots must all be in the same pool.
+ * The value is a nvlist whose keys are the holds to remove.
+ *
+ * The return value will be 0 if all holds were removed.
+ * Otherwise the return value will be the errno of a (unspecified) release
+ * that failed, no holds will be released, and the errlist will have an
+ * entry for each snapshot that has failed releases (name = snapshot).
+ * The value in the errlist will be the error code (int32) of a failed release.
+ */
+int
+lzc_release(nvlist_t *holds, nvlist_t **errlist)
+{
+	char pool[MAXNAMELEN];
+	nvpair_t *elem;
+
+	/* determine the pool name */
+	elem = nvlist_next_nvpair(holds, NULL);
+	if (elem == NULL)
+		return (0);
+	(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
+	pool[strcspn(pool, "/@")] = '\0';
+
+	return (lzc_ioctl(ZFS_IOC_RELEASE, pool, holds, errlist));
+}
+
+/*
+ * Retrieve list of user holds on the specified snapshot.
+ *
+ * On success, *holdsp will be set to a nvlist which the caller must free.
+ * The keys are the names of the holds, and the value is the creation time
+ * of the hold (uint64) in seconds since the epoch.
+ */
+int
+lzc_get_holds(const char *snapname, nvlist_t **holdsp)
+{
+	int error;
+	nvlist_t *innvl = fnvlist_alloc();
+	error = lzc_ioctl(ZFS_IOC_GET_HOLDS, snapname, innvl, holdsp);
+	fnvlist_free(innvl);
+	return (error);
+}
+
+/*
  * If fromsnap is NULL, a full (non-incremental) stream will be sent.
  */
 int
--- a/usr/src/lib/libzfs_core/common/libzfs_core.h	Wed Feb 27 21:13:42 2013 +0100
+++ b/usr/src/lib/libzfs_core/common/libzfs_core.h	Thu Feb 28 12:44:05 2013 -0800
@@ -46,6 +46,10 @@
 int lzc_snaprange_space(const char *firstsnap, const char *lastsnap,
     uint64_t *usedp);
 
+int lzc_hold(nvlist_t *holds, int cleanup_fd, nvlist_t **errlist);
+int lzc_release(nvlist_t *holds, nvlist_t **errlist);
+int lzc_get_holds(const char *snapname, nvlist_t **holdsp);
+
 int lzc_send(const char *snapname, const char *fromsnap, int fd);
 int lzc_receive(const char *snapname, nvlist_t *props, const char *origin,
     boolean_t force, int fd);
--- a/usr/src/lib/libzfs_core/common/mapfile-vers	Wed Feb 27 21:13:42 2013 +0100
+++ b/usr/src/lib/libzfs_core/common/mapfile-vers	Thu Feb 28 12:44:05 2013 -0800
@@ -45,7 +45,10 @@
 	lzc_create;
 	lzc_destroy_snaps;
 	lzc_exists;
+	lzc_get_holds;
+	lzc_hold;
 	lzc_receive;
+	lzc_release;
 	lzc_send;
 	lzc_send_space;
 	lzc_snaprange_space;
--- a/usr/src/lib/libzpool/common/kernel.c	Wed Feb 27 21:13:42 2013 +0100
+++ b/usr/src/lib/libzpool/common/kernel.c	Thu Feb 28 12:44:05 2013 -0800
@@ -34,6 +34,7 @@
 #include <sys/stat.h>
 #include <sys/processor.h>
 #include <sys/zfs_context.h>
+#include <sys/rrwlock.h>
 #include <sys/zmod.h>
 #include <sys/utsname.h>
 #include <sys/systeminfo.h>
@@ -859,6 +860,8 @@
 void
 kernel_init(int mode)
 {
+	extern uint_t rrw_tsd_key;
+
 	umem_nofail_callback(umem_out_of_memory);
 
 	physmem = sysconf(_SC_PHYS_PAGES);
@@ -877,6 +880,8 @@
 	mutex_init(&cpu_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	spa_init(mode);
+
+	tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
 }
 
 void
--- a/usr/src/lib/libzpool/common/llib-lzpool	Wed Feb 27 21:13:42 2013 +0100
+++ b/usr/src/lib/libzpool/common/llib-lzpool	Thu Feb 28 12:44:05 2013 -0800
@@ -57,6 +57,9 @@
 #include <sys/sa.h>
 #include <sys/zfs_sa.h>
 #include <sys/zfeature.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_userhold.h>
 
 extern uint64_t metaslab_gang_bang;
 extern uint64_t metaslab_df_alloc_threshold;
--- a/usr/src/lib/libzpool/common/sys/zfs_context.h	Wed Feb 27 21:13:42 2013 +0100
+++ b/usr/src/lib/libzpool/common/sys/zfs_context.h	Thu Feb 28 12:44:05 2013 -0800
@@ -61,6 +61,8 @@
 #include <dirent.h>
 #include <time.h>
 #include <procfs.h>
+#include <pthread.h>
+#include <sys/debug.h>
 #include <libsysevent.h>
 #include <sys/note.h>
 #include <sys/types.h>
@@ -224,6 +226,9 @@
 #undef RW_WRITE_HELD
 #define	RW_WRITE_HELD(x)	_rw_write_held(&(x)->rw_lock)
 
+#undef RW_LOCK_HELD
+#define	RW_LOCK_HELD(x)		(RW_READ_HELD(x) || RW_WRITE_HELD(x))
+
 extern void rw_init(krwlock_t *rwlp, char *name, int type, void *arg);
 extern void rw_destroy(krwlock_t *rwlp);
 extern void rw_enter(krwlock_t *rwlp, krw_t rw);
@@ -253,6 +258,14 @@
 extern void cv_broadcast(kcondvar_t *cv);
 
 /*
+ * Thread-specific data
+ */
+#define	tsd_get(k) pthread_getspecific(k)
+#define	tsd_set(k, v) pthread_setspecific(k, v)
+#define	tsd_create(kp, d) pthread_key_create(kp, d)
+#define	tsd_destroy(kp) /* nothing */
+
+/*
  * kstat creation, installation and deletion
  */
 extern kstat_t *kstat_create(const char *, int,
@@ -519,7 +532,7 @@
 #define	INGLOBALZONE(z)			(1)
 
 extern char *kmem_asprintf(const char *fmt, ...);
-#define	strfree(str) kmem_free((str), strlen(str)+1)
+#define	strfree(str) kmem_free((str), strlen(str) + 1)
 
 /*
  * Hostname information
--- a/usr/src/man/man1m/zfs.1m	Wed Feb 27 21:13:42 2013 +0100
+++ b/usr/src/man/man1m/zfs.1m	Thu Feb 28 12:44:05 2013 -0800
@@ -1868,7 +1868,9 @@
 .ad
 .sp .6
 .RS 4n
-Recursively destroy all dependents.
+Recursively destroy all clones of these snapshots, including the clones,
+snapshots, and children.  If this flag is specified, the \fB-d\fR flag will
+have no effect.
 .RE
 
 .sp
@@ -1904,7 +1906,7 @@
 .RE
 
 .sp
-Extreme care should be taken when applying either the \fB-r\fR or the \fB-f\fR
+Extreme care should be taken when applying either the \fB-r\fR or the \fB-R\fR
 options, as they can destroy large portions of a pool and cause unexpected
 behavior for mounted file systems in use.
 .RE
--- a/usr/src/uts/common/Makefile.files	Wed Feb 27 21:13:42 2013 +0100
+++ b/usr/src/uts/common/Makefile.files	Thu Feb 28 12:44:05 2013 -0800
@@ -1346,8 +1346,10 @@
 	dsl_dir.o		\
 	dsl_dataset.o		\
 	dsl_deadlist.o		\
+	dsl_destroy.o		\
 	dsl_pool.o		\
 	dsl_synctask.o		\
+	dsl_userhold.o		\
 	dmu_zfetch.o		\
 	dsl_deleg.o		\
 	dsl_prop.o		\
@@ -1358,6 +1360,7 @@
 	lzjb.o			\
 	metaslab.o		\
 	refcount.o		\
+	rrwlock.o		\
 	sa.o			\
 	sha256.o		\
 	spa.o			\
@@ -1417,7 +1420,6 @@
 	zfs_onexit.o		\
 	zfs_replay.o		\
 	zfs_rlock.o		\
-	rrwlock.o		\
 	zfs_vfsops.o		\
 	zfs_vnops.o		\
 	zvol.o
--- a/usr/src/uts/common/fs/zfs/arc.c	Wed Feb 27 21:13:42 2013 +0100
+++ b/usr/src/uts/common/fs/zfs/arc.c	Thu Feb 28 12:44:05 2013 -0800
@@ -1633,12 +1633,12 @@
 	}
 }
 
-int
+boolean_t
 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	kmutex_t *hash_lock = HDR_LOCK(hdr);
-	int no_callback = (buf->b_efunc == NULL);
+	boolean_t no_callback = (buf->b_efunc == NULL);
 
 	if (hdr->b_state == arc_anon) {
 		ASSERT(hdr->b_datacnt == 1);
@@ -1843,7 +1843,7 @@
 		ARCSTAT_INCR(arcstat_mutex_miss, missed);
 
 	/*
-	 * We have just evicted some date into the ghost state, make
+	 * We have just evicted some data into the ghost state, make
 	 * sure we also adjust the ghost state size if necessary.
 	 */
 	if (arc_no_grow &&
@@ -2622,7 +2622,7 @@
 {
 	if (zio == NULL || zio->io_error == 0)
 		bcopy(buf->b_data, arg, buf->b_hdr->b_size);
-	VERIFY(arc_buf_remove_ref(buf, arg) == 1);
+	VERIFY(arc_buf_remove_ref(buf, arg));
 }
 
 /* a generic arc_done_func_t */
@@ -2631,7 +2631,7 @@
 {
 	arc_buf_t **bufp = arg;
 	if (zio && zio->io_error) {
-		VERIFY(arc_buf_remove_ref(buf, arg) == 1);
+		VERIFY(arc_buf_remove_ref(buf, arg));
 		*bufp = NULL;
 	} else {
 		*bufp = buf;
--- a/usr/src/uts/common/fs/zfs/bplist.c	Wed Feb 27 21:13:42 2013 +0100
+++ b/usr/src/uts/common/fs/zfs/bplist.c	Thu Feb 28 12:44:05 2013 -0800
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/bplist.h>
@@ -52,6 +53,12 @@
 	mutex_exit(&bpl->bpl_lock);
 }
 
+/*
+ * To aid debugging, we keep the most recently removed entry.  This way if
+ * we are in the callback, we can easily locate the entry.
+ */
+static bplist_entry_t *bplist_iterate_last_removed;
+
 void
 bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx)
 {
@@ -59,6 +66,7 @@
 
 	mutex_enter(&bpl->bpl_lock);
 	while (bpe = list_head(&bpl->bpl_list)) {
+		bplist_iterate_last_removed = bpe;
 		list_remove(&bpl->bpl_list, bpe);
 		mutex_exit(&bpl->bpl_lock);
 		func(arg, &bpe->bpe_blk, tx);
--- a/usr/src/uts/common/fs/zfs/bpobj.c	Wed Feb 27 21:13:42 2013 +0100
+++ b/usr/src/uts/common/fs/zfs/bpobj.c	Thu Feb 28 12:44:05 2013 -0800
@@ -392,6 +392,10 @@
 		    DMU_OT_BPOBJ_SUBOBJ, SPA_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
 	}
 
+	dmu_object_info_t doi;
+	ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi));
+	ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
+
 	mutex_enter(&bpo->bpo_lock);
 	dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
 	    bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
--- a/usr/src/uts/common/fs/zfs/dbuf.c	Wed Feb 27 21:13:42 2013 +0100
+++ b/usr/src/uts/common/fs/zfs/dbuf.c	Thu Feb 28 12:44:05 2013 -0800
@@ -39,7 +39,7 @@
 #include <sys/sa_impl.h>
 
 static void dbuf_destroy(dmu_buf_impl_t *db);
-static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
 
 /*
@@ -499,7 +499,7 @@
 	} else {
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 		ASSERT3P(db->db_buf, ==, NULL);
-		VERIFY(arc_buf_remove_ref(buf, db) == 1);
+		VERIFY(arc_buf_remove_ref(buf, db));
 		db->db_state = DB_UNCACHED;
 	}
 	cv_broadcast(&db->db_changed);
@@ -828,10 +828,12 @@
 			continue;
 
 		/* found a level 0 buffer in the range */
-		if (dbuf_undirty(db, tx))
+		mutex_enter(&db->db_mtx);
+		if (dbuf_undirty(db, tx)) {
+			/* mutex has been dropped and dbuf destroyed */
 			continue;
+		}
 
-		mutex_enter(&db->db_mtx);
 		if (db->db_state == DB_UNCACHED ||
 		    db->db_state == DB_NOFILL ||
 		    db->db_state == DB_EVICTING) {
@@ -958,7 +960,7 @@
 
 	mutex_enter(&db->db_mtx);
 	dbuf_set_data(db, buf);
-	VERIFY(arc_buf_remove_ref(obuf, db) == 1);
+	VERIFY(arc_buf_remove_ref(obuf, db));
 	db->db.db_size = size;
 
 	if (db->db_level == 0) {
@@ -1258,7 +1260,10 @@
 	return (dr);
 }
 
-static int
+/*
+ * Return TRUE if this evicted the dbuf.
+ */
+static boolean_t
 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	dnode_t *dn;
@@ -1267,18 +1272,17 @@
 
 	ASSERT(txg != 0);
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+	ASSERT0(db->db_level);
+	ASSERT(MUTEX_HELD(&db->db_mtx));
 
-	mutex_enter(&db->db_mtx);
 	/*
 	 * If this buffer is not dirty, we're done.
 	 */
 	for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
 		if (dr->dr_txg <= txg)
 			break;
-	if (dr == NULL || dr->dr_txg < txg) {
-		mutex_exit(&db->db_mtx);
-		return (0);
-	}
+	if (dr == NULL || dr->dr_txg < txg)
+		return (B_FALSE);
 	ASSERT(dr->dr_txg == txg);
 	ASSERT(dr->dr_dbuf == db);
 
@@ -1286,24 +1290,12 @@
 	dn = DB_DNODE(db);
 
 	/*
-	 * If this buffer is currently held, we cannot undirty
-	 * it, since one of the current holders may be in the
-	 * middle of an update.  Note that users of dbuf_undirty()
-	 * should not place a hold on the dbuf before the call.
-	 * Also note: we can get here with a spill block, so
-	 * test for that similar to how dbuf_dirty does.
+	 * Note:  This code will probably work even if there are concurrent
+	 * holders, but it is untested in that scenerio, as the ZPL and
+	 * ztest have additional locking (the range locks) that prevents
+	 * that type of concurrent access.
 	 */
-	if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
-		mutex_exit(&db->db_mtx);
-		/* Make sure we don't toss this buffer at sync phase */
-		if (db->db_blkid != DMU_SPILL_BLKID) {
-			mutex_enter(&dn->dn_mtx);
-			dnode_clear_range(dn, db->db_blkid, 1, tx);
-			mutex_exit(&dn->dn_mtx);
-		}
-		DB_DNODE_EXIT(db);
-		return (0);
-	}
+	ASSERT3U(refcount_count(&db->db_holds), ==, db->db_dirtycnt);
 
 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
 
@@ -1332,21 +1324,13 @@
 	}
 	DB_DNODE_EXIT(db);
 
-	if (db->db_level == 0) {
-		if (db->db_state != DB_NOFILL) {
-			dbuf_unoverride(dr);
+	if (db->db_state != DB_NOFILL) {
+		dbuf_unoverride(dr);
 
-			ASSERT(db->db_buf != NULL);
-			ASSERT(dr->dt.dl.dr_data != NULL);
-			if (dr->dt.dl.dr_data != db->db_buf)
-				VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
-				    db) == 1);
-		}
-	} else {
 		ASSERT(db->db_buf != NULL);
-		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
-		mutex_destroy(&dr->dt.di.dr_mtx);
-		list_destroy(&dr->dt.di.dr_children);
+		ASSERT(dr->dt.dl.dr_data != NULL);
+		if (dr->dt.dl.dr_data != db->db_buf)
+			VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
 	}
 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
 
@@ -1358,13 +1342,12 @@
 
 		ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
 		dbuf_set_data(db, NULL);
-		VERIFY(arc_buf_remove_ref(buf, db) == 1);
+		VERIFY(arc_buf_remove_ref(buf, db));
 		dbuf_evict(db);
-		return (1);
+		return (B_TRUE);
 	}
 
-	mutex_exit(&db->db_mtx);
-	return (0);
+	return (B_FALSE);
 }
 
 #pragma weak dmu_buf_will_dirty = dbuf_will_dirty
@@ -1463,7 +1446,7 @@
 		mutex_exit(&db->db_mtx);
 		(void) dbuf_dirty(db, tx);
 		bcopy(buf->b_data, db->db.db_data, db->db.db_size);
-		VERIFY(arc_buf_remove_ref(buf, db) == 1);
+		VERIFY(arc_buf_remove_ref(buf, db));
 		xuio_stat_wbuf_copied();
 		return;
 	}
@@ -1481,10 +1464,10 @@
 				arc_release(db->db_buf, db);
 			}
 			dr->dt.dl.dr_data = buf;
-			VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1);
+			VERIFY(arc_buf_remove_ref(db->db_buf, db));
 		} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
 			arc_release(db->db_buf, db);
-			VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1);
+			VERIFY(arc_buf_remove_ref(db->db_buf, db));
 		}
 		db->db_buf = NULL;
 	}
@@ -2067,10 +2050,10 @@
 			 * This dbuf has anonymous data associated with it.
 			 */
 			dbuf_set_data(db, NULL);
-			VERIFY(arc_buf_remove_ref(buf, db) == 1);
+			VERIFY(arc_buf_remove_ref(buf, db));
 			dbuf_evict(db);
 		} else {
-			VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0);
+			VERIFY(!arc_buf_remove_ref(db->db_buf, db));
 
 			/*
 			 * A dbuf will be eligible for eviction if either the
@@ -2567,7 +2550,7 @@
 		if (db->db_state != DB_NOFILL) {
 			if (dr->dt.dl.dr_data != db->db_buf)
 				VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
-				    db) == 1);
+				    db));
 			else if (!arc_released(db->db_buf))
 				arc_set_callback(db->db_buf, dbuf_do_evict, db);
 		}
--- a/usr/src/uts/common/fs/zfs/dmu.c	Wed Feb 27 21:13:42 2013 +0100
+++ b/usr/src/uts/common/fs/zfs/dmu.c	Thu Feb 28 12:44:05 2013 -0800
@@ -1194,7 +1194,7 @@
 dmu_return_arcbuf(arc_buf_t *buf)
 {
 	arc_return_buf(buf, FTAG);
-	VERIFY(arc_buf_remove_ref(buf, FTAG) == 1);
+	VERIFY(arc_buf_remove_ref(buf, FTAG));
 }
 
 /*
--- a/usr/src/uts/common/fs/zfs/dmu_diff.c	Wed Feb 27 21:13:42 2013 +0100
+++ b/usr/src/uts/common/fs/zfs/dmu_diff.c	Thu Feb 28 12:44:05 2013 -0800
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/dmu.h>
@@ -155,51 +156,49 @@
 }
 
 int
-dmu_diff(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp, offset_t *offp)
+dmu_diff(const char *tosnap_name, const char *fromsnap_name,
+    struct vnode *vp, offset_t *offp)
 {
 	struct diffarg da;
-	dsl_dataset_t *ds = tosnap->os_dsl_dataset;
-	dsl_dataset_t *fromds = fromsnap->os_dsl_dataset;
-	dsl_dataset_t *findds;
-	dsl_dataset_t *relds;
-	int err = 0;
+	dsl_dataset_t *fromsnap;
+	dsl_dataset_t *tosnap;
+	dsl_pool_t *dp;
+	int error;
+	uint64_t fromtxg;
 
-	/* make certain we are looking at snapshots */
-	if (!dsl_dataset_is_snapshot(ds) || !dsl_dataset_is_snapshot(fromds))
+	if (strchr(tosnap_name, '@') == NULL ||
+	    strchr(fromsnap_name, '@') == NULL)
 		return (EINVAL);
 
-	/* fromsnap must be earlier and from the same lineage as tosnap */
-	if (fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)
-		return (EXDEV);
-
-	relds = NULL;
-	findds = ds;
-
-	while (fromds->ds_dir != findds->ds_dir) {
-		dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	error = dsl_pool_hold(tosnap_name, FTAG, &dp);
+	if (error != 0)
+		return (error);
 
-		if (!dsl_dir_is_clone(findds->ds_dir)) {
-			if (relds)
-				dsl_dataset_rele(relds, FTAG);
-			return (EXDEV);
-		}
-
-		rw_enter(&dp->dp_config_rwlock, RW_READER);
-		err = dsl_dataset_hold_obj(dp,
-		    findds->ds_dir->dd_phys->dd_origin_obj, FTAG, &findds);
-		rw_exit(&dp->dp_config_rwlock);
-
-		if (relds)
-			dsl_dataset_rele(relds, FTAG);
-
-		if (err)
-			return (EXDEV);
-
-		relds = findds;
+	error = dsl_dataset_hold(dp, tosnap_name, FTAG, &tosnap);
+	if (error != 0) {
+		dsl_pool_rele(dp, FTAG);
+		return (error);
 	}
 
-	if (relds)
-		dsl_dataset_rele(relds, FTAG);
+	error = dsl_dataset_hold(dp, fromsnap_name, FTAG, &fromsnap);
+	if (error != 0) {
+		dsl_dataset_rele(tosnap, FTAG);
+		dsl_pool_rele(dp, FTAG);
+		return (error);
+	}
+
+	if (!dsl_dataset_is_before(tosnap, fromsnap)) {
+		dsl_dataset_rele(fromsnap, FTAG);
+		dsl_dataset_rele(tosnap, FTAG);
+		dsl_pool_rele(dp, FTAG);
+		return (EXDEV);
+	}
+
+	fromtxg = fromsnap->ds_phys->ds_creation_txg;
+	dsl_dataset_rele(fromsnap, FTAG);
+
+	dsl_dataset_long_hold(tosnap, FTAG);
+	dsl_pool_rele(dp, FTAG);
 
 	da.da_vp = vp;
 	da.da_offp = offp;
@@ -207,15 +206,18 @@
 	da.da_ddr.ddr_first = da.da_ddr.ddr_last = 0;
 	da.da_err = 0;
 
-	err = traverse_dataset(ds, fromds->ds_phys->ds_creation_txg,
+	error = traverse_dataset(tosnap, fromtxg,
 	    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, diff_cb, &da);
 
-	if (err) {
-		da.da_err = err;
+	if (error != 0) {
+		da.da_err = error;
 	} else {
 		/* we set the da.da_err we return as side-effect */
 		(void) write_record(&da);
 	}
 
+	dsl_dataset_long_rele(tosnap, FTAG);
+	dsl_dataset_rele(tosnap, FTAG);
+
 	return (da.da_err);
 }
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c	Wed Feb 27 21:13:42 2013 +0100
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c	Thu Feb 28 12:44:05 2013 -0800
@@ -44,6 +44,7 @@
 #include <sys/zfs_ioctl.h>
 #include <sys/sa.h>
 #include <sys/zfs_onexit.h>
+#include <sys/dsl_destroy.h>
 
 /*
  * Needed to close a window in dnode_move() that allows the objset to be freed
@@ -280,7 +281,7 @@
 		err = arc_read(NULL, spa, os->os_rootbp,
 		    arc_getbuf_func, &os->os_phys_buf,
 		    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
-		if (err) {
+		if (err != 0) {
 			kmem_free(os, sizeof (objset_t));
 			/* convert checksum errors into IO errors */
 			if (err == ECKSUM)
@@ -320,34 +321,49 @@
 	 * checksum/compression/copies.
 	 */
 	if (ds) {
-		err = dsl_prop_register(ds, "primarycache",
+		err = dsl_prop_register(ds,
+		    zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
 		    primary_cache_changed_cb, os);
-		if (err == 0)
-			err = dsl_prop_register(ds, "secondarycache",
+		if (err == 0) {
+			err = dsl_prop_register(ds,
+			    zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
 			    secondary_cache_changed_cb, os);
+		}
 		if (!dsl_dataset_is_snapshot(ds)) {
-			if (err == 0)
-				err = dsl_prop_register(ds, "checksum",
+			if (err == 0) {
+				err = dsl_prop_register(ds,
+				    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
 				    checksum_changed_cb, os);
-			if (err == 0)
-				err = dsl_prop_register(ds, "compression",
+			}
+			if (err == 0) {
+				err = dsl_prop_register(ds,
+				    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
 				    compression_changed_cb, os);
-			if (err == 0)
-				err = dsl_prop_register(ds, "copies",
+			}
+			if (err == 0) {
+				err = dsl_prop_register(ds,
+				    zfs_prop_to_name(ZFS_PROP_COPIES),
 				    copies_changed_cb, os);
-			if (err == 0)
-				err = dsl_prop_register(ds, "dedup",
+			}
+			if (err == 0) {
+				err = dsl_prop_register(ds,
+				    zfs_prop_to_name(ZFS_PROP_DEDUP),
 				    dedup_changed_cb, os);
-			if (err == 0)
-				err = dsl_prop_register(ds, "logbias",
+			}
+			if (err == 0) {
+				err = dsl_prop_register(ds,
+				    zfs_prop_to_name(ZFS_PROP_LOGBIAS),
 				    logbias_changed_cb, os);
-			if (err == 0)
-				err = dsl_prop_register(ds, "sync",
+			}
+			if (err == 0) {
+				err = dsl_prop_register(ds,
+				    zfs_prop_to_name(ZFS_PROP_SYNC),
 				    sync_changed_cb, os);
+			}
 		}
-		if (err) {
+		if (err != 0) {
 			VERIFY(arc_buf_remove_ref(os->os_phys_buf,
-			    &os->os_phys_buf) == 1);
+			    &os->os_phys_buf));
 			kmem_free(os, sizeof (objset_t));
 			return (err);
 		}
@@ -425,44 +441,66 @@
 	return (err);
 }
 
-/* called from zpl */
+/*
+ * Holds the pool while the objset is held.  Therefore only one objset
+ * can be held at a time.
+ */
 int
 dmu_objset_hold(const char *name, void *tag, objset_t **osp)
 {
+	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
 	int err;
 
-	err = dsl_dataset_hold(name, tag, &ds);
-	if (err)
+	err = dsl_pool_hold(name, tag, &dp);
+	if (err != 0)
 		return (err);
+	err = dsl_dataset_hold(dp, name, tag, &ds);
+	if (err != 0) {
+		dsl_pool_rele(dp, tag);
+		return (err);
+	}
 
 	err = dmu_objset_from_ds(ds, osp);
-	if (err)
+	if (err != 0) {
 		dsl_dataset_rele(ds, tag);
+		dsl_pool_rele(dp, tag);
+	}
 
 	return (err);
 }
 
-/* called from zpl */
+/*
+ * dsl_pool must not be held when this is called.
+ * Upon successful return, there will be a longhold on the dataset,
+ * and the dsl_pool will not be held.
+ */
 int
 dmu_objset_own(const char *name, dmu_objset_type_t type,
     boolean_t readonly, void *tag, objset_t **osp)
 {
+	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
 	int err;
 
-	err = dsl_dataset_own(name, B_FALSE, tag, &ds);
-	if (err)
+	err = dsl_pool_hold(name, FTAG, &dp);
+	if (err != 0)
 		return (err);
+	err = dsl_dataset_own(dp, name, tag, &ds);
+	if (err != 0) {
+		dsl_pool_rele(dp, FTAG);
+		return (err);
+	}
 
 	err = dmu_objset_from_ds(ds, osp);
-	if (err) {
+	dsl_pool_rele(dp, FTAG);
+	if (err != 0) {
 		dsl_dataset_disown(ds, tag);
 	} else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
-		dmu_objset_disown(*osp, tag);
+		dsl_dataset_disown(ds, tag);
 		return (EINVAL);
 	} else if (!readonly && dsl_dataset_is_snapshot(ds)) {
-		dmu_objset_disown(*osp, tag);
+		dsl_dataset_disown(ds, tag);
 		return (EROFS);
 	}
 	return (err);
@@ -471,7 +509,9 @@
 void
 dmu_objset_rele(objset_t *os, void *tag)
 {
+	dsl_pool_t *dp = dmu_objset_pool(os);
 	dsl_dataset_rele(os->os_dsl_dataset, tag);
+	dsl_pool_rele(dp, tag);
 }
 
 void
@@ -480,7 +520,7 @@
 	dsl_dataset_disown(os->os_dsl_dataset, tag);
 }
 
-int
+void
 dmu_objset_evict_dbufs(objset_t *os)
 {
 	dnode_t *dn;
@@ -515,9 +555,7 @@
 		mutex_enter(&os->os_lock);
 		dn = next_dn;
 	}
-	dn = list_head(&os->os_dnodes);
 	mutex_exit(&os->os_lock);
-	return (dn != DMU_META_DNODE(os));
 }
 
 void
@@ -530,33 +568,37 @@
 
 	if (ds) {
 		if (!dsl_dataset_is_snapshot(ds)) {
-			VERIFY(0 == dsl_prop_unregister(ds, "checksum",
+			VERIFY0(dsl_prop_unregister(ds,
+			    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
 			    checksum_changed_cb, os));
-			VERIFY(0 == dsl_prop_unregister(ds, "compression",
+			VERIFY0(dsl_prop_unregister(ds,
+			    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
 			    compression_changed_cb, os));
-			VERIFY(0 == dsl_prop_unregister(ds, "copies",
+			VERIFY0(dsl_prop_unregister(ds,
+			    zfs_prop_to_name(ZFS_PROP_COPIES),
 			    copies_changed_cb, os));
-			VERIFY(0 == dsl_prop_unregister(ds, "dedup",
+			VERIFY0(dsl_prop_unregister(ds,
+			    zfs_prop_to_name(ZFS_PROP_DEDUP),
 			    dedup_changed_cb, os));
-			VERIFY(0 == dsl_prop_unregister(ds, "logbias",
+			VERIFY0(dsl_prop_unregister(ds,
+			    zfs_prop_to_name(ZFS_PROP_LOGBIAS),
 			    logbias_changed_cb, os));
-			VERIFY(0 == dsl_prop_unregister(ds, "sync",
+			VERIFY0(dsl_prop_unregister(ds,
+			    zfs_prop_to_name(ZFS_PROP_SYNC),
 			    sync_changed_cb, os));
 		}
-		VERIFY(0 == dsl_prop_unregister(ds, "primarycache",
+		VERIFY0(dsl_prop_unregister(ds,
+		    zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
 		    primary_cache_changed_cb, os));
-		VERIFY(0 == dsl_prop_unregister(ds, "secondarycache",
+		VERIFY0(dsl_prop_unregister(ds,
+		    zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
 		    secondary_cache_changed_cb, os));
 	}
 
 	if (os->os_sa)
 		sa_tear_down(os);
 
-	/*
-	 * We should need only a single pass over the dnode list, since
-	 * nothing can be added to the list at this point.
-	 */
-	(void) dmu_objset_evict_dbufs(os);
+	dmu_objset_evict_dbufs(os);
 
 	dnode_special_close(&os->os_meta_dnode);
 	if (DMU_USERUSED_DNODE(os)) {
@@ -567,7 +609,7 @@
 
 	ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
 
-	VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf) == 1);
+	VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf));
 
 	/*
 	 * This is a barrier to prevent the objset from going away in
@@ -599,10 +641,11 @@
 	dnode_t *mdn;
 
 	ASSERT(dmu_tx_is_syncing(tx));
+
 	if (ds != NULL)
-		VERIFY(0 == dmu_objset_from_ds(ds, &os));
+		VERIFY0(dmu_objset_from_ds(ds, &os));
 	else
-		VERIFY(0 == dmu_objset_open_impl(spa, NULL, bp, &os));
+		VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os));
 
 	mdn = DMU_META_DNODE(os);
 
@@ -650,359 +693,181 @@
 	return (os);
 }
 
-struct oscarg {
-	void (*userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
-	void *userarg;
-	dsl_dataset_t *clone_origin;
-	const char *lastname;
-	dmu_objset_type_t type;
-	uint64_t flags;
-	cred_t *cr;
-};
+typedef struct dmu_objset_create_arg {
+	const char *doca_name;
+	cred_t *doca_cred;
+	void (*doca_userfunc)(objset_t *os, void *arg,
+	    cred_t *cr, dmu_tx_t *tx);
+	void *doca_userarg;
+	dmu_objset_type_t doca_type;
+	uint64_t doca_flags;
+} dmu_objset_create_arg_t;
 
 /*ARGSUSED*/
 static int
-dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx)
+dmu_objset_create_check(void *arg, dmu_tx_t *tx)
 {
-	dsl_dir_t *dd = arg1;
-	struct oscarg *oa = arg2;
-	objset_t *mos = dd->dd_pool->dp_meta_objset;
-	int err;
-	uint64_t ddobj;
+	dmu_objset_create_arg_t *doca = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dir_t *pdd;
+	const char *tail;
+	int error;
 
-	err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj,
-	    oa->lastname, sizeof (uint64_t), 1, &ddobj);
-	if (err != ENOENT)
-		return (err ? err : EEXIST);
+	if (strchr(doca->doca_name, '@') != NULL)
+		return (EINVAL);
 
-	if (oa->clone_origin != NULL) {
-		/* You can't clone across pools. */
-		if (oa->clone_origin->ds_dir->dd_pool != dd->dd_pool)
-			return (EXDEV);
-
-		/* You can only clone snapshots, not the head datasets. */
-		if (!dsl_dataset_is_snapshot(oa->clone_origin))
-			return (EINVAL);
+	error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail);
+	if (error != 0)
+		return (error);
+	if (tail == NULL) {
+		dsl_dir_rele(pdd, FTAG);
+		return (EEXIST);
 	}
+	dsl_dir_rele(pdd, FTAG);
 
 	return (0);
 }
 
 static void
-dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dmu_objset_create_sync(void *arg, dmu_tx_t *tx)
 {
-	dsl_dir_t *dd = arg1;
-	spa_t *spa = dd->dd_pool->dp_spa;
-	struct oscarg *oa = arg2;
+	dmu_objset_create_arg_t *doca = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dir_t *pdd;
+	const char *tail;
+	dsl_dataset_t *ds;
 	uint64_t obj;
-	dsl_dataset_t *ds;
 	blkptr_t *bp;
-
-	ASSERT(dmu_tx_is_syncing(tx));
+	objset_t *os;
 
-	obj = dsl_dataset_create_sync(dd, oa->lastname,
-	    oa->clone_origin, oa->flags, oa->cr, tx);
+	VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail));
 
-	VERIFY3U(0, ==, dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds));
+	obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags,
+	    doca->doca_cred, tx);
+
+	VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
 	bp = dsl_dataset_get_blkptr(ds);
-	if (BP_IS_HOLE(bp)) {
-		objset_t *os =
-		    dmu_objset_create_impl(spa, ds, bp, oa->type, tx);
+	os = dmu_objset_create_impl(pdd->dd_pool->dp_spa,
+	    ds, bp, doca->doca_type, tx);
 
-		if (oa->userfunc)
-			oa->userfunc(os, oa->userarg, oa->cr, tx);
+	if (doca->doca_userfunc != NULL) {
+		doca->doca_userfunc(os, doca->doca_userarg,
+		    doca->doca_cred, tx);
 	}
 
-	if (oa->clone_origin == NULL) {
-		spa_history_log_internal_ds(ds, "create", tx, "");
-	} else {
-		char namebuf[MAXNAMELEN];
-		dsl_dataset_name(oa->clone_origin, namebuf);
-		spa_history_log_internal_ds(ds, "clone", tx,
-		    "origin=%s (%llu)", namebuf, oa->clone_origin->ds_object);
-	}
+	spa_history_log_internal_ds(ds, "create", tx, "");
 	dsl_dataset_rele(ds, FTAG);
+	dsl_dir_rele(pdd, FTAG);
 }
 
 int
 dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
     void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg)
 {
-	dsl_dir_t *pdd;
-	const char *tail;
-	int err = 0;
-	struct oscarg oa = { 0 };
-
-	ASSERT(strchr(name, '@') == NULL);
-	err = dsl_dir_open(name, FTAG, &pdd, &tail);
-	if (err)
-		return (err);
-	if (tail == NULL) {
-		dsl_dir_close(pdd, FTAG);
-		return (EEXIST);
-	}
-
-	oa.userfunc = func;
-	oa.userarg = arg;
-	oa.lastname = tail;
-	oa.type = type;
-	oa.flags = flags;
-	oa.cr = CRED();
-
-	err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check,
-	    dmu_objset_create_sync, pdd, &oa, 5);
-	dsl_dir_close(pdd, FTAG);
-	return (err);
-}
+	dmu_objset_create_arg_t doca;
 
-int
-dmu_objset_clone(const char *name, dsl_dataset_t *clone_origin, uint64_t flags)
-{
-	dsl_dir_t *pdd;
-	const char *tail;
-	int err = 0;
-	struct oscarg oa = { 0 };
-
-	ASSERT(strchr(name, '@') == NULL);
-	err = dsl_dir_open(name, FTAG, &pdd, &tail);
-	if (err)
-		return (err);
-	if (tail == NULL) {
-		dsl_dir_close(pdd, FTAG);
-		return (EEXIST);
-	}
+	doca.doca_name = name;
+	doca.doca_cred = CRED();
+	doca.doca_flags = flags;
+	doca.doca_userfunc = func;
+	doca.doca_userarg = arg;
+	doca.doca_type = type;
 
-	oa.lastname = tail;
-	oa.clone_origin = clone_origin;
-	oa.flags = flags;
-	oa.cr = CRED();
-
-	err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check,
-	    dmu_objset_create_sync, pdd, &oa, 5);
-	dsl_dir_close(pdd, FTAG);
-	return (err);
-}
-
-int
-dmu_objset_destroy(const char *name, boolean_t defer)
-{
-	dsl_dataset_t *ds;
-	int error;
-
-	error = dsl_dataset_own(name, B_TRUE, FTAG, &ds);
-	if (error == 0) {
-		error = dsl_dataset_destroy(ds, FTAG, defer);
-		/* dsl_dataset_destroy() closes the ds. */
-	}
-
-	return (error);
+	return (dsl_sync_task(name,
+	    dmu_objset_create_check, dmu_objset_create_sync, &doca, 5));
 }
 
-typedef struct snapallarg {
-	dsl_sync_task_group_t *saa_dstg;
-	boolean_t saa_needsuspend;
-	nvlist_t *saa_props;
+typedef struct dmu_objset_clone_arg {
+	const char *doca_clone;
+	const char *doca_origin;
+	cred_t *doca_cred;
+} dmu_objset_clone_arg_t;
 
-	/* the following are used only if 'temporary' is set: */
-	boolean_t saa_temporary;
-	const char *saa_htag;
-	struct dsl_ds_holdarg *saa_ha;
-	dsl_dataset_t *saa_newds;
-} snapallarg_t;
+/*ARGSUSED*/
+static int
+dmu_objset_clone_check(void *arg, dmu_tx_t *tx)
+{
+	dmu_objset_clone_arg_t *doca = arg;
+	dsl_dir_t *pdd;
+	const char *tail;
+	int error;
+	dsl_dataset_t *origin;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
 
-typedef struct snaponearg {
-	const char *soa_longname; /* long snap name */
-	const char *soa_snapname; /* short snap name */
-	snapallarg_t *soa_saa;
-} snaponearg_t;
+	if (strchr(doca->doca_clone, '@') != NULL)
+		return (EINVAL);
 
-static int
-snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	objset_t *os = arg1;
-	snaponearg_t *soa = arg2;
-	snapallarg_t *saa = soa->soa_saa;
-	int error;
+	error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail);
+	if (error != 0)
+		return (error);
+	if (tail == NULL) {
+		dsl_dir_rele(pdd, FTAG);
+		return (EEXIST);
+	}
+	/* You can't clone across pools. */
+	if (pdd->dd_pool != dp) {
+		dsl_dir_rele(pdd, FTAG);
+		return (EXDEV);
+	}
+	dsl_dir_rele(pdd, FTAG);
 
-	/* The props have already been checked by zfs_check_userprops(). */
-
-	error = dsl_dataset_snapshot_check(os->os_dsl_dataset,
-	    soa->soa_snapname, tx);
-	if (error)
+	error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin);
+	if (error != 0)
 		return (error);
 
-	if (saa->saa_temporary) {
-		/*
-		 * Ideally we would just call
-		 * dsl_dataset_user_hold_check() and
-		 * dsl_dataset_destroy_check() here.  However the
-		 * dataset we want to hold and destroy is the snapshot
-		 * that we just confirmed we can create, but it won't
-		 * exist until after these checks are run.  Do any
-		 * checks we can here and if more checks are added to
-		 * those routines in the future, similar checks may be
-		 * necessary here.
-		 */
-		if (spa_version(os->os_spa) < SPA_VERSION_USERREFS)
-			return (ENOTSUP);
-		/*
-		 * Not checking number of tags because the tag will be
-		 * unique, as it will be the only tag.
-		 */
-		if (strlen(saa->saa_htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN)
-			return (E2BIG);
-
-		saa->saa_ha = kmem_alloc(sizeof (struct dsl_ds_holdarg),
-		    KM_SLEEP);
-		saa->saa_ha->temphold = B_TRUE;
-		saa->saa_ha->htag = saa->saa_htag;
-	}
-	return (error);
-}
-
-static void
-snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	objset_t *os = arg1;
-	dsl_dataset_t *ds = os->os_dsl_dataset;
-	snaponearg_t *soa = arg2;
-	snapallarg_t *saa = soa->soa_saa;
-
-	dsl_dataset_snapshot_sync(ds, soa->soa_snapname, tx);
-
-	if (saa->saa_props != NULL) {
-		dsl_props_arg_t pa;
-		pa.pa_props = saa->saa_props;
-		pa.pa_source = ZPROP_SRC_LOCAL;
-		dsl_props_set_sync(ds->ds_prev, &pa, tx);
+	/* You can't clone across pools. */
+	if (origin->ds_dir->dd_pool != dp) {
+		dsl_dataset_rele(origin, FTAG);
+		return (EXDEV);
 	}
 
-	if (saa->saa_temporary) {
-		struct dsl_ds_destroyarg da;
-
-		dsl_dataset_user_hold_sync(ds->ds_prev, saa->saa_ha, tx);
-		kmem_free(saa->saa_ha, sizeof (struct dsl_ds_holdarg));
-		saa->saa_ha = NULL;
-		saa->saa_newds = ds->ds_prev;
-
-		da.ds = ds->ds_prev;
-		da.defer = B_TRUE;
-		dsl_dataset_destroy_sync(&da, FTAG, tx);
+	/* You can only clone snapshots, not the head datasets. */
+	if (!dsl_dataset_is_snapshot(origin)) {
+		dsl_dataset_rele(origin, FTAG);
+		return (EINVAL);
 	}
-}
-
-static int
-snapshot_one_impl(const char *snapname, void *arg)
-{
-	char fsname[MAXPATHLEN];
-	snapallarg_t *saa = arg;
-	snaponearg_t *soa;
-	objset_t *os;
-	int err;
-
-	(void) strlcpy(fsname, snapname, sizeof (fsname));
-	strchr(fsname, '@')[0] = '\0';
-
-	err = dmu_objset_hold(fsname, saa, &os);
-	if (err != 0)
-		return (err);
-
-	/*
-	 * If the objset is in an inconsistent state (eg, in the process
-	 * of being destroyed), don't snapshot it.
-	 */
-	if (os->os_dsl_dataset->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) {
-		dmu_objset_rele(os, saa);
-		return (EBUSY);
-	}
-
-	if (saa->saa_needsuspend) {
-		err = zil_suspend(dmu_objset_zil(os));
-		if (err) {
-			dmu_objset_rele(os, saa);
-			return (err);
-		}
-	}
-
-	soa = kmem_zalloc(sizeof (*soa), KM_SLEEP);
-	soa->soa_saa = saa;
-	soa->soa_longname = snapname;
-	soa->soa_snapname = strchr(snapname, '@') + 1;
-
-	dsl_sync_task_create(saa->saa_dstg, snapshot_check, snapshot_sync,
-	    os, soa, 3);
+	dsl_dataset_rele(origin, FTAG);
 
 	return (0);
 }
 
-/*
- * The snapshots must all be in the same pool.
- */
-int
-dmu_objset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)
+static void
+dmu_objset_clone_sync(void *arg, dmu_tx_t *tx)
 {
-	dsl_sync_task_t *dst;
-	snapallarg_t saa = { 0 };
-	spa_t *spa;
-	int rv = 0;
-	int err;
-	nvpair_t *pair;
+	dmu_objset_clone_arg_t *doca = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dir_t *pdd;
+	const char *tail;
+	dsl_dataset_t *origin, *ds;
+	uint64_t obj;
+	char namebuf[MAXNAMELEN];
 
-	pair = nvlist_next_nvpair(snaps, NULL);
-	if (pair == NULL)
-		return (0);
+	VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail));
+	VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin));
 
-	err = spa_open(nvpair_name(pair), &spa, FTAG);
-	if (err)
-		return (err);
-	saa.saa_dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
-	saa.saa_props = props;
-	saa.saa_needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
+	obj = dsl_dataset_create_sync(pdd, tail, origin, 0,
+	    doca->doca_cred, tx);
 
-	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
-	    pair = nvlist_next_nvpair(snaps, pair)) {
-		err = snapshot_one_impl(nvpair_name(pair), &saa);
-		if (err != 0) {
-			if (errors != NULL) {
-				fnvlist_add_int32(errors,
-				    nvpair_name(pair), err);
-			}
-			rv = err;
-		}
-	}
+	VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
+	dsl_dataset_name(origin, namebuf);
+	spa_history_log_internal_ds(ds, "clone", tx,
+	    "origin=%s (%llu)", namebuf, origin->ds_object);
+	dsl_dataset_rele(ds, FTAG);
+	dsl_dataset_rele(origin, FTAG);
+	dsl_dir_rele(pdd, FTAG);
+}
 
-	/*
-	 * If any call to snapshot_one_impl() failed, don't execute the
-	 * sync task.  The error handling code below will clean up the
-	 * snaponearg_t from any successful calls to
-	 * snapshot_one_impl().
-	 */
-	if (rv == 0)
-		err = dsl_sync_task_group_wait(saa.saa_dstg);
-	if (err != 0)
-		rv = err;
+int
+dmu_objset_clone(const char *clone, const char *origin)
+{
+	dmu_objset_clone_arg_t doca;
 
-	for (dst = list_head(&saa.saa_dstg->dstg_tasks); dst;
-	    dst = list_next(&saa.saa_dstg->dstg_tasks, dst)) {
-		objset_t *os = dst->dst_arg1;
-		snaponearg_t *soa = dst->dst_arg2;
-		if (dst->dst_err != 0) {
-			if (errors != NULL) {
-				fnvlist_add_int32(errors,
-				    soa->soa_longname, dst->dst_err);
-			}
-			rv = dst->dst_err;
-		}
+	doca.doca_clone = clone;
+	doca.doca_origin = origin;
+	doca.doca_cred = CRED();
 
-		if (saa.saa_needsuspend)
-			zil_resume(dmu_objset_zil(os));
-		dmu_objset_rele(os, &saa);
-		kmem_free(soa, sizeof (*soa));
-	}
-
-	dsl_sync_task_group_destroy(saa.saa_dstg);
-	spa_close(spa, FTAG);
-	return (rv);
+	return (dsl_sync_task(clone,
+	    dmu_objset_clone_check, dmu_objset_clone_sync, &doca, 5));
 }
 
 int
@@ -1013,59 +878,12 @@
 	nvlist_t *snaps = fnvlist_alloc();
 
 	fnvlist_add_boolean(snaps, longsnap);
-	err = dmu_objset_snapshot(snaps, NULL, NULL);
+	strfree(longsnap);
+	err = dsl_dataset_snapshot(snaps, NULL, NULL);
 	fnvlist_free(snaps);
-	strfree(longsnap);
 	return (err);
 }
 
-int
-dmu_objset_snapshot_tmp(const char *snapname, const char *tag, int cleanup_fd)
-{
-	dsl_sync_task_t *dst;
-	snapallarg_t saa = { 0 };
-	spa_t *spa;
-	minor_t minor;
-	int err;
-
-	err = spa_open(snapname, &spa, FTAG);
-	if (err)
-		return (err);
-	saa.saa_dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
-	saa.saa_htag = tag;
-	saa.saa_needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
-	saa.saa_temporary = B_TRUE;
-
-	if (cleanup_fd < 0) {
-		spa_close(spa, FTAG);
-		return (EINVAL);
-	}
-	if ((err = zfs_onexit_fd_hold(cleanup_fd, &minor)) != 0) {
-		spa_close(spa, FTAG);
-		return (err);
-	}
-
-	err = snapshot_one_impl(snapname, &saa);
-
-	if (err == 0)
-		err = dsl_sync_task_group_wait(saa.saa_dstg);
-
-	for (dst = list_head(&saa.saa_dstg->dstg_tasks); dst;
-	    dst = list_next(&saa.saa_dstg->dstg_tasks, dst)) {
-		objset_t *os = dst->dst_arg1;
-		dsl_register_onexit_hold_cleanup(saa.saa_newds, tag, minor);
-		if (saa.saa_needsuspend)
-			zil_resume(dmu_objset_zil(os));
-		dmu_objset_rele(os, &saa);
-	}
-
-	zfs_onexit_fd_rele(cleanup_fd);
-	dsl_sync_task_group_destroy(saa.saa_dstg);
-	spa_close(spa, FTAG);
-	return (err);
-}
-
-
 static void
 dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx)
 {
@@ -1101,9 +919,9 @@
 	objset_t *os = arg;
 	dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
 
-	ASSERT(bp == os->os_rootbp);
-	ASSERT(BP_GET_TYPE(bp) == DMU_OT_OBJSET);
-	ASSERT(BP_GET_LEVEL(bp) == 0);
+	ASSERT3P(bp, ==, os->os_rootbp);
+	ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
+	ASSERT0(BP_GET_LEVEL(bp));
 
 	/*
 	 * Update rootbp fill count: it should be the number of objects
@@ -1210,7 +1028,7 @@
 
 	list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
 	while (dr = list_head(list)) {
-		ASSERT(dr->dr_dbuf->db_level == 0);
+		ASSERT0(dr->dr_dbuf->db_level);
 		list_remove(list, dr);
 		if (dr->dr_zio)
 			zio_nowait(dr->dr_zio);
@@ -1505,12 +1323,12 @@
 			return (EINTR);
 
 		objerr = dmu_bonus_hold(os, obj, FTAG, &db);
-		if (objerr)
+		if (objerr != 0)
 			continue;
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_bonus(tx, obj);
 		objerr = dmu_tx_assign(tx, TXG_WAIT);
-		if (objerr) {
+		if (objerr != 0) {
 			dmu_tx_abort(tx);
 			continue;
 		}
@@ -1593,6 +1411,8 @@
 	zap_cursor_t cursor;
 	zap_attribute_t attr;
 
+	ASSERT(dsl_pool_config_held(dmu_objset_pool(os)));
+
 	if (ds->ds_phys->ds_snapnames_zapobj == 0)
 		return (ENOENT);
 
@@ -1659,64 +1479,34 @@
 	return (0);
 }
 
-struct findarg {
-	int (*func)(const char *, void *);
-	void *arg;
-};
-
-/* ARGSUSED */
-static int
-findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
-{
-	struct findarg *fa = arg;
-	return (fa->func(dsname, fa->arg));
-}
-
 /*
- * Find all objsets under name, and for each, call 'func(child_name, arg)'.
- * Perhaps change all callers to use dmu_objset_find_spa()?
+ * Find objsets under and including ddobj, call func(ds) on each.
  */
 int
-dmu_objset_find(char *name, int func(const char *, void *), void *arg,
-    int flags)
-{
-	struct findarg fa;
-	fa.func = func;
-	fa.arg = arg;
-	return (dmu_objset_find_spa(NULL, name, findfunc, &fa, flags));
-}
-
-/*
- * Find all objsets under name, call func on each
- */
-int
-dmu_objset_find_spa(spa_t *spa, const char *name,
-    int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags)
+dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
+    int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
 {
 	dsl_dir_t *dd;
-	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
 	zap_cursor_t zc;
 	zap_attribute_t *attr;
-	char *child;
 	uint64_t thisobj;
 	int err;
 
-	if (name == NULL)
-		name = spa_name(spa);
-	err = dsl_dir_open_spa(spa, name, FTAG, &dd, NULL);
-	if (err)
+	ASSERT(dsl_pool_config_held(dp));
+
+	err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd);
+	if (err != 0)
 		return (err);
 
 	/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
 	if (dd->dd_myname[0] == '$') {
-		dsl_dir_close(dd, FTAG);
+		dsl_dir_rele(dd, FTAG);
 		return (0);
 	}
 
 	thisobj = dd->dd_phys->dd_head_dataset_obj;
 	attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
-	dp = dd->dd_pool;
 
 	/*
 	 * Iterate over all children.
@@ -1726,19 +1516,19 @@
 		    dd->dd_phys->dd_child_dir_zapobj);
 		    zap_cursor_retrieve(&zc, attr) == 0;
 		    (void) zap_cursor_advance(&zc)) {
-			ASSERT(attr->za_integer_length == sizeof (uint64_t));
-			ASSERT(attr->za_num_integers == 1);
+			ASSERT3U(attr->za_integer_length, ==,
+			    sizeof (uint64_t));
+			ASSERT3U(attr->za_num_integers, ==, 1);
 
-			child = kmem_asprintf("%s/%s", name, attr->za_name);
-			err = dmu_objset_find_spa(spa, child, func, arg, flags);
-			strfree(child);
-			if (err)
+			err = dmu_objset_find_dp(dp, attr->za_first_integer,
+			    func, arg, flags);
+			if (err != 0)
 				break;
 		}
 		zap_cursor_fini(&zc);
 
-		if (err) {
-			dsl_dir_close(dd, FTAG);
+		if (err != 0) {
+			dsl_dir_rele(dd, FTAG);
 			kmem_free(attr, sizeof (zap_attribute_t));
 			return (err);
 		}
@@ -1748,11 +1538,8 @@
 	 * Iterate over all snapshots.
 	 */
 	if (flags & DS_FIND_SNAPSHOTS) {
-		if (!dsl_pool_sync_context(dp))
-			rw_enter(&dp->dp_config_rwlock, RW_READER);
+		dsl_dataset_t *ds;
 		err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
-		if (!dsl_pool_sync_context(dp))
-			rw_exit(&dp->dp_config_rwlock);
 
 		if (err == 0) {
 			uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
@@ -1761,64 +1548,166 @@
 			for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
 			    zap_cursor_retrieve(&zc, attr) == 0;
 			    (void) zap_cursor_advance(&zc)) {
-				ASSERT(attr->za_integer_length ==
+				ASSERT3U(attr->za_integer_length, ==,
 				    sizeof (uint64_t));
-				ASSERT(attr->za_num_integers == 1);
+				ASSERT3U(attr->za_num_integers, ==, 1);
 
-				child = kmem_asprintf("%s@%s",
-				    name, attr->za_name);
-				err = func(spa, attr->za_first_integer,
-				    child, arg);
-				strfree(child);
-				if (err)
+				err = dsl_dataset_hold_obj(dp,
+				    attr->za_first_integer, FTAG, &ds);
+				if (err != 0)
+					break;
+				err = func(dp, ds, arg);
+				dsl_dataset_rele(ds, FTAG);
+				if (err != 0)
 					break;
 			}
 			zap_cursor_fini(&zc);
 		}
 	}
 
-	dsl_dir_close(dd, FTAG);
+	dsl_dir_rele(dd, FTAG);
 	kmem_free(attr, sizeof (zap_attribute_t));
 
-	if (err)
+	if (err != 0)
 		return (err);
 
 	/*
-	 * Apply to self if appropriate.
+	 * Apply to self.
 	 */
-	err = func(spa, thisobj, name, arg);
+	err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
+	if (err != 0)
+		return (err);
+	err = func(dp, ds, arg);
+	dsl_dataset_rele(ds, FTAG);
 	return (err);
 }
 
-/* ARGSUSED */
-int
-dmu_objset_prefetch(const char *name, void *arg)
+/*
+ * Find all objsets under name, and for each, call 'func(child_name, arg)'.
+ * The dp_config_rwlock must not be held when this is called, and it
+ * will not be held when the callback is called.
+ * Therefore this function should only be used when the pool is not changing
+ * (e.g. in syncing context), or the callback can deal with the possible races.
+ */
+static int
+dmu_objset_find_impl(spa_t *spa, const char *name,
+    int func(const char *, void *), void *arg, int flags)
 {
+	dsl_dir_t *dd;
+	dsl_pool_t *dp = spa_get_dsl(spa);
 	dsl_dataset_t *ds;
-
-	if (dsl_dataset_hold(name, FTAG, &ds))
-		return (0);
+	zap_cursor_t zc;
+	zap_attribute_t *attr;
+	char *child;
+	uint64_t thisobj;
+	int err;
 
-	if (!BP_IS_HOLE(&ds->ds_phys->ds_bp)) {
-		mutex_enter(&ds->ds_opening_lock);
-		if (ds->ds_objset == NULL) {
-			uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
-			zbookmark_t zb;
+	dsl_pool_config_enter(dp, FTAG);
 
-			SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT,
-			    ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+	err = dsl_dir_hold(dp, name, FTAG, &dd, NULL);
+	if (err != 0) {
+		dsl_pool_config_exit(dp, FTAG);
+		return (err);
+	}
 
-			(void) arc_read(NULL, dsl_dataset_get_spa(ds),
-			    &ds->ds_phys->ds_bp, NULL, NULL,
-			    ZIO_PRIORITY_ASYNC_READ,
-			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
-			    &aflags, &zb);
-		}
-		mutex_exit(&ds->ds_opening_lock);
+	/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
+	if (dd->dd_myname[0] == '$') {
+		dsl_dir_rele(dd, FTAG);
+		dsl_pool_config_exit(dp, FTAG);
+		return (0);
 	}
 
-	dsl_dataset_rele(ds, FTAG);
-	return (0);
+	thisobj = dd->dd_phys->dd_head_dataset_obj;
+	attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+
+	/*
+	 * Iterate over all children.
+	 */
+	if (flags & DS_FIND_CHILDREN) {
+		for (zap_cursor_init(&zc, dp->dp_meta_objset,
+		    dd->dd_phys->dd_child_dir_zapobj);
+		    zap_cursor_retrieve(&zc, attr) == 0;
+		    (void) zap_cursor_advance(&zc)) {
+			ASSERT3U(attr->za_integer_length, ==,
+			    sizeof (uint64_t));
+			ASSERT3U(attr->za_num_integers, ==, 1);
+
+			child = kmem_asprintf("%s/%s", name, attr->za_name);
+			dsl_pool_config_exit(dp, FTAG);
+			err = dmu_objset_find_impl(spa, child,
+			    func, arg, flags);
+			dsl_pool_config_enter(dp, FTAG);
+			strfree(child);
+			if (err != 0)
+				break;
+		}
+		zap_cursor_fini(&zc);
+
+		if (err != 0) {
+			dsl_dir_rele(dd, FTAG);
+			dsl_pool_config_exit(dp, FTAG);
+			kmem_free(attr, sizeof (zap_attribute_t));
+			return (err);
+		}
+	}
+
+	/*
+	 * Iterate over all snapshots.
+	 */
+	if (flags & DS_FIND_SNAPSHOTS) {
+		err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
+
+		if (err == 0) {
+			uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
+			dsl_dataset_rele(ds, FTAG);
+
+			for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
+			    zap_cursor_retrieve(&zc, attr) == 0;
+			    (void) zap_cursor_advance(&zc)) {
+				ASSERT3U(attr->za_integer_length, ==,
+				    sizeof (uint64_t));
+				ASSERT3U(attr->za_num_integers, ==, 1);
+
+				child = kmem_asprintf("%s@%s",
+				    name, attr->za_name);
+				dsl_pool_config_exit(dp, FTAG);
+				err = func(child, arg);
+				dsl_pool_config_enter(dp, FTAG);
+				strfree(child);
+				if (err != 0)
+					break;
+			}
+			zap_cursor_fini(&zc);
+		}
+	}
+
+	dsl_dir_rele(dd, FTAG);
+	kmem_free(attr, sizeof (zap_attribute_t));
+	dsl_pool_config_exit(dp, FTAG);
+
+	if (err != 0)
+		return (err);
+
+	/* Apply to self. */
+	return (func(name, arg));
+}
+
+/*
+ * See comment above dmu_objset_find_impl().
+ */
+int
+dmu_objset_find(char *name, int func(const char *, void *), void *arg,
+    int flags)
+{
+	spa_t *spa;
+	int error;
+
+	error = spa_open(name, &spa, FTAG);
+	if (error != 0)
+		return (error);
+	error = dmu_objset_find_impl(spa, name, func, arg, flags);
+	spa_close(spa, FTAG);
+	return (error);
 }
 
 void
@@ -1834,3 +1723,19 @@
 	ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
 	return (os->os_user_ptr);
 }
+
+/*
+ * Determine name of filesystem, given name of snapshot.
+ * buf must be at least MAXNAMELEN bytes
+ */
+int
+dmu_fsname(const char *snapname, char *buf)
+{
+	char *atp = strchr(snapname, '@');
+	if (atp == NULL)
+		return (EINVAL);
+	if (atp - snapname >= MAXNAMELEN)
+		return (ENAMETOOLONG);
+	(void) strlcpy(buf, snapname, atp - snapname + 1);
+	return (0);
+}
--- a/usr/src/uts/common/fs/zfs/dmu_send.c	Wed Feb 27 21:13:42 2013 +0100
+++ b/usr/src/uts/common/fs/zfs/dmu_send.c	Thu Feb 28 12:44:05 2013 -0800
@@ -46,11 +46,14 @@
 #include <sys/avl.h>
 #include <sys/ddt.h>
 #include <sys/zfs_onexit.h>
+#include <sys/dmu_send.h>
+#include <sys/dsl_destroy.h>
 
 /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
 int zfs_send_corrupt_data = B_FALSE;
 
 static char *dmu_recv_tag = "dmu_recv_tag";
+static const char *recv_clone_name = "%recv";
 
 static int
 dump_bytes(dmu_sendarg_t *dsp, void *buf, int len)
@@ -290,7 +293,7 @@
 	if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) *
 	    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL))
 		return (EINTR);
-	if (dsp->dsa_err)
+	if (dsp->dsa_err != 0)
 		return (EINTR);
 	return (0);
 }
@@ -340,7 +343,7 @@
 			uint64_t dnobj = (zb->zb_blkid <<
 			    (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
 			err = dump_dnode(dsp, dnobj, blk+i);
-			if (err)
+			if (err != 0)
 				break;
 		}
 		(void) arc_buf_remove_ref(abuf, &abuf);
@@ -388,65 +391,33 @@
 }
 
 /*
- * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
- * For example, they could both be snapshots of the same filesystem, and
- * 'earlier' is before 'later'.  Or 'earlier' could be the origin of
- * 'later's filesystem.  Or 'earlier' could be an older snapshot in the origin's
- * filesystem.  Or 'earlier' could be the origin's origin.
+ * Releases dp, ds, and fromds, using the specified tag.
  */
-static boolean_t
-is_before(dsl_dataset_t *later, dsl_dataset_t *earlier)
+static int
+dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
+    dsl_dataset_t *fromds, int outfd, vnode_t *vp, offset_t *off)
 {
-	dsl_pool_t *dp = later->ds_dir->dd_pool;
-	int error;
-	boolean_t ret;
-	dsl_dataset_t *origin;
-
-	if (earlier->ds_phys->ds_creation_txg >=
-	    later->ds_phys->ds_creation_txg)
-		return (B_FALSE);
-
-	if (later->ds_dir == earlier->ds_dir)
-		return (B_TRUE);
-	if (!dsl_dir_is_clone(later->ds_dir))
-		return (B_FALSE);
-
-	rw_enter(&dp->dp_config_rwlock, RW_READER);
-	if (later->ds_dir->dd_phys->dd_origin_obj == earlier->ds_object) {
-		rw_exit(&dp->dp_config_rwlock);
-		return (B_TRUE);
-	}
-	error = dsl_dataset_hold_obj(dp,
-	    later->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin);
-	rw_exit(&dp->dp_config_rwlock);
-	if (error != 0)
-		return (B_FALSE);
-	ret = is_before(origin, earlier);
-	dsl_dataset_rele(origin, FTAG);
-	return (ret);
-}
-
-int
-dmu_send(objset_t *tosnap, objset_t *fromsnap, int outfd, vnode_t *vp,
-    offset_t *off)
-{
-	dsl_dataset_t *ds = tosnap->os_dsl_dataset;
-	dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL;
+	objset_t *os;
 	dmu_replay_record_t *drr;
 	dmu_sendarg_t *dsp;
 	int err;
 	uint64_t fromtxg = 0;
 
-	/* tosnap must be a snapshot */
-	if (ds->ds_phys->ds_next_snap_obj == 0)
-		return (EINVAL);
+	if (fromds != NULL && !dsl_dataset_is_before(ds, fromds)) {
+		dsl_dataset_rele(fromds, tag);
+		dsl_dataset_rele(ds, tag);
+		dsl_pool_rele(dp, tag);
+		return (EXDEV);
+	}
 
-	/*
-	 * fromsnap must be an earlier snapshot from the same fs as tosnap,
-	 * or the origin's fs.
-	 */
-	if (fromds != NULL && !is_before(ds, fromds))
-		return (EXDEV);
+	err = dmu_objset_from_ds(ds, &os);
+	if (err != 0) {
+		if (fromds != NULL)
+			dsl_dataset_rele(fromds, tag);
+		dsl_dataset_rele(ds, tag);
+		dsl_pool_rele(dp, tag);
+		return (err);
+	}
 
 	drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
 	drr->drr_type = DRR_BEGIN;
@@ -455,13 +426,17 @@
 	    DMU_SUBSTREAM);
 
 #ifdef _KERNEL
-	if (dmu_objset_type(tosnap) == DMU_OST_ZFS) {
+	if (dmu_objset_type(os) == DMU_OST_ZFS) {
 		uint64_t version;
-		if (zfs_get_zplprop(tosnap, ZFS_PROP_VERSION, &version) != 0) {
+		if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) {
 			kmem_free(drr, sizeof (dmu_replay_record_t));
+			if (fromds != NULL)
+				dsl_dataset_rele(fromds, tag);
+			dsl_dataset_rele(ds, tag);
+			dsl_pool_rele(dp, tag);
 			return (EINVAL);
 		}
-		if (version == ZPL_VERSION_SA) {
+		if (version >= ZPL_VERSION_SA) {
 			DMU_SET_FEATUREFLAGS(
 			    drr->drr_u.drr_begin.drr_versioninfo,
 			    DMU_BACKUP_FEATURE_SA_SPILL);
@@ -471,19 +446,22 @@
 
 	drr->drr_u.drr_begin.drr_creation_time =
 	    ds->ds_phys->ds_creation_time;
-	drr->drr_u.drr_begin.drr_type = tosnap->os_phys->os_type;
+	drr->drr_u.drr_begin.drr_type = dmu_objset_type(os);
 	if (fromds != NULL && ds->ds_dir != fromds->ds_dir)
 		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
 	drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid;
 	if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
 		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
 
-	if (fromds)
+	if (fromds != NULL)
 		drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid;
 	dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname);
 
-	if (fromds)
+	if (fromds != NULL) {
 		fromtxg = fromds->ds_phys->ds_creation_txg;
+		dsl_dataset_rele(fromds, tag);
+		fromds = NULL;
+	}
 
 	dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP);
 
@@ -491,7 +469,7 @@
 	dsp->dsa_vp = vp;
 	dsp->dsa_outfd = outfd;
 	dsp->dsa_proc = curproc;
-	dsp->dsa_os = tosnap;
+	dsp->dsa_os = os;
 	dsp->dsa_off = off;
 	dsp->dsa_toguid = ds->ds_phys->ds_guid;
 	ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0);
@@ -506,6 +484,9 @@
 		goto out;
 	}
 
+	dsl_dataset_long_hold(ds, FTAG);
+	dsl_pool_rele(dp, tag);
+
 	err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH,
 	    backup_cb, dsp);
 
@@ -513,8 +494,8 @@
 		if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0)
 			err = EINTR;
 
-	if (err) {
-		if (err == EINTR && dsp->dsa_err)
+	if (err != 0) {
+		if (err == EINTR && dsp->dsa_err != 0)
 			err = dsp->dsa_err;
 		goto out;
 	}
@@ -537,27 +518,96 @@
 	kmem_free(drr, sizeof (dmu_replay_record_t));
 	kmem_free(dsp, sizeof (dmu_sendarg_t));
 
+	dsl_dataset_long_rele(ds, FTAG);
+	dsl_dataset_rele(ds, tag);
+
 	return (err);
 }
 
 int
-dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, uint64_t *sizep)
+dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
+    int outfd, vnode_t *vp, offset_t *off)
+{
+	dsl_pool_t *dp;
+	dsl_dataset_t *ds;
+	dsl_dataset_t *fromds = NULL;
+	int err;
+
+	err = dsl_pool_hold(pool, FTAG, &dp);
+	if (err != 0)
+		return (err);
+
+	err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds);
+	if (err != 0) {
+		dsl_pool_rele(dp, FTAG);
+		return (err);
+	}
+
+	if (fromsnap != 0) {
+		err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds);
+		if (err != 0) {
+			dsl_dataset_rele(ds, FTAG);
+			dsl_pool_rele(dp, FTAG);
+			return (err);
+		}
+	}
+
+	return (dmu_send_impl(FTAG, dp, ds, fromds, outfd, vp, off));
+}
+
+int
+dmu_send(const char *tosnap, const char *fromsnap,
+    int outfd, vnode_t *vp, offset_t *off)
 {
-	dsl_dataset_t *ds = tosnap->os_dsl_dataset;
-	dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL;
+	dsl_pool_t *dp;
+	dsl_dataset_t *ds;
+	dsl_dataset_t *fromds = NULL;
+	int err;
+
+	if (strchr(tosnap, '@') == NULL)
+		return (EINVAL);
+	if (fromsnap != NULL && strchr(fromsnap, '@') == NULL)
+		return (EINVAL);
+
+	err = dsl_pool_hold(tosnap, FTAG, &dp);
+	if (err != 0)
+		return (err);
+
+	err = dsl_dataset_hold(dp, tosnap, FTAG, &ds);
+	if (err != 0) {
+		dsl_pool_rele(dp, FTAG);
+		return (err);
+	}
+
+	if (fromsnap != NULL) {
+		err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds);
+		if (err != 0) {
+			dsl_dataset_rele(ds, FTAG);
+			dsl_pool_rele(dp, FTAG);
+			return (err);
+		}
+	}
+	return (dmu_send_impl(FTAG, dp, ds, fromds, outfd, vp, off));
+}
+
+int
+dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep)
+{
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	int err;
 	uint64_t size;
 
+	ASSERT(dsl_pool_config_held(dp));
+
 	/* tosnap must be a snapshot */
-	if (ds->ds_phys->ds_next_snap_obj == 0)
+	if (!dsl_dataset_is_snapshot(ds))
 		return (EINVAL);
 
 	/*
 	 * fromsnap must be an earlier snapshot from the same fs as tosnap,
 	 * or the origin's fs.
 	 */
-	if (fromds != NULL && !is_before(ds, fromds))
+	if (fromds != NULL && !dsl_dataset_is_before(ds, fromds))
 		return (EXDEV);
 
 	/* Get uncompressed size estimate of changed data. */
@@ -567,7 +617,7 @@
 		uint64_t used, comp;
 		err = dsl_dataset_space_written(fromds, ds,
 		    &used, &comp, &size);
-		if (err)
+		if (err != 0)
 			return (err);
 	}
 
@@ -587,11 +637,8 @@
 	 * block, which we observe in practice.
 	 */
 	uint64_t recordsize;
-	rw_enter(&dp->dp_config_rwlock, RW_READER);
-	err = dsl_prop_get_ds(ds, "recordsize",
-	    sizeof (recordsize), 1, &recordsize, NULL);
-	rw_exit(&dp->dp_config_rwlock);
-	if (err)
+	err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize);
+	if (err != 0)
 		return (err);
 	size -= size / recordsize * sizeof (blkptr_t);
 
@@ -603,93 +650,40 @@
 	return (0);
 }
 
-struct recvbeginsyncarg {
-	const char *tofs;
-	const char *tosnap;
-	dsl_dataset_t *origin;
-	uint64_t fromguid;
-	dmu_objset_type_t type;
-	void *tag;
-	boolean_t force;
-	uint64_t dsflags;
-	char clonelastname[MAXNAMELEN];
-	dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */
-	cred_t *cr;
-};
-
-/* ARGSUSED */
-static int
-recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dir_t *dd = arg1;
-	struct recvbeginsyncarg *rbsa = arg2;
-	objset_t *mos = dd->dd_pool->dp_meta_objset;
-	uint64_t val;
-	int err;
-
-	err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj,
-	    strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val);
-
-	if (err != ENOENT)
-		return (err ? err : EEXIST);
+typedef struct dmu_recv_begin_arg {
+	const char *drba_origin;
+	dmu_recv_cookie_t *drba_cookie;
+	cred_t *drba_cred;
+} dmu_recv_begin_arg_t;
 
-	if (rbsa->origin) {
-		/* make sure it's a snap in the same pool */
-		if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool)
-			return (EXDEV);
-		if (!dsl_dataset_is_snapshot(rbsa->origin))
-			return (EINVAL);
-		if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid)
-			return (ENODEV);
-	}
-
-	return (0);
-}
-
-static void
-recv_new_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+static int
+recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
+    uint64_t fromguid)
 {
-	dsl_dir_t *dd = arg1;
-	struct recvbeginsyncarg *rbsa = arg2;
-	uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
-	uint64_t dsobj;
-
-	/* Create and open new dataset. */
-	dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1,
-	    rbsa->origin, flags, rbsa->cr, tx);
-	VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj,
-	    B_TRUE, dmu_recv_tag, &rbsa->ds));
-
-	if (rbsa->origin == NULL) {
-		(void) dmu_objset_create_impl(dd->dd_pool->dp_spa,
-		    rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx);
-	}
-
-	spa_history_log_internal_ds(rbsa->ds, "receive new", tx, "");
-}
-
-/* ARGSUSED */
-static int
-recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = arg1;
-	struct recvbeginsyncarg *rbsa = arg2;
-	int err;
 	uint64_t val;
+	int error;
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 
 	/* must not have any changes since most recent snapshot */
-	if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds))
+	if (!drba->drba_cookie->drc_force &&
+	    dsl_dataset_modified_since_lastsnap(ds))
 		return (ETXTBSY);
 
+	/* temporary clone name must not exist */
+	error = zap_lookup(dp->dp_meta_objset,
+	    ds->ds_dir->dd_phys->dd_child_dir_zapobj, recv_clone_name,
+	    8, 1, &val);
+	if (error != ENOENT)
+		return (error == 0 ? EBUSY : error);
+
 	/* new snapshot name must not exist */
-	err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
-	    ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val);
-	if (err == 0)
-		return (EEXIST);
-	if (err != ENOENT)
-		return (err);
+	error = zap_lookup(dp->dp_meta_objset,
+	    ds->ds_phys->ds_snapnames_zapobj, drba->drba_cookie->drc_tosnap,
+	    8, 1, &val);
+	if (error != ENOENT)
+		return (error == 0 ? EEXIST : error);
 
-	if (rbsa->fromguid) {
+	if (fromguid != 0) {
 		/* if incremental, most recent snapshot must match fromguid */
 		if (ds->ds_prev == NULL)
 			return (ENODEV);
@@ -698,20 +692,20 @@
 		 * most recent snapshot must match fromguid, or there are no
 		 * changes since the fromguid one
 		 */
-		if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) {
+		if (ds->ds_prev->ds_phys->ds_guid != fromguid) {
 			uint64_t birth = ds->ds_prev->ds_phys->ds_bp.blk_birth;
 			uint64_t obj = ds->ds_prev->ds_phys->ds_prev_snap_obj;
 			while (obj != 0) {
 				dsl_dataset_t *snap;
-				err = dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
-				    obj, FTAG, &snap);
-				if (err)
+				error = dsl_dataset_hold_obj(dp, obj, FTAG,
+				    &snap);
+				if (error != 0)
 					return (ENODEV);
 				if (snap->ds_phys->ds_creation_txg < birth) {
 					dsl_dataset_rele(snap, FTAG);
 					return (ENODEV);
 				}
-				if (snap->ds_phys->ds_guid == rbsa->fromguid) {
+				if (snap->ds_phys->ds_guid == fromguid) {
 					dsl_dataset_rele(snap, FTAG);
 					break; /* it's ok */
 				}
@@ -727,58 +721,153 @@
 			return (ENODEV);
 	}
 
-	/* temporary clone name must not exist */
-	err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
-	    ds->ds_dir->dd_phys->dd_child_dir_zapobj,
-	    rbsa->clonelastname, 8, 1, &val);
-	if (err == 0)
-		return (EEXIST);
-	if (err != ENOENT)
-		return (err);
+	return (0);
 
-	return (0);
 }
 
-/* ARGSUSED */
-static void
-recv_existing_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+static int
+dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
 {
-	dsl_dataset_t *ohds = arg1;
-	struct recvbeginsyncarg *rbsa = arg2;
-	dsl_pool_t *dp = ohds->ds_dir->dd_pool;
-	dsl_dataset_t *cds;
-	uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
+	dmu_recv_begin_arg_t *drba = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
+	uint64_t fromguid = drrb->drr_fromguid;
+	int flags = drrb->drr_flags;
+	int error;
+	dsl_dataset_t *ds;
+	const char *tofs = drba->drba_cookie->drc_tofs;
+
+	/* already checked */
+	ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
+
+	if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
+	    DMU_COMPOUNDSTREAM ||
+	    drrb->drr_type >= DMU_OST_NUMTYPES ||
+	    ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL))
+		return (EINVAL);
+
+	/* Verify pool version supports SA if SA_SPILL feature set */
+	if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
+	    DMU_BACKUP_FEATURE_SA_SPILL) &&
+	    spa_version(dp->dp_spa) < SPA_VERSION_SA) {
+		return (ENOTSUP);
+	}
+
+	error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
+	if (error == 0) {
+		/* target fs already exists; recv into temp clone */
+
+		/* Can't recv a clone into an existing fs */
+		if (flags & DRR_FLAG_CLONE) {
+			dsl_dataset_rele(ds, FTAG);
+			return (EINVAL);
+		}
+
+		error = recv_begin_check_existing_impl(drba, ds, fromguid);
+		dsl_dataset_rele(ds, FTAG);
+	} else if (error == ENOENT) {
+		/* target fs does not exist; must be a full backup or clone */
+		char buf[MAXNAMELEN];
+
+		/*
+		 * If it's a non-clone incremental, we are missing the
+		 * target fs, so fail the recv.
+		 */
+		if (fromguid != 0 && !(flags & DRR_FLAG_CLONE))
+			return (ENOENT);
+
+		/* Open the parent of tofs */
+		ASSERT3U(strlen(tofs), <, MAXNAMELEN);
+		(void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1);
+		error = dsl_dataset_hold(dp, buf, FTAG, &ds);
+		if (error != 0)
+			return (error);
+
+		if (drba->drba_origin != NULL) {
+			dsl_dataset_t *origin;
+			error = dsl_dataset_hold(dp, drba->drba_origin,
+			    FTAG, &origin);
+			if (error != 0) {
+				dsl_dataset_rele(ds, FTAG);
+				return (error);
+			}
+			if (!dsl_dataset_is_snapshot(origin)) {
+				dsl_dataset_rele(origin, FTAG);
+				dsl_dataset_rele(ds, FTAG);
+				return (EINVAL);
+			}
+			if (origin->ds_phys->ds_guid != fromguid) {
+				dsl_dataset_rele(origin, FTAG);
+				dsl_dataset_rele(ds, FTAG);
+				return (ENODEV);
+			}
+			dsl_dataset_rele(origin, FTAG);
+		}
+		dsl_dataset_rele(ds, FTAG);
+		error = 0;
+	}
+	return (error);
+}
+
+static void
+dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
+{
+	dmu_recv_begin_arg_t *drba = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
+	const char *tofs = drba->drba_cookie->drc_tofs;
+	dsl_dataset_t *ds, *newds;
 	uint64_t dsobj;
+	int error;
+	uint64_t crflags;
 
-	/* create and open the temporary clone */
-	dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname,
-	    ohds->ds_prev, flags, rbsa->cr, tx);
-	VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, B_TRUE, dmu_recv_tag, &cds));
+	crflags = (drrb->drr_flags & DRR_FLAG_CI_DATA) ?
+	    DS_FLAG_CI_DATASET : 0;
+
+	error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
+	if (error == 0) {
+		/* create temporary clone */
+		dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name,
+		    ds->ds_prev, crflags, drba->drba_cred, tx);
+		dsl_dataset_rele(ds, FTAG);
+	} else {
+		dsl_dir_t *dd;
+		const char *tail;
+		dsl_dataset_t *origin = NULL;
+
+		VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail));
+
+		if (drba->drba_origin != NULL) {
+			VERIFY0(dsl_dataset_hold(dp, drba->drba_origin,
+			    FTAG, &origin));
+		}
+
+		/* Create new dataset. */
+		dsobj = dsl_dataset_create_sync(dd,
+		    strrchr(tofs, '/') + 1,
+		    origin, crflags, drba->drba_cred, tx);
+		if (origin != NULL)
+			dsl_dataset_rele(origin, FTAG);
+		dsl_dir_rele(dd, FTAG);
+		drba->drba_cookie->drc_newfs = B_TRUE;
+	}
+	VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds));
+
+	dmu_buf_will_dirty(newds->ds_dbuf, tx);
+	newds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
 
 	/*
 	 * If we actually created a non-clone, we need to create the
 	 * objset in our new dataset.
 	 */
-	if (BP_IS_HOLE(dsl_dataset_get_blkptr(cds))) {
+	if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) {
 		(void) dmu_objset_create_impl(dp->dp_spa,
-		    cds, dsl_dataset_get_blkptr(cds), rbsa->type, tx);
+		    newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx);
 	}
 
-	rbsa->ds = cds;
-
-	spa_history_log_internal_ds(cds, "receive over existing", tx, "");
-}
+	drba->drba_cookie->drc_ds = newds;
 
-static boolean_t
-dmu_recv_verify_features(dsl_dataset_t *ds, struct drr_begin *drrb)
-{
-	int featureflags;
-
-	featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
-
-	/* Verify pool version supports SA if SA_SPILL feature set */
-	return ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
-	    (spa_version(dsl_dataset_get_spa(ds)) < SPA_VERSION_SA));
+	spa_history_log_internal_ds(newds, "receive", tx, "");
 }
 
 /*
@@ -786,132 +875,55 @@
  * succeeds; otherwise we will leak the holds on the datasets.
  */
 int
-dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb,
-    boolean_t force, objset_t *origin, dmu_recv_cookie_t *drc)
+dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
+    boolean_t force, char *origin, dmu_recv_cookie_t *drc)
 {
-	int err = 0;
-	boolean_t byteswap;
-	struct recvbeginsyncarg rbsa = { 0 };
-	uint64_t versioninfo;
-	int flags;
-	dsl_dataset_t *ds;
-
-	if (drrb->drr_magic == DMU_BACKUP_MAGIC)
-		byteswap = FALSE;
-	else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
-		byteswap = TRUE;
-	else
-		return (EINVAL);
-
-	rbsa.tofs = tofs;
-	rbsa.tosnap = tosnap;
-	rbsa.origin = origin ? origin->os_dsl_dataset : NULL;
-	rbsa.fromguid = drrb->drr_fromguid;
-	rbsa.type = drrb->drr_type;
-	rbsa.tag = FTAG;
-	rbsa.dsflags = 0;
-	rbsa.cr = CRED();
-	versioninfo = drrb->drr_versioninfo;
-	flags = drrb->drr_flags;
-
-	if (byteswap) {
-		rbsa.type = BSWAP_32(rbsa.type);
-		rbsa.fromguid = BSWAP_64(rbsa.fromguid);
-		versioninfo = BSWAP_64(versioninfo);
-		flags = BSWAP_32(flags);
-	}
-
-	if (DMU_GET_STREAM_HDRTYPE(versioninfo) == DMU_COMPOUNDSTREAM ||
-	    rbsa.type >= DMU_OST_NUMTYPES ||
-	    ((flags & DRR_FLAG_CLONE) && origin == NULL))
-		return (EINVAL);
-
-	if (flags & DRR_FLAG_CI_DATA)
-		rbsa.dsflags = DS_FLAG_CI_DATASET;
+	dmu_recv_begin_arg_t drba = { 0 };
+	dmu_replay_record_t *drr;
 
 	bzero(drc, sizeof (dmu_recv_cookie_t));
 	drc->drc_drrb = drrb;
 	drc->drc_tosnap = tosnap;
-	drc->drc_top_ds = top_ds;
+	drc->drc_tofs = tofs;
 	drc->drc_force = force;
 
-	/*
-	 * Process the begin in syncing context.
-	 */
-
-	/* open the dataset we are logically receiving into */
-	err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds);
-	if (err == 0) {
-		if (dmu_recv_verify_features(ds, drrb)) {
-			dsl_dataset_rele(ds, dmu_recv_tag);
-			return (ENOTSUP);
-		}
-		/* target fs already exists; recv into temp clone */
-
-		/* Can't recv a clone into an existing fs */
-		if (flags & DRR_FLAG_CLONE) {
-			dsl_dataset_rele(ds, dmu_recv_tag);
-			return (EINVAL);
-		}
-
-		/* must not have an incremental recv already in progress */
-		if (!mutex_tryenter(&ds->ds_recvlock)) {
-			dsl_dataset_rele(ds, dmu_recv_tag);
-			return (EBUSY);
-		}
+	if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
+		drc->drc_byteswap = B_TRUE;
+	else if (drrb->drr_magic != DMU_BACKUP_MAGIC)
+		return (EINVAL);
 
-		/* tmp clone name is: tofs/%tosnap" */
-		(void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname),
-		    "%%%s", tosnap);
-		rbsa.force = force;
-		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-		    recv_existing_check, recv_existing_sync, ds, &rbsa, 5);
-		if (err) {
-			mutex_exit(&ds->ds_recvlock);
-			dsl_dataset_rele(ds, dmu_recv_tag);
-			return (err);
-		}
-		drc->drc_logical_ds = ds;
-		drc->drc_real_ds = rbsa.ds;
-	} else if (err == ENOENT) {
-		/* target fs does not exist; must be a full backup or clone */
-		char *cp;
+	drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
+	drr->drr_type = DRR_BEGIN;
+	drr->drr_u.drr_begin = *drc->drc_drrb;
+	if (drc->drc_byteswap) {
+		fletcher_4_incremental_byteswap(drr,
+		    sizeof (dmu_replay_record_t), &drc->drc_cksum);
+	} else {
+		fletcher_4_incremental_native(drr,
+		    sizeof (dmu_replay_record_t), &drc->drc_cksum);
+	}
+	kmem_free(drr, sizeof (dmu_replay_record_t));
 
-		/*
-		 * If it's a non-clone incremental, we are missing the
-		 * target fs, so fail the recv.
-		 */
-		if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE))
-			return (ENOENT);
-
-		/* Open the parent of tofs */
-		cp = strrchr(tofs, '/');
-		*cp = '\0';
-		err = dsl_dataset_hold(tofs, FTAG, &ds);
-		*cp = '/';
-		if (err)
-			return (err);
-
-		if (dmu_recv_verify_features(ds, drrb)) {
-			dsl_dataset_rele(ds, FTAG);
-			return (ENOTSUP);
-		}
-
-		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-		    recv_new_check, recv_new_sync, ds->ds_dir, &rbsa, 5);
-		dsl_dataset_rele(ds, FTAG);
-		if (err)
-			return (err);
-		drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds;
-		drc->drc_newfs = B_TRUE;
+	if (drc->drc_byteswap) {
+		drrb->drr_magic = BSWAP_64(drrb->drr_magic);
+		drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo);
+		drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
+		drrb->drr_type = BSWAP_32(drrb->drr_type);
+		drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
+		drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
 	}
 
-	return (err);
+	drba.drba_origin = origin;
+	drba.drba_cookie = drc;
+	drba.drba_cred = CRED();
+
+	return (dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync,
+	    &drba, 5));
 }
 
 struct restorearg {
 	int err;
-	int byteswap;
+	boolean_t byteswap;
 	vnode_t *vp;
 	char *buf;
 	uint64_t voff;
@@ -947,7 +959,7 @@
 	guid_map_entry_t *gmep;
 
 	while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) {
-		dsl_dataset_rele(gmep->gme_ds, ca);
+		dsl_dataset_long_rele(gmep->gme_ds, gmep);
 		kmem_free(gmep, sizeof (guid_map_entry_t));
 	}
 	avl_destroy(ca);
@@ -975,7 +987,7 @@
 			ra->err = EINVAL;
 		ra->voff += len - done - resid;
 		done = len - resid;
-		if (ra->err)
+		if (ra->err != 0)
 			return (NULL);
 	}
 
@@ -1094,7 +1106,7 @@
 
 	if (drro->drr_bonuslen) {
 		data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8));
-		if (ra->err)
+		if (ra->err != 0)
 			return (ra->err);
 	}
 
@@ -1103,7 +1115,7 @@
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
 		err = dmu_tx_assign(tx, TXG_WAIT);
-		if (err) {
+		if (err != 0) {
 			dmu_tx_abort(tx);
 			return (err);
 		}
@@ -1117,14 +1129,14 @@
 		    drro->drr_type, drro->drr_blksz,
 		    drro->drr_bonustype, drro->drr_bonuslen);
 	}
-	if (err) {
+	if (err != 0) {
 		return (EINVAL);
 	}
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_bonus(tx, drro->drr_object);
 	err = dmu_tx_assign(tx, TXG_WAIT);
-	if (err) {
+	if (err != 0) {
 		dmu_tx_abort(tx);
 		return (err);
 	}
@@ -1172,7 +1184,7 @@
 			continue;
 
 		err = dmu_free_object(os, obj);
-		if (err)
+		if (err != 0)
 			return (err);
 	}
 	return (0);
@@ -1202,7 +1214,7 @@
 	dmu_tx_hold_write(tx, drrw->drr_object,
 	    drrw->drr_offset, drrw->drr_length);
 	err = dmu_tx_assign(tx, TXG_WAIT);
-	if (err) {
+	if (err != 0) {
 		dmu_tx_abort(tx);
 		return (err);
 	}
@@ -1264,7 +1276,7 @@
 	dmu_tx_hold_write(tx, drrwbr->drr_object,
 	    drrwbr->drr_offset, drrwbr->drr_length);
 	err = dmu_tx_assign(tx, TXG_WAIT);
-	if (err) {
+	if (err != 0) {
 		dmu_tx_abort(tx);
 		return (err);
 	}
@@ -1305,7 +1317,7 @@
 	dmu_tx_hold_spill(tx, db->db_object);
 
 	err = dmu_tx_assign(tx, TXG_WAIT);
-	if (err) {
+	if (err != 0) {
 		dmu_buf_rele(db, FTAG);
 		dmu_buf_rele(db_spill, FTAG);
 		dmu_tx_abort(tx);
@@ -1344,6 +1356,16 @@
 	return (err);
 }
 
+/* used to destroy the drc_ds on error */
+static void
+dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc)
+{
+	char name[MAXNAMELEN];
+	dsl_dataset_name(drc->drc_ds, name);
+	dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
+	(void) dsl_destroy_head(name);
+}
+
 /*
  * NB: callers *must* call dmu_recv_end() if this succeeds.
  */
@@ -1357,52 +1379,24 @@
 	zio_cksum_t pcksum;
 	int featureflags;
 
-	if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
-		ra.byteswap = TRUE;
-
-	{
-		/* compute checksum of drr_begin record */
-		dmu_replay_record_t *drr;
-		drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
-
-		drr->drr_type = DRR_BEGIN;
-		drr->drr_u.drr_begin = *drc->drc_drrb;
-		if (ra.byteswap) {
-			fletcher_4_incremental_byteswap(drr,
-			    sizeof (dmu_replay_record_t), &ra.cksum);
-		} else {
-			fletcher_4_incremental_native(drr,
-			    sizeof (dmu_replay_record_t), &ra.cksum);
-		}
-		kmem_free(drr, sizeof (dmu_replay_record_t));
-	}
-
-	if (ra.byteswap) {
-		struct drr_begin *drrb = drc->drc_drrb;
-		drrb->drr_magic = BSWAP_64(drrb->drr_magic);
-		drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo);
-		drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
-		drrb->drr_type = BSWAP_32(drrb->drr_type);
-		drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
-		drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
-	}
-
+	ra.byteswap = drc->drc_byteswap;
+	ra.cksum = drc->drc_cksum;
 	ra.vp = vp;
 	ra.voff = *voffp;
 	ra.bufsize = 1<<20;
 	ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
 
 	/* these were verified in dmu_recv_begin */
-	ASSERT(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo) ==
+	ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==,
 	    DMU_SUBSTREAM);
-	ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES);
+	ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES);
 
 	/*
 	 * Open the objset we are modifying.
 	 */
-	VERIFY(dmu_objset_from_ds(drc->drc_real_ds, &os) == 0);
+	VERIFY0(dmu_objset_from_ds(drc->drc_ds, &os));
 
-	ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT);
+	ASSERT(drc->drc_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT);
 
 	featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo);
 
@@ -1415,7 +1409,7 @@
 			goto out;
 		}
 		ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor);
-		if (ra.err) {
+		if (ra.err != 0) {
 			cleanup_fd = -1;
 			goto out;
 		}
@@ -1429,12 +1423,12 @@
 			ra.err = zfs_onexit_add_cb(minor,
 			    free_guid_map_onexit, ra.guid_to_ds_map,
 			    action_handlep);
-			if (ra.err)
+			if (ra.err != 0)
 				goto out;
 		} else {
 			ra.err = zfs_onexit_cb_data(minor, *action_handlep,
 			    (void **)&ra.guid_to_ds_map);
-			if (ra.err)
+			if (ra.err != 0)
 				goto out;
 		}
 
@@ -1528,14 +1522,7 @@
 		 * destroy what we created, so we don't leave it in the
 		 * inconsistent restoring state.
 		 */
-		txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0);
-
-		(void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag,
-		    B_FALSE);
-		if (drc->drc_real_ds != drc->drc_logical_ds) {
-			mutex_exit(&drc->drc_logical_ds->ds_recvlock);
-			dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag);
-		}
+		dmu_recv_cleanup_ds(drc);
 	}
 
 	kmem_free(ra.buf, ra.bufsize);
@@ -1543,142 +1530,176 @@
 	return (ra.err);
 }
 
-struct recvendsyncarg {
-	char *tosnap;
-	uint64_t creation_time;
-	uint64_t toguid;
-};
+static int
+dmu_recv_end_check(void *arg, dmu_tx_t *tx)
+{
+	dmu_recv_cookie_t *drc = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	int error;
+
+	ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag);
+
+	if (!drc->drc_newfs) {
+		dsl_dataset_t *origin_head;
 
-static int
-recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = arg1;
-	struct recvendsyncarg *resa = arg2;
+		error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head);
+		if (error != 0)
+			return (error);
+		error = dsl_dataset_clone_swap_check_impl(drc->drc_ds,
+		    origin_head, drc->drc_force);
+		if (error != 0) {
+			dsl_dataset_rele(origin_head, FTAG);
+			return (error);
+		}
+		error = dsl_dataset_snapshot_check_impl(origin_head,
+		    drc->drc_tosnap, tx);
+		dsl_dataset_rele(origin_head, FTAG);
+		if (error != 0)
+			return (error);
 
-	return (dsl_dataset_snapshot_check(ds, resa->tosnap, tx));
+		error = dsl_destroy_head_check_impl(drc->drc_ds, 1);
+	} else {
+		error = dsl_dataset_snapshot_check_impl(drc->drc_ds,
+		    drc->drc_tosnap, tx);
+	}
+	return (error);
 }
 
 static void
-recv_end_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
 {
-	dsl_dataset_t *ds = arg1;
-	struct recvendsyncarg *resa = arg2;
+	dmu_recv_cookie_t *drc = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+
+	spa_history_log_internal_ds(drc->drc_ds, "finish receiving",
+	    tx, "snap=%s", drc->drc_tosnap);
+
+	if (!drc->drc_newfs) {
+		dsl_dataset_t *origin_head;
 
-	dsl_dataset_snapshot_sync(ds, resa->tosnap, tx);
+		VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG,
+		    &origin_head));
+		dsl_dataset_clone_swap_sync_impl(drc->drc_ds,
+		    origin_head, tx);
+		dsl_dataset_snapshot_sync_impl(origin_head,
+		    drc->drc_tosnap, tx);
+
+		/* set snapshot's creation time and guid */
+		dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx);
+		origin_head->ds_prev->ds_phys->ds_creation_time =
+		    drc->drc_drrb->drr_creation_time;
+		origin_head->ds_prev->ds_phys->ds_guid =
+		    drc->drc_drrb->drr_toguid;
+		origin_head->ds_prev->ds_phys->ds_flags &=
+		    ~DS_FLAG_INCONSISTENT;
 
-	/* set snapshot's creation time and guid */
-	dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
-	ds->ds_prev->ds_phys->ds_creation_time = resa->creation_time;
-	ds->ds_prev->ds_phys->ds_guid = resa->toguid;
-	ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+		dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
+		origin_head->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+
+		dsl_dataset_rele(origin_head, FTAG);
+		dsl_destroy_head_sync_impl(drc->drc_ds, tx);
+	} else {
+		dsl_dataset_t *ds = drc->drc_ds;
+
+		dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx);
 
-	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
-	spa_history_log_internal_ds(ds, "finished receiving", tx, "");
+		/* set snapshot's creation time and guid */
+		dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+		ds->ds_prev->ds_phys->ds_creation_time =
+		    drc->drc_drrb->drr_creation_time;
+		ds->ds_prev->ds_phys->ds_guid = drc->drc_drrb->drr_toguid;
+		ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+
+		dmu_buf_will_dirty(ds->ds_dbuf, tx);
+		ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+	}
+	drc->drc_newsnapobj = drc->drc_ds->ds_phys->ds_prev_snap_obj;
+	/*
+	 * Release the hold from dmu_recv_begin.  This must be done before
+	 * we return to open context, so that when we free the dataset's dnode,
+	 * we can evict its bonus buffer.
+	 */
+	dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
+	drc->drc_ds = NULL;
 }
 
 static int
-add_ds_to_guidmap(avl_tree_t *guid_map, dsl_dataset_t *ds)
+add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj)
 {
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-	uint64_t snapobj = ds->ds_phys->ds_prev_snap_obj;
+	dsl_pool_t *dp;
 	dsl_dataset_t *snapds;
 	guid_map_entry_t *gmep;
 	int err;
 
 	ASSERT(guid_map != NULL);
 
-	rw_enter(&dp->dp_config_rwlock, RW_READER);
-	err = dsl_dataset_hold_obj(dp, snapobj, guid_map, &snapds);
+	err = dsl_pool_hold(name, FTAG, &dp);
+	if (err != 0)
+		return (err);
+	err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snapds);
 	if (err == 0) {
 		gmep = kmem_alloc(sizeof (guid_map_entry_t), KM_SLEEP);
 		gmep->guid = snapds->ds_phys->ds_guid;
 		gmep->gme_ds = snapds;
 		avl_add(guid_map, gmep);
+		dsl_dataset_long_hold(snapds, gmep);
+		dsl_dataset_rele(snapds, FTAG);
 	}
 
-	rw_exit(&dp->dp_config_rwlock);
+	dsl_pool_rele(dp, FTAG);
 	return (err);
 }
 
+static int dmu_recv_end_modified_blocks = 3;
+
 static int
 dmu_recv_existing_end(dmu_recv_cookie_t *drc)
 {
-	struct recvendsyncarg resa;
-	dsl_dataset_t *ds = drc->drc_logical_ds;
-	int err, myerr;
-
-	if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) {
-		err = dsl_dataset_clone_swap(drc->drc_real_ds, ds,
-		    drc->drc_force);
-		if (err)
-			goto out;
-	} else {
-		mutex_exit(&ds->ds_recvlock);
-		dsl_dataset_rele(ds, dmu_recv_tag);
-		(void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag,
-		    B_FALSE);
-		return (EBUSY);
-	}
+	int error;
+	char name[MAXNAMELEN];
 
-	resa.creation_time = drc->drc_drrb->drr_creation_time;
-	resa.toguid = drc->drc_drrb->drr_toguid;
-	resa.tosnap = drc->drc_tosnap;
+#ifdef _KERNEL
+	/*
+	 * We will be destroying the ds; make sure its origin is unmounted if
+	 * necessary.
+	 */
+	dsl_dataset_name(drc->drc_ds, name);
+	zfs_destroy_unmount_origin(name);
+#endif
 
-	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-	    recv_end_check, recv_end_sync, ds, &resa, 3);
-	if (err) {
-		/* swap back */
-		(void) dsl_dataset_clone_swap(drc->drc_real_ds, ds, B_TRUE);
-	}
+	error = dsl_sync_task(drc->drc_tofs,
+	    dmu_recv_end_check, dmu_recv_end_sync, drc,
+	    dmu_recv_end_modified_blocks);
 
-out:
-	mutex_exit(&ds->ds_recvlock);
-	if (err == 0 && drc->drc_guid_to_ds_map != NULL)
-		(void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds);
-	dsl_dataset_disown(ds, dmu_recv_tag);
-	myerr = dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE);
-	ASSERT0(myerr);
-	return (err);
+	if (error != 0)
+		dmu_recv_cleanup_ds(drc);
+	return (error);
 }
 
 static int
 dmu_recv_new_end(dmu_recv_cookie_t *drc)
 {
-	struct recvendsyncarg resa;
-	dsl_dataset_t *ds = drc->drc_logical_ds;
-	int err;
+	int error;
 
-	/*
-	 * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean()
-	 * expects it to have a ds_user_ptr (and zil), but clone_swap()
-	 * can close it.
-	 */
-	txg_wait_synced(ds->ds_dir->dd_pool, 0);
+	error = dsl_sync_task(drc->drc_tofs,
+	    dmu_recv_end_check, dmu_recv_end_sync, drc,
+	    dmu_recv_end_modified_blocks);
 
-	resa.creation_time = drc->drc_drrb->drr_creation_time;
-	resa.toguid = drc->drc_drrb->drr_toguid;
-	resa.tosnap = drc->drc_tosnap;
-
-	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-	    recv_end_check, recv_end_sync, ds, &resa, 3);
-	if (err) {
-		/* clean up the fs we just recv'd into */
-		(void) dsl_dataset_destroy(ds, dmu_recv_tag, B_FALSE);
-	} else {
-		if (drc->drc_guid_to_ds_map != NULL)
-			(void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds);
-		/* release the hold from dmu_recv_begin */
-		dsl_dataset_disown(ds, dmu_recv_tag);
+	if (error != 0) {
+		dmu_recv_cleanup_ds(drc);
+	} else if (drc->drc_guid_to_ds_map != NULL) {
+		(void) add_ds_to_guidmap(drc->drc_tofs,
+		    drc->drc_guid_to_ds_map,
+		    drc->drc_newsnapobj);
 	}
-	return (err);
+	return (error);
 }
 
 int
 dmu_recv_end(dmu_recv_cookie_t *drc)
 {
-	if (drc->drc_logical_ds != drc->drc_real_ds)
-		return (dmu_recv_existing_end(drc));
+	if (drc->drc_newfs)
+		return (dmu_recv_new_end(drc));
 	else
-		return (dmu_recv_new_end(drc));
+		return (dmu_recv_existing_end(drc));
 }
--- a/usr/src/uts/common/fs/zfs/dmu_traverse.c	Wed Feb 27 21:13:42 2013 +0100
+++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c	Thu Feb 28 12:44:05 2013 -0800
@@ -265,7 +265,7 @@
 
 		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
-		if (err)
+		if (err != 0)
 			return (err);
 		cbp = buf->b_data;
 
@@ -282,7 +282,7 @@
 			    zb->zb_level - 1,
 			    zb->zb_blkid * epb + i);
 			err = traverse_visitbp(td, dnp, &cbp[i], &czb);
-			if (err) {
+			if (err != 0) {
 				if (!hard)
 					break;
 				lasterr = err;
@@ -295,7 +295,7 @@
 
 		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
-		if (err)
+		if (err != 0)
 			return (err);
 		dnp = buf->b_data;
 
@@ -308,7 +308,7 @@
 		for (i = 0; i < epb; i++) {
 			err = traverse_dnode(td, &dnp[i], zb->zb_objset,
 			    zb->zb_blkid * epb + i);
-			if (err) {
+			if (err != 0) {
 				if (!hard)
 					break;
 				lasterr = err;
@@ -321,7 +321,7 @@
 
 		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
-		if (err)
+		if (err != 0)
 			return (err);
 
 		osp = buf->b_data;
@@ -405,7 +405,7 @@
 	for (j = 0; j < dnp->dn_nblkptr; j++) {
 		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
 		err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
-		if (err) {
+		if (err != 0) {
 			if (!hard)
 				break;
 			lasterr = err;
@@ -415,7 +415,7 @@
 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 		SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
 		err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb);
-		if (err) {
+		if (err != 0) {
 			if (!hard)
 				return (err);
 			lasterr = err;
@@ -514,14 +514,20 @@
 	cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL);
 
 	/* See comment on ZIL traversal in dsl_scan_visitds. */
-	if (ds != NULL && !dsl_dataset_is_snapshot(ds)) {
-		objset_t *os;
+	if (ds != NULL && !dsl_dataset_is_snapshot(ds) && !BP_IS_HOLE(rootbp)) {
+		uint32_t flags = ARC_WAIT;
+		objset_phys_t *osp;
+		arc_buf_t *buf;
 
-		err = dmu_objset_from_ds(ds, &os);
-		if (err)
+		err = arc_read(NULL, td.td_spa, rootbp,
+		    arc_getbuf_func, &buf,
+		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, NULL);
+		if (err != 0)
 			return (err);
 
-		traverse_zil(&td, &os->os_zil_header);
+		osp = buf->b_data;
+		traverse_zil(&td, &osp->os_zil_header);
+		(void) arc_buf_remove_ref(buf, &buf);
 	}
 
 	if (!(flags & TRAVERSE_PREFETCH_DATA) ||
@@ -583,7 +589,7 @@
 	/* visit the MOS */
 	err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa),
 	    txg_start, NULL, flags, func, arg);
-	if (err)
+	if (err != 0)
 		return (err);
 
 	/* visit each dataset */
@@ -592,7 +598,7 @@
 		dmu_object_info_t doi;
 
 		err = dmu_object_info(mos, obj, &doi);
-		if (err) {
+		if (err != 0) {
 			if (!hard)
 				return (err);
 			lasterr = err;
@@ -603,10 +609,10 @@
 			dsl_dataset_t *ds;
 			uint64_t txg = txg_start;
 
-			rw_enter(&dp->dp_config_rwlock, RW_READER);
+			dsl_pool_config_enter(dp, FTAG);
 			err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
-			rw_exit(&dp->dp_config_rwlock);
-			if (err) {
+			dsl_pool_config_exit(dp, FTAG);
+			if (err != 0) {
 				if (!hard)
 					return (err);
 				lasterr = err;
@@ -616,7 +622,7 @@
 				txg = ds->ds_phys->ds_prev_snap_txg;
 			err = traverse_dataset(ds, txg, flags, func, arg);
 			dsl_dataset_rele(ds, FTAG);
-			if (err) {
+			if (err != 0) {
 				if (!hard)
 					return (err);
 				lasterr = err;
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c	Wed Feb 27 21:13:42 2013 +0100
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c	Thu Feb 28 12:44:05 2013 -0800
@@ -898,7 +898,7 @@
 #endif
 
 static int
-dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
+dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
 {
 	dmu_tx_hold_t *txh;
 	spa_t *spa = tx->tx_pool->dp_spa;
@@ -962,13 +962,6 @@
 	}
 
 	/*
-	 * NB: This check must be after we've held the dnodes, so that
-	 * the dmu_tx_unassign() logic will work properly
-	 */
-	if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg)
-		return (ERESTART);
-
-	/*
 	 * If a snapshot has been taken since we made our estimates,
 	 * assume that we won't be able to free or overwrite anything.
 	 */
@@ -1048,26 +1041,25 @@
  *
  * (1)	TXG_WAIT.  If the current open txg is full, waits until there's
  *	a new one.  This should be used when you're not holding locks.
- *	If will only fail if we're truly out of space (or over quota).
+ *	It will only fail if we're truly out of space (or over quota).
  *
  * (2)	TXG_NOWAIT.  If we can't assign into the current open txg without
  *	blocking, returns immediately with ERESTART.  This should be used
  *	whenever you're holding locks.  On an ERESTART error, the caller
  *	should drop locks, do a dmu_tx_wait(tx), and try again.
- *
- * (3)	A specific txg.  Use this if you need to ensure that multiple
- *	transactions all sync in the same txg.  Like TXG_NOWAIT, it
- *	returns ERESTART if it can't assign you into the requested txg.
  */
 int
-dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
+dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
 {
 	int err;
 
 	ASSERT(tx->tx_txg == 0);
-	ASSERT(txg_how != 0);
+	ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT);
 	ASSERT(!dsl_pool_sync_context(tx->tx_pool));
 
+	/* If we might wait, we must not hold the config lock. */
+	ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));
+
 	while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
 		dmu_tx_unassign(tx);
 
@@ -1088,6 +1080,7 @@
 	spa_t *spa = tx->tx_pool->dp_spa;
 
 	ASSERT(tx->tx_txg == 0);
+	ASSERT(!dsl_pool_config_held(tx->tx_pool));
 
 	/*
 	 * It's possible that the pool has become active after this thread
@@ -1214,6 +1207,14 @@
 	return (tx->tx_txg);
 }
 
+dsl_pool_t *
+dmu_tx_pool(dmu_tx_t *tx)
+{
+	ASSERT(tx->tx_pool != NULL);
+	return (tx->tx_pool);
+}
+
+
 void
 dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
 {
--- a/usr/src/uts/common/fs/zfs/dnode.c	Wed Feb 27 21:13:42 2013 +0100
+++ b/usr/src/uts/common/fs/zfs/dnode.c	Thu Feb 28 12:44:05 2013 -0800
@@ -72,7 +72,11 @@
 	mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
 
-	refcount_create(&dn->dn_holds);
+	/*
+	 * Every dbuf has a reference, and dropping a tracked reference is
+	 * O(number of references), so don't track dn_holds.
+	 */
+	refcount_create_untracked(&dn->dn_holds);
 	refcount_create(&dn->dn_tx_holds);
 	list_link_init(&dn->dn_link);
 
--- a/usr/src/uts/common/fs/zfs/dnode_sync.c	Wed Feb 27 21:13:42 2013 +0100
+++ b/usr/src/uts/common/fs/zfs/dnode_sync.c	Thu Feb 28 12:44:05 2013 -0800
@@ -477,6 +477,7 @@
 	dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
 	dnode_evict_dbufs(dn);
 	ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+	ASSERT3P(dn->dn_bonus, ==, NULL);
 
 	/*
 	 * XXX - It would be nice to assert this, but we may still
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c	Wed Feb 27 21:13:42 2013 +0100
+++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c	Thu Feb 28 12:44:05 2013 -0800
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 
@@ -45,12 +45,8 @@
 #include <sys/zvol.h>
 #include <sys/dsl_scan.h>
 #include <sys/dsl_deadlist.h>
-
-static char *dsl_reaper = "the grim reaper";
-
-static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
-static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
-static dsl_syncfunc_t dsl_dataset_set_reservation_sync;
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_userhold.h>
 
 #define	SWITCH64(x, y) \
 	{ \
@@ -63,9 +59,6 @@
 
 #define	DSL_DEADLIST_BLOCKSIZE	SPA_MAXBLOCKSIZE
 
-#define	DSL_DATASET_IS_DESTROYED(ds)	((ds)->ds_owner == dsl_reaper)
-
-
 /*
  * Figure out how much of this delta should be propogated to the dsl_dir
  * layer.  If there's a refreservation, that space has already been
@@ -252,7 +245,7 @@
 {
 	dsl_dataset_t *ds = dsv;
 
-	ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds));
+	ASSERT(ds->ds_owner == NULL);
 
 	unique_remove(ds->ds_fsid_guid);
 
@@ -260,32 +253,26 @@
 		dmu_objset_evict(ds->ds_objset);
 
 	if (ds->ds_prev) {
-		dsl_dataset_drop_ref(ds->ds_prev, ds);
+		dsl_dataset_rele(ds->ds_prev, ds);
 		ds->ds_prev = NULL;
 	}
 
 	bplist_destroy(&ds->ds_pending_deadlist);
-	if (db != NULL) {
+	if (ds->ds_phys->ds_deadlist_obj != 0)
 		dsl_deadlist_close(&ds->ds_deadlist);
-	} else {
-		ASSERT(ds->ds_deadlist.dl_dbuf == NULL);
-		ASSERT(!ds->ds_deadlist.dl_oldfmt);
-	}
 	if (ds->ds_dir)
-		dsl_dir_close(ds->ds_dir, ds);
+		dsl_dir_rele(ds->ds_dir, ds);
 
 	ASSERT(!list_link_active(&ds->ds_synced_link));
 
 	mutex_destroy(&ds->ds_lock);
-	mutex_destroy(&ds->ds_recvlock);
 	mutex_destroy(&ds->ds_opening_lock);
-	rw_destroy(&ds->ds_rwlock);
-	cv_destroy(&ds->ds_exclusive_cv);
+	refcount_destroy(&ds->ds_longholds);
 
 	kmem_free(ds, sizeof (dsl_dataset_t));
 }
 
-static int
+int
 dsl_dataset_get_snapname(dsl_dataset_t *ds)
 {
 	dsl_dataset_phys_t *headphys;
@@ -301,7 +288,7 @@
 
 	err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj,
 	    FTAG, &headdbuf);
-	if (err)
+	if (err != 0)
 		return (err);
 	headphys = headdbuf->db_data;
 	err = zap_value_search(dp->dp_meta_objset,
@@ -310,7 +297,7 @@
 	return (err);
 }
 
-static int
+int
 dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
 {
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
@@ -330,8 +317,8 @@
 	return (err);
 }
 
-static int
-dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx)
+int
+dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx)
 {
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 	uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
@@ -351,8 +338,8 @@
 	return (err);
 }
 
-static int
-dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
+int
+dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
     dsl_dataset_t **dsp)
 {
 	objset_t *mos = dp->dp_meta_objset;
@@ -361,11 +348,10 @@
 	int err;
 	dmu_object_info_t doi;
 
-	ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
-	    dsl_pool_sync_context(dp));
+	ASSERT(dsl_pool_config_held(dp));
 
 	err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
-	if (err)
+	if (err != 0)
 		return (err);
 
 	/* Make sure dsobj has the correct object type. */
@@ -383,12 +369,9 @@
 		ds->ds_phys = dbuf->db_data;
 
 		mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
-		mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL);
 		mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
 		mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL);
-
-		rw_init(&ds->ds_rwlock, 0, 0, 0);
-		cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL);
+		refcount_create(&ds->ds_longholds);
 
 		bplist_create(&ds->ds_pending_deadlist);
 		dsl_deadlist_open(&ds->ds_deadlist,
@@ -398,15 +381,13 @@
 		    offsetof(dmu_sendarg_t, dsa_link));
 
 		if (err == 0) {
-			err = dsl_dir_open_obj(dp,
+			err = dsl_dir_hold_obj(dp,
 			    ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
 		}
-		if (err) {
+		if (err != 0) {
 			mutex_destroy(&ds->ds_lock);
-			mutex_destroy(&ds->ds_recvlock);
 			mutex_destroy(&ds->ds_opening_lock);
-			rw_destroy(&ds->ds_rwlock);
-			cv_destroy(&ds->ds_exclusive_cv);
+			refcount_destroy(&ds->ds_longholds);
 			bplist_destroy(&ds->ds_pending_deadlist);
 			dsl_deadlist_close(&ds->ds_deadlist);
 			kmem_free(ds, sizeof (dsl_dataset_t));
@@ -416,8 +397,8 @@
 
 		if (!dsl_dataset_is_snapshot(ds)) {
 			ds->ds_snapname[0] = '\0';
-			if (ds->ds_phys->ds_prev_snap_obj) {
-				err = dsl_dataset_get_ref(dp,
+			if (ds->ds_phys->ds_prev_snap_obj != 0) {
+				err = dsl_dataset_hold_obj(dp,
 				    ds->ds_phys->ds_prev_snap_obj,
 				    ds, &ds->ds_prev);
 			}
@@ -433,29 +414,14 @@
 		}
 
 		if (err == 0 && !dsl_dataset_is_snapshot(ds)) {
-			/*
-			 * In sync context, we're called with either no lock
-			 * or with the write lock.  If we're not syncing,
-			 * we're always called with the read lock held.
-			 */
-			boolean_t need_lock =
-			    !RW_WRITE_HELD(&dp->dp_config_rwlock) &&
-			    dsl_pool_sync_context(dp);
-
-			if (need_lock)
-				rw_enter(&dp->dp_config_rwlock, RW_READER);
-
-			err = dsl_prop_get_ds(ds,
-			    "refreservation", sizeof (uint64_t), 1,
-			    &ds->ds_reserved, NULL);
+			err = dsl_prop_get_int_ds(ds,
+			    zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
+			    &ds->ds_reserved);
 			if (err == 0) {
-				err = dsl_prop_get_ds(ds,
-				    "refquota", sizeof (uint64_t), 1,
-				    &ds->ds_quota, NULL);
+				err = dsl_prop_get_int_ds(ds,
+				    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
+				    &ds->ds_quota);
 			}
-
-			if (need_lock)
-				rw_exit(&dp->dp_config_rwlock);
 		} else {
 			ds->ds_reserved = ds->ds_quota = 0;
 		}
@@ -465,15 +431,13 @@
 			bplist_destroy(&ds->ds_pending_deadlist);
 			dsl_deadlist_close(&ds->ds_deadlist);
 			if (ds->ds_prev)
-				dsl_dataset_drop_ref(ds->ds_prev, ds);
-			dsl_dir_close(ds->ds_dir, ds);
+				dsl_dataset_rele(ds->ds_prev, ds);
+			dsl_dir_rele(ds->ds_dir, ds);
 			mutex_destroy(&ds->ds_lock);
-			mutex_destroy(&ds->ds_recvlock);
 			mutex_destroy(&ds->ds_opening_lock);
-			rw_destroy(&ds->ds_rwlock);
-			cv_destroy(&ds->ds_exclusive_cv);
+			refcount_destroy(&ds->ds_longholds);
 			kmem_free(ds, sizeof (dsl_dataset_t));
-			if (err) {
+			if (err != 0) {
 				dmu_buf_rele(dbuf, tag);
 				return (err);
 			}
@@ -488,90 +452,68 @@
 	ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 ||
 	    spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
 	    dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
-	mutex_enter(&ds->ds_lock);
-	if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) {
-		mutex_exit(&ds->ds_lock);
-		dmu_buf_rele(ds->ds_dbuf, tag);
-		return (ENOENT);
-	}
-	mutex_exit(&ds->ds_lock);
 	*dsp = ds;
 	return (0);
 }
 
-static int
-dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag)
-{
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-
-	/*
-	 * In syncing context we don't want the rwlock lock: there
-	 * may be an existing writer waiting for sync phase to
-	 * finish.  We don't need to worry about such writers, since
-	 * sync phase is single-threaded, so the writer can't be
-	 * doing anything while we are active.
-	 */
-	if (dsl_pool_sync_context(dp)) {
-		ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
-		return (0);
-	}
-
-	/*
-	 * Normal users will hold the ds_rwlock as a READER until they
-	 * are finished (i.e., call dsl_dataset_rele()).  "Owners" will
-	 * drop their READER lock after they set the ds_owner field.
-	 *
-	 * If the dataset is being destroyed, the destroy thread will
-	 * obtain a WRITER lock for exclusive access after it's done its
-	 * open-context work and then change the ds_owner to
-	 * dsl_reaper once destruction is assured.  So threads
-	 * may block here temporarily, until the "destructability" of
-	 * the dataset is determined.
-	 */
-	ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock));
-	mutex_enter(&ds->ds_lock);
-	while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) {
-		rw_exit(&dp->dp_config_rwlock);
-		cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock);
-		if (DSL_DATASET_IS_DESTROYED(ds)) {
-			mutex_exit(&ds->ds_lock);
-			dsl_dataset_drop_ref(ds, tag);
-			rw_enter(&dp->dp_config_rwlock, RW_READER);
-			return (ENOENT);
-		}
-		/*
-		 * The dp_config_rwlock lives above the ds_lock. And
-		 * we need to check DSL_DATASET_IS_DESTROYED() while
-		 * holding the ds_lock, so we have to drop and reacquire
-		 * the ds_lock here.
-		 */
-		mutex_exit(&ds->ds_lock);
-		rw_enter(&dp->dp_config_rwlock, RW_READER);
-		mutex_enter(&ds->ds_lock);
-	}
-	mutex_exit(&ds->ds_lock);
-	return (0);
-}
-
 int
-dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
-    dsl_dataset_t **dsp)
+dsl_dataset_hold(dsl_pool_t *dp, const char *name,
+    void *tag, dsl_dataset_t **dsp)
 {
-	int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp);
-
-	if (err)
+	dsl_dir_t *dd;
+	const char *snapname;
+	uint64_t obj;
+	int err = 0;
+
+	err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname);
+	if (err != 0)
 		return (err);
-	return (dsl_dataset_hold_ref(*dsp, tag));
+
+	ASSERT(dsl_pool_config_held(dp));
+	obj = dd->dd_phys->dd_head_dataset_obj;
+	if (obj != 0)
+		err = dsl_dataset_hold_obj(dp, obj, tag, dsp);
+	else
+		err = ENOENT;
+
+	/* we may be looking for a snapshot */
+	if (err == 0 && snapname != NULL) {
+		dsl_dataset_t *ds;
+
+		if (*snapname++ != '@') {
+			dsl_dataset_rele(*dsp, tag);
+			dsl_dir_rele(dd, FTAG);
+			return (ENOENT);
+		}
+
+		dprintf("looking for snapshot '%s'\n", snapname);
+		err = dsl_dataset_snap_lookup(*dsp, snapname, &obj);
+		if (err == 0)
+			err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
+		dsl_dataset_rele(*dsp, tag);
+
+		if (err == 0) {
+			mutex_enter(&ds->ds_lock);
+			if (ds->ds_snapname[0] == 0)
+				(void) strlcpy(ds->ds_snapname, snapname,
+				    sizeof (ds->ds_snapname));
+			mutex_exit(&ds->ds_lock);
+			*dsp = ds;
+		}
+	}
+
+	dsl_dir_rele(dd, FTAG);
+	return (err);
 }
 
 int
-dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, boolean_t inconsistentok,
+dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj,
     void *tag, dsl_dataset_t **dsp)
 {
 	int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
-	if (err)
+	if (err != 0)
 		return (err);
-	if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) {
+	if (!dsl_dataset_tryown(*dsp, tag)) {
 		dsl_dataset_rele(*dsp, tag);
 		*dsp = NULL;
 		return (EBUSY);
@@ -580,78 +522,48 @@
 }
 
 int
-dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp)
-{
-	dsl_dir_t *dd;
-	dsl_pool_t *dp;
-	const char *snapname;
-	uint64_t obj;
-	int err = 0;
-
-	err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname);
-	if (err)
-		return (err);
-
-	dp = dd->dd_pool;
-	obj = dd->dd_phys->dd_head_dataset_obj;
-	rw_enter(&dp->dp_config_rwlock, RW_READER);
-	if (obj)
-		err = dsl_dataset_get_ref(dp, obj, tag, dsp);
-	else
-		err = ENOENT;
-	if (err)
-		goto out;
-
-	err = dsl_dataset_hold_ref(*dsp, tag);
-
-	/* we may be looking for a snapshot */
-	if (err == 0 && snapname != NULL) {
-		dsl_dataset_t *ds = NULL;
-
-		if (*snapname++ != '@') {
-			dsl_dataset_rele(*dsp, tag);
-			err = ENOENT;
-			goto out;
-		}
-
-		dprintf("looking for snapshot '%s'\n", snapname);
-		err = dsl_dataset_snap_lookup(*dsp, snapname, &obj);
-		if (err == 0)
-			err = dsl_dataset_get_ref(dp, obj, tag, &ds);
-		dsl_dataset_rele(*dsp, tag);
-
-		ASSERT3U((err == 0), ==, (ds != NULL));
-
-		if (ds) {
-			mutex_enter(&ds->ds_lock);
-			if (ds->ds_snapname[0] == 0)
-				(void) strlcpy(ds->ds_snapname, snapname,
-				    sizeof (ds->ds_snapname));
-			mutex_exit(&ds->ds_lock);
-			err = dsl_dataset_hold_ref(ds, tag);
-			*dsp = err ? NULL : ds;
-		}
-	}
-out:
-	rw_exit(&dp->dp_config_rwlock);
-	dsl_dir_close(dd, FTAG);
-	return (err);
-}
-
-int
-dsl_dataset_own(const char *name, boolean_t inconsistentok,
+dsl_dataset_own(dsl_pool_t *dp, const char *name,
     void *tag, dsl_dataset_t **dsp)
 {
-	int err = dsl_dataset_hold(name, tag, dsp);
-	if (err)
+	int err = dsl_dataset_hold(dp, name, tag, dsp);
+	if (err != 0)
 		return (err);
-	if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) {
+	if (!dsl_dataset_tryown(*dsp, tag)) {
 		dsl_dataset_rele(*dsp, tag);
 		return (EBUSY);
 	}
 	return (0);
 }
 
+/*
+ * See the comment above dsl_pool_hold() for details.  In summary, a long
+ * hold is used to prevent destruction of a dataset while the pool hold
+ * is dropped, allowing other concurrent operations (e.g. spa_sync()).
+ *
+ * The dataset and pool must be held when this function is called.  After it
+ * is called, the pool hold may be released while the dataset is still held
+ * and accessed.
+ */
+void
+dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag)
+{
+	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
+	(void) refcount_add(&ds->ds_longholds, tag);
+}
+
+void
+dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag)
+{
+	(void) refcount_remove(&ds->ds_longholds, tag);
+}
+
+/* Return B_TRUE if there are any long holds on this dataset. */
+boolean_t
+dsl_dataset_long_held(dsl_dataset_t *ds)
+{
+	return (!refcount_is_zero(&ds->ds_longholds));
+}
+
 void
 dsl_dataset_name(dsl_dataset_t *ds, char *name)
 {
@@ -659,7 +571,7 @@
 		(void) strcpy(name, "mos");
 	} else {
 		dsl_dir_name(ds->ds_dir, name);
-		VERIFY(0 == dsl_dataset_get_snapname(ds));
+		VERIFY0(dsl_dataset_get_snapname(ds));
 		if (ds->ds_snapname[0]) {
 			(void) strcat(name, "@");
 			/*
@@ -686,7 +598,7 @@
 		result = 3;	/* "mos" */
 	} else {
 		result = dsl_dir_namelen(ds->ds_dir);
-		VERIFY(0 == dsl_dataset_get_snapname(ds));
+		VERIFY0(dsl_dataset_get_snapname(ds));
 		if (ds->ds_snapname[0]) {
 			++result;	/* adding one for the @-sign */
 			if (!MUTEX_HELD(&ds->ds_lock)) {
@@ -703,64 +615,41 @@
 }
 
 void
-dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag)
+dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
 {
 	dmu_buf_rele(ds->ds_dbuf, tag);
 }
 
 void
-dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
-{
-	if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) {
-		rw_exit(&ds->ds_rwlock);
-	}
-	dsl_dataset_drop_ref(ds, tag);
-}
-
-void
 dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
 {
-	ASSERT((ds->ds_owner == tag && ds->ds_dbuf) ||
-	    (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL));
+	ASSERT(ds->ds_owner == tag && ds->ds_dbuf != NULL);
 
 	mutex_enter(&ds->ds_lock);
 	ds->ds_owner = NULL;
-	if (RW_WRITE_HELD(&ds->ds_rwlock)) {
-		rw_exit(&ds->ds_rwlock);
-		cv_broadcast(&ds->ds_exclusive_cv);
-	}
 	mutex_exit(&ds->ds_lock);
-	if (ds->ds_dbuf)
-		dsl_dataset_drop_ref(ds, tag);
+	dsl_dataset_long_rele(ds, tag);
+	if (ds->ds_dbuf != NULL)
+		dsl_dataset_rele(ds, tag);
 	else
 		dsl_dataset_evict(NULL, ds);
 }
 
 boolean_t
-dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag)
+dsl_dataset_tryown(dsl_dataset_t *ds, void *tag)
 {
 	boolean_t gotit = FALSE;
 
 	mutex_enter(&ds->ds_lock);
-	if (ds->ds_owner == NULL &&
-	    (!DS_IS_INCONSISTENT(ds) || inconsistentok)) {
+	if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) {
 		ds->ds_owner = tag;
-		if (!dsl_pool_sync_context(ds->ds_dir->dd_pool))
-			rw_exit(&ds->ds_rwlock);
+		dsl_dataset_long_hold(ds, tag);
 		gotit = TRUE;
 	}
 	mutex_exit(&ds->ds_lock);
 	return (gotit);
 }
 
-void
-dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner)
-{
-	ASSERT3P(owner, ==, ds->ds_owner);
-	if (!RW_WRITE_HELD(&ds->ds_rwlock))
-		rw_enter(&ds->ds_rwlock, RW_WRITER);
-}
-
 uint64_t
 dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
     uint64_t flags, dmu_tx_t *tx)
@@ -781,7 +670,7 @@
 
 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
-	VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
+	VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	dsphys = dbuf->db_data;
 	bzero(dsphys, sizeof (dsl_dataset_phys_t));
@@ -799,7 +688,7 @@
 	if (origin == NULL) {
 		dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
 	} else {
-		dsl_dataset_t *ohds;
+		dsl_dataset_t *ohds; /* head of the origin snapshot */
 
 		dsphys->ds_prev_snap_obj = origin->ds_object;
 		dsphys->ds_prev_snap_txg =
@@ -816,7 +705,7 @@
 		dmu_buf_will_dirty(origin->ds_dbuf, tx);
 		origin->ds_phys->ds_num_children++;
 
-		VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
+		VERIFY0(dsl_dataset_hold_obj(dp,
 		    origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds));
 		dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
 		    dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
@@ -828,9 +717,8 @@
 				    zap_create(mos,
 				    DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
 			}
-			VERIFY(0 == zap_add_int(mos,
-			    origin->ds_phys->ds_next_clones_obj,
-			    dsobj, tx));
+			VERIFY0(zap_add_int(mos,
+			    origin->ds_phys->ds_next_clones_obj, dsobj, tx));
 		}
 
 		dmu_buf_will_dirty(dd->dd_dbuf, tx);
@@ -842,7 +730,7 @@
 				    zap_create(mos,
 				    DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
 			}
-			VERIFY3U(0, ==, zap_add_int(mos,
+			VERIFY0(zap_add_int(mos,
 			    origin->ds_dir->dd_phys->dd_clones, dsobj, tx));
 		}
 	}
@@ -858,6 +746,16 @@
 	return (dsobj);
 }
 
+static void
+dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	objset_t *os;
+
+	VERIFY0(dmu_objset_from_ds(ds, &os));
+	bzero(&os->os_zil_header, sizeof (os->os_zil_header));
+	dsl_dataset_dirty(ds, tx);
+}
+
 uint64_t
 dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
     dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx)
@@ -866,29 +764,28 @@
 	uint64_t dsobj, ddobj;
 	dsl_dir_t *dd;
 
+	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(lastname[0] != '@');
 
 	ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
-	VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd));
-
-	dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx);
+	VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd));
+
+	dsobj = dsl_dataset_create_sync_dd(dd, origin,
+	    flags & ~DS_CREATE_FLAG_NODIRTY, tx);
 
 	dsl_deleg_set_create_perms(dd, tx, cr);
 
-	dsl_dir_close(dd, FTAG);
+	dsl_dir_rele(dd, FTAG);
 
 	/*
 	 * If we are creating a clone, make sure we zero out any stale
 	 * data from the origin snapshots zil header.
 	 */
-	if (origin != NULL) {
+	if (origin != NULL && !(flags & DS_CREATE_FLAG_NODIRTY)) {
 		dsl_dataset_t *ds;
-		objset_t *os;
-
-		VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
-		VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os));
-		bzero(&os->os_zil_header, sizeof (os->os_zil_header));
-		dsl_dataset_dirty(ds, tx);
+
+		VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+		dsl_dataset_zero_zil(ds, tx);
 		dsl_dataset_rele(ds, FTAG);
 	}
 
@@ -896,271 +793,65 @@
 }
 
 /*
- * The snapshots must all be in the same pool.
+ * The unique space in the head dataset can be calculated by subtracting
+ * the space used in the most recent snapshot, that is still being used
+ * in this file system, from the space currently in use.  To figure out
+ * the space in the most recent snapshot still in use, we need to take
+ * the total space used in the snapshot and subtract out the space that
+ * has been freed up since the snapshot was taken.
  */
-int
-dmu_snapshots_destroy_nvl(nvlist_t *snaps, boolean_t defer,
-    nvlist_t *errlist)
+void
+dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
 {
-	int err;
-	dsl_sync_task_t *dst;
-	spa_t *spa;
-	nvpair_t *pair;
-	dsl_sync_task_group_t *dstg;
-
-	pair = nvlist_next_nvpair(snaps, NULL);
-	if (pair == NULL)
-		return (0);
-
-	err = spa_open(nvpair_name(pair), &spa, FTAG);
-	if (err)
-		return (err);
-	dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
-
-	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
-	    pair = nvlist_next_nvpair(snaps, pair)) {
-		dsl_dataset_t *ds;
-
-		err = dsl_dataset_own(nvpair_name(pair), B_TRUE, dstg, &ds);
-		if (err == 0) {
-			struct dsl_ds_destroyarg *dsda;
-
-			dsl_dataset_make_exclusive(ds, dstg);
-			dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg),
-			    KM_SLEEP);
-			dsda->ds = ds;
-			dsda->defer = defer;
-			dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
-			    dsl_dataset_destroy_sync, dsda, dstg, 0);
-		} else if (err == ENOENT) {
-			err = 0;
-		} else {
-			fnvlist_add_int32(errlist, nvpair_name(pair), err);
-			break;
-		}
-	}
-
-	if (err == 0)
-		err = dsl_sync_task_group_wait(dstg);
-
-	for (dst = list_head(&dstg->dstg_tasks); dst;
-	    dst = list_next(&dstg->dstg_tasks, dst)) {
-		struct dsl_ds_destroyarg *dsda = dst->dst_arg1;
-		dsl_dataset_t *ds = dsda->ds;
-
-		/*
-		 * Return the snapshots that triggered the error.
-		 */
-		if (dst->dst_err != 0) {
-			char name[ZFS_MAXNAMELEN];
-			dsl_dataset_name(ds, name);
-			fnvlist_add_int32(errlist, name, dst->dst_err);
-		}
-		ASSERT3P(dsda->rm_origin, ==, NULL);
-		dsl_dataset_disown(ds, dstg);
-		kmem_free(dsda, sizeof (struct dsl_ds_destroyarg));
-	}
-
-	dsl_sync_task_group_destroy(dstg);
-	spa_close(spa, FTAG);
-	return (err);
-
+	uint64_t mrs_used;
+	uint64_t dlused, dlcomp, dluncomp;
+
+	ASSERT(!dsl_dataset_is_snapshot(ds));
+
+	if (ds->ds_phys->ds_prev_snap_obj != 0)
+		mrs_used = ds->ds_prev->ds_phys->ds_referenced_bytes;
+	else
+		mrs_used = 0;
+
+	dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
+
+	ASSERT3U(dlused, <=, mrs_used);
+	ds->ds_phys->ds_unique_bytes =
+	    ds->ds_phys->ds_referenced_bytes - (mrs_used - dlused);
+
+	if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
+	    SPA_VERSION_UNIQUE_ACCURATE)
+		ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
 }
 
-static boolean_t
-dsl_dataset_might_destroy_origin(dsl_dataset_t *ds)
-{
-	boolean_t might_destroy = B_FALSE;
-
-	mutex_enter(&ds->ds_lock);
-	if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 &&
-	    DS_IS_DEFER_DESTROY(ds))
-		might_destroy = B_TRUE;
-	mutex_exit(&ds->ds_lock);
-
-	return (might_destroy);
-}
-
-/*
- * If we're removing a clone, and these three conditions are true:
- *	1) the clone's origin has no other children
- *	2) the clone's origin has no user references
- *	3) the clone's origin has been marked for deferred destruction
- * Then, prepare to remove the origin as part of this sync task group.
- */
-static int
-dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag)
+void
+dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj,
+    dmu_tx_t *tx)
 {
-	dsl_dataset_t *ds = dsda->ds;
-	dsl_dataset_t *origin = ds->ds_prev;
-
-	if (dsl_dataset_might_destroy_origin(origin)) {
-		char *name;
-		int namelen;
-		int error;
-
-		namelen = dsl_dataset_namelen(origin) + 1;
-		name = kmem_alloc(namelen, KM_SLEEP);
-		dsl_dataset_name(origin, name);
-#ifdef _KERNEL
-		error = zfs_unmount_snap(name, NULL);
-		if (error) {
-			kmem_free(name, namelen);
-			return (error);
-		}
-#endif
-		error = dsl_dataset_own(name, B_TRUE, tag, &origin);
-		kmem_free(name, namelen);
-		if (error)
-			return (error);
-		dsda->rm_origin = origin;
-		dsl_dataset_make_exclusive(origin, tag);
-	}
-
-	return (0);
-}
-
-/*
- * ds must be opened as OWNER.  On return (whether successful or not),
- * ds will be closed and caller can no longer dereference it.
- */
-int
-dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer)
-{
+	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+	uint64_t count;
 	int err;
-	dsl_sync_task_group_t *dstg;
-	objset_t *os;
-	dsl_dir_t *dd;
-	uint64_t obj;
-	struct dsl_ds_destroyarg dsda = { 0 };
-
-	dsda.ds = ds;
-
-	if (dsl_dataset_is_snapshot(ds)) {
-		/* Destroying a snapshot is simpler */
-		dsl_dataset_make_exclusive(ds, tag);
-
-		dsda.defer = defer;
-		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-		    dsl_dataset_destroy_check, dsl_dataset_destroy_sync,
-		    &dsda, tag, 0);
-		ASSERT3P(dsda.rm_origin, ==, NULL);
-		goto out;
-	} else if (defer) {
-		err = EINVAL;
-		goto out;
-	}
-
-	dd = ds->ds_dir;
-
-	if (!spa_feature_is_enabled(dsl_dataset_get_spa(ds),
-	    &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
-		/*
-		 * Check for errors and mark this ds as inconsistent, in
-		 * case we crash while freeing the objects.
-		 */
-		err = dsl_sync_task_do(dd->dd_pool,
-		    dsl_dataset_destroy_begin_check,
-		    dsl_dataset_destroy_begin_sync, ds, NULL, 0);
-		if (err)
-			goto out;
-
-		err = dmu_objset_from_ds(ds, &os);
-		if (err)
-			goto out;
-
-		/*
-		 * Remove all objects while in the open context so that
-		 * there is less work to do in the syncing context.
-		 */
-		for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
-		    ds->ds_phys->ds_prev_snap_txg)) {
-			/*
-			 * Ignore errors, if there is not enough disk space
-			 * we will deal with it in dsl_dataset_destroy_sync().
-			 */
-			(void) dmu_free_object(os, obj);
-		}
-		if (err != ESRCH)
-			goto out;
-
-		/*
-		 * Sync out all in-flight IO.
-		 */
-		txg_wait_synced(dd->dd_pool, 0);
-
-		/*
-		 * If we managed to free all the objects in open
-		 * context, the user space accounting should be zero.
-		 */
-		if (ds->ds_phys->ds_bp.blk_fill == 0 &&
-		    dmu_objset_userused_enabled(os)) {
-			uint64_t count;
-
-			ASSERT(zap_count(os, DMU_USERUSED_OBJECT,
-			    &count) != 0 || count == 0);
-			ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT,
-			    &count) != 0 || count == 0);
-		}
-	}
-
-	rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
-	err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd);
-	rw_exit(&dd->dd_pool->dp_config_rwlock);
-
-	if (err)
-		goto out;
-
+
+	ASSERT(ds->ds_phys->ds_num_children >= 2);
+	err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx);
 	/*
-	 * Blow away the dsl_dir + head dataset.
-	 */
-	dsl_dataset_make_exclusive(ds, tag);
-	/*
-	 * If we're removing a clone, we might also need to remove its
-	 * origin.
+	 * The err should not be ENOENT, but a bug in a previous version
+	 * of the code could cause upgrade_clones_cb() to not set
+	 * ds_next_snap_obj when it should, leading to a missing entry.
+	 * If we knew that the pool was created after
+	 * SPA_VERSION_NEXT_CLONES, we could assert that it isn't
+	 * ENOENT.  However, at least we can check that we don't have
+	 * too many entries in the next_clones_obj even after failing to
+	 * remove this one.
 	 */
-	do {
-		dsda.need_prep = B_FALSE;
-		if (dsl_dir_is_clone(dd)) {
-			err = dsl_dataset_origin_rm_prep(&dsda, tag);
-			if (err) {
-				dsl_dir_close(dd, FTAG);
-				goto out;
-			}
-		}
-
-		dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
-		dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
-		    dsl_dataset_destroy_sync, &dsda, tag, 0);
-		dsl_sync_task_create(dstg, dsl_dir_destroy_check,
-		    dsl_dir_destroy_sync, dd, FTAG, 0);
-		err = dsl_sync_task_group_wait(dstg);
-		dsl_sync_task_group_destroy(dstg);
-
-		/*
-		 * We could be racing against 'zfs release' or 'zfs destroy -d'
-		 * on the origin snap, in which case we can get EBUSY if we
-		 * needed to destroy the origin snap but were not ready to
-		 * do so.
-		 */
-		if (dsda.need_prep) {
-			ASSERT(err == EBUSY);
-			ASSERT(dsl_dir_is_clone(dd));
-			ASSERT(dsda.rm_origin == NULL);
-		}
-	} while (dsda.need_prep);
-
-	if (dsda.rm_origin != NULL)
-		dsl_dataset_disown(dsda.rm_origin, tag);
-
-	/* if it is successful, dsl_dir_destroy_sync will close the dd */
-	if (err)
-		dsl_dir_close(dd, FTAG);
-out:
-	dsl_dataset_disown(ds, tag);
-	return (err);
+	if (err != ENOENT)
+		VERIFY0(err);
+	ASSERT0(zap_count(mos, ds->ds_phys->ds_next_clones_obj,
+	    &count));
+	ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2);
 }
 
+
 blkptr_t *
 dsl_dataset_get_blkptr(dsl_dataset_t *ds)
 {
@@ -1201,7 +892,7 @@
 
 	dp = ds->ds_dir->dd_pool;
 
-	if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) {
+	if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) {
 		/* up the hold count until we can be written out */
 		dmu_buf_add_ref(ds->ds_dbuf, ds);
 	}
@@ -1218,769 +909,6 @@
 	return (B_FALSE);
 }
 
-/*
- * The unique space in the head dataset can be calculated by subtracting
- * the space used in the most recent snapshot, that is still being used
- * in this file system, from the space currently in use.  To figure out
- * the space in the most recent snapshot still in use, we need to take
- * the total space used in the snapshot and subtract out the space that
- * has been freed up since the snapshot was taken.
- */
-static void
-dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
-{
-	uint64_t mrs_used;
-	uint64_t dlused, dlcomp, dluncomp;
-
-	ASSERT(!dsl_dataset_is_snapshot(ds));
-
-	if (ds->ds_phys->ds_prev_snap_obj != 0)
-		mrs_used = ds->ds_prev->ds_phys->ds_referenced_bytes;
-	else
-		mrs_used = 0;
-
-	dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
-
-	ASSERT3U(dlused, <=, mrs_used);
-	ds->ds_phys->ds_unique_bytes =
-	    ds->ds_phys->ds_referenced_bytes - (mrs_used - dlused);
-
-	if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
-	    SPA_VERSION_UNIQUE_ACCURATE)
-		ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
-}
-
-struct killarg {
-	dsl_dataset_t *ds;
-	dmu_tx_t *tx;
-};
-
-/* ARGSUSED */
-static int
-kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
-    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
-{
-	struct killarg *ka = arg;
-	dmu_tx_t *tx = ka->tx;
-
-	if (bp == NULL)
-		return (0);
-
-	if (zb->zb_level == ZB_ZIL_LEVEL) {
-		ASSERT(zilog != NULL);
-		/*
-		 * It's a block in the intent log.  It has no
-		 * accounting, so just free it.
-		 */
-		dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
-	} else {
-		ASSERT(zilog == NULL);
-		ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg);
-		(void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
-	}
-
-	return (0);
-}
-
-/* ARGSUSED */
-static int
-dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = arg1;
-	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-	uint64_t count;
-	int err;
-
-	/*
-	 * Can't delete a head dataset if there are snapshots of it.
-	 * (Except if the only snapshots are from the branch we cloned
-	 * from.)
-	 */
-	if (ds->ds_prev != NULL &&
-	    ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
-		return (EBUSY);
-
-	/*
-	 * This is really a dsl_dir thing, but check it here so that
-	 * we'll be less likely to leave this dataset inconsistent &
-	 * nearly destroyed.
-	 */
-	err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count);
-	if (err)
-		return (err);
-	if (count != 0)
-		return (EEXIST);
-
-	return (0);
-}
-
-/* ARGSUSED */
-static void
-dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = arg1;
-
-	/* Mark it as inconsistent on-disk, in case we crash */
-	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
-
-	spa_history_log_internal_ds(ds, "destroy begin", tx, "");
-}
-
-static int
-dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag,
-    dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = dsda->ds;
-	dsl_dataset_t *ds_prev = ds->ds_prev;
-
-	if (dsl_dataset_might_destroy_origin(ds_prev)) {
-		struct dsl_ds_destroyarg ndsda = {0};
-
-		/*
-		 * If we're not prepared to remove the origin, don't remove
-		 * the clone either.
-		 */
-		if (dsda->rm_origin == NULL) {
-			dsda->need_prep = B_TRUE;
-			return (EBUSY);
-		}
-
-		ndsda.ds = ds_prev;
-		ndsda.is_origin_rm = B_TRUE;
-		return (dsl_dataset_destroy_check(&ndsda, tag, tx));
-	}
-
-	/*
-	 * If we're not going to remove the origin after all,
-	 * undo the open context setup.
-	 */
-	if (dsda->rm_origin != NULL) {
-		dsl_dataset_disown(dsda->rm_origin, tag);
-		dsda->rm_origin = NULL;
-	}
-
-	return (0);
-}
-
-/*
- * If you add new checks here, you may need to add
- * additional checks to the "temporary" case in
- * snapshot_check() in dmu_objset.c.
- */
-/* ARGSUSED */
-int
-dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	struct dsl_ds_destroyarg *dsda = arg1;
-	dsl_dataset_t *ds = dsda->ds;
-
-	/* we have an owner hold, so noone else can destroy us */
-	ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
-
-	/*
-	 * Only allow deferred destroy on pools that support it.
-	 * NOTE: deferred destroy is only supported on snapshots.
-	 */
-	if (dsda->defer) {
-		if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
-		    SPA_VERSION_USERREFS)
-			return (ENOTSUP);
-		ASSERT(dsl_dataset_is_snapshot(ds));
-		return (0);
-	}
-
-	/*
-	 * Can't delete a head dataset if there are snapshots of it.
-	 * (Except if the only snapshots are from the branch we cloned
-	 * from.)
-	 */
-	if (ds->ds_prev != NULL &&
-	    ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
-		return (EBUSY);
-
-	/*
-	 * If we made changes this txg, traverse_dsl_dataset won't find
-	 * them.  Try again.
-	 */
-	if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
-		return (EAGAIN);
-
-	if (dsl_dataset_is_snapshot(ds)) {
-		/*
-		 * If this snapshot has an elevated user reference count,
-		 * we can't destroy it yet.
-		 */
-		if (ds->ds_userrefs > 0 && !dsda->releasing)
-			return (EBUSY);
-
-		mutex_enter(&ds->ds_lock);
-		/*
-		 * Can't delete a branch point. However, if we're destroying
-		 * a clone and removing its origin due to it having a user
-		 * hold count of 0 and having been marked for deferred destroy,
-		 * it's OK for the origin to have a single clone.
-		 */
-		if (ds->ds_phys->ds_num_children >
-		    (dsda->is_origin_rm ? 2 : 1)) {
-			mutex_exit(&ds->ds_lock);
-			return (EEXIST);
-		}
-		mutex_exit(&ds->ds_lock);
-	} else if (dsl_dir_is_clone(ds->ds_dir)) {
-		return (dsl_dataset_origin_check(dsda, arg2, tx));
-	}
-
-	/* XXX we should do some i/o error checking... */
-	return (0);
-}
-
-struct refsarg {
-	kmutex_t lock;
-	boolean_t gone;
-	kcondvar_t cv;
-};
-
-/* ARGSUSED */
-static void
-dsl_dataset_refs_gone(dmu_buf_t *db, void *argv)
-{
-	struct refsarg *arg = argv;
-
-	mutex_enter(&arg->lock);
-	arg->gone = TRUE;
-	cv_signal(&arg->cv);
-	mutex_exit(&arg->lock);
-}
-
-static void
-dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag)
-{
-	struct refsarg arg;
-
-	mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&arg.cv, NULL, CV_DEFAULT, NULL);
-	arg.gone = FALSE;
-	(void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys,
-	    dsl_dataset_refs_gone);
-	dmu_buf_rele(ds->ds_dbuf, tag);
-	mutex_enter(&arg.lock);
-	while (!arg.gone)
-		cv_wait(&arg.cv, &arg.lock);
-	ASSERT(arg.gone);
-	mutex_exit(&arg.lock);
-	ds->ds_dbuf = NULL;
-	ds->ds_phys = NULL;
-	mutex_destroy(&arg.lock);
-	cv_destroy(&arg.cv);
-}
-
-static void
-remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx)
-{
-	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-	uint64_t count;
-	int err;
-
-	ASSERT(ds->ds_phys->ds_num_children >= 2);
-	err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx);
-	/*
-	 * The err should not be ENOENT, but a bug in a previous version
-	 * of the code could cause upgrade_clones_cb() to not set
-	 * ds_next_snap_obj when it should, leading to a missing entry.
-	 * If we knew that the pool was created after
-	 * SPA_VERSION_NEXT_CLONES, we could assert that it isn't
-	 * ENOENT.  However, at least we can check that we don't have
-	 * too many entries in the next_clones_obj even after failing to
-	 * remove this one.
-	 */
-	if (err != ENOENT) {
-		VERIFY0(err);
-	}
-	ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj,
-	    &count));
-	ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2);
-}
-
-static void
-dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx)
-{
-	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-	zap_cursor_t zc;
-	zap_attribute_t za;
-
-	/*
-	 * If it is the old version, dd_clones doesn't exist so we can't
-	 * find the clones, but deadlist_remove_key() is a no-op so it
-	 * doesn't matter.
-	 */
-	if (ds->ds_dir->dd_phys->dd_clones == 0)
-		return;
-
-	for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones);
-	    zap_cursor_retrieve(&zc, &za) == 0;
-	    zap_cursor_advance(&zc)) {
-		dsl_dataset_t *clone;
-
-		VERIFY3U(0, ==, dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
-		    za.za_first_integer, FTAG, &clone));
-		if (clone->ds_dir->dd_origin_txg > mintxg) {
-			dsl_deadlist_remove_key(&clone->ds_deadlist,
-			    mintxg, tx);
-			dsl_dataset_remove_clones_key(clone, mintxg, tx);
-		}
-		dsl_dataset_rele(clone, FTAG);
-	}
-	zap_cursor_fini(&zc);
-}
-
-struct process_old_arg {
-	dsl_dataset_t *ds;
-	dsl_dataset_t *ds_prev;
-	boolean_t after_branch_point;
-	zio_t *pio;
-	uint64_t used, comp, uncomp;
-};
-
-static int
-process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
-{
-	struct process_old_arg *poa = arg;
-	dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
-
-	if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) {
-		dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
-		if (poa->ds_prev && !poa->after_branch_point &&
-		    bp->blk_birth >
-		    poa->ds_prev->ds_phys->ds_prev_snap_txg) {
-			poa->ds_prev->ds_phys->ds_unique_bytes +=
-			    bp_get_dsize_sync(dp->dp_spa, bp);
-		}
-	} else {
-		poa->used += bp_get_dsize_sync(dp->dp_spa, bp);
-		poa->comp += BP_GET_PSIZE(bp);
-		poa->uncomp += BP_GET_UCSIZE(bp);
-		dsl_free_sync(poa->pio, dp, tx->tx_txg, bp);
-	}
-	return (0);
-}
-
-static void
-process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
-    dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx)
-{
-	struct process_old_arg poa = { 0 };
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-	objset_t *mos = dp->dp_meta_objset;
-
-	ASSERT(ds->ds_deadlist.dl_oldfmt);
-	ASSERT(ds_next->ds_deadlist.dl_oldfmt);
-
-	poa.ds = ds;
-	poa.ds_prev = ds_prev;
-	poa.after_branch_point = after_branch_point;
-	poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
-	VERIFY3U(0, ==, bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
-	    process_old_cb, &poa, tx));
-	VERIFY0(zio_wait(poa.pio));
-	ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes);
-
-	/* change snapused */
-	dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
-	    -poa.used, -poa.comp, -poa.uncomp, tx);
-
-	/* swap next's deadlist to our deadlist */
-	dsl_deadlist_close(&ds->ds_deadlist);
-	dsl_deadlist_close(&ds_next->ds_deadlist);
-	SWITCH64(ds_next->ds_phys->ds_deadlist_obj,
-	    ds->ds_phys->ds_deadlist_obj);
-	dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
-	dsl_deadlist_open(&ds_next->ds_deadlist, mos,
-	    ds_next->ds_phys->ds_deadlist_obj);
-}
-
-static int
-old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
-{
-	int err;
-	struct killarg ka;
-
-	/*
-	 * Free everything that we point to (that's born after
-	 * the previous snapshot, if we are a clone)
-	 *
-	 * NB: this should be very quick, because we already
-	 * freed all the objects in open context.
-	 */
-	ka.ds = ds;
-	ka.tx = tx;
-	err = traverse_dataset(ds,
-	    ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST,
-	    kill_blkptr, &ka);
-	ASSERT0(err);
-	ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0);
-
-	return (err);
-}
-
-void
-dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
-{
-	struct dsl_ds_destroyarg *dsda = arg1;
-	dsl_dataset_t *ds = dsda->ds;
-	int err;
-	int after_branch_point = FALSE;
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-	objset_t *mos = dp->dp_meta_objset;
-	dsl_dataset_t *ds_prev = NULL;
-	boolean_t wont_destroy;
-	uint64_t obj;
-
-	wont_destroy = (dsda->defer &&
-	    (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1));
-
-	ASSERT(ds->ds_owner || wont_destroy);
-	ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1);
-	ASSERT(ds->ds_prev == NULL ||
-	    ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
-	ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
-
-	if (wont_destroy) {
-		ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
-		dmu_buf_will_dirty(ds->ds_dbuf, tx);
-		ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY;
-		spa_history_log_internal_ds(ds, "defer_destroy", tx, "");
-		return;
-	}
-
-	/* We need to log before removing it from the namespace. */
-	spa_history_log_internal_ds(ds, "destroy", tx, "");
-
-	/* signal any waiters that this dataset is going away */
-	mutex_enter(&ds->ds_lock);
-	ds->ds_owner = dsl_reaper;
-	cv_broadcast(&ds->ds_exclusive_cv);
-	mutex_exit(&ds->ds_lock);
-
-	/* Remove our reservation */
-	if (ds->ds_reserved != 0) {
-		dsl_prop_setarg_t psa;
-		uint64_t value = 0;
-
-		dsl_prop_setarg_init_uint64(&psa, "refreservation",
-		    (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
-		    &value);
-		psa.psa_effective_value = 0;	/* predict default value */
-
-		dsl_dataset_set_reservation_sync(ds, &psa, tx);
-		ASSERT0(ds->ds_reserved);
-	}
-
-	ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
-
-	dsl_scan_ds_destroyed(ds, tx);
-
-	obj = ds->ds_object;
-
-	if (ds->ds_phys->ds_prev_snap_obj != 0) {
-		if (ds->ds_prev) {
-			ds_prev = ds->ds_prev;
-		} else {
-			VERIFY(0 == dsl_dataset_hold_obj(dp,
-			    ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev));
-		}
-		after_branch_point =
-		    (ds_prev->ds_phys->ds_next_snap_obj != obj);
-
-		dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
-		if (after_branch_point &&
-		    ds_prev->ds_phys->ds_next_clones_obj != 0) {
-			remove_from_next_clones(ds_prev, obj, tx);
-			if (ds->ds_phys->ds_next_snap_obj != 0) {
-				VERIFY(0 == zap_add_int(mos,
-				    ds_prev->ds_phys->ds_next_clones_obj,
-				    ds->ds_phys->ds_next_snap_obj, tx));
-			}
-		}
-		if (after_branch_point &&
-		    ds->ds_phys->ds_next_snap_obj == 0) {
-			/* This clone is toast. */
-			ASSERT(ds_prev->ds_phys->ds_num_children > 1);
-			ds_prev->ds_phys->ds_num_children--;
-
-			/*
-			 * If the clone's origin has no other clones, no
-			 * user holds, and has been marked for deferred
-			 * deletion, then we should have done the necessary
-			 * destroy setup for it.
-			 */
-			if (ds_prev->ds_phys->ds_num_children == 1 &&
-			    ds_prev->ds_userrefs == 0 &&
-			    DS_IS_DEFER_DESTROY(ds_prev)) {
-				ASSERT3P(dsda->rm_origin, !=, NULL);
-			} else {
-				ASSERT3P(dsda->rm_origin, ==, NULL);
-			}
-		} else if (!after_branch_point) {
-			ds_prev->ds_phys->ds_next_snap_obj =
-			    ds->ds_phys->ds_next_snap_obj;
-		}
-	}
-
-	if (dsl_dataset_is_snapshot(ds)) {
-		dsl_dataset_t *ds_next;
-		uint64_t old_unique;
-		uint64_t used = 0, comp = 0, uncomp = 0;
-
-		VERIFY(0 == dsl_dataset_hold_obj(dp,
-		    ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next));
-		ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
-
-		old_unique = ds_next->ds_phys->ds_unique_bytes;
-
-		dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
-		ds_next->ds_phys->ds_prev_snap_obj =
-		    ds->ds_phys->ds_prev_snap_obj;
-		ds_next->ds_phys->ds_prev_snap_txg =
-		    ds->ds_phys->ds_prev_snap_txg;
-		ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
-		    ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
-
-
-		if (ds_next->ds_deadlist.dl_oldfmt) {
-			process_old_deadlist(ds, ds_prev, ds_next,
-			    after_branch_point, tx);
-		} else {
-			/* Adjust prev's unique space. */
-			if (ds_prev && !after_branch_point) {
-				dsl_deadlist_space_range(&ds_next->ds_deadlist,
-				    ds_prev->ds_phys->ds_prev_snap_txg,
-				    ds->ds_phys->ds_prev_snap_txg,
-				    &used, &comp, &uncomp);
-				ds_prev->ds_phys->ds_unique_bytes += used;
-			}
-
-			/* Adjust snapused. */
-			dsl_deadlist_space_range(&ds_next->ds_deadlist,
-			    ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
-			    &used, &comp, &uncomp);
-			dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
-			    -used, -comp, -uncomp, tx);
-
-			/* Move blocks to be freed to pool's free list. */
-			dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
-			    &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg,
-			    tx);
-			dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
-			    DD_USED_HEAD, used, comp, uncomp, tx);
-
-			/* Merge our deadlist into next's and free it. */
-			dsl_deadlist_merge(&ds_next->ds_deadlist,
-			    ds->ds_phys->ds_deadlist_obj, tx);
-		}
-		dsl_deadlist_close(&ds->ds_deadlist);
-		dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
-
-		/* Collapse range in clone heads */
-		dsl_dataset_remove_clones_key(ds,
-		    ds->ds_phys->ds_creation_txg, tx);
-
-		if (dsl_dataset_is_snapshot(ds_next)) {
-			dsl_dataset_t *ds_nextnext;
-
-			/*
-			 * Update next's unique to include blocks which
-			 * were previously shared by only this snapshot
-			 * and it.  Those blocks will be born after the
-			 * prev snap and before this snap, and will have
-			 * died after the next snap and before the one
-			 * after that (ie. be on the snap after next's
-			 * deadlist).
-			 */
-			VERIFY(0 == dsl_dataset_hold_obj(dp,
-			    ds_next->ds_phys->ds_next_snap_obj,
-			    FTAG, &ds_nextnext));
-			dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
-			    ds->ds_phys->ds_prev_snap_txg,
-			    ds->ds_phys->ds_creation_txg,
-			    &used, &comp, &uncomp);
-			ds_next->ds_phys->ds_unique_bytes += used;
-			dsl_dataset_rele(ds_nextnext, FTAG);
-			ASSERT3P(ds_next->ds_prev, ==, NULL);
-
-			/* Collapse range in this head. */
-			dsl_dataset_t *hds;
-			VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
-			    ds->ds_dir->dd_phys->dd_head_dataset_obj,
-			    FTAG, &hds));
-			dsl_deadlist_remove_key(&hds->ds_deadlist,
-			    ds->ds_phys->ds_creation_txg, tx);
-			dsl_dataset_rele(hds, FTAG);
-
-		} else {
-			ASSERT3P(ds_next->ds_prev, ==, ds);
-			dsl_dataset_drop_ref(ds_next->ds_prev, ds_next);
-			ds_next->ds_prev = NULL;
-			if (ds_prev) {
-				VERIFY(0 == dsl_dataset_get_ref(dp,
-				    ds->ds_phys->ds_prev_snap_obj,
-				    ds_next, &ds_next->ds_prev));
-			}
-
-			dsl_dataset_recalc_head_uniq(ds_next);
-
-			/*
-			 * Reduce the amount of our unconsmed refreservation
-			 * being charged to our parent by the amount of
-			 * new unique data we have gained.
-			 */
-			if (old_unique < ds_next->ds_reserved) {
-				int64_t mrsdelta;
-				uint64_t new_unique =
-				    ds_next->ds_phys->ds_unique_bytes;
-
-				ASSERT(old_unique <= new_unique);
-				mrsdelta = MIN(new_unique - old_unique,
-				    ds_next->ds_reserved - old_unique);
-				dsl_dir_diduse_space(ds->ds_dir,
-				    DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
-			}
-		}
-		dsl_dataset_rele(ds_next, FTAG);
-	} else {
-		zfeature_info_t *async_destroy =
-		    &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY];
-		objset_t *os;
-
-		/*
-		 * There's no next snapshot, so this is a head dataset.
-		 * Destroy the deadlist.  Unless it's a clone, the
-		 * deadlist should be empty.  (If it's a clone, it's
-		 * safe to ignore the deadlist contents.)
-		 */
-		dsl_deadlist_close(&ds->ds_deadlist);
-		dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
-		ds->ds_phys->ds_deadlist_obj = 0;
-
-		VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os));
-
-		if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) {
-			err = old_synchronous_dataset_destroy(ds, tx);
-		} else {
-			/*
-			 * Move the bptree into the pool's list of trees to
-			 * clean up and update space accounting information.
-			 */
-			uint64_t used, comp, uncomp;
-
-			zil_destroy_sync(dmu_objset_zil(os), tx);
-
-			if (!spa_feature_is_active(dp->dp_spa, async_destroy)) {
-				spa_feature_incr(dp->dp_spa, async_destroy, tx);
-				dp->dp_bptree_obj = bptree_alloc(mos, tx);
-				VERIFY(zap_add(mos,
-				    DMU_POOL_DIRECTORY_OBJECT,
-				    DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
-				    &dp->dp_bptree_obj, tx) == 0);
-			}
-
-			used = ds->ds_dir->dd_phys->dd_used_bytes;
-			comp = ds->ds_dir->dd_phys->dd_compressed_bytes;
-			uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes;
-
-			ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
-			    ds->ds_phys->ds_unique_bytes == used);
-
-			bptree_add(mos, dp->dp_bptree_obj,
-			    &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg,
-			    used, comp, uncomp, tx);
-			dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
-			    -used, -comp, -uncomp, tx);
-			dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
-			    used, comp, uncomp, tx);
-		}
-
-		if (ds->ds_prev != NULL) {
-			if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
-				VERIFY3U(0, ==, zap_remove_int(mos,
-				    ds->ds_prev->ds_dir->dd_phys->dd_clones,
-				    ds->ds_object, tx));
-			}
-			dsl_dataset_rele(ds->ds_prev, ds);
-			ds->ds_prev = ds_prev = NULL;
-		}
-	}
-
-	/*
-	 * This must be done after the dsl_traverse(), because it will
-	 * re-open the objset.
-	 */
-	if (ds->ds_objset) {
-		dmu_objset_evict(ds->ds_objset);
-		ds->ds_objset = NULL;
-	}
-
-	if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) {
-		/* Erase the link in the dir */
-		dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
-		ds->ds_dir->dd_phys->dd_head_dataset_obj = 0;
-		ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0);
-		err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx);
-		ASSERT(err == 0);
-	} else {
-		/* remove from snapshot namespace */
-		dsl_dataset_t *ds_head;
-		ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0);
-		VERIFY(0 == dsl_dataset_hold_obj(dp,
-		    ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head));
-		VERIFY(0 == dsl_dataset_get_snapname(ds));
-#ifdef ZFS_DEBUG
-		{
-			uint64_t val;
-
-			err = dsl_dataset_snap_lookup(ds_head,
-			    ds->ds_snapname, &val);
-			ASSERT0(err);
-			ASSERT3U(val, ==, obj);
-		}
-#endif
-		err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx);
-		ASSERT(err == 0);
-		dsl_dataset_rele(ds_head, FTAG);
-	}
-
-	if (ds_prev && ds->ds_prev != ds_prev)
-		dsl_dataset_rele(ds_prev, FTAG);
-
-	spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
-
-	if (ds->ds_phys->ds_next_clones_obj != 0) {
-		uint64_t count;
-		ASSERT(0 == zap_count(mos,
-		    ds->ds_phys->ds_next_clones_obj, &count) && count == 0);
-		VERIFY(0 == dmu_object_free(mos,
-		    ds->ds_phys->ds_next_clones_obj, tx));
-	}
-	if (ds->ds_phys->ds_props_obj != 0)
-		VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx));
-	if (ds->ds_phys->ds_userrefs_obj != 0)
-		VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx));
-	dsl_dir_close(ds->ds_dir, ds);
-	ds->ds_dir = NULL;
-	dsl_dataset_drain_refs(ds, tag);
-	VERIFY(0 == dmu_object_free(mos, obj, tx));
-
-	if (dsda->rm_origin) {
-		/*
-		 * Remove the origin of the clone we just destroyed.
-		 */
-		struct dsl_ds_destroyarg ndsda = {0};
-
-		ndsda.ds = dsda->rm_origin;
-		dsl_dataset_destroy_sync(&ndsda, tag, tx);
-	}
-}
-
 static int
 dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
@@ -2009,13 +937,24 @@
 	return (0);
 }
 
+typedef struct dsl_dataset_snapshot_arg {
+	nvlist_t *ddsa_snaps;
+	nvlist_t *ddsa_props;
+	nvlist_t *ddsa_errors;
+} dsl_dataset_snapshot_arg_t;
+
 int
-dsl_dataset_snapshot_check(dsl_dataset_t *ds, const char *snapname,
+dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
     dmu_tx_t *tx)
 {
-	int err;
+	int error;
 	uint64_t value;
 
+	ds->ds_trysnap_txg = tx->tx_txg;
+
+	if (!dmu_tx_is_syncing(tx))
+		return (0);
+
 	/*
 	 * We don't allow multiple snapshots of the same txg.  If there
 	 * is already one, try again.
@@ -2026,39 +965,87 @@
 	/*
 	 * Check for conflicting snapshot name.
 	 */
-	err = dsl_dataset_snap_lookup(ds, snapname, &value);
-	if (err == 0)
+	error = dsl_dataset_snap_lookup(ds, snapname, &value);
+	if (error == 0)
 		return (EEXIST);
-	if (err != ENOENT)
-		return (err);
-
-	/*
-	 * Check that the dataset's name is not too long.  Name consists
-	 * of the dataset's length + 1 for the @-sign + snapshot name's length
-	 */
-	if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN)
-		return (ENAMETOOLONG);
-
-	err = dsl_dataset_snapshot_reserve_space(ds, tx);
-	if (err)
-		return (err);
-
-	ds->ds_trysnap_txg = tx->tx_txg;
+	if (error != ENOENT)
+		return (error);
+
+	error = dsl_dataset_snapshot_reserve_space(ds, tx);
+	if (error != 0)
+		return (error);
+
 	return (0);
 }
 
+static int
+dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_snapshot_arg_t *ddsa = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	nvpair_t *pair;
+	int rv = 0;
+
+	for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
+		int error = 0;
+		dsl_dataset_t *ds;
+		char *name, *atp;
+		char dsname[MAXNAMELEN];
+
+		name = nvpair_name(pair);
+		if (strlen(name) >= MAXNAMELEN)
+			error = ENAMETOOLONG;
+		if (error == 0) {
+			atp = strchr(name, '@');
+			if (atp == NULL)
+				error = EINVAL;
+			if (error == 0)
+				(void) strlcpy(dsname, name, atp - name + 1);
+		}
+		if (error == 0)
+			error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+		if (error == 0) {
+			error = dsl_dataset_snapshot_check_impl(ds,
+			    atp + 1, tx);
+			dsl_dataset_rele(ds, FTAG);
+		}
+
+		if (error != 0) {
+			if (ddsa->ddsa_errors != NULL) {
+				fnvlist_add_int32(ddsa->ddsa_errors,
+				    name, error);
+			}
+			rv = error;
+		}
+	}
+	return (rv);
+}
+
 void
-dsl_dataset_snapshot_sync(dsl_dataset_t *ds, const char *snapname,
+dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
     dmu_tx_t *tx)
 {
+	static zil_header_t zero_zil;
+
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	dmu_buf_t *dbuf;
 	dsl_dataset_phys_t *dsphys;
 	uint64_t dsobj, crtxg;
 	objset_t *mos = dp->dp_meta_objset;
-	int err;
-
-	ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
+	objset_t *os;
+
+	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+
+	/*
+	 * If we are on an old pool, the zil must not be active, in which
+	 * case it will be zeroed.  Usually zil_suspend() accomplishes this.
+	 */
+	ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP ||
+	    dmu_objset_from_ds(ds, &os) != 0 ||
+	    bcmp(&os->os_phys->os_zil_header, &zero_zil,
+	    sizeof (zero_zil)) == 0);
+
 
 	/*
 	 * The origin's ds_creation_txg has to be < TXG_INITIAL
@@ -2070,7 +1057,7 @@
 
 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
-	VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
+	VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	dsphys = dbuf->db_data;
 	bzero(dsphys, sizeof (dsl_dataset_phys_t));
@@ -2105,9 +1092,9 @@
 			    ds->ds_prev->ds_phys->ds_creation_txg);
 			ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
 		} else if (next_clones_obj != 0) {
-			remove_from_next_clones(ds->ds_prev,
+			dsl_dataset_remove_from_next_clones(ds->ds_prev,
 			    dsphys->ds_next_snap_obj, tx);
-			VERIFY3U(0, ==, zap_add_int(mos,
+			VERIFY0(zap_add_int(mos,
 			    next_clones_obj, dsobj, tx));
 		}
 	}
@@ -2126,9 +1113,6 @@
 	}
 
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	zfs_dbgmsg("taking snapshot %s@%s/%llu; newkey=%llu",
-	    ds->ds_dir->dd_myname, snapname, dsobj,
-	    ds->ds_phys->ds_prev_snap_txg);
 	ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist,
 	    UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx);
 	dsl_deadlist_close(&ds->ds_deadlist);
@@ -2143,13 +1127,12 @@
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
 		ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
 
-	err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
-	    snapname, 8, 1, &dsobj, tx);
-	ASSERT(err == 0);
+	VERIFY0(zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
+	    snapname, 8, 1, &dsobj, tx));
 
 	if (ds->ds_prev)
-		dsl_dataset_drop_ref(ds->ds_prev, ds);
-	VERIFY(0 == dsl_dataset_get_ref(dp,
+		dsl_dataset_rele(ds->ds_prev, ds);
+	VERIFY0(dsl_dataset_hold_obj(dp,
 	    ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
 
 	dsl_scan_ds_snapshotted(ds, tx);
@@ -2159,6 +1142,198 @@
 	spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, "");
 }
 
+static void
+dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_snapshot_arg_t *ddsa = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	nvpair_t *pair;
+
+	for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
+		dsl_dataset_t *ds;
+		char *name, *atp;
+		char dsname[MAXNAMELEN];
+
+		name = nvpair_name(pair);
+		atp = strchr(name, '@');
+		(void) strlcpy(dsname, name, atp - name + 1);
+		VERIFY0(dsl_dataset_hold(dp, dsname, FTAG, &ds));
+
+		dsl_dataset_snapshot_sync_impl(ds, atp + 1, tx);
+		if (ddsa->ddsa_props != NULL) {
+			dsl_props_set_sync_impl(ds->ds_prev,
+			    ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx);
+		}
+		dsl_dataset_rele(ds, FTAG);
+	}
+}
+
+/*
+ * The snapshots must all be in the same pool.
+ * All-or-nothing: if there are any failures, nothing will be modified.
+ */
+int
+dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)
+{
+	dsl_dataset_snapshot_arg_t ddsa;
+	nvpair_t *pair;
+	boolean_t needsuspend;
+	int error;
+	spa_t *spa;
+	char *firstname;
+	nvlist_t *suspended = NULL;
+
+	pair = nvlist_next_nvpair(snaps, NULL);
+	if (pair == NULL)
+		return (0);
+	firstname = nvpair_name(pair);
+
+	error = spa_open(firstname, &spa, FTAG);
+	if (error != 0)
+		return (error);
+	needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
+	spa_close(spa, FTAG);
+
+	if (needsuspend) {
+		suspended = fnvlist_alloc();
+		for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
+		    pair = nvlist_next_nvpair(snaps, pair)) {
+			char fsname[MAXNAMELEN];
+			char *snapname = nvpair_name(pair);
+			char *atp;
+			void *cookie;
+
+			atp = strchr(snapname, '@');
+			if (atp == NULL) {
+				error = EINVAL;
+				break;
+			}
+			(void) strlcpy(fsname, snapname, atp - snapname + 1);
+
+			error = zil_suspend(fsname, &cookie);
+			if (error != 0)
+				break;
+			fnvlist_add_uint64(suspended, fsname,
+			    (uintptr_t)cookie);
+		}
+	}
+
+	ddsa.ddsa_snaps = snaps;
+	ddsa.ddsa_props = props;
+	ddsa.ddsa_errors = errors;
+
+	if (error == 0) {
+		error = dsl_sync_task(firstname, dsl_dataset_snapshot_check,
+		    dsl_dataset_snapshot_sync, &ddsa,
+		    fnvlist_num_pairs(snaps) * 3);
+	}
+
+	if (suspended != NULL) {
+		for (pair = nvlist_next_nvpair(suspended, NULL); pair != NULL;
+		    pair = nvlist_next_nvpair(suspended, pair)) {
+			zil_resume((void *)(uintptr_t)
+			    fnvpair_value_uint64(pair));
+		}
+		fnvlist_free(suspended);
+	}
+
+	return (error);
+}
+
+typedef struct dsl_dataset_snapshot_tmp_arg {
+	const char *ddsta_fsname;
+	const char *ddsta_snapname;
+	minor_t ddsta_cleanup_minor;
+	const char *ddsta_htag;
+} dsl_dataset_snapshot_tmp_arg_t;
+
+static int
+dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+	int error;
+
+	error = dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds);
+	if (error != 0)
+		return (error);
+
+	error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname, tx);
+	if (error != 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return (error);
+	}
+
+	if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) {
+		dsl_dataset_rele(ds, FTAG);
+		return (ENOTSUP);
+	}
+	error = dsl_dataset_user_hold_check_one(NULL, ddsta->ddsta_htag,
+	    B_TRUE, tx);
+	if (error != 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return (error);
+	}
+
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
+}
+
+static void
+dsl_dataset_snapshot_tmp_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+
+	VERIFY0(dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds));
+
+	dsl_dataset_snapshot_sync_impl(ds, ddsta->ddsta_snapname, tx);
+	dsl_dataset_user_hold_sync_one(ds->ds_prev, ddsta->ddsta_htag,
+	    ddsta->ddsta_cleanup_minor, gethrestime_sec(), tx);
+	dsl_destroy_snapshot_sync_impl(ds->ds_prev, B_TRUE, tx);
+
+	dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
+    minor_t cleanup_minor, const char *htag)
+{
+	dsl_dataset_snapshot_tmp_arg_t ddsta;
+	int error;
+	spa_t *spa;
+	boolean_t needsuspend;
+	void *cookie;
+
+	ddsta.ddsta_fsname = fsname;
+	ddsta.ddsta_snapname = snapname;
+	ddsta.ddsta_cleanup_minor = cleanup_minor;
+	ddsta.ddsta_htag = htag;
+
+	error = spa_open(fsname, &spa, FTAG);
+	if (error != 0)
+		return (error);
+	needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
+	spa_close(spa, FTAG);
+
+	if (needsuspend) {
+		error = zil_suspend(fsname, &cookie);
+		if (error != 0)
+			return (error);
+	}
+
+	error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check,
+	    dsl_dataset_snapshot_tmp_sync, &ddsta, 3);
+
+	if (needsuspend)
+		zil_resume(cookie);
+	return (error);
+}
+
+
 void
 dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
 {
@@ -2183,65 +1358,49 @@
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 	zap_cursor_t zc;
 	zap_attribute_t za;
-	nvlist_t *propval;
-	nvlist_t *val;
-
-	rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
-	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-	VERIFY(nvlist_alloc(&val, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+	nvlist_t *propval = fnvlist_alloc();
+	nvlist_t *val = fnvlist_alloc();
+
+	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
 
 	/*
-	 * There may me missing entries in ds_next_clones_obj
+	 * There may be missing entries in ds_next_clones_obj
 	 * due to a bug in a previous version of the code.
 	 * Only trust it if it has the right number of entries.
 	 */
 	if (ds->ds_phys->ds_next_clones_obj != 0) {
-		ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj,
+		ASSERT0(zap_count(mos, ds->ds_phys->ds_next_clones_obj,
 		    &count));
 	}
-	if (count != ds->ds_phys->ds_num_children - 1) {
+	if (count != ds->ds_phys->ds_num_children - 1)
 		goto fail;
-	}
 	for (zap_cursor_init(&zc, mos, ds->ds_phys->ds_next_clones_obj);
 	    zap_cursor_retrieve(&zc, &za) == 0;
 	    zap_cursor_advance(&zc)) {
 		dsl_dataset_t *clone;
 		char buf[ZFS_MAXNAMELEN];
-		/*
-		 * Even though we hold the dp_config_rwlock, the dataset
-		 * may fail to open, returning ENOENT.  If there is a
-		 * thread concurrently attempting to destroy this
-		 * dataset, it will have the ds_rwlock held for
-		 * RW_WRITER.  Our call to dsl_dataset_hold_obj() ->
-		 * dsl_dataset_hold_ref() will fail its
-		 * rw_tryenter(&ds->ds_rwlock, RW_READER), drop the
-		 * dp_config_rwlock, and wait for the destroy progress
-		 * and signal ds_exclusive_cv.  If the destroy was
-		 * successful, we will see that
-		 * DSL_DATASET_IS_DESTROYED(), and return ENOENT.
-		 */
-		if (dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
-		    za.za_first_integer, FTAG, &clone) != 0)
-			continue;
+		VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
+		    za.za_first_integer, FTAG, &clone));
 		dsl_dir_name(clone->ds_dir, buf);
-		VERIFY(nvlist_add_boolean(val, buf) == 0);
+		fnvlist_add_boolean(val, buf);
 		dsl_dataset_rele(clone, FTAG);
 	}
 	zap_cursor_fini(&zc);
-	VERIFY(nvlist_add_nvlist(propval, ZPROP_VALUE, val) == 0);
-	VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES),
-	    propval) == 0);
+	fnvlist_add_nvlist(propval, ZPROP_VALUE, val);
+	fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), propval);
 fail:
 	nvlist_free(val);
 	nvlist_free(propval);
-	rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
 }
 
 void
 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
 {
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	uint64_t refd, avail, uobjs, aobjs, ratio;
 
+	ASSERT(dsl_pool_config_held(dp));
+
 	ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 :
 	    (ds->ds_phys->ds_uncompressed_bytes * 100 /
 	    ds->ds_phys->ds_compressed_bytes);
@@ -2287,10 +1446,8 @@
 		dsl_pool_t *dp = ds->ds_dir->dd_pool;
 		dsl_dataset_t *prev;
 
-		rw_enter(&dp->dp_config_rwlock, RW_READER);
 		int err = dsl_dataset_hold_obj(dp,
 		    ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
-		rw_exit(&dp->dp_config_rwlock);
 		if (err == 0) {
 			err = dsl_dataset_space_written(prev, ds, &written,
 			    &comp, &uncomp);
@@ -2306,6 +1463,9 @@
 void
 dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
 {
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	ASSERT(dsl_pool_config_held(dp));
+
 	stat->dds_creation_txg = ds->ds_phys->ds_creation_txg;
 	stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT;
 	stat->dds_guid = ds->ds_phys->ds_guid;
@@ -2317,16 +1477,14 @@
 		stat->dds_is_snapshot = B_FALSE;
 		stat->dds_num_clones = 0;
 
-		rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
 		if (dsl_dir_is_clone(ds->ds_dir)) {
 			dsl_dataset_t *ods;
 
-			VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool,
+			VERIFY0(dsl_dataset_hold_obj(dp,
 			    ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods));
 			dsl_dataset_name(ods, stat->dds_origin);
-			dsl_dataset_drop_ref(ods, FTAG);
+			dsl_dataset_rele(ods, FTAG);
 		}
-		rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
 	}
 }
 
@@ -2364,8 +1522,7 @@
 {
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 
-	ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
-	    dsl_pool_sync_context(dp));
+	ASSERT(dsl_pool_config_held(dp));
 	if (ds->ds_prev == NULL)
 		return (B_FALSE);
 	if (ds->ds_phys->ds_bp.blk_birth >
@@ -2387,237 +1544,225 @@
 	return (B_FALSE);
 }
 
+typedef struct dsl_dataset_rename_snapshot_arg {
+	const char *ddrsa_fsname;
+	const char *ddrsa_oldsnapname;
+	const char *ddrsa_newsnapname;
+	boolean_t ddrsa_recursive;
+	dmu_tx_t *ddrsa_tx;
+} dsl_dataset_rename_snapshot_arg_t;
+
 /* ARGSUSED */
 static int
-dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp,
+    dsl_dataset_t *hds, void *arg)
 {
-	dsl_dataset_t *ds = arg1;
-	char *newsnapname = arg2;
-	dsl_dir_t *dd = ds->ds_dir;
-	dsl_dataset_t *hds;
+	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
+	int error;
 	uint64_t val;
-	int err;
-
-	err = dsl_dataset_hold_obj(dd->dd_pool,
-	    dd->dd_phys->dd_head_dataset_obj, FTAG, &hds);
-	if (err)
-		return (err);
-
-	/* new name better not be in use */
-	err = dsl_dataset_snap_lookup(hds, newsnapname, &val);
-	dsl_dataset_rele(hds, FTAG);
-
-	if (err == 0)
-		err = EEXIST;
-	else if (err == ENOENT)
-		err = 0;
+
+	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
+	if (error != 0) {
+		/* ignore nonexistent snapshots */
+		return (error == ENOENT ? 0 : error);
+	}
+
+	/* new name should not exist */
+	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_newsnapname, &val);
+	if (error == 0)
+		error = EEXIST;
+	else if (error == ENOENT)
+		error = 0;
 
 	/* dataset name + 1 for the "@" + the new snapshot name must fit */
-	if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN)
-		err = ENAMETOOLONG;
-
-	return (err);
+	if (dsl_dir_namelen(hds->ds_dir) + 1 +
+	    strlen(ddrsa->ddrsa_newsnapname) >= MAXNAMELEN)
+		error = ENAMETOOLONG;
+
+	return (error);
 }
 
-static void
-dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = arg1;
-	const char *newsnapname = arg2;
-	dsl_dir_t *dd = ds->ds_dir;
-	objset_t *mos = dd->dd_pool->dp_meta_objset;
-	dsl_dataset_t *hds;
-	int err;
-
-	ASSERT(ds->ds_phys->ds_next_snap_obj != 0);
-
-	VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool,
-	    dd->dd_phys->dd_head_dataset_obj, FTAG, &hds));
-
-	VERIFY(0 == dsl_dataset_get_snapname(ds));
-	err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx);
-	ASSERT0(err);
-	mutex_enter(&ds->ds_lock);
-	(void) strcpy(ds->ds_snapname, newsnapname);
-	mutex_exit(&ds->ds_lock);
-	err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj,
-	    ds->ds_snapname, 8, 1, &ds->ds_object, tx);
-	ASSERT0(err);
-
-	spa_history_log_internal_ds(ds, "rename", tx,
-	    "-> @%s", newsnapname);
-	dsl_dataset_rele(hds, FTAG);
-}
-
-struct renamesnaparg {
-	dsl_sync_task_group_t *dstg;
-	char failed[MAXPATHLEN];
-	char *oldsnap;
-	char *newsnap;
-};
-
 static int
-dsl_snapshot_rename_one(const char *name, void *arg)
+dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx)
 {
-	struct renamesnaparg *ra = arg;
-	dsl_dataset_t *ds = NULL;
-	char *snapname;
-	int err;
-
-	snapname = kmem_asprintf("%s@%s", name, ra->oldsnap);
-	(void) strlcpy(ra->failed, snapname, sizeof (ra->failed));
-
-	/*
-	 * For recursive snapshot renames the parent won't be changing
-	 * so we just pass name for both the to/from argument.
-	 */
-	err = zfs_secpolicy_rename_perms(snapname, snapname, CRED());
-	if (err != 0) {
-		strfree(snapname);
-		return (err == ENOENT ? 0 : err);
+	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *hds;
+	int error;
+
+	error = dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds);
+	if (error != 0)
+		return (error);
+
+	if (ddrsa->ddrsa_recursive) {
+		error = dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
+		    dsl_dataset_rename_snapshot_check_impl, ddrsa,
+		    DS_FIND_CHILDREN);
+	} else {
+		error = dsl_dataset_rename_snapshot_check_impl(dp, hds, ddrsa);
 	}
-
-#ifdef _KERNEL
-	/*
-	 * For all filesystems undergoing rename, we'll need to unmount it.
-	 */
-	(void) zfs_unmount_snap(snapname, NULL);
-#endif
-	err = dsl_dataset_hold(snapname, ra->dstg, &ds);
-	strfree(snapname);
-	if (err != 0)
-		return (err == ENOENT ? 0 : err);
-
-	dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check,
-	    dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0);
-
-	return (0);
+	dsl_dataset_rele(hds, FTAG);
+	return (error);
 }
 
 static int
-dsl_recursive_rename(char *oldname, const char *newname)
+dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp,
+    dsl_dataset_t *hds, void *arg)
 {
-	int err;
-	struct renamesnaparg *ra;
-	dsl_sync_task_t *dst;
-	spa_t *spa;
-	char *cp, *fsname = spa_strdup(oldname);
-	int len = strlen(oldname) + 1;
-
-	/* truncate the snapshot name to get the fsname */
-	cp = strchr(fsname, '@');
-	*cp = '\0';
-
-	err = spa_open(fsname, &spa, FTAG);
-	if (err) {
-		kmem_free(fsname, len);
-		return (err);
+	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
+	dsl_dataset_t *ds;
+	uint64_t val;
+	dmu_tx_t *tx = ddrsa->ddrsa_tx;
+	int error;
+
+	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
+	ASSERT(error == 0 || error == ENOENT);
+	if (error == ENOENT) {
+		/* ignore nonexistent snapshots */
+		return (0);
 	}
-	ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP);
-	ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
-
-	ra->oldsnap = strchr(oldname, '@') + 1;
-	ra->newsnap = strchr(newname, '@') + 1;
-	*ra->failed = '\0';
-
-	err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra,
-	    DS_FIND_CHILDREN);
-	kmem_free(fsname, len);
-
-	if (err == 0) {
-		err = dsl_sync_task_group_wait(ra->dstg);
+
+	VERIFY0(dsl_dataset_hold_obj(dp, val, FTAG, &ds));
+
+	/* log before we change the name */
+	spa_history_log_internal_ds(ds, "rename", tx,
+	    "-> @%s", ddrsa->ddrsa_newsnapname);
+
+	VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx));
+	mutex_enter(&ds->ds_lock);
+	(void) strcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname);
+	mutex_exit(&ds->ds_lock);
+	VERIFY0(zap_add(dp->dp_meta_objset, hds->ds_phys->ds_snapnames_zapobj,
+	    ds->ds_snapname, 8, 1, &ds->ds_object, tx));
+
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
+}
+
+static void
+dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *hds;
+
+	VERIFY0(dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds));
+	ddrsa->ddrsa_tx = tx;
+	if (ddrsa->ddrsa_recursive) {
+		VERIFY0(dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
+		    dsl_dataset_rename_snapshot_sync_impl, ddrsa,
+		    DS_FIND_CHILDREN));
+	} else {
+		VERIFY0(dsl_dataset_rename_snapshot_sync_impl(dp, hds, ddrsa));
 	}
-
-	for (dst = list_head(&ra->dstg->dstg_tasks); dst;
-	    dst = list_next(&ra->dstg->dstg_tasks, dst)) {
-		dsl_dataset_t *ds = dst->dst_arg1;
-		if (dst->dst_err) {
-			dsl_dir_name(ds->ds_dir, ra->failed);
-			(void) strlcat(ra->failed, "@", sizeof (ra->failed));
-			(void) strlcat(ra->failed, ra->newsnap,
-			    sizeof (ra->failed));
-		}
-		dsl_dataset_rele(ds, ra->dstg);
-	}
-
-	if (err)
-		(void) strlcpy(oldname, ra->failed, sizeof (ra->failed));
-
-	dsl_sync_task_group_destroy(ra->dstg);
-	kmem_free(ra, sizeof (struct renamesnaparg));
-	spa_close(spa, FTAG);
-	return (err);
+	dsl_dataset_rele(hds, FTAG);
+}
+
+int
+dsl_dataset_rename_snapshot(const char *fsname,
+    const char *oldsnapname, const char *newsnapname, boolean_t recursive)
+{
+	dsl_dataset_rename_snapshot_arg_t ddrsa;
+
+	ddrsa.ddrsa_fsname = fsname;
+	ddrsa.ddrsa_oldsnapname = oldsnapname;
+	ddrsa.ddrsa_newsnapname = newsnapname;
+	ddrsa.ddrsa_recursive = recursive;
+
+	return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check,
+	    dsl_dataset_rename_snapshot_sync, &ddrsa, 1));
 }
 
 static int
-dsl_valid_rename(const char *oldname, void *arg)
+dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx)
 {
-	int delta = *(int *)arg;
-
-	if (strlen(oldname) + delta >= MAXNAMELEN)
-		return (ENAMETOOLONG);
-
+	const char *fsname = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+	int64_t unused_refres_delta;
+	int error;
+
+	error = dsl_dataset_hold(dp, fsname, FTAG, &ds);
+	if (error != 0)
+		return (error);
+
+	/* must not be a snapshot */
+	if (dsl_dataset_is_snapshot(ds)) {
+		dsl_dataset_rele(ds, FTAG);
+		return (EINVAL);
+	}
+
+	/* must have a most recent snapshot */
+	if (ds->ds_phys->ds_prev_snap_txg < TXG_INITIAL) {
+		dsl_dataset_rele(ds, FTAG);
+		return (EINVAL);
+	}
+
+	if (dsl_dataset_long_held(ds)) {
+		dsl_dataset_rele(ds, FTAG);
+		return (EBUSY);
+	}
+
+	/*
+	 * Check if the snap we are rolling back to uses more than
+	 * the refquota.
+	 */
+	if (ds->ds_quota != 0 &&
+	    ds->ds_prev->ds_phys->ds_referenced_bytes > ds->ds_quota) {
+		dsl_dataset_rele(ds, FTAG);
+		return (EDQUOT);
+	}
+
+	/*
+	 * When we do the clone swap, we will temporarily use more space
+	 * due to the refreservation (the head will no longer have any
+	 * unique space, so the entire amount of the refreservation will need
+	 * to be free).  We will immediately destroy the clone, freeing
+	 * this space, but the freeing happens over many txg's.
+	 */
+	unused_refres_delta = (int64_t)MIN(ds->ds_reserved,
+	    ds->ds_phys->ds_unique_bytes);
+
+	if (unused_refres_delta > 0 &&
+	    unused_refres_delta >
+	    dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) {
+		dsl_dataset_rele(ds, FTAG);
+		return (ENOSPC);
+	}
+
+	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
-#pragma weak dmu_objset_rename = dsl_dataset_rename
-int
-dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive)
+static void
+dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx)
 {
-	dsl_dir_t *dd;
-	dsl_dataset_t *ds;
-	const char *tail;
-	int err;
-
-	err = dsl_dir_open(oldname, FTAG, &dd, &tail);
-	if (err)
-		return (err);
-
-	if (tail == NULL) {
-		int delta = strlen(newname) - strlen(oldname);
-
-		/* if we're growing, validate child name lengths */
-		if (delta > 0)
-			err = dmu_objset_find(oldname, dsl_valid_rename,
-			    &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
-
-		if (err == 0)
-			err = dsl_dir_rename(dd, newname);
-		dsl_dir_close(dd, FTAG);
-		return (err);
-	}
-
-	if (tail[0] != '@') {
-		/* the name ended in a nonexistent component */
-		dsl_dir_close(dd, FTAG);
-		return (ENOENT);
-	}
-
-	dsl_dir_close(dd, FTAG);
-
-	/* new name must be snapshot in same filesystem */
-	tail = strchr(newname, '@');
-	if (tail == NULL)
-		return (EINVAL);
-	tail++;
-	if (strncmp(oldname, newname, tail - newname) != 0)
-		return (EXDEV);
-
-	if (recursive) {
-		err = dsl_recursive_rename(oldname, newname);
-	} else {
-		err = dsl_dataset_hold(oldname, FTAG, &ds);
-		if (err)
-			return (err);
-
-		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-		    dsl_dataset_snapshot_rename_check,
-		    dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1);
-
-		dsl_dataset_rele(ds, FTAG);
-	}
-
-	return (err);
+	const char *fsname = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds, *clone;
+	uint64_t cloneobj;
+
+	VERIFY0(dsl_dataset_hold(dp, fsname, FTAG, &ds));
+
+	cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback",
+	    ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx);
+
+	VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone));
+
+	dsl_dataset_clone_swap_sync_impl(clone, ds, tx);
+	dsl_dataset_zero_zil(ds, tx);
+
+	dsl_destroy_head_sync_impl(clone, tx);
+
+	dsl_dataset_rele(clone, FTAG);
+	dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_dataset_rollback(const char *fsname)
+{
+	return (dsl_sync_task(fsname, dsl_dataset_rollback_check,
+	    dsl_dataset_rollback_sync, (void *)fsname, 1));
 }
 
 struct promotenode {
@@ -2625,49 +1770,66 @@
 	dsl_dataset_t *ds;
 };
 
-struct promotearg {
+typedef struct dsl_dataset_promote_arg {
+	const char *ddpa_clonename;
+	dsl_dataset_t *ddpa_clone;
 	list_t shared_snaps, origin_snaps, clone_snaps;
-	dsl_dataset_t *origin_origin;
+	dsl_dataset_t *origin_origin; /* origin of the origin */
 	uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
 	char *err_ds;
-};
+} dsl_dataset_promote_arg_t;
 
 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
-static boolean_t snaplist_unstable(list_t *l);
+static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp,
+    void *tag);
+static void promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag);
 
 static int
-dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
 {
-	dsl_dataset_t *hds = arg1;
-	struct promotearg *pa = arg2;
-	struct promotenode *snap = list_head(&pa->shared_snaps);
-	dsl_dataset_t *origin_ds = snap->ds;
+	dsl_dataset_promote_arg_t *ddpa = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *hds;
+	struct promotenode *snap;
+	dsl_dataset_t *origin_ds;
 	int err;
 	uint64_t unused;
 
-	/* Check that it is a real clone */
-	if (!dsl_dir_is_clone(hds->ds_dir))
-		return (EINVAL);
-
-	/* Since this is so expensive, don't do the preliminary check */
-	if (!dmu_tx_is_syncing(tx))
+	err = promote_hold(ddpa, dp, FTAG);
+	if (err != 0)
+		return (err);
+
+	hds = ddpa->ddpa_clone;
+
+	if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) {
+		promote_rele(ddpa, FTAG);
+		return (EXDEV);
+	}
+
+	/*
+	 * Compute and check the amount of space to transfer.  Since this is
+	 * so expensive, don't do the preliminary check.
+	 */
+	if (!dmu_tx_is_syncing(tx)) {
+		promote_rele(ddpa, FTAG);
 		return (0);
-
-	if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)
-		return (EXDEV);
+	}
+
+	snap = list_head(&ddpa->shared_snaps);
+	origin_ds = snap->ds;
 
 	/* compute origin's new unique space */
-	snap = list_tail(&pa->clone_snaps);
+	snap = list_tail(&ddpa->clone_snaps);
 	ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
 	dsl_deadlist_space_range(&snap->ds->ds_deadlist,
 	    origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
-	    &pa->unique, &unused, &unused);
+	    &ddpa->unique, &unused, &unused);
 
 	/*
 	 * Walk the snapshots that we are moving
 	 *
 	 * Compute space to transfer.  Consider the incremental changes
-	 * to used for each snapshot:
+	 * to used by each snapshot:
 	 * (my used) = (prev's used) + (blocks born) - (blocks killed)
 	 * So each snapshot gave birth to:
 	 * (blocks born) = (my used) - (prev's used) + (blocks killed)
@@ -2678,18 +1840,28 @@
 	 * Note however, if we stop before we reach the ORIGIN we get:
 	 * uN + kN + kN-1 + ... + kM - uM-1
 	 */
-	pa->used = origin_ds->ds_phys->ds_referenced_bytes;
-	pa->comp = origin_ds->ds_phys->ds_compressed_bytes;
-	pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes;
-	for (snap = list_head(&pa->shared_snaps); snap;
-	    snap = list_next(&pa->shared_snaps, snap)) {
+	ddpa->used = origin_ds->ds_phys->ds_referenced_bytes;
+	ddpa->comp = origin_ds->ds_phys->ds_compressed_bytes;
+	ddpa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes;
+	for (snap = list_head(&ddpa->shared_snaps); snap;
+	    snap = list_next(&ddpa->shared_snaps, snap)) {
 		uint64_t val, dlused, dlcomp, dluncomp;
 		dsl_dataset_t *ds = snap->ds;
 
+		/*
+		 * If there are long holds, we won't be able to evict
+		 * the objset.
+		 */
+		if (dsl_dataset_long_held(ds)) {
+			err = EBUSY;
+			goto out;
+		}
+
 		/* Check that the snapshot name does not conflict */
-		VERIFY(0 == dsl_dataset_get_snapname(ds));
+		VERIFY0(dsl_dataset_get_snapname(ds));
 		err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
 		if (err == 0) {
+			(void) strcpy(ddpa->err_ds, snap->ds->ds_snapname);
 			err = EEXIST;
 			goto out;
 		}
@@ -2702,26 +1874,27 @@
 
 		dsl_deadlist_space(&ds->ds_deadlist,
 		    &dlused, &dlcomp, &dluncomp);
-		pa->used += dlused;
-		pa->comp += dlcomp;
-		pa->uncomp += dluncomp;
+		ddpa->used += dlused;
+		ddpa->comp += dlcomp;
+		ddpa->uncomp += dluncomp;
 	}
 
 	/*
 	 * If we are a clone of a clone then we never reached ORIGIN,
 	 * so we need to subtract out the clone origin's used space.
 	 */
-	if (pa->origin_origin) {
-		pa->used -= pa->origin_origin->ds_phys->ds_referenced_bytes;
-		pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes;
-		pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes;
+	if (ddpa->origin_origin) {
+		ddpa->used -= ddpa->origin_origin->ds_phys->ds_referenced_bytes;
+		ddpa->comp -= ddpa->origin_origin->ds_phys->ds_compressed_bytes;
+		ddpa->uncomp -=
+		    ddpa->origin_origin->ds_phys->ds_uncompressed_bytes;
 	}
 
 	/* Check that there is enough space here */
 	err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
-	    pa->used);
-	if (err)
-		return (err);
+	    ddpa->used);
+	if (err != 0)
+		goto out;
 
 	/*
 	 * Compute the amounts of space that will be used by snapshots
@@ -2739,68 +1912,75 @@
 		 * calls will be fast because they do not have to
 		 * iterate over all bps.
 		 */
-		snap = list_head(&pa->origin_snaps);
-		err = snaplist_space(&pa->shared_snaps,
-		    snap->ds->ds_dir->dd_origin_txg, &pa->cloneusedsnap);
-		if (err)
-			return (err);
-
-		err = snaplist_space(&pa->clone_snaps,
+		snap = list_head(&ddpa->origin_snaps);
+		err = snaplist_space(&ddpa->shared_snaps,
+		    snap->ds->ds_dir->dd_origin_txg, &ddpa->cloneusedsnap);
+		if (err != 0)
+			goto out;
+
+		err = snaplist_space(&ddpa->clone_snaps,
 		    snap->ds->ds_dir->dd_origin_txg, &space);
-		if (err)
-			return (err);
-		pa->cloneusedsnap += space;
+		if (err != 0)
+			goto out;
+		ddpa->cloneusedsnap += space;
 	}
 	if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
-		err = snaplist_space(&pa->origin_snaps,
-		    origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap);
-		if (err)
-			return (err);
+		err = snaplist_space(&ddpa->origin_snaps,
+		    origin_ds->ds_phys->ds_creation_txg, &ddpa->originusedsnap);
+		if (err != 0)
+			goto out;
 	}
 
-	return (0);
 out:
-	pa->err_ds =  snap->ds->ds_snapname;
+	promote_rele(ddpa, FTAG);
 	return (err);
 }
 
 static void
-dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
 {
-	dsl_dataset_t *hds = arg1;
-	struct promotearg *pa = arg2;
-	struct promotenode *snap = list_head(&pa->shared_snaps);
-	dsl_dataset_t *origin_ds = snap->ds;
+	dsl_dataset_promote_arg_t *ddpa = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *hds;
+	struct promotenode *snap;
+	dsl_dataset_t *origin_ds;
 	dsl_dataset_t *origin_head;
-	dsl_dir_t *dd = hds->ds_dir;
-	dsl_pool_t *dp = hds->ds_dir->dd_pool;
+	dsl_dir_t *dd;
 	dsl_dir_t *odd = NULL;
 	uint64_t oldnext_obj;
 	int64_t delta;
 
-	ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE));
-
-	snap = list_head(&pa->origin_snaps);
+	VERIFY0(promote_hold(ddpa, dp, FTAG));
+	hds = ddpa->ddpa_clone;
+
+	ASSERT0(hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE);
+
+	snap = list_head(&ddpa->shared_snaps);
+	origin_ds = snap->ds;
+	dd = hds->ds_dir;
+
+	snap = list_head(&ddpa->origin_snaps);
 	origin_head = snap->ds;
 
 	/*
 	 * We need to explicitly open odd, since origin_ds's dd will be
 	 * changing.
 	 */
-	VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object,
+	VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object,
 	    NULL, FTAG, &odd));
 
 	/* change origin's next snap */
 	dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
 	oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj;
-	snap = list_tail(&pa->clone_snaps);
+	snap = list_tail(&ddpa->clone_snaps);
 	ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
 	origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object;
 
 	/* change the origin's next clone */
 	if (origin_ds->ds_phys->ds_next_clones_obj) {
-		remove_from_next_clones(origin_ds, snap->ds->ds_object, tx);
-		VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
+		dsl_dataset_remove_from_next_clones(origin_ds,
+		    snap->ds->ds_object, tx);
+		VERIFY0(zap_add_int(dp->dp_meta_objset,
 		    origin_ds->ds_phys->ds_next_clones_obj,
 		    oldnext_obj, tx));
 	}
@@ -2817,39 +1997,43 @@
 
 	/* change dd_clone entries */
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
-		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+		VERIFY0(zap_remove_int(dp->dp_meta_objset,
 		    odd->dd_phys->dd_clones, hds->ds_object, tx));
-		VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
-		    pa->origin_origin->ds_dir->dd_phys->dd_clones,
+		VERIFY0(zap_add_int(dp->dp_meta_objset,
+		    ddpa->origin_origin->ds_dir->dd_phys->dd_clones,
 		    hds->ds_object, tx));
 
-		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
-		    pa->origin_origin->ds_dir->dd_phys->dd_clones,
+		VERIFY0(zap_remove_int(dp->dp_meta_objset,
+		    ddpa->origin_origin->ds_dir->dd_phys->dd_clones,
 		    origin_head->ds_object, tx));
 		if (dd->dd_phys->dd_clones == 0) {
 			dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset,
 			    DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
 		}
-		VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
+		VERIFY0(zap_add_int(dp->dp_meta_objset,
 		    dd->dd_phys->dd_clones, origin_head->ds_object, tx));
-
 	}
 
 	/* move snapshots to this dir */
-	for (snap = list_head(&pa->shared_snaps); snap;
-	    snap = list_next(&pa->shared_snaps, snap)) {
+	for (snap = list_head(&ddpa->shared_snaps); snap;
+	    snap = list_next(&ddpa->shared_snaps, snap)) {
 		dsl_dataset_t *ds = snap->ds;
 
-		/* unregister props as dsl_dir is changing */
+		/*
+		 * Property callbacks are registered to a particular
+		 * dsl_dir.  Since ours is changing, evict the objset
+		 * so that they will be unregistered from the old dsl_dir.
+		 */
 		if (ds->ds_objset) {
 			dmu_objset_evict(ds->ds_objset);
 			ds->ds_objset = NULL;
 		}
+
 		/* move snap name entry */
-		VERIFY(0 == dsl_dataset_get_snapname(ds));
-		VERIFY(0 == dsl_dataset_snap_remove(origin_head,
+		VERIFY0(dsl_dataset_get_snapname(ds));
+		VERIFY0(dsl_dataset_snap_remove(origin_head,
 		    ds->ds_snapname, tx));
-		VERIFY(0 == zap_add(dp->dp_meta_objset,
+		VERIFY0(zap_add(dp->dp_meta_objset,
 		    hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
 		    8, 1, &ds->ds_object, tx));
 
@@ -2858,8 +2042,8 @@
 		ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object);
 		ds->ds_phys->ds_dir_obj = dd->dd_object;
 		ASSERT3P(ds->ds_dir, ==, odd);
-		dsl_dir_close(ds->ds_dir, ds);
-		VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
+		dsl_dir_rele(ds->ds_dir, ds);
+		VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object,
 		    NULL, ds, &ds->ds_dir));
 
 		/* move any clone references */
@@ -2883,20 +2067,20 @@
 					continue;
 				}
 
-				VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
+				VERIFY0(dsl_dataset_hold_obj(dp,
 				    za.za_first_integer, FTAG, &cnds));
 				o = cnds->ds_dir->dd_phys->dd_head_dataset_obj;
 
-				VERIFY3U(zap_remove_int(dp->dp_meta_objset,
-				    odd->dd_phys->dd_clones, o, tx), ==, 0);
-				VERIFY3U(zap_add_int(dp->dp_meta_objset,
-				    dd->dd_phys->dd_clones, o, tx), ==, 0);
+				VERIFY0(zap_remove_int(dp->dp_meta_objset,
+				    odd->dd_phys->dd_clones, o, tx));
+				VERIFY0(zap_add_int(dp->dp_meta_objset,
+				    dd->dd_phys->dd_clones, o, tx));
 				dsl_dataset_rele(cnds, FTAG);
 			}
 			zap_cursor_fini(&zc);
 		}
 
-		ASSERT0(dsl_prop_numcb(ds));
+		ASSERT(!dsl_prop_hascb(ds));
 	}
 
 	/*
@@ -2906,31 +2090,31 @@
 	 * is true for each of {clone,origin} independently.
 	 */
 
-	delta = pa->cloneusedsnap -
+	delta = ddpa->cloneusedsnap -
 	    dd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
 	ASSERT3S(delta, >=, 0);
-	ASSERT3U(pa->used, >=, delta);
+	ASSERT3U(ddpa->used, >=, delta);
 	dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
 	dsl_dir_diduse_space(dd, DD_USED_HEAD,
-	    pa->used - delta, pa->comp, pa->uncomp, tx);
-
-	delta = pa->originusedsnap -
+	    ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx);
+
+	delta = ddpa->originusedsnap -
 	    odd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
 	ASSERT3S(delta, <=, 0);
-	ASSERT3U(pa->used, >=, -delta);
+	ASSERT3U(ddpa->used, >=, -delta);
 	dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
 	dsl_dir_diduse_space(odd, DD_USED_HEAD,
-	    -pa->used - delta, -pa->comp, -pa->uncomp, tx);
-
-	origin_ds->ds_phys->ds_unique_bytes = pa->unique;
+	    -ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx);
+
+	origin_ds->ds_phys->ds_unique_bytes = ddpa->unique;
 
 	/* log history record */
 	spa_history_log_internal_ds(hds, "promote", tx, "");
 
-	dsl_dir_close(odd, FTAG);
+	dsl_dir_rele(odd, FTAG);
+	promote_rele(ddpa, FTAG);
 }
 
-static char *snaplist_tag = "snaplist";
 /*
  * Make a list of dsl_dataset_t's for the snapshots between first_obj
  * (exclusive) and last_obj (inclusive).  The list will be in reverse
@@ -2938,13 +2122,11 @@
  * snapshots back to this dataset's origin.
  */
 static int
-snaplist_make(dsl_pool_t *dp, boolean_t own,
-    uint64_t first_obj, uint64_t last_obj, list_t *l)
+snaplist_make(dsl_pool_t *dp,
+    uint64_t first_obj, uint64_t last_obj, list_t *l, void *tag)
 {
 	uint64_t obj = last_obj;
 
-	ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock));
-
 	list_create(l, sizeof (struct promotenode),
 	    offsetof(struct promotenode, link));
 
@@ -2953,28 +2135,15 @@
 		struct promotenode *snap;
 		int err;
 
-		if (own) {
-			err = dsl_dataset_own_obj(dp, obj,
-			    0, snaplist_tag, &ds);
-			if (err == 0)
-				dsl_dataset_make_exclusive(ds, snaplist_tag);
-		} else {
-			err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds);
-		}
-		if (err == ENOENT) {
-			/* lost race with snapshot destroy */
-			struct promotenode *last = list_tail(l);
-			ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj);
-			obj = last->ds->ds_phys->ds_prev_snap_obj;
-			continue;
-		} else if (err) {
+		err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
+		ASSERT(err != ENOENT);
+		if (err != 0)
 			return (err);
-		}
 
 		if (first_obj == 0)
 			first_obj = ds->ds_dir->dd_phys->dd_origin_obj;
 
-		snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP);
+		snap = kmem_alloc(sizeof (*snap), KM_SLEEP);
 		snap->ds = ds;
 		list_insert_tail(l, snap);
 		obj = ds->ds_phys->ds_prev_snap_obj;
@@ -2999,208 +2168,209 @@
 }
 
 static void
-snaplist_destroy(list_t *l, boolean_t own)
+snaplist_destroy(list_t *l, void *tag)
 {
 	struct promotenode *snap;
 
-	if (!l || !list_link_active(&l->list_head))
+	if (l == NULL || !list_link_active(&l->list_head))
 		return;
 
 	while ((snap = list_tail(l)) != NULL) {
 		list_remove(l, snap);
-		if (own)
-			dsl_dataset_disown(snap->ds, snaplist_tag);
-		else
-			dsl_dataset_rele(snap->ds, snaplist_tag);
-		kmem_free(snap, sizeof (struct promotenode));
+		dsl_dataset_rele(snap->ds, tag);
+		kmem_free(snap, sizeof (*snap));
 	}
 	list_destroy(l);
 }
 
+static int
+promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag)
+{
+	int error;
+	dsl_dir_t *dd;
+	struct promotenode *snap;
+
+	error = dsl_dataset_hold(dp, ddpa->ddpa_clonename, tag,
+	    &ddpa->ddpa_clone);
+	if (error != 0)
+		return (error);
+	dd = ddpa->ddpa_clone->ds_dir;
+
+	if (dsl_dataset_is_snapshot(ddpa->ddpa_clone) ||
+	    !dsl_dir_is_clone(dd)) {
+		dsl_dataset_rele(ddpa->ddpa_clone, tag);
+		return (EINVAL);
+	}
+
+	error = snaplist_make(dp, 0, dd->dd_phys->dd_origin_obj,
+	    &ddpa->shared_snaps, tag);
+	if (error != 0)
+		goto out;
+
+	error = snaplist_make(dp, 0, ddpa->ddpa_clone->ds_object,
+	    &ddpa->clone_snaps, tag);
+	if (error != 0)
+		goto out;
+
+	snap = list_head(&ddpa->shared_snaps);
+	ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj);
+	error = snaplist_make(dp, dd->dd_phys->dd_origin_obj,
+	    snap->ds->ds_dir->dd_phys->dd_head_dataset_obj,
+	    &ddpa->origin_snaps, tag);
+	if (error != 0)
+		goto out;
+
+	if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) {
+		error = dsl_dataset_hold_obj(dp,
+		    snap->ds->ds_dir->dd_phys->dd_origin_obj,
+		    tag, &ddpa->origin_origin);
+		if (error != 0)
+			goto out;
+	}
+out:
+	if (error != 0)
+		promote_rele(ddpa, tag);
+	return (error);
+}
+
+static void
+promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag)
+{
+	snaplist_destroy(&ddpa->shared_snaps, tag);
+	snaplist_destroy(&ddpa->clone_snaps, tag);
+	snaplist_destroy(&ddpa->origin_snaps, tag);
+	if (ddpa->origin_origin != NULL)
+		dsl_dataset_rele(ddpa->origin_origin, tag);
+	dsl_dataset_rele(ddpa->ddpa_clone, tag);
+}
+
 /*
- * Promote a clone.  Nomenclature note:
- * "clone" or "cds": the original clone which is being promoted
- * "origin" or "ods": the snapshot which is originally clone's origin
- * "origin head" or "ohds": the dataset which is the head
- * (filesystem/volume) for the origin
- * "origin origin": the origin of the origin's filesystem (typically
- * NULL, indicating that the clone is not a clone of a clone).
+ * Promote a clone.
+ *
+ * If it fails due to a conflicting snapshot name, "conflsnap" will be filled
+ * in with the name.  (It must be at least MAXNAMELEN bytes long.)
  */
 int
 dsl_dataset_promote(const char *name, char *conflsnap)
 {
-	dsl_dataset_t *ds;
-	dsl_dir_t *dd;
-	dsl_pool_t *dp;
-	dmu_object_info_t doi;
-	struct promotearg pa = { 0 };
-	struct promotenode *snap;
-	int err;
-
-	err = dsl_dataset_hold(name, FTAG, &ds);
-	if (err)
-		return (err);
-	dd = ds->ds_dir;
-	dp = dd->dd_pool;
-
-	err = dmu_object_info(dp->dp_meta_objset,
-	    ds->ds_phys->ds_snapnames_zapobj, &doi);
-	if (err) {
-		dsl_dataset_rele(ds, FTAG);
-		return (err);
-	}
-
-	if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) {
-		dsl_dataset_rele(ds, FTAG);
-		return (EINVAL);
-	}
+	dsl_dataset_promote_arg_t ddpa = { 0 };
+	uint64_t numsnaps;
+	int error;
+	objset_t *os;
 
 	/*
-	 * We are going to inherit all the snapshots taken before our
-	 * origin (i.e., our new origin will be our parent's origin).
-	 * Take ownership of them so that we can rename them into our
-	 * namespace.
+	 * We will modify space proportional to the number of
+	 * snapshots.  Compute numsnaps.
 	 */
-	rw_enter(&dp->dp_config_rwlock, RW_READER);
-
-	err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj,
-	    &pa.shared_snaps);
-	if (err != 0)
-		goto out;
-
-	err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps);
-	if (err != 0)
-		goto out;
-
-	snap = list_head(&pa.shared_snaps);
-	ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj);
-	err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj,
-	    snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps);
-	if (err != 0)
-		goto out;
-
-	if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) {
-		err = dsl_dataset_hold_obj(dp,
-		    snap->ds->ds_dir->dd_phys->dd_origin_obj,
-		    FTAG, &pa.origin_origin);
-		if (err != 0)
-			goto out;
-	}
-
-out:
-	rw_exit(&dp->dp_config_rwlock);
-
-	/*
-	 * Add in 128x the snapnames zapobj size, since we will be moving
-	 * a bunch of snapnames to the promoted ds, and dirtying their
-	 * bonus buffers.
-	 */
-	if (err == 0) {
-		err = dsl_sync_task_do(dp, dsl_dataset_promote_check,
-		    dsl_dataset_promote_sync, ds, &pa,
-		    2 + 2 * doi.doi_physical_blocks_512);
-		if (err && pa.err_ds && conflsnap)
-			(void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN);
-	}
-
-	snaplist_destroy(&pa.shared_snaps, B_TRUE);
-	snaplist_destroy(&pa.clone_snaps, B_FALSE);
-	snaplist_destroy(&pa.origin_snaps, B_FALSE);
-	if (pa.origin_origin)
-		dsl_dataset_rele(pa.origin_origin, FTAG);
-	dsl_dataset_rele(ds, FTAG);
-	return (err);
+	error = dmu_objset_hold(name, FTAG, &os);
+	if (error != 0)
+		return (error);
+	error = zap_count(dmu_objset_pool(os)->dp_meta_objset,
+	    dmu_objset_ds(os)->ds_phys->ds_snapnames_zapobj, &numsnaps);
+	dmu_objset_rele(os, FTAG);
+	if (error != 0)
+		return (error);
+
+	ddpa.ddpa_clonename = name;
+	ddpa.err_ds = conflsnap;
+
+	return (dsl_sync_task(name, dsl_dataset_promote_check,
+	    dsl_dataset_promote_sync, &ddpa, 2 + numsnaps));
 }
 
-struct cloneswaparg {
-	dsl_dataset_t *cds; /* clone dataset */
-	dsl_dataset_t *ohds; /* origin's head dataset */
-	boolean_t force;
-	int64_t unused_refres_delta; /* change in unconsumed refreservation */
-};
-
-/* ARGSUSED */
-static int
-dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx)
+int
+dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
+    dsl_dataset_t *origin_head, boolean_t force)
 {
-	struct cloneswaparg *csa = arg1;
+	int64_t unused_refres_delta;
 
 	/* they should both be heads */
-	if (dsl_dataset_is_snapshot(csa->cds) ||
-	    dsl_dataset_is_snapshot(csa->ohds))
+	if (dsl_dataset_is_snapshot(clone) ||
+	    dsl_dataset_is_snapshot(origin_head))
 		return (EINVAL);
 
 	/* the branch point should be just before them */
-	if (csa->cds->ds_prev != csa->ohds->ds_prev)
+	if (clone->ds_prev != origin_head->ds_prev)
 		return (EINVAL);
 
-	/* cds should be the clone (unless they are unrelated) */
-	if (csa->cds->ds_prev != NULL &&
-	    csa->cds->ds_prev != csa->cds->ds_dir->dd_pool->dp_origin_snap &&
-	    csa->ohds->ds_object !=
-	    csa->cds->ds_prev->ds_phys->ds_next_snap_obj)
+	/* clone should be the clone (unless they are unrelated) */
+	if (clone->ds_prev != NULL &&
+	    clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap &&
+	    origin_head->ds_object !=
+	    clone->ds_prev->ds_phys->ds_next_snap_obj)
 		return (EINVAL);
 
 	/* the clone should be a child of the origin */
-	if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir)
+	if (clone->ds_dir->dd_parent != origin_head->ds_dir)
 		return (EINVAL);
 
-	/* ohds shouldn't be modified unless 'force' */
-	if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds))
+	/* origin_head shouldn't be modified unless 'force' */
+	if (!force && dsl_dataset_modified_since_lastsnap(origin_head))
 		return (ETXTBSY);
 
-	/* adjust amount of any unconsumed refreservation */
-	csa->unused_refres_delta =
-	    (int64_t)MIN(csa->ohds->ds_reserved,
-	    csa->ohds->ds_phys->ds_unique_bytes) -
-	    (int64_t)MIN(csa->ohds->ds_reserved,
-	    csa->cds->ds_phys->ds_unique_bytes);
-
-	if (csa->unused_refres_delta > 0 &&
-	    csa->unused_refres_delta >
-	    dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE))
+	/* origin_head should have no long holds (e.g. is not mounted) */
+	if (dsl_dataset_long_held(origin_head))
+		return (EBUSY);
+
+	/* check amount of any unconsumed refreservation */
+	unused_refres_delta =
+	    (int64_t)MIN(origin_head->ds_reserved,
+	    origin_head->ds_phys->ds_unique_bytes) -
+	    (int64_t)MIN(origin_head->ds_reserved,
+	    clone->ds_phys->ds_unique_bytes);
+
+	if (unused_refres_delta > 0 &&
+	    unused_refres_delta >
+	    dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE))
 		return (ENOSPC);
 
-	if (csa->ohds->ds_quota != 0 &&
-	    csa->cds->ds_phys->ds_unique_bytes > csa->ohds->ds_quota)
+	/* clone can't be over the head's refquota */
+	if (origin_head->ds_quota != 0 &&
+	    clone->ds_phys->ds_referenced_bytes > origin_head->ds_quota)
 		return (EDQUOT);
 
 	return (0);
 }
 
-/* ARGSUSED */
-static void
-dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+void
+dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
+    dsl_dataset_t *origin_head, dmu_tx_t *tx)
 {
-	struct cloneswaparg *csa = arg1;
-	dsl_pool_t *dp = csa->cds->ds_dir->dd_pool;
-
-	ASSERT(csa->cds->ds_reserved == 0);
-	ASSERT(csa->ohds->ds_quota == 0 ||
-	    csa->cds->ds_phys->ds_unique_bytes <= csa->ohds->ds_quota);
-
-	dmu_buf_will_dirty(csa->cds->ds_dbuf, tx);
-	dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx);
-
-	if (csa->cds->ds_objset != NULL) {
-		dmu_objset_evict(csa->cds->ds_objset);
-		csa->cds->ds_objset = NULL;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	int64_t unused_refres_delta;
+
+	ASSERT(clone->ds_reserved == 0);
+	ASSERT(origin_head->ds_quota == 0 ||
+	    clone->ds_phys->ds_unique_bytes <= origin_head->ds_quota);
+
+	dmu_buf_will_dirty(clone->ds_dbuf, tx);
+	dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
+
+	if (clone->ds_objset != NULL) {
+		dmu_objset_evict(clone->ds_objset);
+		clone->ds_objset = NULL;
 	}
 
-	if (csa->ohds->ds_objset != NULL) {
-		dmu_objset_evict(csa->ohds->ds_objset);
-		csa->ohds->ds_objset = NULL;
+	if (origin_head->ds_objset != NULL) {
+		dmu_objset_evict(origin_head->ds_objset);
+		origin_head->ds_objset = NULL;
 	}
 
+	unused_refres_delta =
+	    (int64_t)MIN(origin_head->ds_reserved,
+	    origin_head->ds_phys->ds_unique_bytes) -
+	    (int64_t)MIN(origin_head->ds_reserved,
+	    clone->ds_phys->ds_unique_bytes);
+
 	/*
 	 * Reset origin's unique bytes, if it exists.
 	 */
-	if (csa->cds->ds_prev) {
-		dsl_dataset_t *origin = csa->cds->ds_prev;
+	if (clone->ds_prev) {
+		dsl_dataset_t *origin = clone->ds_prev;
 		uint64_t comp, uncomp;
 
 		dmu_buf_will_dirty(origin->ds_dbuf, tx);
-		dsl_deadlist_space_range(&csa->cds->ds_deadlist,
+		dsl_deadlist_space_range(&clone->ds_deadlist,
 		    origin->ds_phys->ds_prev_snap_txg, UINT64_MAX,
 		    &origin->ds_phys->ds_unique_bytes, &comp, &uncomp);
 	}
@@ -3208,9 +2378,9 @@
 	/* swap blkptrs */
 	{
 		blkptr_t tmp;
-		tmp = csa->ohds->ds_phys->ds_bp;
-		csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp;
-		csa->cds->ds_phys->ds_bp = tmp;
+		tmp = origin_head->ds_phys->ds_bp;
+		origin_head->ds_phys->ds_bp = clone->ds_phys->ds_bp;
+		clone->ds_phys->ds_bp = tmp;
 	}
 
 	/* set dd_*_bytes */
@@ -3219,25 +2389,25 @@
 		uint64_t cdl_used, cdl_comp, cdl_uncomp;
 		uint64_t odl_used, odl_comp, odl_uncomp;
 
-		ASSERT3U(csa->cds->ds_dir->dd_phys->
+		ASSERT3U(clone->ds_dir->dd_phys->
 		    dd_used_breakdown[DD_USED_SNAP], ==, 0);
 
-		dsl_deadlist_space(&csa->cds->ds_deadlist,
+		dsl_deadlist_space(&clone->ds_deadlist,
 		    &cdl_used, &cdl_comp, &cdl_uncomp);
-		dsl_deadlist_space(&csa->ohds->ds_deadlist,
+		dsl_deadlist_space(&origin_head->ds_deadlist,
 		    &odl_used, &odl_comp, &odl_uncomp);
 
-		dused = csa->cds->ds_phys->ds_referenced_bytes + cdl_used -
-		    (csa->ohds->ds_phys->ds_referenced_bytes + odl_used);
-		dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp -
-		    (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp);
-		duncomp = csa->cds->ds_phys->ds_uncompressed_bytes +
+		dused = clone->ds_phys->ds_referenced_bytes + cdl_used -
+		    (origin_head->ds_phys->ds_referenced_bytes + odl_used);
+		dcomp = clone->ds_phys->ds_compressed_bytes + cdl_comp -
+		    (origin_head->ds_phys->ds_compressed_bytes + odl_comp);
+		duncomp = clone->ds_phys->ds_uncompressed_bytes +
 		    cdl_uncomp -
-		    (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp);
-
-		dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD,
+		    (origin_head->ds_phys->ds_uncompressed_bytes + odl_uncomp);
+
+		dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD,
 		    dused, dcomp, duncomp, tx);
-		dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD,
+		dsl_dir_diduse_space(clone->ds_dir, DD_USED_HEAD,
 		    -dused, -dcomp, -duncomp, tx);
 
 		/*
@@ -3246,86 +2416,46 @@
 		 * deadlist (since that's the only thing that's
 		 * changing that affects the snapused).
 		 */
-		dsl_deadlist_space_range(&csa->cds->ds_deadlist,
-		    csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX,
+		dsl_deadlist_space_range(&clone->ds_deadlist,
+		    origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
 		    &cdl_used, &cdl_comp, &cdl_uncomp);
-		dsl_deadlist_space_range(&csa->ohds->ds_deadlist,
-		    csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX,
+		dsl_deadlist_space_range(&origin_head->ds_deadlist,
+		    origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
 		    &odl_used, &odl_comp, &odl_uncomp);
-		dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used,
+		dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used,
 		    DD_USED_HEAD, DD_USED_SNAP, tx);
 	}
 
 	/* swap ds_*_bytes */
-	SWITCH64(csa->ohds->ds_phys->ds_referenced_bytes,
-	    csa->cds->ds_phys->ds_referenced_bytes);
-	SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes,
-	    csa->cds->ds_phys->ds_compressed_bytes);
-	SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes,
-	    csa->cds->ds_phys->ds_uncompressed_bytes);
-	SWITCH64(csa->ohds->ds_phys->ds_unique_bytes,
-	    csa->cds->ds_phys->ds_unique_bytes);
+	SWITCH64(origin_head->ds_phys->ds_referenced_bytes,
+	    clone->ds_phys->ds_referenced_bytes);
+	SWITCH64(origin_head->ds_phys->ds_compressed_bytes,
+	    clone->ds_phys->ds_compressed_bytes);
+	SWITCH64(origin_head->ds_phys->ds_uncompressed_bytes,
+	    clone->ds_phys->ds_uncompressed_bytes);
+	SWITCH64(origin_head->ds_phys->ds_unique_bytes,
+	    clone->ds_phys->ds_unique_bytes);
 
 	/* apply any parent delta for change in unconsumed refreservation */
-	dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV,
-	    csa->unused_refres_delta, 0, 0, tx);
+	dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV,
+	    unused_refres_delta, 0, 0, tx);
 
 	/*
 	 * Swap deadlists.
 	 */
-	dsl_deadlist_close(&csa->cds->ds_deadlist);
-	dsl_deadlist_close(&csa->ohds->ds_deadlist);
-	SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj,
-	    csa->cds->ds_phys->ds_deadlist_obj);
-	dsl_deadlist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset,
-	    csa->cds->ds_phys->ds_deadlist_obj);
-	dsl_deadlist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset,
-	    csa->ohds->ds_phys->ds_deadlist_obj);
-
-	dsl_scan_ds_clone_swapped(csa->ohds, csa->cds, tx);
-
-	spa_history_log_internal_ds(csa->cds, "clone swap", tx,
-	    "parent=%s", csa->ohds->ds_dir->dd_myname);
-}
-
-/*
- * Swap 'clone' with its origin head datasets.  Used at the end of "zfs
- * recv" into an existing fs to swizzle the file system to the new
- * version, and by "zfs rollback".  Can also be used to swap two
- * independent head datasets if neither has any snapshots.
- */
-int
-dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
-    boolean_t force)
-{
-	struct cloneswaparg csa;
-	int error;
-
-	ASSERT(clone->ds_owner);
-	ASSERT(origin_head->ds_owner);
-retry:
-	/*
-	 * Need exclusive access for the swap. If we're swapping these
-	 * datasets back after an error, we already hold the locks.
-	 */
-	if (!RW_WRITE_HELD(&clone->ds_rwlock))
-		rw_enter(&clone->ds_rwlock, RW_WRITER);
-	if (!RW_WRITE_HELD(&origin_head->ds_rwlock) &&
-	    !rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) {
-		rw_exit(&clone->ds_rwlock);
-		rw_enter(&origin_head->ds_rwlock, RW_WRITER);
-		if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) {
-			rw_exit(&origin_head->ds_rwlock);
-			goto retry;
-		}
-	}
-	csa.cds = clone;
-	csa.ohds = origin_head;
-	csa.force = force;
-	error = dsl_sync_task_do(clone->ds_dir->dd_pool,
-	    dsl_dataset_clone_swap_check,
-	    dsl_dataset_clone_swap_sync, &csa, NULL, 9);
-	return (error);
+	dsl_deadlist_close(&clone->ds_deadlist);
+	dsl_deadlist_close(&origin_head->ds_deadlist);
+	SWITCH64(origin_head->ds_phys->ds_deadlist_obj,
+	    clone->ds_phys->ds_deadlist_obj);
+	dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset,
+	    clone->ds_phys->ds_deadlist_obj);
+	dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset,
+	    origin_head->ds_phys->ds_deadlist_obj);
+
+	dsl_scan_ds_clone_swapped(origin_head, clone, tx);
+
+	spa_history_log_internal_ds(clone, "clone swap", tx,
+	    "parent=%s", origin_head->ds_dir->dd_myname);
 }
 
 /*
@@ -3335,21 +2465,20 @@
 int
 dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
 {
-	spa_t *spa;
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
 	int error;
 
-	if ((error = spa_open(pname, &spa, FTAG)) != 0)
+	error = dsl_pool_hold(pname, FTAG, &dp);
+	if (error != 0)
 		return (error);
-	dp = spa_get_dsl(spa);
-	rw_enter(&dp->dp_config_rwlock, RW_READER);
-	if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) {
+
+	error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
+	if (error == 0) {
 		dsl_dataset_name(ds, buf);
 		dsl_dataset_rele(ds, FTAG);
 	}
-	rw_exit(&dp->dp_config_rwlock);
-	spa_close(spa, FTAG);
+	dsl_pool_rele(dp, FTAG);
 
 	return (error);
 }
@@ -3402,102 +2531,134 @@
 	return (error);
 }
 
+typedef struct dsl_dataset_set_qr_arg {
+	const char *ddsqra_name;
+	zprop_source_t ddsqra_source;
+	uint64_t ddsqra_value;
+} dsl_dataset_set_qr_arg_t;
+
+
 /* ARGSUSED */
 static int
-dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx)
 {
-	dsl_dataset_t *ds = arg1;
-	dsl_prop_setarg_t *psa = arg2;
-	int err;
-
-	if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA)
+	dsl_dataset_set_qr_arg_t *ddsqra = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+	int error;
+	uint64_t newval;
+
+	if (spa_version(dp->dp_spa) < SPA_VERSION_REFQUOTA)
 		return (ENOTSUP);
 
-	if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
-		return (err);
-
-	if (psa->psa_effective_value == 0)
+	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
+	if (error != 0)
+		return (error);
+
+	if (dsl_dataset_is_snapshot(ds)) {
+		dsl_dataset_rele(ds, FTAG);
+		return (EINVAL);
+	}
+
+	error = dsl_prop_predict(ds->ds_dir,
+	    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
+	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
+	if (error != 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return (error);
+	}
+
+	if (newval == 0) {
+		dsl_dataset_rele(ds, FTAG);
 		return (0);
-
-	if (psa->psa_effective_value < ds->ds_phys->ds_referenced_bytes ||
-	    psa->psa_effective_value < ds->ds_reserved)
+	}
+
+	if (newval < ds->ds_phys->ds_referenced_bytes ||
+	    newval < ds->ds_reserved) {
+		dsl_dataset_rele(ds, FTAG);
 		return (ENOSPC);
-
+	}
+
+	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
-extern void dsl_prop_set_sync(void *, void *, dmu_tx_t *);
-
-void
-dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+static void
+dsl_dataset_set_refquota_sync(void *arg, dmu_tx_t *tx)
 {
-	dsl_dataset_t *ds = arg1;
-	dsl_prop_setarg_t *psa = arg2;
-	uint64_t effective_value = psa->psa_effective_value;
-
-	dsl_prop_set_sync(ds, psa, tx);
-	DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
-
-	if (ds->ds_quota != effective_value) {
+	dsl_dataset_set_qr_arg_t *ddsqra = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+	uint64_t newval;
+
+	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
+
+	dsl_prop_set_sync_impl(ds,
+	    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
+	    ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
+	    &ddsqra->ddsqra_value, tx);
+
+	VERIFY0(dsl_prop_get_int_ds(ds,
+	    zfs_prop_to_name(ZFS_PROP_REFQUOTA), &newval));
+
+	if (ds->ds_quota != newval) {
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
-		ds->ds_quota = effective_value;
+		ds->ds_quota = newval;
 	}
+	dsl_dataset_rele(ds, FTAG);
 }
 
 int
-dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota)
+dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,
+    uint64_t refquota)
 {
-	dsl_dataset_t *ds;
-	dsl_prop_setarg_t psa;
-	int err;
-
-	dsl_prop_setarg_init_uint64(&psa, "refquota", source, &quota);
-
-	err = dsl_dataset_hold(dsname, FTAG, &ds);
-	if (err)
-		return (err);
-
-	/*
-	 * If someone removes a file, then tries to set the quota, we
-	 * want to make sure the file freeing takes effect.
-	 */
-	txg_wait_open(ds->ds_dir->dd_pool, 0);
-
-	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-	    dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync,
-	    ds, &psa, 0);
-
-	dsl_dataset_rele(ds, FTAG);
-	return (err);
+	dsl_dataset_set_qr_arg_t ddsqra;
+
+	ddsqra.ddsqra_name = dsname;
+	ddsqra.ddsqra_source = source;
+	ddsqra.ddsqra_value = refquota;
+
+	return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check,
+	    dsl_dataset_set_refquota_sync, &ddsqra, 0));
 }
 
 static int
-dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx)
 {
-	dsl_dataset_t *ds = arg1;
-	dsl_prop_setarg_t *psa = arg2;
-	uint64_t effective_value;
-	uint64_t unique;
-	int err;
-
-	if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
-	    SPA_VERSION_REFRESERVATION)
+	dsl_dataset_set_qr_arg_t *ddsqra = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+	int error;
+	uint64_t newval, unique;
+
+	if (spa_version(dp->dp_spa) < SPA_VERSION_REFRESERVATION)
 		return (ENOTSUP);
 
-	if (dsl_dataset_is_snapshot(ds))
+	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
+	if (error != 0)
+		return (error);
+
+	if (dsl_dataset_is_snapshot(ds)) {
+		dsl_dataset_rele(ds, FTAG);
 		return (EINVAL);
-
-	if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
-		return (err);
-
-	effective_value = psa->psa_effective_value;
+	}
+
+	error = dsl_prop_predict(ds->ds_dir,
+	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
+	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
+	if (error != 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return (error);
+	}
 
 	/*
 	 * If we are doing the preliminary check in open context, the
 	 * space estimates may be inaccurate.
 	 */
-	if (!dmu_tx_is_syncing(tx))
+	if (!dmu_tx_is_syncing(tx)) {
+		dsl_dataset_rele(ds, FTAG);
 		return (0);
+	}
 
 	mutex_enter(&ds->ds_lock);
 	if (!DS_UNIQUE_IS_ACCURATE(ds))
@@ -3505,637 +2666,75 @@
 	unique = ds->ds_phys->ds_unique_bytes;
 	mutex_exit(&ds->ds_lock);
 
-	if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) {
-		uint64_t delta = MAX(unique, effective_value) -
+	if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) {
+		uint64_t delta = MAX(unique, newval) -
 		    MAX(unique, ds->ds_reserved);
 
-		if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
+		if (delta >
+		    dsl_dir_space_available(ds->ds_dir, NULL, 0, B_TRUE) ||
+		    (ds->ds_quota > 0 && newval > ds->ds_quota)) {
+			dsl_dataset_rele(ds, FTAG);
 			return (ENOSPC);
-		if (ds->ds_quota > 0 &&
-		    effective_value > ds->ds_quota)
-			return (ENOSPC);
+		}
 	}
 
+	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
-static void
-dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+void
+dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
+    zprop_source_t source, uint64_t value, dmu_tx_t *tx)
 {
-	dsl_dataset_t *ds = arg1;
-	dsl_prop_setarg_t *psa = arg2;
-	uint64_t effective_value = psa->psa_effective_value;
+	uint64_t newval;
 	uint64_t unique;
 	int64_t delta;
 
-	dsl_prop_set_sync(ds, psa, tx);
-	DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
+	dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
+	    source, sizeof (value), 1, &value, tx);
+
+	VERIFY0(dsl_prop_get_int_ds(ds,
+	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &newval));
 
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-
 	mutex_enter(&ds->ds_dir->dd_lock);
 	mutex_enter(&ds->ds_lock);
 	ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
 	unique = ds->ds_phys->ds_unique_bytes;
-	delta = MAX(0, (int64_t)(effective_value - unique)) -
+	delta = MAX(0, (int64_t)(newval - unique)) -
 	    MAX(0, (int64_t)(ds->ds_reserved - unique));
-	ds->ds_reserved = effective_value;
+	ds->ds_reserved = newval;
 	mutex_exit(&ds->ds_lock);
 
 	dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
 	mutex_exit(&ds->ds_dir->dd_lock);
 }
 
-int
-dsl_dataset_set_reservation(const char *dsname, zprop_source_t source,
-    uint64_t reservation)
-{
-	dsl_dataset_t *ds;
-	dsl_prop_setarg_t psa;
-	int err;
-
-	dsl_prop_setarg_init_uint64(&psa, "refreservation", source,
-	    &reservation);
-
-	err = dsl_dataset_hold(dsname, FTAG, &ds);
-	if (err)
-		return (err);
-
-	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-	    dsl_dataset_set_reservation_check,
-	    dsl_dataset_set_reservation_sync, ds, &psa, 0);
-
-	dsl_dataset_rele(ds, FTAG);
-	return (err);
-}
-
-typedef struct zfs_hold_cleanup_arg {
-	dsl_pool_t *dp;
-	uint64_t dsobj;
-	char htag[MAXNAMELEN];
-} zfs_hold_cleanup_arg_t;
-
 static void
-dsl_dataset_user_release_onexit(void *arg)
-{
-	zfs_hold_cleanup_arg_t *ca = arg;
-
-	(void) dsl_dataset_user_release_tmp(ca->dp, ca->dsobj, ca->htag,
-	    B_TRUE);
-	kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t));
-}
-
-void
-dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag,
-    minor_t minor)
-{
-	zfs_hold_cleanup_arg_t *ca;
-
-	ca = kmem_alloc(sizeof (zfs_hold_cleanup_arg_t), KM_SLEEP);
-	ca->dp = ds->ds_dir->dd_pool;
-	ca->dsobj = ds->ds_object;
-	(void) strlcpy(ca->htag, htag, sizeof (ca->htag));
-	VERIFY3U(0, ==, zfs_onexit_add_cb(minor,
-	    dsl_dataset_user_release_onexit, ca, NULL));
-}
-
-/*
- * If you add new checks here, you may need to add
- * additional checks to the "temporary" case in
- * snapshot_check() in dmu_objset.c.
- */
-static int
-dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dataset_set_refreservation_sync(void *arg, dmu_tx_t *tx)
 {
-	dsl_dataset_t *ds = arg1;
-	struct dsl_ds_holdarg *ha = arg2;
-	const char *htag = ha->htag;
-	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-	int error = 0;
-
-	if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS)
-		return (ENOTSUP);
-
-	if (!dsl_dataset_is_snapshot(ds))
-		return (EINVAL);
-
-	/* tags must be unique */
-	mutex_enter(&ds->ds_lock);
-	if (ds->ds_phys->ds_userrefs_obj) {
-		error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag,
-		    8, 1, tx);
-		if (error == 0)
-			error = EEXIST;
-		else if (error == ENOENT)
-			error = 0;
-	}
-	mutex_exit(&ds->ds_lock);
-
-	if (error == 0 && ha->temphold &&
-	    strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN)
-		error = E2BIG;
-
-	return (error);
-}
-
-void
-dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = arg1;
-	struct dsl_ds_holdarg *ha = arg2;
-	const char *htag = ha->htag;
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-	objset_t *mos = dp->dp_meta_objset;
-	uint64_t now = gethrestime_sec();
-	uint64_t zapobj;
-
-	mutex_enter(&ds->ds_lock);
-	if (ds->ds_phys->ds_userrefs_obj == 0) {
-		/*
-		 * This is the first user hold for this dataset.  Create
-		 * the userrefs zap object.
-		 */
-		dmu_buf_will_dirty(ds->ds_dbuf, tx);
-		zapobj = ds->ds_phys->ds_userrefs_obj =
-		    zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx);
-	} else {
-		zapobj = ds->ds_phys->ds_userrefs_obj;
-	}
-	ds->ds_userrefs++;
-	mutex_exit(&ds->ds_lock);
-
-	VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx));
-
-	if (ha->temphold) {
-		VERIFY(0 == dsl_pool_user_hold(dp, ds->ds_object,
-		    htag, &now, tx));
-	}
-
-	spa_history_log_internal_ds(ds, "hold", tx,
-	    "tag = %s temp = %d holds now = %llu",
-	    htag, (int)ha->temphold, ds->ds_userrefs);
-}
-
-static int
-dsl_dataset_user_hold_one(const char *dsname, void *arg)
-{
-	struct dsl_ds_holdarg *ha = arg;
+	dsl_dataset_set_qr_arg_t *ddsqra = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
-	int error;
-	char *name;
-
-	/* alloc a buffer to hold dsname@snapname plus terminating NULL */
-	name = kmem_asprintf("%s@%s", dsname, ha->snapname);
-	error = dsl_dataset_hold(name, ha->dstg, &ds);
-	strfree(name);
-	if (error == 0) {
-		ha->gotone = B_TRUE;
-		dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check,
-		    dsl_dataset_user_hold_sync, ds, ha, 0);
-	} else if (error == ENOENT && ha->recursive) {
-		error = 0;
-	} else {
-		(void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
-	}
-	return (error);
-}
-
-int
-dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag,
-    boolean_t temphold)
-{
-	struct dsl_ds_holdarg *ha;
-	int error;
-
-	ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
-	ha->htag = htag;
-	ha->temphold = temphold;
-	error = dsl_sync_task_do(ds->ds_dir->dd_pool,
-	    dsl_dataset_user_hold_check, dsl_dataset_user_hold_sync,
-	    ds, ha, 0);
-	kmem_free(ha, sizeof (struct dsl_ds_holdarg));
-
-	return (error);
+
+	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
+	dsl_dataset_set_refreservation_sync_impl(ds,
+	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, tx);
+	dsl_dataset_rele(ds, FTAG);
 }
 
 int
-dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
-    boolean_t recursive, boolean_t temphold, int cleanup_fd)
-{
-	struct dsl_ds_holdarg *ha;
-	dsl_sync_task_t *dst;
-	spa_t *spa;
-	int error;
-	minor_t minor = 0;
-
-	if (cleanup_fd != -1) {
-		/* Currently we only support cleanup-on-exit of tempholds. */
-		if (!temphold)
-			return (EINVAL);
-		error = zfs_onexit_fd_hold(cleanup_fd, &minor);
-		if (error)
-			return (error);
-	}
-
-	ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
-
-	(void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
-
-	error = spa_open(dsname, &spa, FTAG);
-	if (error) {
-		kmem_free(ha, sizeof (struct dsl_ds_holdarg));
-		if (cleanup_fd != -1)
-			zfs_onexit_fd_rele(cleanup_fd);
-		return (error);
-	}
-
-	ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
-	ha->htag = htag;
-	ha->snapname = snapname;
-	ha->recursive = recursive;
-	ha->temphold = temphold;
-
-	if (recursive) {
-		error = dmu_objset_find(dsname, dsl_dataset_user_hold_one,
-		    ha, DS_FIND_CHILDREN);
-	} else {
-		error = dsl_dataset_user_hold_one(dsname, ha);
-	}
-	if (error == 0)
-		error = dsl_sync_task_group_wait(ha->dstg);
-
-	for (dst = list_head(&ha->dstg->dstg_tasks); dst;
-	    dst = list_next(&ha->dstg->dstg_tasks, dst)) {
-		dsl_dataset_t *ds = dst->dst_arg1;
-
-		if (dst->dst_err) {
-			dsl_dataset_name(ds, ha->failed);
-			*strchr(ha->failed, '@') = '\0';
-		} else if (error == 0 && minor != 0 && temphold) {
-			/*
-			 * If this hold is to be released upon process exit,
-			 * register that action now.
-			 */
-			dsl_register_onexit_hold_cleanup(ds, htag, minor);
-		}
-		dsl_dataset_rele(ds, ha->dstg);
-	}
-
-	if (error == 0 && recursive && !ha->gotone)
-		error = ENOENT;
-
-	if (error)
-		(void) strlcpy(dsname, ha->failed, sizeof (ha->failed));
-
-	dsl_sync_task_group_destroy(ha->dstg);
-
-	kmem_free(ha, sizeof (struct dsl_ds_holdarg));
-	spa_close(spa, FTAG);
-	if (cleanup_fd != -1)
-		zfs_onexit_fd_rele(cleanup_fd);
-	return (error);
-}
-
-struct dsl_ds_releasearg {
-	dsl_dataset_t *ds;
-	const char *htag;
-	boolean_t own;		/* do we own or just hold ds? */
-};
-
-static int
-dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag,
-    boolean_t *might_destroy)
-{
-	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-	uint64_t zapobj;
-	uint64_t tmp;
-	int error;
-
-	*might_destroy = B_FALSE;
-
-	mutex_enter(&ds->ds_lock);
-	zapobj = ds->ds_phys->ds_userrefs_obj;
-	if (zapobj == 0) {
-		/* The tag can't possibly exist */
-		mutex_exit(&ds->ds_lock);
-		return (ESRCH);
-	}
-
-	/* Make sure the tag exists */
-	error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp);
-	if (error) {
-		mutex_exit(&ds->ds_lock);
-		if (error == ENOENT)
-			error = ESRCH;
-		return (error);
-	}
-
-	if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 &&
-	    DS_IS_DEFER_DESTROY(ds))
-		*might_destroy = B_TRUE;
-
-	mutex_exit(&ds->ds_lock);
-	return (0);
-}
-
-static int
-dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx)
-{
-	struct dsl_ds_releasearg *ra = arg1;
-	dsl_dataset_t *ds = ra->ds;
-	boolean_t might_destroy;
-	int error;
-
-	if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS)
-		return (ENOTSUP);
-
-	error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy);
-	if (error)
-		return (error);
-
-	if (might_destroy) {
-		struct dsl_ds_destroyarg dsda = {0};
-
-		if (dmu_tx_is_syncing(tx)) {
-			/*
-			 * If we're not prepared to remove the snapshot,
-			 * we can't allow the release to happen right now.
-			 */
-			if (!ra->own)
-				return (EBUSY);
-		}
-		dsda.ds = ds;
-		dsda.releasing = B_TRUE;
-		return (dsl_dataset_destroy_check(&dsda, tag, tx));
-	}
-
-	return (0);
-}
-
-static void
-dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx)
-{
-	struct dsl_ds_releasearg *ra = arg1;
-	dsl_dataset_t *ds = ra->ds;
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-	objset_t *mos = dp->dp_meta_objset;
-	uint64_t zapobj;
-	uint64_t refs;
-	int error;
-
-	mutex_enter(&ds->ds_lock);
-	ds->ds_userrefs--;
-	refs = ds->ds_userrefs;
-	mutex_exit(&ds->ds_lock);
-	error = dsl_pool_user_release(dp, ds->ds_object, ra->htag, tx);
-	VERIFY(error == 0 || error == ENOENT);
-	zapobj = ds->ds_phys->ds_userrefs_obj;
-	VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx));
-
-	spa_history_log_internal_ds(ds, "release", tx,
-	    "tag = %s refs now = %lld", ra->htag, (longlong_t)refs);
-
-	if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 &&
-	    DS_IS_DEFER_DESTROY(ds)) {
-		struct dsl_ds_destroyarg dsda = {0};
-
-		ASSERT(ra->own);
-		dsda.ds = ds;
-		dsda.releasing = B_TRUE;
-		/* We already did the destroy_check */
-		dsl_dataset_destroy_sync(&dsda, tag, tx);
-	}
-}
-
-static int
-dsl_dataset_user_release_one(const char *dsname, void *arg)
+dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
+    uint64_t refreservation)
 {
-	struct dsl_ds_holdarg *ha = arg;
-	struct dsl_ds_releasearg *ra;
-	dsl_dataset_t *ds;
-	int error;
-	void *dtag = ha->dstg;
-	char *name;
-	boolean_t own = B_FALSE;
-	boolean_t might_destroy;
-
-	/* alloc a buffer to hold dsname@snapname, plus the terminating NULL */
-	name = kmem_asprintf("%s@%s", dsname, ha->snapname);
-	error = dsl_dataset_hold(name, dtag, &ds);
-	strfree(name);
-	if (error == ENOENT && ha->recursive)
-		return (0);
-	(void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
-	if (error)
-		return (error);
-
-	ha->gotone = B_TRUE;
-
-	ASSERT(dsl_dataset_is_snapshot(ds));
-
-	error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy);
-	if (error) {
-		dsl_dataset_rele(ds, dtag);
-		return (error);
-	}
-
-	if (might_destroy) {
-#ifdef _KERNEL
-		name = kmem_asprintf("%s@%s", dsname, ha->snapname);
-		error = zfs_unmount_snap(name, NULL);
-		strfree(name);
-		if (error) {
-			dsl_dataset_rele(ds, dtag);
-			return (error);
-		}
-#endif
-		if (!dsl_dataset_tryown(ds, B_TRUE, dtag)) {
-			dsl_dataset_rele(ds, dtag);
-			return (EBUSY);
-		} else {
-			own = B_TRUE;
-			dsl_dataset_make_exclusive(ds, dtag);
-		}
-	}
-
-	ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP);
-	ra->ds = ds;
-	ra->htag = ha->htag;
-	ra->own = own;
-	dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check,
-	    dsl_dataset_user_release_sync, ra, dtag, 0);
-
-	return (0);
-}
-
-int
-dsl_dataset_user_release(char *dsname, char *snapname, char *htag,
-    boolean_t recursive)
-{
-	struct dsl_ds_holdarg *ha;
-	dsl_sync_task_t *dst;
-	spa_t *spa;
-	int error;
-
-top:
-	ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
-
-	(void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
-
-	error = spa_open(dsname, &spa, FTAG);
-	if (error) {
-		kmem_free(ha, sizeof (struct dsl_ds_holdarg));
-		return (error);
-	}
-
-	ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
-	ha->htag = htag;
-	ha->snapname = snapname;
-	ha->recursive = recursive;
-	if (recursive) {
-		error = dmu_objset_find(dsname, dsl_dataset_user_release_one,
-		    ha, DS_FIND_CHILDREN);
-	} else {
-		error = dsl_dataset_user_release_one(dsname, ha);
-	}
-	if (error == 0)
-		error = dsl_sync_task_group_wait(ha->dstg);
-
-	for (dst = list_head(&ha->dstg->dstg_tasks); dst;
-	    dst = list_next(&ha->dstg->dstg_tasks, dst)) {
-		struct dsl_ds_releasearg *ra = dst->dst_arg1;
-		dsl_dataset_t *ds = ra->ds;
-
-		if (dst->dst_err)
-			dsl_dataset_name(ds, ha->failed);
-
-		if (ra->own)
-			dsl_dataset_disown(ds, ha->dstg);
-		else
-			dsl_dataset_rele(ds, ha->dstg);
-
-		kmem_free(ra, sizeof (struct dsl_ds_releasearg));
-	}
-
-	if (error == 0 && recursive && !ha->gotone)
-		error = ENOENT;
-
-	if (error && error != EBUSY)
-		(void) strlcpy(dsname, ha->failed, sizeof (ha->failed));
-
-	dsl_sync_task_group_destroy(ha->dstg);
-	kmem_free(ha, sizeof (struct dsl_ds_holdarg));
-	spa_close(spa, FTAG);
-
-	/*
-	 * We can get EBUSY if we were racing with deferred destroy and
-	 * dsl_dataset_user_release_check() hadn't done the necessary
-	 * open context setup.  We can also get EBUSY if we're racing
-	 * with destroy and that thread is the ds_owner.  Either way
-	 * the busy condition should be transient, and we should retry
-	 * the release operation.
-	 */
-	if (error == EBUSY)
-		goto top;
-
-	return (error);
-}
-
-/*
- * Called at spa_load time (with retry == B_FALSE) to release a stale
- * temporary user hold. Also called by the onexit code (with retry == B_TRUE).
- */
-int
-dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag,
-    boolean_t retry)
-{
-	dsl_dataset_t *ds;
-	char *snap;
-	char *name;
-	int namelen;
-	int error;
-
-	do {
-		rw_enter(&dp->dp_config_rwlock, RW_READER);
-		error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
-		rw_exit(&dp->dp_config_rwlock);
-		if (error)
-			return (error);
-		namelen = dsl_dataset_namelen(ds)+1;
-		name = kmem_alloc(namelen, KM_SLEEP);
-		dsl_dataset_name(ds, name);
-		dsl_dataset_rele(ds, FTAG);
-
-		snap = strchr(name, '@');
-		*snap = '\0';
-		++snap;
-		error = dsl_dataset_user_release(name, snap, htag, B_FALSE);
-		kmem_free(name, namelen);
-
-		/*
-		 * The object can't have been destroyed because we have a hold,
-		 * but it might have been renamed, resulting in ENOENT.  Retry
-		 * if we've been requested to do so.
-		 *
-		 * It would be nice if we could use the dsobj all the way
-		 * through and avoid ENOENT entirely.  But we might need to
-		 * unmount the snapshot, and there's currently no way to lookup
-		 * a vfsp using a ZFS object id.
-		 */
-	} while ((error == ENOENT) && retry);
-
-	return (error);
-}
-
-int
-dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp)
-{
-	dsl_dataset_t *ds;
-	int err;
-
-	err = dsl_dataset_hold(dsname, FTAG, &ds);
-	if (err)
-		return (err);
-
-	VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP));
-	if (ds->ds_phys->ds_userrefs_obj != 0) {
-		zap_attribute_t *za;
-		zap_cursor_t zc;
-
-		za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
-		for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset,
-		    ds->ds_phys->ds_userrefs_obj);
-		    zap_cursor_retrieve(&zc, za) == 0;
-		    zap_cursor_advance(&zc)) {
-			VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name,
-			    za->za_first_integer));
-		}
-		zap_cursor_fini(&zc);
-		kmem_free(za, sizeof (zap_attribute_t));
-	}
-	dsl_dataset_rele(ds, FTAG);
-	return (0);
-}
-
-/*
- * Note, this function is used as the callback for dmu_objset_find().  We
- * always return 0 so that we will continue to find and process
- * inconsistent datasets, even if we encounter an error trying to
- * process one of them.
- */
-/* ARGSUSED */
-int
-dsl_destroy_inconsistent(const char *dsname, void *arg)
-{
-	dsl_dataset_t *ds;
-
-	if (dsl_dataset_own(dsname, B_TRUE, FTAG, &ds) == 0) {
-		if (DS_IS_INCONSISTENT(ds))
-			(void) dsl_dataset_destroy(ds, FTAG, B_FALSE);
-		else
-			dsl_dataset_disown(ds, FTAG);
-	}
-	return (0);
+	dsl_dataset_set_qr_arg_t ddsqra;
+
+	ddsqra.ddsqra_name = dsname;
+	ddsqra.ddsqra_source = source;
+	ddsqra.ddsqra_value = refreservation;
+
+	return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check,
+	    dsl_dataset_set_refreservation_sync, &ddsqra, 0));
 }
 
 /*
@@ -4163,6 +2762,8 @@
 	uint64_t snapobj;
 	dsl_pool_t *dp = new->ds_dir->dd_pool;
 
+	ASSERT(dsl_pool_config_held(dp));
+
 	*usedp = 0;
 	*usedp += new->ds_phys->ds_referenced_bytes;
 	*usedp -= oldsnap->ds_phys->ds_referenced_bytes;
@@ -4175,7 +2776,6 @@
 	*uncompp += new->ds_phys->ds_uncompressed_bytes;
 	*uncompp -= oldsnap->ds_phys->ds_uncompressed_bytes;
 
-	rw_enter(&dp->dp_config_rwlock, RW_READER);
 	snapobj = new->ds_object;
 	while (snapobj != oldsnap->ds_object) {
 		dsl_dataset_t *snap;
@@ -4224,7 +2824,6 @@
 		}
 
 	}
-	rw_exit(&dp->dp_config_rwlock);
 	return (err);
 }
 
@@ -4266,7 +2865,6 @@
 
 	*usedp = *compp = *uncompp = 0;
 
-	rw_enter(&dp->dp_config_rwlock, RW_READER);
 	snapobj = lastsnap->ds_phys->ds_next_snap_obj;
 	while (snapobj != firstsnap->ds_object) {
 		dsl_dataset_t *ds;
@@ -4287,6 +2885,42 @@
 		ASSERT3U(snapobj, !=, 0);
 		dsl_dataset_rele(ds, FTAG);
 	}
-	rw_exit(&dp->dp_config_rwlock);
 	return (err);
 }
+
+/*
+ * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
+ * For example, they could both be snapshots of the same filesystem, and
+ * 'earlier' is before 'later'.  Or 'earlier' could be the origin of
+ * 'later's filesystem.  Or 'earlier' could be an older snapshot in the origin's
+ * filesystem.  Or 'earlier' could be the origin's origin.
+ */
+boolean_t
+dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier)
+{
+	dsl_pool_t *dp = later->ds_dir->dd_pool;
+	int error;
+	boolean_t ret;
+
+	ASSERT(dsl_pool_config_held(dp));
+
+	if (earlier->ds_phys->ds_creation_txg >=
+	    later->ds_phys->ds_creation_txg)
+		return (B_FALSE);
+
+	if (later->ds_dir == earlier->ds_dir)
+		return (B_TRUE);
+	if (!dsl_dir_is_clone(later->ds_dir))
+		return (B_FALSE);
+
+	if (later->ds_dir->dd_phys->dd_origin_obj == earlier->ds_object)
+		return (B_TRUE);
+	dsl_dataset_t *origin;
+	error = dsl_dataset_hold_obj(dp,
+	    later->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin);
+	if (error != 0)
+		return (B_FALSE);
+	ret = dsl_dataset_is_before(origin, earlier);
+	dsl_dataset_rele(origin, FTAG);
+	return (ret);
+}
--- a/usr/src/uts/common/fs/zfs/dsl_deleg.c	Wed Feb 27 21:13:42 2013 +0100
+++ b/usr/src/uts/common/fs/zfs/dsl_deleg.c	Thu Feb 28 12:44:05 2013 -0800
@@ -147,28 +147,37 @@
 	return (0);
 }
 
+typedef struct dsl_deleg_arg {
+	const char *dda_name;
+	nvlist_t *dda_nvlist;
+} dsl_deleg_arg_t;
+
 static void
-dsl_deleg_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_deleg_set_sync(void *arg, dmu_tx_t *tx)
 {
-	dsl_dir_t *dd = arg1;
-	nvlist_t *nvp = arg2;
-	objset_t *mos = dd->dd_pool->dp_meta_objset;
+	dsl_deleg_arg_t *dda = arg;
+	dsl_dir_t *dd;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	objset_t *mos = dp->dp_meta_objset;
 	nvpair_t *whopair = NULL;
-	uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj;
+	uint64_t zapobj;
 
+	VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL));
+
+	zapobj = dd->dd_phys->dd_deleg_zapobj;
 	if (zapobj == 0) {
 		dmu_buf_will_dirty(dd->dd_dbuf, tx);
 		zapobj = dd->dd_phys->dd_deleg_zapobj = zap_create(mos,
 		    DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
 	}
 
-	while (whopair = nvlist_next_nvpair(nvp, whopair)) {
+	while (whopair = nvlist_next_nvpair(dda->dda_nvlist, whopair)) {
 		const char *whokey = nvpair_name(whopair);
 		nvlist_t *perms;
 		nvpair_t *permpair = NULL;
 		uint64_t jumpobj;
 
-		VERIFY(nvpair_value_nvlist(whopair, &perms) == 0);
+		perms = fnvpair_value_nvlist(whopair);
 
 		if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) {
 			jumpobj = zap_create_link(mos, DMU_OT_DSL_PERMS,
@@ -185,21 +194,27 @@
 			    "%s %s", whokey, perm);
 		}
 	}
+	dsl_dir_rele(dd, FTAG);
 }
 
 static void
-dsl_deleg_unset_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_deleg_unset_sync(void *arg, dmu_tx_t *tx)
 {
-	dsl_dir_t *dd = arg1;
-	nvlist_t *nvp = arg2;
-	objset_t *mos = dd->dd_pool->dp_meta_objset;
+	dsl_deleg_arg_t *dda = arg;
+	dsl_dir_t *dd;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	objset_t *mos = dp->dp_meta_objset;
 	nvpair_t *whopair = NULL;
-	uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj;
+	uint64_t zapobj;
 
-	if (zapobj == 0)
+	VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL));
+	zapobj = dd->dd_phys->dd_deleg_zapobj;
+	if (zapobj == 0) {
+		dsl_dir_rele(dd, FTAG);
 		return;
+	}
 
-	while (whopair = nvlist_next_nvpair(nvp, whopair)) {
+	while (whopair = nvlist_next_nvpair(dda->dda_nvlist, whopair)) {
 		const char *whokey = nvpair_name(whopair);
 		nvlist_t *perms;
 		nvpair_t *permpair = NULL;
@@ -234,35 +249,40 @@
 			    "%s %s", whokey, perm);
 		}
 	}
+	dsl_dir_rele(dd, FTAG);
+}
+
+static int
+dsl_deleg_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_deleg_arg_t *dda = arg;
+	dsl_dir_t *dd;
+	int error;
+
+	if (spa_version(dmu_tx_pool(tx)->dp_spa) <
+	    SPA_VERSION_DELEGATED_PERMS) {
+		return (ENOTSUP);
+	}
+
+	error = dsl_dir_hold(dmu_tx_pool(tx), dda->dda_name, FTAG, &dd, NULL);
+	if (error == 0)
+		dsl_dir_rele(dd, FTAG);
+	return (error);
 }
 
 int
 dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset)
 {
-	dsl_dir_t *dd;
-	int error;
-	nvpair_t *whopair = NULL;
-	int blocks_modified = 0;
+	dsl_deleg_arg_t dda;
 
-	error = dsl_dir_open(ddname, FTAG, &dd, NULL);
-	if (error)
-		return (error);
+	/* nvp must already have been verified to be valid */
 
-	if (spa_version(dmu_objset_spa(dd->dd_pool->dp_meta_objset)) <
-	    SPA_VERSION_DELEGATED_PERMS) {
-		dsl_dir_close(dd, FTAG);
-		return (ENOTSUP);
-	}
+	dda.dda_name = ddname;
+	dda.dda_nvlist = nvp;
 
-	while (whopair = nvlist_next_nvpair(nvp, whopair))
-		blocks_modified++;
-
-	error = dsl_sync_task_do(dd->dd_pool, NULL,
+	return (dsl_sync_task(ddname, dsl_deleg_check,
 	    unset ? dsl_deleg_unset_sync : dsl_deleg_set_sync,
-	    dd, nvp, blocks_modified);
-	dsl_dir_close(dd, FTAG);
-
-	return (error);
+	    &dda, fnvlist_num_pairs(nvp)));
 }
 
 /*
@@ -290,16 +310,21 @@
 	int error;
 	objset_t *mos;
 
-	error = dsl_dir_open(ddname, FTAG, &startdd, NULL);
-	if (error)
+	error = dsl_pool_hold(ddname, FTAG, &dp);
+	if (error != 0)
 		return (error);
 
+	error = dsl_dir_hold(dp, ddname, FTAG, &startdd, NULL);
+	if (error != 0) {
+		dsl_pool_rele(dp, FTAG);
+		return (error);
+	}
+
 	dp = startdd->dd_pool;
 	mos = dp->dp_meta_objset;
 
 	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
-	rw_enter(&dp->dp_config_rwlock, RW_READER);
 	for (dd = startdd; dd != NULL; dd = dd->dd_parent) {
 		zap_cursor_t basezc;
 		zap_attribute_t baseza;
@@ -307,15 +332,12 @@
 		uint64_t n;
 		char source[MAXNAMELEN];
 
-		if (dd->dd_phys->dd_deleg_zapobj &&
-		    (zap_count(mos, dd->dd_phys->dd_deleg_zapobj,
-		    &n) == 0) && n) {
-			VERIFY(nvlist_alloc(&sp_nvp,
-			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
-		} else {
+		if (dd->dd_phys->dd_deleg_zapobj == 0 ||
+		    zap_count(mos, dd->dd_phys->dd_deleg_zapobj, &n) != 0 ||
+		    n == 0)
 			continue;
-		}
 
+		sp_nvp = fnvlist_alloc();
 		for (zap_cursor_init(&basezc, mos,
 		    dd->dd_phys->dd_deleg_zapobj);
 		    zap_cursor_retrieve(&basezc, &baseza) == 0;
@@ -327,29 +349,26 @@
 			ASSERT(baseza.za_integer_length == 8);
 			ASSERT(baseza.za_num_integers == 1);
 
-			VERIFY(nvlist_alloc(&perms_nvp,
-			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
+			perms_nvp = fnvlist_alloc();
 			for (zap_cursor_init(&zc, mos, baseza.za_first_integer);
 			    zap_cursor_retrieve(&zc, &za) == 0;
 			    zap_cursor_advance(&zc)) {
-				VERIFY(nvlist_add_boolean(perms_nvp,
-				    za.za_name) == 0);
+				fnvlist_add_boolean(perms_nvp, za.za_name);
 			}
 			zap_cursor_fini(&zc);
-			VERIFY(nvlist_add_nvlist(sp_nvp, baseza.za_name,
-			    perms_nvp) == 0);
-			nvlist_free(perms_nvp);
+			fnvlist_add_nvlist(sp_nvp, baseza.za_name, perms_nvp);
+			fnvlist_free(perms_nvp);
 		}
 
 		zap_cursor_fini(&basezc);
 
 		dsl_dir_name(dd, source);
-		VERIFY(nvlist_add_nvlist(*nvp, source, sp_nvp) == 0);
+		fnvlist_add_nvlist(*nvp, source, sp_nvp);
 		nvlist_free(sp_nvp);
 	}
-	rw_exit(&dp->dp_config_rwlock);
 
-	dsl_dir_close(startdd, FTAG);
+	dsl_dir_rele(startdd, FTAG);
+	dsl_pool_rele(dp, FTAG);
 	return (0);
 }
 
@@ -555,7 +574,7 @@
 	avl_create(&permsets, perm_set_compare, sizeof (perm_set_t),
 	    offsetof(perm_set_t, p_node));
 
-	rw_enter(&dp->dp_config_rwlock, RW_READER);
+	ASSERT(dsl_pool_config_held(dp));
 	for (dd = ds->ds_dir; dd != NULL; dd = dd->dd_parent,
 	    checkflag = ZFS_DELEG_DESCENDENT) {
 		uint64_t zapobj;
@@ -616,7 +635,6 @@
 	}
 	error = EPERM;
 success:
-	rw_exit(&dp->dp_config_rwlock);
 
 	cookie = NULL;
 	while ((setnode = avl_destroy_nodes(&permsets, &cookie)) != NULL)
@@ -628,15 +646,19 @@
 int
 dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr)
 {
+	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
 	int error;
 
-	error = dsl_dataset_hold(dsname, FTAG, &ds);
-	if (error)
+	error = dsl_pool_hold(dsname, FTAG, &dp);
+	if (error != 0)
 		return (error);
-
-	error = dsl_deleg_access_impl(ds, perm, cr);
-	dsl_dataset_rele(ds, FTAG);
+	error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+	if (error == 0) {
+		error = dsl_deleg_access_impl(ds, perm, cr);
+		dsl_dataset_rele(ds, FTAG);
+	}
+	dsl_pool_rele(dp, FTAG);
 
 	return (error);
 }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/fs/zfs/dsl_destroy.c	Thu Feb 28 12:44:05 2013 -0800
@@ -0,0 +1,926 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dsl_userhold.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dir.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_scan.h>
+#include <sys/dmu_objset.h>
+#include <sys/zap.h>
+#include <sys/zfeature.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/dsl_deleg.h>
+
+typedef struct dmu_snapshots_destroy_arg {
+	nvlist_t *dsda_snaps;
+	nvlist_t *dsda_successful_snaps;
+	boolean_t dsda_defer;
+	nvlist_t *dsda_errlist;
+} dmu_snapshots_destroy_arg_t;
+
+/*
+ * ds must be owned.
+ */
+static int
+dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer)
+{
+	if (!dsl_dataset_is_snapshot(ds))
+		return (EINVAL);
+
+	if (dsl_dataset_long_held(ds))
+		return (EBUSY);
+
+	/*
+	 * Only allow deferred destroy on pools that support it.
+	 * NOTE: deferred destroy is only supported on snapshots.
+	 */
+	if (defer) {
+		if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
+		    SPA_VERSION_USERREFS)
+			return (ENOTSUP);
+		return (0);
+	}
+
+	/*
+	 * If this snapshot has an elevated user reference count,
+	 * we can't destroy it yet.
+	 */
+	if (ds->ds_userrefs > 0)
+		return (EBUSY);
+
+	/*
+	 * Can't delete a branch point.
+	 */
+	if (ds->ds_phys->ds_num_children > 1)
+		return (EEXIST);
+
+	return (0);
+}
+
+static int
+dsl_destroy_snapshot_check(void *arg, dmu_tx_t *tx)
+{
+	dmu_snapshots_destroy_arg_t *dsda = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	nvpair_t *pair;
+	int error = 0;
+
+	if (!dmu_tx_is_syncing(tx))
+		return (0);
+
+	for (pair = nvlist_next_nvpair(dsda->dsda_snaps, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(dsda->dsda_snaps, pair)) {
+		dsl_dataset_t *ds;
+
+		error = dsl_dataset_hold(dp, nvpair_name(pair),
+		    FTAG, &ds);
+
+		/*
+		 * If the snapshot does not exist, silently ignore it
+		 * (it's "already destroyed").
+		 */
+		if (error == ENOENT)
+			continue;
+
+		if (error == 0) {
+			error = dsl_destroy_snapshot_check_impl(ds,
+			    dsda->dsda_defer);
+			dsl_dataset_rele(ds, FTAG);
+		}
+
+		if (error == 0) {
+			fnvlist_add_boolean(dsda->dsda_successful_snaps,
+			    nvpair_name(pair));
+		} else {
+			fnvlist_add_int32(dsda->dsda_errlist,
+			    nvpair_name(pair), error);
+		}
+	}
+
+	pair = nvlist_next_nvpair(dsda->dsda_errlist, NULL);
+	if (pair != NULL)
+		return (fnvpair_value_int32(pair));
+	return (0);
+}
+
+struct process_old_arg {
+	dsl_dataset_t *ds;
+	dsl_dataset_t *ds_prev;
+	boolean_t after_branch_point;
+	zio_t *pio;
+	uint64_t used, comp, uncomp;
+};
+
+static int
+process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	struct process_old_arg *poa = arg;
+	dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
+
+	if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) {
+		dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
+		if (poa->ds_prev && !poa->after_branch_point &&
+		    bp->blk_birth >
+		    poa->ds_prev->ds_phys->ds_prev_snap_txg) {
+			poa->ds_prev->ds_phys->ds_unique_bytes +=
+			    bp_get_dsize_sync(dp->dp_spa, bp);
+		}
+	} else {
+		poa->used += bp_get_dsize_sync(dp->dp_spa, bp);
+		poa->comp += BP_GET_PSIZE(bp);
+		poa->uncomp += BP_GET_UCSIZE(bp);
+		dsl_free_sync(poa->pio, dp, tx->tx_txg, bp);
+	}
+	return (0);
+}
+
+static void
+process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
+    dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx)
+{
+	struct process_old_arg poa = { 0 };
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	objset_t *mos = dp->dp_meta_objset;
+	uint64_t deadlist_obj;
+
+	ASSERT(ds->ds_deadlist.dl_oldfmt);
+	ASSERT(ds_next->ds_deadlist.dl_oldfmt);
+
+	poa.ds = ds;
+	poa.ds_prev = ds_prev;
+	poa.after_branch_point = after_branch_point;
+	poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+	VERIFY0(bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
+	    process_old_cb, &poa, tx));
+	VERIFY0(zio_wait(poa.pio));
+	ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes);
+
+	/* change snapused */
+	dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
+	    -poa.used, -poa.comp, -poa.uncomp, tx);
+
+	/* swap next's deadlist to our deadlist */
+	dsl_deadlist_close(&ds->ds_deadlist);
+	dsl_deadlist_close(&ds_next->ds_deadlist);
+	deadlist_obj = ds->ds_phys->ds_deadlist_obj;
+	ds->ds_phys->ds_deadlist_obj = ds_next->ds_phys->ds_deadlist_obj;
+	ds_next->ds_phys->ds_deadlist_obj = deadlist_obj;
+	dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
+	dsl_deadlist_open(&ds_next->ds_deadlist, mos,
+	    ds_next->ds_phys->ds_deadlist_obj);
+}
+
+static void
+dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx)
+{
+	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+	zap_cursor_t zc;
+	zap_attribute_t za;
+
+	/*
+	 * If it is the old version, dd_clones doesn't exist so we can't
+	 * find the clones, but dsl_deadlist_remove_key() is a no-op so it
+	 * doesn't matter.
+	 */
+	if (ds->ds_dir->dd_phys->dd_clones == 0)
+		return;
+
+	for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones);
+	    zap_cursor_retrieve(&zc, &za) == 0;
+	    zap_cursor_advance(&zc)) {
+		dsl_dataset_t *clone;
+
+		VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
+		    za.za_first_integer, FTAG, &clone));
+		if (clone->ds_dir->dd_origin_txg > mintxg) {
+			dsl_deadlist_remove_key(&clone->ds_deadlist,
+			    mintxg, tx);
+			dsl_dataset_remove_clones_key(clone, mintxg, tx);
+		}
+		dsl_dataset_rele(clone, FTAG);
+	}
+	zap_cursor_fini(&zc);
+}
+
+void
+dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
+{
+	int err;
+	int after_branch_point = FALSE;
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	objset_t *mos = dp->dp_meta_objset;
+	dsl_dataset_t *ds_prev = NULL;
+	uint64_t obj;
+
+	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+	ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
+	ASSERT(refcount_is_zero(&ds->ds_longholds));
+
+	if (defer &&
+	    (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1)) {
+		ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
+		dmu_buf_will_dirty(ds->ds_dbuf, tx);
+		ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY;
+		spa_history_log_internal_ds(ds, "defer_destroy", tx, "");
+		return;
+	}
+
+	ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
+
+	/* We need to log before removing it from the namespace. */
+	spa_history_log_internal_ds(ds, "destroy", tx, "");
+
+	dsl_scan_ds_destroyed(ds, tx);
+
+	obj = ds->ds_object;
+
+	if (ds->ds_phys->ds_prev_snap_obj != 0) {
+		ASSERT3P(ds->ds_prev, ==, NULL);
+		VERIFY0(dsl_dataset_hold_obj(dp,
+		    ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev));
+		after_branch_point =
+		    (ds_prev->ds_phys->ds_next_snap_obj != obj);
+
+		dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
+		if (after_branch_point &&
+		    ds_prev->ds_phys->ds_next_clones_obj != 0) {
+			dsl_dataset_remove_from_next_clones(ds_prev, obj, tx);
+			if (ds->ds_phys->ds_next_snap_obj != 0) {
+				VERIFY0(zap_add_int(mos,
+				    ds_prev->ds_phys->ds_next_clones_obj,
+				    ds->ds_phys->ds_next_snap_obj, tx));
+			}
+		}
+		if (!after_branch_point) {
+			ds_prev->ds_phys->ds_next_snap_obj =
+			    ds->ds_phys->ds_next_snap_obj;
+		}
+	}
+
+	dsl_dataset_t *ds_next;
+	uint64_t old_unique;
+	uint64_t used = 0, comp = 0, uncomp = 0;
+
+	VERIFY0(dsl_dataset_hold_obj(dp,
+	    ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next));
+	ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
+
+	old_unique = ds_next->ds_phys->ds_unique_bytes;
+
+	dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
+	ds_next->ds_phys->ds_prev_snap_obj =
+	    ds->ds_phys->ds_prev_snap_obj;
+	ds_next->ds_phys->ds_prev_snap_txg =
+	    ds->ds_phys->ds_prev_snap_txg;
+	ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
+	    ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
+
+	if (ds_next->ds_deadlist.dl_oldfmt) {
+		process_old_deadlist(ds, ds_prev, ds_next,
+		    after_branch_point, tx);
+	} else {
+		/* Adjust prev's unique space. */
+		if (ds_prev && !after_branch_point) {
+			dsl_deadlist_space_range(&ds_next->ds_deadlist,
+			    ds_prev->ds_phys->ds_prev_snap_txg,
+			    ds->ds_phys->ds_prev_snap_txg,
+			    &used, &comp, &uncomp);
+			ds_prev->ds_phys->ds_unique_bytes += used;
+		}
+
+		/* Adjust snapused. */
+		dsl_deadlist_space_range(&ds_next->ds_deadlist,
+		    ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
+		    &used, &comp, &uncomp);
+		dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
+		    -used, -comp, -uncomp, tx);
+
+		/* Move blocks to be freed to pool's free list. */
+		dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
+		    &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg,
+		    tx);
+		dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
+		    DD_USED_HEAD, used, comp, uncomp, tx);
+
+		/* Merge our deadlist into next's and free it. */
+		dsl_deadlist_merge(&ds_next->ds_deadlist,
+		    ds->ds_phys->ds_deadlist_obj, tx);
+	}
+	dsl_deadlist_close(&ds->ds_deadlist);
+	dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	ds->ds_phys->ds_deadlist_obj = 0;
+
+	/* Collapse range in clone heads */
+	dsl_dataset_remove_clones_key(ds,
+	    ds->ds_phys->ds_creation_txg, tx);
+
+	if (dsl_dataset_is_snapshot(ds_next)) {
+		dsl_dataset_t *ds_nextnext;
+
+		/*
+		 * Update next's unique to include blocks which
+		 * were previously shared by only this snapshot
+		 * and it.  Those blocks will be born after the
+		 * prev snap and before this snap, and will have
+		 * died after the next snap and before the one
+		 * after that (ie. be on the snap after next's
+		 * deadlist).
+		 */
+		VERIFY0(dsl_dataset_hold_obj(dp,
+		    ds_next->ds_phys->ds_next_snap_obj, FTAG, &ds_nextnext));
+		dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
+		    ds->ds_phys->ds_prev_snap_txg,
+		    ds->ds_phys->ds_creation_txg,
+		    &used, &comp, &uncomp);
+		ds_next->ds_phys->ds_unique_bytes += used;
+		dsl_dataset_rele(ds_nextnext, FTAG);
+		ASSERT3P(ds_next->ds_prev, ==, NULL);
+
+		/* Collapse range in this head. */
+		dsl_dataset_t *hds;
+		VERIFY0(dsl_dataset_hold_obj(dp,
+		    ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &hds));
+		dsl_deadlist_remove_key(&hds->ds_deadlist,
+		    ds->ds_phys->ds_creation_txg, tx);
+		dsl_dataset_rele(hds, FTAG);
+
+	} else {
+		ASSERT3P(ds_next->ds_prev, ==, ds);
+		dsl_dataset_rele(ds_next->ds_prev, ds_next);
+		ds_next->ds_prev = NULL;
+		if (ds_prev) {
+			VERIFY0(dsl_dataset_hold_obj(dp,
+			    ds->ds_phys->ds_prev_snap_obj,
+			    ds_next, &ds_next->ds_prev));
+		}
+
+		dsl_dataset_recalc_head_uniq(ds_next);
+
+		/*
+		 * Reduce the amount of our unconsumed refreservation
+		 * being charged to our parent by the amount of
+		 * new unique data we have gained.
+		 */
+		if (old_unique < ds_next->ds_reserved) {
+			int64_t mrsdelta;
+			uint64_t new_unique =
+			    ds_next->ds_phys->ds_unique_bytes;
+
+			ASSERT(old_unique <= new_unique);
+			mrsdelta = MIN(new_unique - old_unique,
+			    ds_next->ds_reserved - old_unique);
+			dsl_dir_diduse_space(ds->ds_dir,
+			    DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
+		}
+	}
+	dsl_dataset_rele(ds_next, FTAG);
+
+	/*
+	 * This must be done after the dsl_traverse(), because it will
+	 * re-open the objset.
+	 */
+	if (ds->ds_objset) {
+		dmu_objset_evict(ds->ds_objset);
+		ds->ds_objset = NULL;
+	}
+
+	/* remove from snapshot namespace */
+	dsl_dataset_t *ds_head;
+	ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0);
+	VERIFY0(dsl_dataset_hold_obj(dp,
+	    ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head));
+	VERIFY0(dsl_dataset_get_snapname(ds));
+#ifdef ZFS_DEBUG
+	{
+		uint64_t val;
+
+		err = dsl_dataset_snap_lookup(ds_head,
+		    ds->ds_snapname, &val);
+		ASSERT0(err);
+		ASSERT3U(val, ==, obj);
+	}
+#endif
+	VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx));
+	dsl_dataset_rele(ds_head, FTAG);
+
+	if (ds_prev != NULL)
+		dsl_dataset_rele(ds_prev, FTAG);
+
+	spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
+
+	if (ds->ds_phys->ds_next_clones_obj != 0) {
+		uint64_t count;
+		ASSERT0(zap_count(mos,
+		    ds->ds_phys->ds_next_clones_obj, &count) && count == 0);
+		VERIFY0(dmu_object_free(mos,
+		    ds->ds_phys->ds_next_clones_obj, tx));
+	}
+	if (ds->ds_phys->ds_props_obj != 0)
+		VERIFY0(zap_destroy(mos, ds->ds_phys->ds_props_obj, tx));
+	if (ds->ds_phys->ds_userrefs_obj != 0)
+		VERIFY0(zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx));
+	dsl_dir_rele(ds->ds_dir, ds);
+	ds->ds_dir = NULL;
+	VERIFY0(dmu_object_free(mos, obj, tx));
+}
+
+static void
+dsl_destroy_snapshot_sync(void *arg, dmu_tx_t *tx)
+{
+	dmu_snapshots_destroy_arg_t *dsda = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	nvpair_t *pair;
+
+	for (pair = nvlist_next_nvpair(dsda->dsda_successful_snaps, NULL);
+	    pair != NULL;
+	    pair = nvlist_next_nvpair(dsda->dsda_successful_snaps, pair)) {
+		dsl_dataset_t *ds;
+
+		VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds));
+
+		dsl_destroy_snapshot_sync_impl(ds, dsda->dsda_defer, tx);
+		dsl_dataset_rele(ds, FTAG);
+	}
+}
+
+/*
+ * The semantics of this function are described in the comment above
+ * lzc_destroy_snaps().  To summarize:
+ *
+ * The snapshots must all be in the same pool.
+ *
+ * Snapshots that don't exist will be silently ignored (considered to be
+ * "already deleted").
+ *
+ * On success, all snaps will be destroyed and this will return 0.
+ * On failure, no snaps will be destroyed, the errlist will be filled in,
+ * and this will return an errno.
+ */
+int
+dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer,
+    nvlist_t *errlist)
+{
+	dmu_snapshots_destroy_arg_t dsda;
+	int error;
+	nvpair_t *pair;
+
+	pair = nvlist_next_nvpair(snaps, NULL);
+	if (pair == NULL)
+		return (0);
+
+	dsda.dsda_snaps = snaps;
+	dsda.dsda_successful_snaps = fnvlist_alloc();
+	dsda.dsda_defer = defer;
+	dsda.dsda_errlist = errlist;
+
+	error = dsl_sync_task(nvpair_name(pair),
+	    dsl_destroy_snapshot_check, dsl_destroy_snapshot_sync,
+	    &dsda, 0);
+	fnvlist_free(dsda.dsda_successful_snaps);
+
+	return (error);
+}
+
+int
+dsl_destroy_snapshot(const char *name, boolean_t defer)
+{
+	int error;
+	nvlist_t *nvl = fnvlist_alloc();
+	nvlist_t *errlist = fnvlist_alloc();
+
+	fnvlist_add_boolean(nvl, name);
+	error = dsl_destroy_snapshots_nvl(nvl, defer, errlist);
+	fnvlist_free(errlist);
+	fnvlist_free(nvl);
+	return (error);
+}
+
+struct killarg {
+	dsl_dataset_t *ds;
+	dmu_tx_t *tx;
+};
+
+/* ARGSUSED */
+static int
+kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+	struct killarg *ka = arg;
+	dmu_tx_t *tx = ka->tx;
+
+	if (bp == NULL)
+		return (0);
+
+	if (zb->zb_level == ZB_ZIL_LEVEL) {
+		ASSERT(zilog != NULL);
+		/*
+		 * It's a block in the intent log.  It has no
+		 * accounting, so just free it.
+		 */
+		dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
+	} else {
+		ASSERT(zilog == NULL);
+		ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg);
+		(void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
+	}
+
+	return (0);
+}
+
+static void
+old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	struct killarg ka;
+
+	/*
+	 * Free everything that we point to (that's born after
+	 * the previous snapshot, if we are a clone)
+	 *
+	 * NB: this should be very quick, because we already
+	 * freed all the objects in open context.
+	 */
+	ka.ds = ds;
+	ka.tx = tx;
+	VERIFY0(traverse_dataset(ds,
+	    ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST,
+	    kill_blkptr, &ka));
+	ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0);
+}
+
+typedef struct dsl_destroy_head_arg {
+	const char *ddha_name;
+} dsl_destroy_head_arg_t;
+
+int
+dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds)
+{
+	int error;
+	uint64_t count;
+	objset_t *mos;
+
+	if (dsl_dataset_is_snapshot(ds))
+		return (EINVAL);
+
+	if (refcount_count(&ds->ds_longholds) != expected_holds)
+		return (EBUSY);
+
+	mos = ds->ds_dir->dd_pool->dp_meta_objset;
+
+	/*
+	 * Can't delete a head dataset if there are snapshots of it.
+	 * (Except if the only snapshots are from the branch we cloned
+	 * from.)
+	 */
+	if (ds->ds_prev != NULL &&
+	    ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
+		return (EBUSY);
+
+	/*
+	 * Can't delete if there are children of this fs.
+	 */
+	error = zap_count(mos,
+	    ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count);
+	if (error != 0)
+		return (error);
+	if (count != 0)
+		return (EEXIST);
+
+	if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) &&
+	    ds->ds_prev->ds_phys->ds_num_children == 2 &&
+	    ds->ds_prev->ds_userrefs == 0) {
+		/* We need to remove the origin snapshot as well. */
+		if (!refcount_is_zero(&ds->ds_prev->ds_longholds))
+			return (EBUSY);
+	}
+	return (0);
+}
+
+static int
+dsl_destroy_head_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_destroy_head_arg_t *ddha = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+	int error;
+
+	error = dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds);
+	if (error != 0)
+		return (error);
+
+	error = dsl_destroy_head_check_impl(ds, 0);
+	dsl_dataset_rele(ds, FTAG);
+	return (error);
+}
+
+static void
+dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx)
+{
+	dsl_dir_t *dd;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	objset_t *mos = dp->dp_meta_objset;
+	dd_used_t t;
+
+	ASSERT(RRW_WRITE_HELD(&dmu_tx_pool(tx)->dp_config_rwlock));
+
+	VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd));
+
+	ASSERT0(dd->dd_phys->dd_head_dataset_obj);
+
+	/*
+	 * Remove our reservation. The impl() routine avoids setting the
+	 * actual property, which would require the (already destroyed) ds.
+	 */
+	dsl_dir_set_reservation_sync_impl(dd, 0, tx);
+
+	ASSERT0(dd->dd_phys->dd_used_bytes);
+	ASSERT0(dd->dd_phys->dd_reserved);
+	for (t = 0; t < DD_USED_NUM; t++)
+		ASSERT0(dd->dd_phys->dd_used_breakdown[t]);
+
+	VERIFY0(zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx));
+	VERIFY0(zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx));
+	VERIFY0(dsl_deleg_destroy(mos, dd->dd_phys->dd_deleg_zapobj, tx));
+	VERIFY0(zap_remove(mos,
+	    dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx));
+
+	dsl_dir_rele(dd, FTAG);
+	VERIFY0(dmu_object_free(mos, ddobj, tx));
+}
+
+void
+dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	objset_t *mos = dp->dp_meta_objset;
+	uint64_t obj, ddobj, prevobj = 0;
+	boolean_t rmorigin;
+
+	ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
+	ASSERT(ds->ds_prev == NULL ||
+	    ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
+	ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
+	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+
+	/* We need to log before removing it from the namespace. */
+	spa_history_log_internal_ds(ds, "destroy", tx, "");
+
+	rmorigin = (dsl_dir_is_clone(ds->ds_dir) &&
+	    DS_IS_DEFER_DESTROY(ds->ds_prev) &&
+	    ds->ds_prev->ds_phys->ds_num_children == 2 &&
+	    ds->ds_prev->ds_userrefs == 0);
+
+	/* Remove our reservation */
+	if (ds->ds_reserved != 0) {
+		dsl_dataset_set_refreservation_sync_impl(ds,
+		    (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
+		    0, tx);
+		ASSERT0(ds->ds_reserved);
+	}
+
+	dsl_scan_ds_destroyed(ds, tx);
+
+	obj = ds->ds_object;
+
+	if (ds->ds_phys->ds_prev_snap_obj != 0) {
+		/* This is a clone */
+		ASSERT(ds->ds_prev != NULL);
+		ASSERT3U(ds->ds_prev->ds_phys->ds_next_snap_obj, !=, obj);
+		ASSERT0(ds->ds_phys->ds_next_snap_obj);
+
+		dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+		if (ds->ds_prev->ds_phys->ds_next_clones_obj != 0) {
+			dsl_dataset_remove_from_next_clones(ds->ds_prev,
+			    obj, tx);
+		}
+
+		ASSERT3U(ds->ds_prev->ds_phys->ds_num_children, >, 1);
+		ds->ds_prev->ds_phys->ds_num_children--;
+	}
+
+	zfeature_info_t *async_destroy =
+	    &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY];
+	objset_t *os;
+
+	/*
+	 * Destroy the deadlist.  Unless it's a clone, the
+	 * deadlist should be empty.  (If it's a clone, it's
+	 * safe to ignore the deadlist contents.)
+	 */
+	dsl_deadlist_close(&ds->ds_deadlist);
+	dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	ds->ds_phys->ds_deadlist_obj = 0;
+
+	VERIFY0(dmu_objset_from_ds(ds, &os));
+
+	if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) {
+		old_synchronous_dataset_destroy(ds, tx);
+	} else {
+		/*
+		 * Move the bptree into the pool's list of trees to
+		 * clean up and update space accounting information.
+		 */
+		uint64_t used, comp, uncomp;
+
+		zil_destroy_sync(dmu_objset_zil(os), tx);
+
+		if (!spa_feature_is_active(dp->dp_spa, async_destroy)) {
+			spa_feature_incr(dp->dp_spa, async_destroy, tx);
+			dp->dp_bptree_obj = bptree_alloc(mos, tx);
+			VERIFY0(zap_add(mos,
+			    DMU_POOL_DIRECTORY_OBJECT,
+			    DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
+			    &dp->dp_bptree_obj, tx));
+		}
+
+		used = ds->ds_dir->dd_phys->dd_used_bytes;
+		comp = ds->ds_dir->dd_phys->dd_compressed_bytes;
+		uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes;
+
+		ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
+		    ds->ds_phys->ds_unique_bytes == used);
+
+		bptree_add(mos, dp->dp_bptree_obj,
+		    &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg,
+		    used, comp, uncomp, tx);
+		dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
+		    -used, -comp, -uncomp, tx);
+		dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
+		    used, comp, uncomp, tx);
+	}
+
+	if (ds->ds_prev != NULL) {
+		if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+			VERIFY0(zap_remove_int(mos,
+			    ds->ds_prev->ds_dir->dd_phys->dd_clones,
+			    ds->ds_object, tx));
+		}
+		prevobj = ds->ds_prev->ds_object;
+		dsl_dataset_rele(ds->ds_prev, ds);
+		ds->ds_prev = NULL;
+	}
+
+	/*
+	 * This must be done after the dsl_traverse(), because it will
+	 * re-open the objset.
+	 */
+	if (ds->ds_objset) {
+		dmu_objset_evict(ds->ds_objset);
+		ds->ds_objset = NULL;
+	}
+
+	/* Erase the link in the dir */
+	dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
+	ds->ds_dir->dd_phys->dd_head_dataset_obj = 0;
+	ddobj = ds->ds_dir->dd_object;
+	ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0);
+	VERIFY0(zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx));
+
+	spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
+
+	ASSERT0(ds->ds_phys->ds_next_clones_obj);
+	ASSERT0(ds->ds_phys->ds_props_obj);
+	ASSERT0(ds->ds_phys->ds_userrefs_obj);
+	dsl_dir_rele(ds->ds_dir, ds);
+	ds->ds_dir = NULL;
+	VERIFY0(dmu_object_free(mos, obj, tx));
+
+	dsl_dir_destroy_sync(ddobj, tx);
+
+	if (rmorigin) {
+		dsl_dataset_t *prev;
+		VERIFY0(dsl_dataset_hold_obj(dp, prevobj, FTAG, &prev));
+		dsl_destroy_snapshot_sync_impl(prev, B_FALSE, tx);
+		dsl_dataset_rele(prev, FTAG);
+	}
+}
+
+static void
+dsl_destroy_head_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_destroy_head_arg_t *ddha = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+
+	VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
+	dsl_destroy_head_sync_impl(ds, tx);
+	dsl_dataset_rele(ds, FTAG);
+}
+
+static void
+dsl_destroy_head_begin_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_destroy_head_arg_t *ddha = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+
+	VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
+
+	/* Mark it as inconsistent on-disk, in case we crash */
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
+
+	spa_history_log_internal_ds(ds, "destroy begin", tx, "");
+	dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_destroy_head(const char *name)
+{
+	dsl_destroy_head_arg_t ddha;
+	int error;
+	spa_t *spa;
+	boolean_t isenabled;
+
+#ifdef _KERNEL
+	zfs_destroy_unmount_origin(name);
+#endif
+
+	error = spa_open(name, &spa, FTAG);
+	if (error != 0)
+		return (error);
+	isenabled = spa_feature_is_enabled(spa,
+	    &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]);
+	spa_close(spa, FTAG);
+
+	ddha.ddha_name = name;
+
+	if (!isenabled) {
+		objset_t *os;
+
+		error = dsl_sync_task(name, dsl_destroy_head_check,
+		    dsl_destroy_head_begin_sync, &ddha, 0);
+		if (error != 0)
+			return (error);
+
+		/*
+		 * Head deletion is processed in one txg on old pools;
+		 * remove the objects from open context so that the txg sync
+		 * is not too long.
+		 */
+		error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, FTAG, &os);
+		if (error == 0) {
+			uint64_t prev_snap_txg =
+			    dmu_objset_ds(os)->ds_phys->ds_prev_snap_txg;
+			for (uint64_t obj = 0; error == 0;
+			    error = dmu_object_next(os, &obj, FALSE,
+			    prev_snap_txg))
+				(void) dmu_free_object(os, obj);
+			/* sync out all frees */
+			txg_wait_synced(dmu_objset_pool(os), 0);
+			dmu_objset_disown(os, FTAG);
+		}
+	}
+
+	return (dsl_sync_task(name, dsl_destroy_head_check,
+	    dsl_destroy_head_sync, &ddha, 0));
+}
+
+/*
+ * Note, this function is used as the callback for dmu_objset_find().  We
+ * always return 0 so that we will continue to find and process
+ * inconsistent datasets, even if we encounter an error trying to
+ * process one of them.
+ */
+/* ARGSUSED */
+int
+dsl_destroy_inconsistent(const char *dsname, void *arg)
+{
+	objset_t *os;
+
+	if (dmu_objset_hold(dsname, FTAG, &os) == 0) {
+		boolean_t inconsistent = DS_IS_INCONSISTENT(dmu_objset_ds(os));
+		dmu_objset_rele(os, FTAG);
+		if (inconsistent)
+			(void) dsl_destroy_head(dsname);
+	}
+	return (0);
+}
--- a/usr/src/uts/common/fs/zfs/dsl_dir.c	Wed Feb 27 21:13:42 2013 +0100
+++ b/usr/src/uts/common/fs/zfs/dsl_dir.c	Thu Feb 28 12:44:05 2013 -0800
@@ -40,8 +40,6 @@
 #include "zfs_namecheck.h"
 
 static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
-static void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd,
-    uint64_t value, dmu_tx_t *tx);
 
 /* ARGSUSED */
 static void
@@ -58,7 +56,7 @@
 	}
 
 	if (dd->dd_parent)
-		dsl_dir_close(dd->dd_parent, dd);
+		dsl_dir_rele(dd->dd_parent, dd);
 
 	spa_close(dd->dd_pool->dp_spa, dd);
 
@@ -72,18 +70,17 @@
 }
 
 int
-dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
+dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
     const char *tail, void *tag, dsl_dir_t **ddp)
 {
 	dmu_buf_t *dbuf;
 	dsl_dir_t *dd;
 	int err;
 
-	ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
-	    dsl_pool_sync_context(dp));
+	ASSERT(dsl_pool_config_held(dp));
 
 	err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf);
-	if (err)
+	if (err != 0)
 		return (err);
 	dd = dmu_buf_get_user(dbuf);
 #ifdef ZFS_DEBUG
@@ -110,9 +107,9 @@
 		dsl_dir_snap_cmtime_update(dd);
 
 		if (dd->dd_phys->dd_parent_obj) {
-			err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj,
+			err = dsl_dir_hold_obj(dp, dd->dd_phys->dd_parent_obj,
 			    NULL, dd, &dd->dd_parent);
-			if (err)
+			if (err != 0)
 				goto errout;
 			if (tail) {
 #ifdef ZFS_DEBUG
@@ -129,7 +126,7 @@
 				    dd->dd_parent->dd_phys->dd_child_dir_zapobj,
 				    ddobj, 0, dd->dd_myname);
 			}
-			if (err)
+			if (err != 0)
 				goto errout;
 		} else {
 			(void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
@@ -146,7 +143,7 @@
 			 */
 			err = dmu_bonus_hold(dp->dp_meta_objset,
 			    dd->dd_phys->dd_origin_obj, FTAG, &origin_bonus);
-			if (err)
+			if (err != 0)
 				goto errout;
 			origin_phys = origin_bonus->db_data;
 			dd->dd_origin_txg =
@@ -158,7 +155,7 @@
 		    dsl_dir_evict);
 		if (winner) {
 			if (dd->dd_parent)
-				dsl_dir_close(dd->dd_parent, dd);
+				dsl_dir_rele(dd->dd_parent, dd);
 			mutex_destroy(&dd->dd_lock);
 			kmem_free(dd, sizeof (dsl_dir_t));
 			dd = winner;
@@ -185,7 +182,7 @@
 
 errout:
 	if (dd->dd_parent)
-		dsl_dir_close(dd->dd_parent, dd);
+		dsl_dir_rele(dd->dd_parent, dd);
 	mutex_destroy(&dd->dd_lock);
 	kmem_free(dd, sizeof (dsl_dir_t));
 	dmu_buf_rele(dbuf, tag);
@@ -193,7 +190,7 @@
 }
 
 void
-dsl_dir_close(dsl_dir_t *dd, void *tag)
+dsl_dir_rele(dsl_dir_t *dd, void *tag)
 {
 	dprintf_dd(dd, "%s\n", "");
 	spa_close(dd->dd_pool->dp_spa, tag);
@@ -250,6 +247,7 @@
 getcomponent(const char *path, char *component, const char **nextp)
 {
 	char *p;
+
 	if ((path == NULL) || (path[0] == '\0'))
 		return (ENOENT);
 	/* This would be a good place to reserve some namespace... */
@@ -272,10 +270,10 @@
 		(void) strcpy(component, path);
 		p = NULL;
 	} else if (p[0] == '/') {
-		if (p-path >= MAXNAMELEN)
+		if (p - path >= MAXNAMELEN)
 			return (ENAMETOOLONG);
 		(void) strncpy(component, path, p - path);
-		component[p-path] = '\0';
+		component[p - path] = '\0';
 		p++;
 	} else if (p[0] == '@') {
 		/*
@@ -284,65 +282,54 @@
 		 */
 		if (strchr(path, '/'))
 			return (EINVAL);
-		if (p-path >= MAXNAMELEN)
+		if (p - path >= MAXNAMELEN)
 			return (ENAMETOOLONG);
 		(void) strncpy(component, path, p - path);
-		component[p-path] = '\0';
+		component[p - path] = '\0';
 	} else {
-		ASSERT(!"invalid p");
+		panic("invalid p=%p", (void *)p);
 	}
 	*nextp = p;
 	return (0);
 }
 
 /*
- * same as dsl_open_dir, ignore the first component of name and use the
- * spa instead
+ * Return the dsl_dir_t, and possibly the last component which couldn't
+ * be found in *tail.  The name must be in the specified dsl_pool_t.  This
+ * thread must hold the dp_config_rwlock for the pool.  Returns NULL if the
+ * path is bogus, or if tail==NULL and we couldn't parse the whole name.
+ * (*tail)[0] == '@' means that the last component is a snapshot.
  */
 int
-dsl_dir_open_spa(spa_t *spa, const char *name, void *tag,
+dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag,
     dsl_dir_t **ddp, const char **tailp)
 {
 	char buf[MAXNAMELEN];
-	const char *next, *nextnext = NULL;
+	const char *spaname, *next, *nextnext = NULL;
 	int err;