diff usr/src/uts/common/fs/zfs/dsl_dataset.c @ 5367:c40abbe796be

PSARC/2007/574 zfs send -R 6358519 'zfs restore' can't restore full backup into topmost filesystem 6421958 want recursive zfs send ('zfs send -r') 6465969 zfs receive error message could be a little more friendly 6482331 assertion failed: ra.err == 0 (0x10 == 0x0) 6577548 nvlist_next_nvpair() can not iterate recursively 6579048 zfs send -i "" fs@snap can succeed 6580447 "zfs list -t filesystem" slowly iterates over all snapshots 6581508 zfs issues confusing error message when doing an incremental send 6585612 'zfs recv -d' cannot receive the top-level filesystem backups 6589317 create-time permissions not granted on filesystems created by "zfs recv" 6596160 zfs create -p -b 1092 <filesystem> should fail. 6619393 help message for ::dbufs is slightly wrong 6620906 zfs_rename() gives incorrect error message 6621295 dsl_deleg_set_sync() should be broken up
author ahrens
date Mon, 29 Oct 2007 17:12:17 -0700
parents 6752aa2bd5bc
children 111aa1baa84a
line wrap: on
line diff
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c	Mon Oct 29 16:16:37 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c	Mon Oct 29 17:12:17 2007 -0700
@@ -45,8 +45,6 @@
 static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
 static dsl_checkfunc_t dsl_dataset_rollback_check;
 static dsl_syncfunc_t dsl_dataset_rollback_sync;
-static dsl_checkfunc_t dsl_dataset_destroy_check;
-static dsl_syncfunc_t dsl_dataset_destroy_sync;
 
 #define	DS_REF_MAX	(1ULL << 62)
 
@@ -533,6 +531,39 @@
 }
 
 void
+dsl_dataset_downgrade(dsl_dataset_t *ds, int oldmode, int newmode)
+{
+	uint64_t oldweight = ds_refcnt_weight[DS_MODE_LEVEL(oldmode)];
+	uint64_t newweight = ds_refcnt_weight[DS_MODE_LEVEL(newmode)];
+	mutex_enter(&ds->ds_lock);
+	ASSERT3U(ds->ds_open_refcount, >=, oldweight);
+	ASSERT3U(oldweight, >=, newweight);
+	ds->ds_open_refcount -= oldweight;
+	ds->ds_open_refcount += newweight;
+	mutex_exit(&ds->ds_lock);
+}
+
+boolean_t
+dsl_dataset_tryupgrade(dsl_dataset_t *ds, int oldmode, int newmode)
+{
+	boolean_t rv;
+	uint64_t oldweight = ds_refcnt_weight[DS_MODE_LEVEL(oldmode)];
+	uint64_t newweight = ds_refcnt_weight[DS_MODE_LEVEL(newmode)];
+	mutex_enter(&ds->ds_lock);
+	ASSERT3U(ds->ds_open_refcount, >=, oldweight);
+	ASSERT3U(newweight, >=, oldweight);
+	if (ds->ds_open_refcount - oldweight + newweight > DS_REF_MAX) {
+		rv = B_FALSE;
+	} else {
+		ds->ds_open_refcount -= oldweight;
+		ds->ds_open_refcount += newweight;
+		rv = B_TRUE;
+	}
+	mutex_exit(&ds->ds_lock);
+	return (rv);
+}
+
+void
 dsl_dataset_create_root(dsl_pool_t *dp, uint64_t *ddobjp, dmu_tx_t *tx)
 {
 	objset_t *mos = dp->dp_meta_objset;
@@ -574,24 +605,18 @@
 }
 
 uint64_t
-dsl_dataset_create_sync(dsl_dir_t *pdd,
-    const char *lastname, dsl_dataset_t *clone_parent, dmu_tx_t *tx)
+dsl_dataset_create_sync_impl(dsl_dir_t *dd, dsl_dataset_t *origin, dmu_tx_t *tx)
 {
-	dsl_pool_t *dp = pdd->dd_pool;
+	dsl_pool_t *dp = dd->dd_pool;
 	dmu_buf_t *dbuf;
 	dsl_dataset_phys_t *dsphys;
-	uint64_t dsobj, ddobj;
+	uint64_t dsobj;
 	objset_t *mos = dp->dp_meta_objset;
-	dsl_dir_t *dd;
 
-	ASSERT(clone_parent == NULL || clone_parent->ds_dir->dd_pool == dp);
-	ASSERT(clone_parent == NULL ||
-	    clone_parent->ds_phys->ds_num_children > 0);
-	ASSERT(lastname[0] != '@');
+	ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
+	ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0);
 	ASSERT(dmu_tx_is_syncing(tx));
-
-	ddobj = dsl_dir_create_sync(pdd, lastname, tx);
-	VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd));
+	ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
 
 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
@@ -608,28 +633,49 @@
 	dsphys->ds_creation_txg = tx->tx_txg;
 	dsphys->ds_deadlist_obj =
 	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
-	if (clone_parent) {
-		dsphys->ds_prev_snap_obj = clone_parent->ds_object;
+	if (origin) {
+		dsphys->ds_prev_snap_obj = origin->ds_object;
 		dsphys->ds_prev_snap_txg =
-		    clone_parent->ds_phys->ds_creation_txg;
+		    origin->ds_phys->ds_creation_txg;
 		dsphys->ds_used_bytes =
-		    clone_parent->ds_phys->ds_used_bytes;
+		    origin->ds_phys->ds_used_bytes;
 		dsphys->ds_compressed_bytes =
-		    clone_parent->ds_phys->ds_compressed_bytes;
+		    origin->ds_phys->ds_compressed_bytes;
 		dsphys->ds_uncompressed_bytes =
-		    clone_parent->ds_phys->ds_uncompressed_bytes;
-		dsphys->ds_bp = clone_parent->ds_phys->ds_bp;
+		    origin->ds_phys->ds_uncompressed_bytes;
+		dsphys->ds_bp = origin->ds_phys->ds_bp;
 
-		dmu_buf_will_dirty(clone_parent->ds_dbuf, tx);
-		clone_parent->ds_phys->ds_num_children++;
+		dmu_buf_will_dirty(origin->ds_dbuf, tx);
+		origin->ds_phys->ds_num_children++;
 
 		dmu_buf_will_dirty(dd->dd_dbuf, tx);
-		dd->dd_phys->dd_clone_parent_obj = clone_parent->ds_object;
+		dd->dd_phys->dd_origin_obj = origin->ds_object;
 	}
 	dmu_buf_rele(dbuf, FTAG);
 
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 	dd->dd_phys->dd_head_dataset_obj = dsobj;
+
+	return (dsobj);
+}
+
+uint64_t
+dsl_dataset_create_sync(dsl_dir_t *pdd,
+    const char *lastname, dsl_dataset_t *origin, cred_t *cr, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = pdd->dd_pool;
+	uint64_t dsobj, ddobj;
+	dsl_dir_t *dd;
+
+	ASSERT(lastname[0] != '@');
+
+	ddobj = dsl_dir_create_sync(pdd, lastname, tx);
+	VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd));
+
+	dsobj = dsl_dataset_create_sync_impl(dd, origin, tx);
+
+	dsl_deleg_set_create_perms(dd, tx, cr);
+
 	dsl_dir_close(dd, FTAG);
 
 	return (dsobj);
@@ -713,36 +759,36 @@
 	return (err);
 }
 
+/*
+ * ds must be opened EXCLUSIVE or PRIMARY.  on return (whether
+ * successful or not), ds will be closed and caller can no longer
+ * dereference it.
+ */
 int
-dsl_dataset_destroy(const char *name)
+dsl_dataset_destroy(dsl_dataset_t *ds, void *tag)
 {
 	int err;
 	dsl_sync_task_group_t *dstg;
 	objset_t *os;
-	dsl_dataset_t *ds;
 	dsl_dir_t *dd;
 	uint64_t obj;
 
-	if (strchr(name, '@')) {
+	if (ds->ds_open_refcount != DS_REF_MAX) {
+		if (dsl_dataset_tryupgrade(ds, DS_MODE_PRIMARY,
+		    DS_MODE_EXCLUSIVE) == 0) {
+			dsl_dataset_close(ds, DS_MODE_PRIMARY, tag);
+			return (EBUSY);
+		}
+	}
+
+	if (dsl_dataset_is_snapshot(ds)) {
 		/* Destroying a snapshot is simpler */
-		err = dsl_dataset_open(name,
-		    DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
-		    FTAG, &ds);
-		if (err)
-			return (err);
 		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
 		    dsl_dataset_destroy_check, dsl_dataset_destroy_sync,
-		    ds, FTAG, 0);
-		if (err)
-			dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
-		return (err);
+		    ds, tag, 0);
+		goto out;
 	}
 
-	err = dmu_objset_open(name, DMU_OST_ANY,
-	    DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, &os);
-	if (err)
-		return (err);
-	ds = os->os->os_dsl_dataset;
 	dd = ds->ds_dir;
 
 	/*
@@ -751,10 +797,12 @@
 	 */
 	err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check,
 	    dsl_dataset_destroy_begin_sync, ds, NULL, 0);
-	if (err) {
-		dmu_objset_close(os);
-		return (err);
-	}
+	if (err)
+		goto out;
+
+	err = dmu_objset_open_ds(ds, DMU_OST_ANY, &os);
+	if (err)
+		goto out;
 
 	/*
 	 * remove the objects in open context, so that we won't
@@ -783,45 +831,47 @@
 
 	dmu_objset_close(os);
 	if (err != ESRCH)
-		return (err);
+		goto out;
 
-	err = dsl_dataset_open(name,
-	    DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
-	    FTAG, &ds);
+	if (ds->ds_user_ptr) {
+		ds->ds_user_evict_func(ds, ds->ds_user_ptr);
+		ds->ds_user_ptr = NULL;
+	}
+
+	rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
+	err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd);
+	rw_exit(&dd->dd_pool->dp_config_rwlock);
+
 	if (err)
-		return (err);
-
-	err = dsl_dir_open(name, FTAG, &dd, NULL);
-	if (err) {
-		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
-		return (err);
-	}
+		goto out;
 
 	/*
 	 * Blow away the dsl_dir + head dataset.
 	 */
 	dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
 	dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
-	    dsl_dataset_destroy_sync, ds, FTAG, 0);
+	    dsl_dataset_destroy_sync, ds, tag, 0);
 	dsl_sync_task_create(dstg, dsl_dir_destroy_check,
 	    dsl_dir_destroy_sync, dd, FTAG, 0);
 	err = dsl_sync_task_group_wait(dstg);
 	dsl_sync_task_group_destroy(dstg);
 	/* if it is successful, *destroy_sync will close the ds+dd */
-	if (err) {
-		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+	if (err)
 		dsl_dir_close(dd, FTAG);
-	}
+out:
+	if (err)
+		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, tag);
 	return (err);
 }
 
 int
-dsl_dataset_rollback(dsl_dataset_t *ds)
+dsl_dataset_rollback(dsl_dataset_t *ds, dmu_objset_type_t ost)
 {
 	ASSERT3U(ds->ds_open_refcount, ==, DS_REF_MAX);
+
 	return (dsl_sync_task_do(ds->ds_dir->dd_pool,
 	    dsl_dataset_rollback_check, dsl_dataset_rollback_sync,
-	    ds, NULL, 0));
+	    ds, &ost, 0));
 }
 
 void *
@@ -927,14 +977,12 @@
 dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
+	dmu_objset_type_t *ost = arg2;
 
 	/*
-	 * There must be a previous snapshot.  I suppose we could roll
-	 * it back to being empty (and re-initialize the upper (ZPL)
-	 * layer).  But for now there's no way to do this via the user
-	 * interface.
+	 * We can only roll back to emptyness if it is a ZPL objset.
 	 */
-	if (ds->ds_phys->ds_prev_snap_txg == 0)
+	if (*ost != DMU_OST_ZFS && ds->ds_phys->ds_prev_snap_txg == 0)
 		return (EINVAL);
 
 	/*
@@ -958,17 +1006,29 @@
 dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
+	dmu_objset_type_t *ost = arg2;
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 
 	/*
 	 * Before the roll back destroy the zil.
-	 * Note, ds_user_ptr can be null if we are doing a "zfs receive -F"
 	 */
 	if (ds->ds_user_ptr != NULL) {
 		zil_rollback_destroy(
 		    ((objset_impl_t *)ds->ds_user_ptr)->os_zil, tx);
+
+		/*
+		 * We need to make sure that the objset_impl_t is reopened after
+		 * we do the rollback, otherwise it will have the wrong
+		 * objset_phys_t.  Normally this would happen when this
+		 * DS_MODE_EXCLUSIVE dataset-open is closed, thus causing the
+		 * dataset to be immediately evicted.  But when doing "zfs recv
+		 * -F", we reopen the objset before that, so that there is no
+		 * window where the dataset is closed and inconsistent.
+		 */
+		ds->ds_user_evict_func(ds, ds->ds_user_ptr);
+		ds->ds_user_ptr = NULL;
 	}
 
 	/* Zero out the deadlist. */
@@ -1000,20 +1060,34 @@
 		    -used, -compressed, -uncompressed, tx);
 	}
 
-	/* Change our contents to that of the prev snapshot */
-	ASSERT3U(ds->ds_prev->ds_object, ==, ds->ds_phys->ds_prev_snap_obj);
-	ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp;
-	ds->ds_phys->ds_used_bytes = ds->ds_prev->ds_phys->ds_used_bytes;
-	ds->ds_phys->ds_compressed_bytes =
-	    ds->ds_prev->ds_phys->ds_compressed_bytes;
-	ds->ds_phys->ds_uncompressed_bytes =
-	    ds->ds_prev->ds_phys->ds_uncompressed_bytes;
-	ds->ds_phys->ds_flags = ds->ds_prev->ds_phys->ds_flags;
-	ds->ds_phys->ds_unique_bytes = 0;
+	if (ds->ds_prev) {
+		/* Change our contents to that of the prev snapshot */
+		ASSERT3U(ds->ds_prev->ds_object, ==,
+		    ds->ds_phys->ds_prev_snap_obj);
+		ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp;
+		ds->ds_phys->ds_used_bytes =
+		    ds->ds_prev->ds_phys->ds_used_bytes;
+		ds->ds_phys->ds_compressed_bytes =
+		    ds->ds_prev->ds_phys->ds_compressed_bytes;
+		ds->ds_phys->ds_uncompressed_bytes =
+		    ds->ds_prev->ds_phys->ds_uncompressed_bytes;
+		ds->ds_phys->ds_flags = ds->ds_prev->ds_phys->ds_flags;
+		ds->ds_phys->ds_unique_bytes = 0;
 
-	if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
-		dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
-		ds->ds_prev->ds_phys->ds_unique_bytes = 0;
+		if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
+			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+			ds->ds_prev->ds_phys->ds_unique_bytes = 0;
+		}
+	} else {
+		/* Zero out our contents, recreate objset */
+		bzero(&ds->ds_phys->ds_bp, sizeof (blkptr_t));
+		ds->ds_phys->ds_used_bytes = 0;
+		ds->ds_phys->ds_compressed_bytes = 0;
+		ds->ds_phys->ds_uncompressed_bytes = 0;
+		ds->ds_phys->ds_flags = 0;
+		ds->ds_phys->ds_unique_bytes = 0;
+		(void) dmu_objset_create_impl(ds->ds_dir->dd_pool->dp_spa, ds,
+		    &ds->ds_phys->ds_bp, *ost, tx);
 	}
 
 	spa_history_internal_log(LOG_DS_ROLLBACK, ds->ds_dir->dd_pool->dp_spa,
@@ -1025,6 +1099,9 @@
 dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
+	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+	uint64_t count;
+	int err;
 
 	/*
 	 * Can't delete a head dataset if there are snapshots of it.
@@ -1035,6 +1112,17 @@
 	    ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
 		return (EINVAL);
 
+	/*
+	 * This is really a dsl_dir thing, but check it here so that
+	 * we'll be less likely to leave this dataset inconsistent &
+	 * nearly destroyed.
+	 */
+	err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count);
+	if (err)
+		return (err);
+	if (count != 0)
+		return (EEXIST);
+
 	return (0);
 }
 
@@ -1054,7 +1142,7 @@
 }
 
 /* ARGSUSED */
-static int
+int
 dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
@@ -1083,7 +1171,7 @@
 	return (0);
 }
 
-static void
+void
 dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
@@ -1337,8 +1425,7 @@
 int
 dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
-	objset_t *os = arg1;
-	dsl_dataset_t *ds = os->os->os_dsl_dataset;
+	dsl_dataset_t *ds = arg1;
 	const char *snapname = arg2;
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 	int err;
@@ -1375,8 +1462,7 @@
 void
 dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 {
-	objset_t *os = arg1;
-	dsl_dataset_t *ds = os->os->os_dsl_dataset;
+	dsl_dataset_t *ds = arg1;
 	const char *snapname = arg2;
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	dmu_buf_t *dbuf;
@@ -1499,20 +1585,21 @@
 {
 	stat->dds_creation_txg = ds->ds_phys->ds_creation_txg;
 	stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT;
+	stat->dds_guid = ds->ds_phys->ds_guid;
 	if (ds->ds_phys->ds_next_snap_obj) {
 		stat->dds_is_snapshot = B_TRUE;
 		stat->dds_num_clones = ds->ds_phys->ds_num_children - 1;
 	}
 
 	/* clone origin is really a dsl_dir thing... */
-	if (ds->ds_dir->dd_phys->dd_clone_parent_obj) {
+	if (ds->ds_dir->dd_phys->dd_origin_obj) {
 		dsl_dataset_t *ods;
 
 		rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
 		VERIFY(0 == dsl_dataset_open_obj(ds->ds_dir->dd_pool,
-		    ds->ds_dir->dd_phys->dd_clone_parent_obj,
+		    ds->ds_dir->dd_phys->dd_origin_obj,
 		    NULL, DS_MODE_NONE, FTAG, &ods));
-		dsl_dataset_name(ods, stat->dds_clone_of);
+		dsl_dataset_name(ods, stat->dds_origin);
 		dsl_dataset_close(ods, DS_MODE_NONE, FTAG);
 		rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
 	}
@@ -1808,9 +1895,9 @@
 	struct promotearg *pa = arg2;
 	dsl_dir_t *dd = hds->ds_dir;
 	dsl_pool_t *dp = hds->ds_dir->dd_pool;
-	dsl_dir_t *pdd = NULL;
+	dsl_dir_t *odd = NULL;
 	dsl_dataset_t *ds = NULL;
-	dsl_dataset_t *pivot_ds = NULL;
+	dsl_dataset_t *origin_ds = NULL;
 	dsl_dataset_t *newnext_ds = NULL;
 	int err;
 	char *name = NULL;
@@ -1820,23 +1907,22 @@
 	bzero(pa, sizeof (*pa));
 
 	/* Check that it is a clone */
-	if (dd->dd_phys->dd_clone_parent_obj == 0)
+	if (dd->dd_phys->dd_origin_obj == 0)
 		return (EINVAL);
 
 	/* Since this is so expensive, don't do the preliminary check */
 	if (!dmu_tx_is_syncing(tx))
 		return (0);
 
-	if (err = dsl_dataset_open_obj(dp,
-	    dd->dd_phys->dd_clone_parent_obj,
-	    NULL, DS_MODE_EXCLUSIVE, FTAG, &pivot_ds))
+	if (err = dsl_dataset_open_obj(dp, dd->dd_phys->dd_origin_obj,
+	    NULL, DS_MODE_EXCLUSIVE, FTAG, &origin_ds))
 		goto out;
-	pdd = pivot_ds->ds_dir;
+	odd = origin_ds->ds_dir;
 
 	{
 		dsl_dataset_t *phds;
 		if (err = dsl_dataset_open_obj(dd->dd_pool,
-		    pdd->dd_phys->dd_head_dataset_obj,
+		    odd->dd_phys->dd_head_dataset_obj,
 		    NULL, DS_MODE_NONE, FTAG, &phds))
 			goto out;
 		pa->snapnames_obj = phds->ds_phys->ds_snapnames_zapobj;
@@ -1848,10 +1934,10 @@
 		goto out;
 	}
 
-	/* find pivot point's new next ds */
+	/* find origin's new next ds */
 	VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, hds->ds_object,
 	    NULL, DS_MODE_NONE, FTAG, &newnext_ds));
-	while (newnext_ds->ds_phys->ds_prev_snap_obj != pivot_ds->ds_object) {
+	while (newnext_ds->ds_phys->ds_prev_snap_obj != origin_ds->ds_object) {
 		dsl_dataset_t *prev;
 
 		if (err = dsl_dataset_open_obj(dd->dd_pool,
@@ -1863,10 +1949,10 @@
 	}
 	pa->newnext_obj = newnext_ds->ds_object;
 
-	/* compute pivot point's new unique space */
+	/* compute origin's new unique space */
 	while ((err = bplist_iterate(&newnext_ds->ds_deadlist,
 	    &itor, &bp)) == 0) {
-		if (bp.blk_birth > pivot_ds->ds_phys->ds_prev_snap_txg)
+		if (bp.blk_birth > origin_ds->ds_phys->ds_prev_snap_txg)
 			pa->unique += bp_get_dasize(dd->dd_pool->dp_spa, &bp);
 	}
 	if (err != ENOENT)
@@ -1874,7 +1960,7 @@
 
 	/* Walk the snapshots that we are moving */
 	name = kmem_alloc(MAXPATHLEN, KM_SLEEP);
-	ds = pivot_ds;
+	ds = origin_ds;
 	/* CONSTCOND */
 	while (TRUE) {
 		uint64_t val, dlused, dlcomp, dluncomp;
@@ -1922,19 +2008,19 @@
 			dsl_dataset_close(prev, DS_MODE_EXCLUSIVE, FTAG);
 			break;
 		}
-		if (ds != pivot_ds)
+		if (ds != origin_ds)
 			dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
 		ds = prev;
 	}
 
 	/* Check that there is enough space here */
-	err = dsl_dir_transfer_possible(pdd, dd, pa->used);
+	err = dsl_dir_transfer_possible(odd, dd, pa->used);
 
 out:
-	if (ds && ds != pivot_ds)
+	if (ds && ds != origin_ds)
 		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
-	if (pivot_ds)
-		dsl_dataset_close(pivot_ds, DS_MODE_EXCLUSIVE, FTAG);
+	if (origin_ds)
+		dsl_dataset_close(origin_ds, DS_MODE_EXCLUSIVE, FTAG);
 	if (newnext_ds)
 		dsl_dataset_close(newnext_ds, DS_MODE_NONE, FTAG);
 	if (name)
@@ -1949,26 +2035,25 @@
 	struct promotearg *pa = arg2;
 	dsl_dir_t *dd = hds->ds_dir;
 	dsl_pool_t *dp = hds->ds_dir->dd_pool;
-	dsl_dir_t *pdd = NULL;
-	dsl_dataset_t *ds, *pivot_ds;
+	dsl_dir_t *odd = NULL;
+	dsl_dataset_t *ds, *origin_ds;
 	char *name;
 
-	ASSERT(dd->dd_phys->dd_clone_parent_obj != 0);
+	ASSERT(dd->dd_phys->dd_origin_obj != 0);
 	ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE));
 
-	VERIFY(0 == dsl_dataset_open_obj(dp,
-	    dd->dd_phys->dd_clone_parent_obj,
-	    NULL, DS_MODE_EXCLUSIVE, FTAG, &pivot_ds));
+	VERIFY(0 == dsl_dataset_open_obj(dp, dd->dd_phys->dd_origin_obj,
+	    NULL, DS_MODE_EXCLUSIVE, FTAG, &origin_ds));
 	/*
-	 * We need to explicitly open pdd, since pivot_ds's pdd will be
+	 * We need to explicitly open odd, since origin_ds's dd will be
 	 * changing.
 	 */
-	VERIFY(0 == dsl_dir_open_obj(dp, pivot_ds->ds_dir->dd_object,
-	    NULL, FTAG, &pdd));
+	VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object,
+	    NULL, FTAG, &odd));
 
 	/* move snapshots to this dir */
 	name = kmem_alloc(MAXPATHLEN, KM_SLEEP);
-	ds = pivot_ds;
+	ds = origin_ds;
 	/* CONSTCOND */
 	while (TRUE) {
 		dsl_dataset_t *prev;
@@ -1983,9 +2068,9 @@
 
 		/* change containing dsl_dir */
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
-		ASSERT3U(ds->ds_phys->ds_dir_obj, ==, pdd->dd_object);
+		ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object);
 		ds->ds_phys->ds_dir_obj = dd->dd_object;
-		ASSERT3P(ds->ds_dir, ==, pdd);
+		ASSERT3P(ds->ds_dir, ==, odd);
 		dsl_dir_close(ds->ds_dir, ds);
 		VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
 		    NULL, ds, &ds->ds_dir));
@@ -2003,35 +2088,35 @@
 			dsl_dataset_close(prev, DS_MODE_EXCLUSIVE, FTAG);
 			break;
 		}
-		if (ds != pivot_ds)
+		if (ds != origin_ds)
 			dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
 		ds = prev;
 	}
-	if (ds != pivot_ds)
+	if (ds != origin_ds)
 		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
 
-	/* change pivot point's next snap */
-	dmu_buf_will_dirty(pivot_ds->ds_dbuf, tx);
-	pivot_ds->ds_phys->ds_next_snap_obj = pa->newnext_obj;
+	/* change origin's next snap */
+	dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
+	origin_ds->ds_phys->ds_next_snap_obj = pa->newnext_obj;
 
-	/* change clone_parent-age */
+	/* change origin */
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
-	ASSERT3U(dd->dd_phys->dd_clone_parent_obj, ==, pivot_ds->ds_object);
-	dd->dd_phys->dd_clone_parent_obj = pdd->dd_phys->dd_clone_parent_obj;
-	dmu_buf_will_dirty(pdd->dd_dbuf, tx);
-	pdd->dd_phys->dd_clone_parent_obj = pivot_ds->ds_object;
+	ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object);
+	dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj;
+	dmu_buf_will_dirty(odd->dd_dbuf, tx);
+	odd->dd_phys->dd_origin_obj = origin_ds->ds_object;
 
 	/* change space accounting */
-	dsl_dir_diduse_space(pdd, -pa->used, -pa->comp, -pa->uncomp, tx);
+	dsl_dir_diduse_space(odd, -pa->used, -pa->comp, -pa->uncomp, tx);
 	dsl_dir_diduse_space(dd, pa->used, pa->comp, pa->uncomp, tx);
-	pivot_ds->ds_phys->ds_unique_bytes = pa->unique;
+	origin_ds->ds_phys->ds_unique_bytes = pa->unique;
 
 	/* log history record */
 	spa_history_internal_log(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx,
 	    cr, "dataset = %llu", ds->ds_object);
 
-	dsl_dir_close(pdd, FTAG);
-	dsl_dataset_close(pivot_ds, DS_MODE_EXCLUSIVE, FTAG);
+	dsl_dir_close(odd, FTAG);
+	dsl_dataset_close(origin_ds, DS_MODE_EXCLUSIVE, FTAG);
 	kmem_free(name, MAXPATHLEN);
 }
 
@@ -2066,122 +2151,85 @@
 	return (err);
 }
 
-#define	SWITCH64(x, y) \
-	{ \
-		uint64_t __tmp = (x); \
-		(x) = (y); \
-		(y) = __tmp; \
-	}
+struct cloneswaparg {
+	dsl_dataset_t *cds; /* clone dataset */
+	dsl_dataset_t *ohds; /* origin's head dataset */
+	boolean_t force;
+};
 
 /* ARGSUSED */
 static int
 dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
-	dsl_dataset_t *cds = arg1;	/* clone to become new head */
-	boolean_t *forcep = arg2;
-	dsl_dir_t *cdd = cds->ds_dir;
-	dsl_pool_t *dp = cds->ds_dir->dd_pool;
-	dsl_dataset_t *ods;	/* the snapshot cds is cloned off of */
-	dsl_dataset_t *ohds = NULL;
-	dsl_dir_t *odd;
-	int err;
+	struct cloneswaparg *csa = arg1;
 
-	/* check that it is a clone */
-	if (cdd->dd_phys->dd_clone_parent_obj == 0)
+	/* they should both be heads */
+	if (dsl_dataset_is_snapshot(csa->cds) ||
+	    dsl_dataset_is_snapshot(csa->ohds))
 		return (EINVAL);
 
-	/* check that cds is not a snapshot */
-	if (dsl_dataset_is_snapshot(cds))
+	/* the branch point should be just before them */
+	if (csa->cds->ds_prev != csa->ohds->ds_prev)
 		return (EINVAL);
 
-	/* open the origin */
-	if (err = dsl_dataset_open_obj(dp, cdd->dd_phys->dd_clone_parent_obj,
-	    NULL, DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ods))
-		return (err);
-	odd = ods->ds_dir;
-
-	/* make sure the clone is descendant of origin */
-	if (cdd->dd_parent != odd) {
-		err = EINVAL;
-		goto out;
-	}
+	/* cds should be the clone */
+	if (csa->cds->ds_prev->ds_phys->ds_next_snap_obj !=
+	    csa->ohds->ds_object)
+		return (EINVAL);
 
-	/* check that there are no snapshots after the origin */
-	if (cds->ds_phys->ds_prev_snap_obj != ods->ds_object ||
-	    ods->ds_phys->ds_next_snap_obj !=
-	    odd->dd_phys->dd_head_dataset_obj) {
-		err = EINVAL;
-		goto out;
-	}
+	/* the clone should be a child of the origin */
+	if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir)
+		return (EINVAL);
 
-	/*
-	 * Verify origin head dataset hasn't been modified or
-	 * 'force' has been passed down.
-	 */
-	if (!(*forcep) &&
-	    (err = dsl_dataset_open_obj(cdd->dd_pool,
-	    odd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_EXCLUSIVE,
-	    FTAG, &ohds)) == 0) {
-		if (dsl_dataset_modified_since_lastsnap(ohds))
-			err = ETXTBSY;
-		dsl_dataset_close(ohds, DS_MODE_EXCLUSIVE, FTAG);
-	}
-out:
-	dsl_dataset_close(ods, DS_MODE_STANDARD, FTAG);
-	return (err);
+	/* ohds shouldn't be modified unless 'force' */
+	if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds))
+		return (ETXTBSY);
+	return (0);
 }
 
 /* ARGSUSED */
 static void
 dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 {
-	dsl_dataset_t *cds = arg1;	/* clone to become new head */
-	dsl_dir_t *cdd = cds->ds_dir;
-	dsl_pool_t *dp = cds->ds_dir->dd_pool;
-	dsl_dataset_t *ods, *ohds;
-	dsl_dir_t *odd;
+	struct cloneswaparg *csa = arg1;
+	dsl_pool_t *dp = csa->cds->ds_dir->dd_pool;
 	uint64_t itor = 0;
 	blkptr_t bp;
 	uint64_t unique = 0;
 	int err;
 
-	ASSERT(cdd->dd_phys->dd_clone_parent_obj != 0);
-	ASSERT(dsl_dataset_is_snapshot(cds) == 0);
+	dmu_buf_will_dirty(csa->cds->ds_dbuf, tx);
+	dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx);
+	dmu_buf_will_dirty(csa->cds->ds_prev->ds_dbuf, tx);
 
-	/* open the origin */
-	VERIFY(0 == dsl_dataset_open_obj(dp, cdd->dd_phys->dd_clone_parent_obj,
-	    NULL, DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ods));
-	odd = ods->ds_dir;
-	ASSERT(cds->ds_phys->ds_prev_snap_obj == ods->ds_object);
-	ASSERT(ods->ds_phys->ds_next_snap_obj ==
-	    odd->dd_phys->dd_head_dataset_obj);
+	if (csa->cds->ds_user_ptr != NULL) {
+		csa->cds->ds_user_evict_func(csa->cds, csa->cds->ds_user_ptr);
+		csa->cds->ds_user_ptr = NULL;
+	}
 
-	/* open the origin head */
-	VERIFY(0 == dsl_dataset_open_obj(cdd->dd_pool,
-	    odd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_EXCLUSIVE,
-	    FTAG, &ohds));
-	ASSERT(odd == ohds->ds_dir);
-
-	dmu_buf_will_dirty(cds->ds_dbuf, tx);
-	dmu_buf_will_dirty(ohds->ds_dbuf, tx);
-	dmu_buf_will_dirty(ods->ds_dbuf, tx);
+	if (csa->ohds->ds_user_ptr != NULL) {
+		csa->ohds->ds_user_evict_func(csa->ohds,
+		    csa->ohds->ds_user_ptr);
+		csa->ohds->ds_user_ptr = NULL;
+	}
 
 	/* compute unique space */
-	while ((err = bplist_iterate(&cds->ds_deadlist, &itor, &bp)) == 0) {
-		if (bp.blk_birth > ods->ds_phys->ds_prev_snap_txg)
-			unique += bp_get_dasize(cdd->dd_pool->dp_spa, &bp);
+	while ((err = bplist_iterate(&csa->cds->ds_deadlist,
+	    &itor, &bp)) == 0) {
+		if (bp.blk_birth > csa->cds->ds_prev->ds_phys->ds_prev_snap_txg)
+			unique += bp_get_dasize(dp->dp_spa, &bp);
 	}
 	VERIFY(err == ENOENT);
 
 	/* reset origin's unique bytes */
-	ods->ds_phys->ds_unique_bytes = unique;
+	csa->cds->ds_prev->ds_phys->ds_unique_bytes = unique;
 
 	/* swap blkptrs */
 	{
 		blkptr_t tmp;
-		tmp = ohds->ds_phys->ds_bp;
-		ohds->ds_phys->ds_bp = cds->ds_phys->ds_bp;
-		cds->ds_phys->ds_bp = tmp;
+		tmp = csa->ohds->ds_phys->ds_bp;
+		csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp;
+		csa->cds->ds_phys->ds_bp = tmp;
 	}
 
 	/* set dd_*_bytes */
@@ -2190,60 +2238,68 @@
 		uint64_t cdl_used, cdl_comp, cdl_uncomp;
 		uint64_t odl_used, odl_comp, odl_uncomp;
 
-		VERIFY(0 == bplist_space(&cds->ds_deadlist, &cdl_used,
+		VERIFY(0 == bplist_space(&csa->cds->ds_deadlist, &cdl_used,
 		    &cdl_comp, &cdl_uncomp));
-		VERIFY(0 == bplist_space(&ohds->ds_deadlist, &odl_used,
+		VERIFY(0 == bplist_space(&csa->ohds->ds_deadlist, &odl_used,
 		    &odl_comp, &odl_uncomp));
-		dused = cds->ds_phys->ds_used_bytes + cdl_used -
-		    (ohds->ds_phys->ds_used_bytes + odl_used);
-		dcomp = cds->ds_phys->ds_compressed_bytes + cdl_comp -
-		    (ohds->ds_phys->ds_compressed_bytes + odl_comp);
-		duncomp = cds->ds_phys->ds_uncompressed_bytes + cdl_uncomp -
-		    (ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp);
+		dused = csa->cds->ds_phys->ds_used_bytes + cdl_used -
+		    (csa->ohds->ds_phys->ds_used_bytes + odl_used);
+		dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp -
+		    (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp);
+		duncomp = csa->cds->ds_phys->ds_uncompressed_bytes +
+		    cdl_uncomp -
+		    (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp);
 
-		dsl_dir_diduse_space(odd, dused, dcomp, duncomp, tx);
-		dsl_dir_diduse_space(cdd, -dused, -dcomp, -duncomp, tx);
+		dsl_dir_diduse_space(csa->ohds->ds_dir,
+		    dused, dcomp, duncomp, tx);
+		dsl_dir_diduse_space(csa->cds->ds_dir,
+		    -dused, -dcomp, -duncomp, tx);
+	}
+
+#define	SWITCH64(x, y) \
+	{ \
+		uint64_t __tmp = (x); \
+		(x) = (y); \
+		(y) = __tmp; \
 	}
 
 	/* swap ds_*_bytes */
-	SWITCH64(ohds->ds_phys->ds_used_bytes, cds->ds_phys->ds_used_bytes);
-	SWITCH64(ohds->ds_phys->ds_compressed_bytes,
-	    cds->ds_phys->ds_compressed_bytes);
-	SWITCH64(ohds->ds_phys->ds_uncompressed_bytes,
-	    cds->ds_phys->ds_uncompressed_bytes);
+	SWITCH64(csa->ohds->ds_phys->ds_used_bytes,
+	    csa->cds->ds_phys->ds_used_bytes);
+	SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes,
+	    csa->cds->ds_phys->ds_compressed_bytes);
+	SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes,
+	    csa->cds->ds_phys->ds_uncompressed_bytes);
 
 	/* swap deadlists */
-	bplist_close(&cds->ds_deadlist);
-	bplist_close(&ohds->ds_deadlist);
-	SWITCH64(ohds->ds_phys->ds_deadlist_obj, cds->ds_phys->ds_deadlist_obj);
-	VERIFY(0 == bplist_open(&cds->ds_deadlist, dp->dp_meta_objset,
-	    cds->ds_phys->ds_deadlist_obj));
-	VERIFY(0 == bplist_open(&ohds->ds_deadlist, dp->dp_meta_objset,
-	    ohds->ds_phys->ds_deadlist_obj));
-
-	dsl_dataset_close(ohds, DS_MODE_EXCLUSIVE, FTAG);
-	dsl_dataset_close(ods, DS_MODE_STANDARD, FTAG);
+	bplist_close(&csa->cds->ds_deadlist);
+	bplist_close(&csa->ohds->ds_deadlist);
+	SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj,
+	    csa->cds->ds_phys->ds_deadlist_obj);
+	VERIFY(0 == bplist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset,
+	    csa->cds->ds_phys->ds_deadlist_obj));
+	VERIFY(0 == bplist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset,
+	    csa->ohds->ds_phys->ds_deadlist_obj));
 }
 
 /*
  * Swap the clone "cosname" with its origin head file system.
  */
 int
-dsl_dataset_clone_swap(const char *cosname, boolean_t force)
+dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
+    boolean_t force)
 {
-	dsl_dataset_t *ds;
-	int err;
+	struct cloneswaparg csa;
 
-	err = dsl_dataset_open(cosname,
-	    DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, FTAG, &ds);
-	if (err)
-		return (err);
+	ASSERT(clone->ds_open_refcount == DS_REF_MAX);
+	ASSERT(origin_head->ds_open_refcount == DS_REF_MAX);
 
-	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+	csa.cds = clone;
+	csa.ohds = origin_head;
+	csa.force = force;
+	return (dsl_sync_task_do(clone->ds_dir->dd_pool,
 	    dsl_dataset_clone_swap_check,
-	    dsl_dataset_clone_swap_sync, ds, &force, 9);
-	dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
-	return (err);
+	    dsl_dataset_clone_swap_sync, &csa, NULL, 9));
 }
 
 /*