changeset 5530:4ed96167d864 onnv_79

6354519 stack overflow in zfs due to zio pipeline 6533726 single-threaded checksum & parity calculations limit write bandwidth 6547248 ztest detects a future leak when there is none 6604198 zfs only using single cpu for compression (part II)
author bonwick
date Tue, 27 Nov 2007 22:58:05 -0800
parents 1d93704c274d
children 05af4ecdb87d
files deleted_files/usr/src/uts/common/fs/zfs/rprwlock.c deleted_files/usr/src/uts/common/fs/zfs/sys/rprwlock.h usr/src/cmd/zdb/zdb.c usr/src/cmd/ztest/ztest.c usr/src/lib/libzpool/common/llib-lzpool usr/src/uts/common/Makefile.files usr/src/uts/common/fs/zfs/metaslab.c usr/src/uts/common/fs/zfs/rprwlock.c usr/src/uts/common/fs/zfs/spa_misc.c usr/src/uts/common/fs/zfs/sys/rprwlock.h usr/src/uts/common/fs/zfs/sys/spa_impl.h usr/src/uts/common/fs/zfs/sys/vdev.h usr/src/uts/common/fs/zfs/sys/vdev_impl.h usr/src/uts/common/fs/zfs/sys/zio.h usr/src/uts/common/fs/zfs/sys/zio_impl.h usr/src/uts/common/fs/zfs/vdev.c usr/src/uts/common/fs/zfs/vdev_cache.c usr/src/uts/common/fs/zfs/vdev_disk.c usr/src/uts/common/fs/zfs/vdev_file.c usr/src/uts/common/fs/zfs/vdev_mirror.c usr/src/uts/common/fs/zfs/vdev_missing.c usr/src/uts/common/fs/zfs/vdev_queue.c usr/src/uts/common/fs/zfs/vdev_raidz.c usr/src/uts/common/fs/zfs/zio.c
diffstat 24 files changed, 1097 insertions(+), 1218 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/deleted_files/usr/src/uts/common/fs/zfs/rprwlock.c	Tue Nov 27 22:58:05 2007 -0800
@@ -0,0 +1,118 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/refcount.h>
+#include <sys/rprwlock.h>
+
+void
+rprw_init(rprwlock_t *rwl)
+{
+	mutex_init(&rwl->rw_lock, NULL, MUTEX_DEFAULT, NULL);
+	rwl->rw_writer = NULL;
+	cv_init(&rwl->rw_cv, NULL, CV_DEFAULT, NULL);
+	refcount_create(&rwl->rw_count);
+}
+
+void
+rprw_destroy(rprwlock_t *rwl)
+{
+	mutex_destroy(&rwl->rw_lock);
+	ASSERT(rwl->rw_writer == NULL);
+	cv_destroy(&rwl->rw_cv);
+	refcount_destroy(&rwl->rw_count);
+}
+
+void
+rprw_enter_read(rprwlock_t *rwl, void *tag)
+{
+	mutex_enter(&rwl->rw_lock);
+
+	if (rwl->rw_writer != curthread) {
+		while (rwl->rw_writer != NULL)
+			cv_wait(&rwl->rw_cv, &rwl->rw_lock);
+	}
+
+	(void) refcount_add(&rwl->rw_count, tag);
+
+	mutex_exit(&rwl->rw_lock);
+}
+
+void
+rprw_enter_write(rprwlock_t *rwl, void *tag)
+{
+	mutex_enter(&rwl->rw_lock);
+
+	if (rwl->rw_writer != curthread) {
+		while (!refcount_is_zero(&rwl->rw_count))
+			cv_wait(&rwl->rw_cv, &rwl->rw_lock);
+		rwl->rw_writer = curthread;
+	}
+
+	(void) refcount_add(&rwl->rw_count, tag);
+
+	mutex_exit(&rwl->rw_lock);
+}
+
+void
+rprw_enter(rprwlock_t *rwl, krw_t rw, void *tag)
+{
+	if (rw == RW_READER)
+		rprw_enter_read(rwl, tag);
+	else
+		rprw_enter_write(rwl, tag);
+}
+
+void
+rprw_exit(rprwlock_t *rwl, void *tag)
+{
+	mutex_enter(&rwl->rw_lock);
+
+	ASSERT(!refcount_is_zero(&rwl->rw_count));
+	ASSERT(rwl->rw_writer == NULL || curthread == rwl->rw_writer);
+	if (refcount_remove(&rwl->rw_count, tag) == 0) {
+		cv_broadcast(&rwl->rw_cv);
+		rwl->rw_writer = NULL;  /* OK in either case */
+	}
+
+	mutex_exit(&rwl->rw_lock);
+}
+
+boolean_t
+rprw_held(rprwlock_t *rwl, krw_t rw)
+{
+	boolean_t held;
+
+	mutex_enter(&rwl->rw_lock);
+	if (rw == RW_WRITER)
+		held = (rwl->rw_writer == curthread);
+	else
+		held = !rwl->rw_writer && !refcount_is_zero(&rwl->rw_count);
+	mutex_exit(&rwl->rw_lock);
+
+	return (held);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/deleted_files/usr/src/uts/common/fs/zfs/sys/rprwlock.h	Tue Nov 27 22:58:05 2007 -0800
@@ -0,0 +1,61 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_RPRWLOCK_H
+#define	_SYS_RPRWLOCK_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/inttypes.h>
+#include <sys/list.h>
+#include <sys/zfs_context.h>
+#include <sys/refcount.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef struct rprwlock {
+	kmutex_t	rw_lock;
+	kthread_t	*rw_writer;
+	kcondvar_t	rw_cv;
+	refcount_t	rw_count;
+} rprwlock_t;
+
+void rprw_init(rprwlock_t *rwl);
+void rprw_destroy(rprwlock_t *rwl);
+void rprw_enter_read(rprwlock_t *rwl, void *tag);
+void rprw_enter_write(rprwlock_t *rwl, void *tag);
+void rprw_enter(rprwlock_t *rwl, krw_t rw, void *tag);
+void rprw_exit(rprwlock_t *rwl, void *tag);
+boolean_t rprw_held(rprwlock_t *rwl, krw_t rw);
+#define	RPRW_READ_HELD(x)	rprw_held(x, RW_READER)
+#define	RPRW_WRITE_HELD(x)	rprw_held(x, RW_WRITER)
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_RPRWLOCK_H */
--- a/usr/src/cmd/zdb/zdb.c	Tue Nov 27 17:41:22 2007 -0800
+++ b/usr/src/cmd/zdb/zdb.c	Tue Nov 27 22:58:05 2007 -0800
@@ -501,10 +501,8 @@
 	for (c = 0; c < rvd->vdev_children; c++) {
 		vd = rvd->vdev_child[c];
 
-		spa_config_enter(spa, RW_READER, FTAG);
 		(void) printf("\n    vdev %llu = %s\n\n",
 		    (u_longlong_t)vd->vdev_id, vdev_description(vd));
-		spa_config_exit(spa, FTAG);
 
 		if (dump_opt['d'] <= 5) {
 			(void) printf("\t%10s   %10s   %5s\n",
@@ -522,7 +520,6 @@
 dump_dtl(vdev_t *vd, int indent)
 {
 	avl_tree_t *t = &vd->vdev_dtl_map.sm_root;
-	spa_t *spa = vd->vdev_spa;
 	space_seg_t *ss;
 	vdev_t *pvd;
 	int c;
@@ -530,9 +527,7 @@
 	if (indent == 0)
 		(void) printf("\nDirty time logs:\n\n");
 
-	spa_config_enter(spa, RW_READER, FTAG);
 	(void) printf("\t%*s%s\n", indent, "", vdev_description(vd));
-	spa_config_exit(spa, FTAG);
 
 	for (ss = avl_first(t); ss; ss = AVL_NEXT(t, ss)) {
 		/*
@@ -1730,6 +1725,8 @@
 	dsl_pool_t *dp = spa_get_dsl(spa);
 	int rc = 0;
 
+	spa_config_enter(spa, RW_READER, FTAG);
+
 	if (dump_opt['u'])
 		dump_uberblock(&spa->spa_uberblock);
 
@@ -1751,6 +1748,8 @@
 	if (dump_opt['s'])
 		show_pool_stats(spa);
 
+	spa_config_exit(spa, FTAG);
+
 	if (rc != 0)
 		exit(rc);
 }
--- a/usr/src/cmd/ztest/ztest.c	Tue Nov 27 17:41:22 2007 -0800
+++ b/usr/src/cmd/ztest/ztest.c	Tue Nov 27 22:58:05 2007 -0800
@@ -127,8 +127,18 @@
 static int zopt_maxfaults;
 static uint16_t zopt_write_fail_shift = 5;
 
+typedef struct ztest_block_tag {
+	uint64_t	bt_objset;
+	uint64_t	bt_object;
+	uint64_t	bt_offset;
+	uint64_t	bt_txg;
+	uint64_t	bt_thread;
+	uint64_t	bt_seq;
+} ztest_block_tag_t;
+
 typedef struct ztest_args {
-	char		*za_pool;
+	char		za_pool[MAXNAMELEN];
+	spa_t		*za_spa;
 	objset_t	*za_os;
 	zilog_t		*za_zilog;
 	thread_t	za_thread;
@@ -141,6 +151,13 @@
 	hrtime_t	za_stop;
 	hrtime_t	za_kill;
 	traverse_handle_t *za_th;
+	/*
+	 * Thread-local variables can go here to aid debugging.
+	 */
+	ztest_block_tag_t za_rbt;
+	ztest_block_tag_t za_wbt;
+	dmu_object_info_t za_doi;
+	dmu_buf_t	*za_dbuf;
 } ztest_args_t;
 
 typedef void ztest_func_t(ztest_args_t *);
@@ -167,6 +184,7 @@
 
 typedef struct ztest_info {
 	ztest_func_t	*zi_func;	/* test function */
+	uint64_t	zi_iters;	/* iterations per execution */
 	uint64_t	*zi_interval;	/* execute every <interval> seconds */
 	uint64_t	zi_calls;	/* per-pass count */
 	uint64_t	zi_call_time;	/* per-pass time */
@@ -180,22 +198,22 @@
 uint64_t zopt_rarely = 60;		/* every 60 seconds */
 
 ztest_info_t ztest_info[] = {
-	{ ztest_dmu_read_write,			&zopt_always	},
-	{ ztest_dmu_write_parallel,		&zopt_always	},
-	{ ztest_dmu_object_alloc_free,		&zopt_always	},
-	{ ztest_zap,				&zopt_always	},
-	{ ztest_zap_parallel,			&zopt_always	},
-	{ ztest_traverse,			&zopt_often	},
-	{ ztest_dsl_prop_get_set,		&zopt_sometimes	},
-	{ ztest_dmu_objset_create_destroy,	&zopt_sometimes	},
-	{ ztest_dmu_snapshot_create_destroy,	&zopt_rarely	},
-	{ ztest_spa_create_destroy,		&zopt_sometimes	},
-	{ ztest_fault_inject,			&zopt_sometimes	},
-	{ ztest_spa_rename,			&zopt_rarely	},
-	{ ztest_vdev_attach_detach,		&zopt_rarely	},
-	{ ztest_vdev_LUN_growth,		&zopt_rarely	},
-	{ ztest_vdev_add_remove,		&zopt_vdevtime	},
-	{ ztest_scrub,				&zopt_vdevtime	},
+	{ ztest_dmu_read_write,			1,	&zopt_always	},
+	{ ztest_dmu_write_parallel,		30,	&zopt_always	},
+	{ ztest_dmu_object_alloc_free,		1,	&zopt_always	},
+	{ ztest_zap,				30,	&zopt_always	},
+	{ ztest_zap_parallel,			100,	&zopt_always	},
+	{ ztest_traverse,			1,	&zopt_often	},
+	{ ztest_dsl_prop_get_set,		1,	&zopt_sometimes	},
+	{ ztest_dmu_objset_create_destroy,	1,	&zopt_sometimes	},
+	{ ztest_dmu_snapshot_create_destroy,	1,	&zopt_rarely	},
+	{ ztest_spa_create_destroy,		1,	&zopt_sometimes	},
+	{ ztest_fault_inject,			1,	&zopt_sometimes	},
+	{ ztest_spa_rename,			1,	&zopt_rarely	},
+	{ ztest_vdev_attach_detach,		1,	&zopt_rarely	},
+	{ ztest_vdev_LUN_growth,		1,	&zopt_rarely	},
+	{ ztest_vdev_add_remove,		1,	&zopt_vdevtime	},
+	{ ztest_scrub,				1,	&zopt_vdevtime	},
 };
 
 #define	ZTEST_FUNCS	(sizeof (ztest_info) / sizeof (ztest_info_t))
@@ -214,21 +232,11 @@
 	hrtime_t	zs_stop_time;
 	uint64_t	zs_alloc;
 	uint64_t	zs_space;
-	uint64_t	zs_txg;
 	ztest_info_t	zs_info[ZTEST_FUNCS];
 	mutex_t		zs_sync_lock[ZTEST_SYNC_LOCKS];
 	uint64_t	zs_seq[ZTEST_SYNC_LOCKS];
 } ztest_shared_t;
 
-typedef struct ztest_block_tag {
-	uint64_t	bt_objset;
-	uint64_t	bt_object;
-	uint64_t	bt_offset;
-	uint64_t	bt_txg;
-	uint64_t	bt_thread;
-	uint64_t	bt_seq;
-} ztest_block_tag_t;
-
 static char ztest_dev_template[] = "%s/%s.%llua";
 static ztest_shared_t *ztest_shared;
 
@@ -237,7 +245,7 @@
 
 static boolean_t ztest_exiting = B_FALSE;
 
-extern uint64_t zio_gang_bang;
+extern uint64_t metaslab_gang_bang;
 extern uint16_t zio_zil_fail_shift;
 extern uint16_t zio_io_fail_shift;
 
@@ -359,7 +367,7 @@
 	FILE *fp = requested ? stdout : stderr;
 
 	nicenum(zopt_vdev_size, nice_vdev_size);
-	nicenum(zio_gang_bang, nice_gang_bang);
+	nicenum(metaslab_gang_bang, nice_gang_bang);
 
 	(void) fprintf(fp, "Usage: %s\n"
 	    "\t[-v vdevs (default: %llu)]\n"
@@ -432,7 +440,7 @@
 	uint64_t value;
 
 	/* By default, test gang blocks for blocks 32K and greater */
-	zio_gang_bang = 32 << 10;
+	metaslab_gang_bang = 32 << 10;
 
 	/* Default value, fail every 32nd allocation */
 	zio_zil_fail_shift = 5;
@@ -484,7 +492,7 @@
 			zopt_threads = MAX(1, value);
 			break;
 		case 'g':
-			zio_gang_bang = MAX(SPA_MINBLOCKSIZE << 1, value);
+			metaslab_gang_bang = MAX(SPA_MINBLOCKSIZE << 1, value);
 			break;
 		case 'i':
 			zopt_init = value;
@@ -835,7 +843,7 @@
 void
 ztest_vdev_add_remove(ztest_args_t *za)
 {
-	spa_t *spa = dmu_objset_spa(za->za_os);
+	spa_t *spa = za->za_spa;
 	uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
 	nvlist_t *nvroot;
 	int error;
@@ -906,7 +914,7 @@
 void
 ztest_vdev_attach_detach(ztest_args_t *za)
 {
-	spa_t *spa = dmu_objset_spa(za->za_os);
+	spa_t *spa = za->za_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *oldvd, *newvd, *pvd;
 	nvlist_t *root, *file;
@@ -1056,7 +1064,7 @@
 void
 ztest_vdev_LUN_growth(ztest_args_t *za)
 {
-	spa_t *spa = dmu_objset_spa(za->za_os);
+	spa_t *spa = za->za_spa;
 	char dev_name[MAXPATHLEN];
 	uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
 	uint64_t vdev;
@@ -1106,7 +1114,7 @@
 	 */
 	VERIFY(dmu_object_claim(os, ZTEST_DIROBJ,
 	    DMU_OT_UINT64_OTHER, ZTEST_DIROBJ_BLOCKSIZE,
-	    DMU_OT_UINT64_OTHER, sizeof (ztest_block_tag_t), tx) == 0);
+	    DMU_OT_UINT64_OTHER, 5 * sizeof (ztest_block_tag_t), tx) == 0);
 
 	VERIFY(zap_create_claim(os, ZTEST_MICROZAP_OBJ,
 	    DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
@@ -1115,12 +1123,12 @@
 	    DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
 }
 
-/* ARGSUSED */
 static int
 ztest_destroy_cb(char *name, void *arg)
 {
+	ztest_args_t *za = arg;
 	objset_t *os;
-	dmu_object_info_t doi;
+	dmu_object_info_t *doi = &za->za_doi;
 	int error;
 
 	/*
@@ -1129,12 +1137,12 @@
 	error = dmu_objset_open(name, DMU_OST_OTHER,
 	    DS_MODE_STANDARD | DS_MODE_READONLY, &os);
 	ASSERT3U(error, ==, 0);
-	error = dmu_object_info(os, ZTEST_DIROBJ, &doi);
+	error = dmu_object_info(os, ZTEST_DIROBJ, doi);
 	if (error != ENOENT) {
 		/* We could have crashed in the middle of destroying it */
 		ASSERT3U(error, ==, 0);
-		ASSERT3U(doi.doi_type, ==, DMU_OT_UINT64_OTHER);
-		ASSERT3S(doi.doi_physical_blks, >=, 0);
+		ASSERT3U(doi->doi_type, ==, DMU_OT_UINT64_OTHER);
+		ASSERT3S(doi->doi_physical_blks, >=, 0);
 	}
 	dmu_objset_close(os);
 
@@ -1215,7 +1223,7 @@
 	 * create lying around from a previous run.  If so, destroy it
 	 * and all of its snapshots.
 	 */
-	(void) dmu_objset_find(name, ztest_destroy_cb, NULL,
+	(void) dmu_objset_find(name, ztest_destroy_cb, za,
 	    DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
 
 	/*
@@ -1428,7 +1436,7 @@
 void
 ztest_traverse(ztest_args_t *za)
 {
-	spa_t *spa = dmu_objset_spa(za->za_os);
+	spa_t *spa = za->za_spa;
 	traverse_handle_t *th = za->za_th;
 	int rc, advance;
 	uint64_t cbstart, cblimit;
@@ -1500,7 +1508,7 @@
 	dmu_tx_t *tx;
 	uint64_t batchobj, object, batchsize, endoff, temp;
 	int b, c, error, bonuslen;
-	dmu_object_info_t doi;
+	dmu_object_info_t *doi = &za->za_doi;
 	char osname[MAXNAMELEN];
 
 	dmu_objset_name(os, osname);
@@ -1545,13 +1553,14 @@
 		 * We expect the nth byte of the bonus buffer to be n.
 		 */
 		VERIFY(0 == dmu_bonus_hold(os, object, FTAG, &db));
-
-		dmu_object_info_from_db(db, &doi);
-		ASSERT(doi.doi_type == DMU_OT_UINT64_OTHER);
-		ASSERT(doi.doi_bonus_type == DMU_OT_PLAIN_OTHER);
-		ASSERT3S(doi.doi_physical_blks, >=, 0);
-
-		bonuslen = doi.doi_bonus_size;
+		za->za_dbuf = db;
+
+		dmu_object_info_from_db(db, doi);
+		ASSERT(doi->doi_type == DMU_OT_UINT64_OTHER);
+		ASSERT(doi->doi_bonus_type == DMU_OT_PLAIN_OTHER);
+		ASSERT3S(doi->doi_physical_blks, >=, 0);
+
+		bonuslen = doi->doi_bonus_size;
 
 		for (c = 0; c < bonuslen; c++) {
 			if (((uint8_t *)db->db_data)[c] !=
@@ -1565,6 +1574,7 @@
 		}
 
 		dmu_buf_rele(db, FTAG);
+		za->za_dbuf = NULL;
 
 		/*
 		 * We expect the word at endoff to be our object number.
@@ -1669,7 +1679,8 @@
 		/*
 		 * Write to both the bonus buffer and the regular data.
 		 */
-		VERIFY(0 == dmu_bonus_hold(os, object, FTAG, &db));
+		VERIFY(dmu_bonus_hold(os, object, FTAG, &db) == 0);
+		za->za_dbuf = db;
 		ASSERT3U(bonuslen, <=, db->db_size);
 
 		dmu_object_size_from_db(db, &va_blksize, &va_nblocks);
@@ -1685,6 +1696,7 @@
 			((uint8_t *)db->db_data)[c] = (uint8_t)(c + bonuslen);
 
 		dmu_buf_rele(db, FTAG);
+		za->za_dbuf = NULL;
 
 		/*
 		 * Write to a large offset to increase indirection.
@@ -1939,244 +1951,229 @@
 }
 
 void
-ztest_dmu_check_future_leak(objset_t *os, uint64_t txg)
+ztest_dmu_check_future_leak(ztest_args_t *za)
 {
+	objset_t *os = za->za_os;
 	dmu_buf_t *db;
-	ztest_block_tag_t rbt;
-
-	if (zopt_verbose >= 3) {
-		char osname[MAXNAMELEN];
-		dmu_objset_name(os, osname);
-		(void) printf("checking %s for future leaks in txg %lld...\n",
-		    osname, (u_longlong_t)txg);
-	}
+	ztest_block_tag_t *bt;
+	dmu_object_info_t *doi = &za->za_doi;
 
 	/*
 	 * Make sure that, if there is a write record in the bonus buffer
 	 * of the ZTEST_DIROBJ, that the txg for this record is <= the
 	 * last synced txg of the pool.
 	 */
-
-	VERIFY(0 == dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &db));
-	ASSERT3U(db->db_size, >=, sizeof (rbt));
-	bcopy(db->db_data, &rbt, sizeof (rbt));
-	if (rbt.bt_objset != 0) {
-		ASSERT3U(rbt.bt_objset, ==, dmu_objset_id(os));
-		ASSERT3U(rbt.bt_object, ==, ZTEST_DIROBJ);
-		ASSERT3U(rbt.bt_offset, ==, -1ULL);
-		if (rbt.bt_txg > txg) {
-			fatal(0,
-			    "future leak: got %llx, last synced txg is %llx",
-			    rbt.bt_txg, txg);
-		}
+	VERIFY(dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &db) == 0);
+	za->za_dbuf = db;
+	VERIFY(dmu_object_info(os, ZTEST_DIROBJ, doi) == 0);
+	ASSERT3U(doi->doi_bonus_size, >=, sizeof (*bt));
+	ASSERT3U(doi->doi_bonus_size, <=, db->db_size);
+	ASSERT3U(doi->doi_bonus_size % sizeof (*bt), ==, 0);
+	bt = (void *)((char *)db->db_data + doi->doi_bonus_size - sizeof (*bt));
+	if (bt->bt_objset != 0) {
+		ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os));
+		ASSERT3U(bt->bt_object, ==, ZTEST_DIROBJ);
+		ASSERT3U(bt->bt_offset, ==, -1ULL);
+		ASSERT3U(bt->bt_txg, <, spa_first_txg(za->za_spa));
 	}
 	dmu_buf_rele(db, FTAG);
+	za->za_dbuf = NULL;
 }
 
 void
 ztest_dmu_write_parallel(ztest_args_t *za)
 {
 	objset_t *os = za->za_os;
-	dmu_tx_t *tx;
+	ztest_block_tag_t *rbt = &za->za_rbt;
+	ztest_block_tag_t *wbt = &za->za_wbt;
+	const size_t btsize = sizeof (ztest_block_tag_t);
 	dmu_buf_t *db;
-	int i, b, error, do_free, bs;
-	uint64_t off, txg_how, txg;
+	int b, error;
+	int bs = ZTEST_DIROBJ_BLOCKSIZE;
+	int do_free = 0;
+	uint64_t off, txg_how;
 	mutex_t *lp;
 	char osname[MAXNAMELEN];
 	char iobuf[SPA_MAXBLOCKSIZE];
-	ztest_block_tag_t rbt, wbt;
+	blkptr_t blk = { 0 };
+	uint64_t blkoff;
+	zbookmark_t zb;
+	dmu_tx_t *tx = dmu_tx_create(os);
 
 	dmu_objset_name(os, osname);
-	bs = ZTEST_DIROBJ_BLOCKSIZE;
 
 	/*
 	 * Have multiple threads write to large offsets in ZTEST_DIROBJ
 	 * to verify that having multiple threads writing to the same object
 	 * in parallel doesn't cause any trouble.
-	 * Also do parallel writes to the bonus buffer on occasion.
 	 */
-	for (i = 0; i < 50; i++) {
+	if (ztest_random(4) == 0) {
+		/*
+		 * Do the bonus buffer instead of a regular block.
+		 * We need a lock to serialize resize vs. others,
+		 * so we hash on the objset ID.
+		 */
+		b = dmu_objset_id(os) % ZTEST_SYNC_LOCKS;
+		off = -1ULL;
+		dmu_tx_hold_bonus(tx, ZTEST_DIROBJ);
+	} else {
 		b = ztest_random(ZTEST_SYNC_LOCKS);
-		lp = &ztest_shared->zs_sync_lock[b];
-
-		do_free = (ztest_random(4) == 0);
-
-		off = za->za_diroff_shared + ((uint64_t)b << SPA_MAXBLOCKSHIFT);
-
+		off = za->za_diroff_shared + (b << SPA_MAXBLOCKSHIFT);
 		if (ztest_random(4) == 0) {
-			/*
-			 * Do the bonus buffer instead of a regular block.
-			 */
-			do_free = 0;
-			off = -1ULL;
-		}
-
-		tx = dmu_tx_create(os);
-
-		if (off == -1ULL)
-			dmu_tx_hold_bonus(tx, ZTEST_DIROBJ);
-		else if (do_free)
+			do_free = 1;
 			dmu_tx_hold_free(tx, ZTEST_DIROBJ, off, bs);
-		else
+		} else {
 			dmu_tx_hold_write(tx, ZTEST_DIROBJ, off, bs);
-
-		txg_how = ztest_random(2) == 0 ? TXG_WAIT : TXG_NOWAIT;
-		error = dmu_tx_assign(tx, txg_how);
-		if (error) {
-			if (error == ERESTART) {
-				ASSERT(txg_how == TXG_NOWAIT);
-				dmu_tx_wait(tx);
-				dmu_tx_abort(tx);
-				continue;
-			}
-			dmu_tx_abort(tx);
-			ztest_record_enospc("dmu write parallel");
-			return;
-		}
-		txg = dmu_tx_get_txg(tx);
-
-		if (do_free) {
-			(void) mutex_lock(lp);
-			VERIFY(0 == dmu_free_range(os, ZTEST_DIROBJ, off,
-			    bs, tx));
-			(void) mutex_unlock(lp);
-			dmu_tx_commit(tx);
-			continue;
-		}
-
-		wbt.bt_objset = dmu_objset_id(os);
-		wbt.bt_object = ZTEST_DIROBJ;
-		wbt.bt_offset = off;
-		wbt.bt_txg = txg;
-		wbt.bt_thread = za->za_instance;
-
-		if (off == -1ULL) {
-			dmu_object_info_t doi;
-			char *off;
-
-			wbt.bt_seq = 0;
-			VERIFY(0 == dmu_bonus_hold(os, ZTEST_DIROBJ,
-			    FTAG, &db));
-			dmu_object_info_from_db(db, &doi);
-			ASSERT3U(doi.doi_bonus_size, >=, sizeof (wbt));
-			off = (char *)db->db_data +
-			    doi.doi_bonus_size - sizeof (wbt);
-			bcopy(off, &rbt, sizeof (wbt));
-			if (rbt.bt_objset != 0) {
-				ASSERT3U(rbt.bt_objset, ==, wbt.bt_objset);
-				ASSERT3U(rbt.bt_object, ==, wbt.bt_object);
-				ASSERT3U(rbt.bt_offset, ==, wbt.bt_offset);
-				ASSERT3U(rbt.bt_txg, <=, wbt.bt_txg);
-			}
-			if (ztest_random(10) == 0) {
-				int newsize = (ztest_random(
-				    db->db_size / sizeof (wbt)) + 1) *
-				    sizeof (wbt);
-
-				ASSERT3U(newsize, >=, sizeof (wbt));
-				ASSERT3U(newsize, <=, db->db_size);
-				error = dmu_set_bonus(db, newsize, tx);
-				ASSERT3U(error, ==, 0);
-				off = (char *)db->db_data + newsize -
-				    sizeof (wbt);
-			}
-			dmu_buf_will_dirty(db, tx);
-			bcopy(&wbt, off, db->db_size);
-			dmu_buf_rele(db, FTAG);
-			dmu_tx_commit(tx);
-			continue;
-		}
-
-		(void) mutex_lock(lp);
-
-		wbt.bt_seq = ztest_shared->zs_seq[b]++;
-
-		dmu_write(os, ZTEST_DIROBJ, off, sizeof (wbt), &wbt, tx);
-
-		(void) mutex_unlock(lp);
-
-		if (ztest_random(100) == 0)
-			(void) poll(NULL, 0, 1); /* open dn_notxholds window */
-
-		dmu_tx_commit(tx);
-
-		if (ztest_random(1000) == 0)
-			txg_wait_synced(dmu_objset_pool(os), txg);
-
-		if (ztest_random(2) == 0) {
-			blkptr_t blk = { 0 };
-			uint64_t blkoff;
-			zbookmark_t zb;
-
-			(void) mutex_lock(lp);
-			blkoff = P2ALIGN_TYPED(off, bs, uint64_t);
-			error = dmu_buf_hold(os,
-			    ZTEST_DIROBJ, blkoff, FTAG, &db);
-			if (error) {
-				dprintf("dmu_buf_hold(%s, %d, %llx) = %d\n",
-				    osname, ZTEST_DIROBJ, blkoff, error);
-				(void) mutex_unlock(lp);
-				continue;
-			}
-			blkoff = off - blkoff;
-			error = dmu_sync(NULL, db, &blk, txg, NULL, NULL);
-			dmu_buf_rele(db, FTAG);
-			(void) mutex_unlock(lp);
-			if (error) {
-				dprintf("dmu_sync(%s, %d, %llx) = %d\n",
-				    osname, ZTEST_DIROBJ, off, error);
-				continue;
-			}
-
-			if (blk.blk_birth == 0)	{	/* concurrent free */
-				continue;
-			}
-			txg_suspend(dmu_objset_pool(os));
-
-			ASSERT(blk.blk_fill == 1);
-			ASSERT3U(BP_GET_TYPE(&blk), ==, DMU_OT_UINT64_OTHER);
-			ASSERT3U(BP_GET_LEVEL(&blk), ==, 0);
-			ASSERT3U(BP_GET_LSIZE(&blk), ==, bs);
-
-			/*
-			 * Read the block that dmu_sync() returned to
-			 * make sure its contents match what we wrote.
-			 * We do this while still txg_suspend()ed to ensure
-			 * that the block can't be reused before we read it.
-			 */
-			zb.zb_objset = dmu_objset_id(os);
-			zb.zb_object = ZTEST_DIROBJ;
-			zb.zb_level = 0;
-			zb.zb_blkid = off / bs;
-			error = zio_wait(zio_read(NULL, dmu_objset_spa(os),
-			    &blk, iobuf, bs, NULL, NULL,
-			    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_MUSTSUCCEED, &zb));
-			ASSERT(error == 0);
-
-			txg_resume(dmu_objset_pool(os));
-
-			bcopy(&iobuf[blkoff], &rbt, sizeof (rbt));
-
-			if (rbt.bt_objset == 0)		/* concurrent free */
-				continue;
-
-			ASSERT3U(rbt.bt_objset, ==, wbt.bt_objset);
-			ASSERT3U(rbt.bt_object, ==, wbt.bt_object);
-			ASSERT3U(rbt.bt_offset, ==, wbt.bt_offset);
-
-			/*
-			 * The semantic of dmu_sync() is that we always
-			 * push the most recent version of the data,
-			 * so in the face of concurrent updates we may
-			 * see a newer version of the block.  That's OK.
-			 */
-			ASSERT3U(rbt.bt_txg, >=, wbt.bt_txg);
-			if (rbt.bt_thread == wbt.bt_thread)
-				ASSERT3U(rbt.bt_seq, ==, wbt.bt_seq);
-			else
-				ASSERT3U(rbt.bt_seq, >, wbt.bt_seq);
 		}
 	}
+
+	txg_how = ztest_random(2) == 0 ? TXG_WAIT : TXG_NOWAIT;
+	error = dmu_tx_assign(tx, txg_how);
+	if (error) {
+		if (error == ERESTART) {
+			ASSERT(txg_how == TXG_NOWAIT);
+			dmu_tx_wait(tx);
+		} else {
+			ztest_record_enospc("dmu write parallel");
+		}
+		dmu_tx_abort(tx);
+		return;
+	}
+
+	lp = &ztest_shared->zs_sync_lock[b];
+	(void) mutex_lock(lp);
+
+	wbt->bt_objset = dmu_objset_id(os);
+	wbt->bt_object = ZTEST_DIROBJ;
+	wbt->bt_offset = off;
+	wbt->bt_txg = dmu_tx_get_txg(tx);
+	wbt->bt_thread = za->za_instance;
+	wbt->bt_seq = ztest_shared->zs_seq[b]++;	/* protected by lp */
+
+	if (off == -1ULL) {
+		dmu_object_info_t *doi = &za->za_doi;
+		char *dboff;
+
+		VERIFY(dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &db) == 0);
+		za->za_dbuf = db;
+		dmu_object_info_from_db(db, doi);
+		ASSERT3U(doi->doi_bonus_size, <=, db->db_size);
+		ASSERT3U(doi->doi_bonus_size, >=, btsize);
+		ASSERT3U(doi->doi_bonus_size % btsize, ==, 0);
+		dboff = (char *)db->db_data + doi->doi_bonus_size - btsize;
+		bcopy(dboff, rbt, btsize);
+		if (rbt->bt_objset != 0) {
+			ASSERT3U(rbt->bt_objset, ==, wbt->bt_objset);
+			ASSERT3U(rbt->bt_object, ==, wbt->bt_object);
+			ASSERT3U(rbt->bt_offset, ==, wbt->bt_offset);
+			ASSERT3U(rbt->bt_txg, <=, wbt->bt_txg);
+		}
+		if (ztest_random(10) == 0) {
+			int newsize = (ztest_random(db->db_size /
+			    btsize) + 1) * btsize;
+
+			ASSERT3U(newsize, >=, btsize);
+			ASSERT3U(newsize, <=, db->db_size);
+			VERIFY3U(dmu_set_bonus(db, newsize, tx), ==, 0);
+			dboff = (char *)db->db_data + newsize - btsize;
+		}
+		dmu_buf_will_dirty(db, tx);
+		bcopy(wbt, dboff, btsize);
+		dmu_buf_rele(db, FTAG);
+		za->za_dbuf = NULL;
+	} else if (do_free) {
+		VERIFY(dmu_free_range(os, ZTEST_DIROBJ, off, bs, tx) == 0);
+	} else {
+		dmu_write(os, ZTEST_DIROBJ, off, btsize, wbt, tx);
+	}
+
+	(void) mutex_unlock(lp);
+
+	if (ztest_random(1000) == 0)
+		(void) poll(NULL, 0, 1); /* open dn_notxholds window */
+
+	dmu_tx_commit(tx);
+
+	if (ztest_random(10000) == 0)
+		txg_wait_synced(dmu_objset_pool(os), wbt->bt_txg);
+
+	if (off == -1 || do_free)
+		return;
+
+	if (ztest_random(2) != 0)
+		return;
+
+	/*
+	 * dmu_sync() the block we just wrote.
+	 */
+	(void) mutex_lock(lp);
+
+	blkoff = P2ALIGN_TYPED(off, bs, uint64_t);
+	error = dmu_buf_hold(os, ZTEST_DIROBJ, blkoff, FTAG, &db);
+	za->za_dbuf = db;
+	if (error) {
+		dprintf("dmu_buf_hold(%s, %d, %llx) = %d\n",
+		    osname, ZTEST_DIROBJ, blkoff, error);
+		(void) mutex_unlock(lp);
+		return;
+	}
+	blkoff = off - blkoff;
+	error = dmu_sync(NULL, db, &blk, wbt->bt_txg, NULL, NULL);
+	dmu_buf_rele(db, FTAG);
+	za->za_dbuf = NULL;
+
+	(void) mutex_unlock(lp);
+
+	if (error) {
+		dprintf("dmu_sync(%s, %d, %llx) = %d\n",
+		    osname, ZTEST_DIROBJ, off, error);
+		return;
+	}
+
+	if (blk.blk_birth == 0)		/* concurrent free */
+		return;
+
+	txg_suspend(dmu_objset_pool(os));
+
+	ASSERT(blk.blk_fill == 1);
+	ASSERT3U(BP_GET_TYPE(&blk), ==, DMU_OT_UINT64_OTHER);
+	ASSERT3U(BP_GET_LEVEL(&blk), ==, 0);
+	ASSERT3U(BP_GET_LSIZE(&blk), ==, bs);
+
+	/*
+	 * Read the block that dmu_sync() returned to make sure its contents
+	 * match what we wrote.  We do this while still txg_suspend()ed
+	 * to ensure that the block can't be reused before we read it.
+	 */
+	zb.zb_objset = dmu_objset_id(os);
+	zb.zb_object = ZTEST_DIROBJ;
+	zb.zb_level = 0;
+	zb.zb_blkid = off / bs;
+	error = zio_wait(zio_read(NULL, za->za_spa, &blk, iobuf, bs,
+	    NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_MUSTSUCCEED, &zb));
+	ASSERT3U(error, ==, 0);
+
+	txg_resume(dmu_objset_pool(os));
+
+	bcopy(&iobuf[blkoff], rbt, btsize);
+
+	if (rbt->bt_objset == 0)		/* concurrent free */
+		return;
+
+	ASSERT3U(rbt->bt_objset, ==, wbt->bt_objset);
+	ASSERT3U(rbt->bt_object, ==, wbt->bt_object);
+	ASSERT3U(rbt->bt_offset, ==, wbt->bt_offset);
+
+	/*
+	 * The semantic of dmu_sync() is that we always push the most recent
+	 * version of the data, so in the face of concurrent updates we may
+	 * see a newer version of the block.  That's OK.
+	 */
+	ASSERT3U(rbt->bt_txg, >=, wbt->bt_txg);
+	if (rbt->bt_thread == wbt->bt_thread)
+		ASSERT3U(rbt->bt_seq, ==, wbt->bt_seq);
+	else
+		ASSERT3U(rbt->bt_seq, >, wbt->bt_seq);
 }
 
 /*
@@ -2195,7 +2192,6 @@
 	uint64_t value[ZTEST_ZAP_MAX_INTS];
 	uint64_t zl_ints, zl_intsize, prop;
 	int i, ints;
-	int iters = 100;
 	dmu_tx_t *tx;
 	char propname[100], txgname[100];
 	int error;
@@ -2259,122 +2255,113 @@
 
 	ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS);
 
-	while (--iters >= 0) {
-		prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
-		(void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
-		(void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
-		bzero(value, sizeof (value));
-		last_txg = 0;
-
-		/*
-		 * If these zap entries already exist, validate their contents.
-		 */
-		error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
-		if (error == 0) {
-			ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
-			ASSERT3U(zl_ints, ==, 1);
-
-			error = zap_lookup(os, object, txgname, zl_intsize,
-			    zl_ints, &last_txg);
-
-			ASSERT3U(error, ==, 0);
-
-			error = zap_length(os, object, propname, &zl_intsize,
-			    &zl_ints);
-
-			ASSERT3U(error, ==, 0);
-			ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
-			ASSERT3U(zl_ints, ==, ints);
-
-			error = zap_lookup(os, object, propname, zl_intsize,
-			    zl_ints, value);
-
-			ASSERT3U(error, ==, 0);
-
-			for (i = 0; i < ints; i++) {
-				ASSERT3U(value[i], ==, last_txg + object + i);
-			}
-		} else {
-			ASSERT3U(error, ==, ENOENT);
-		}
-
-		/*
-		 * Atomically update two entries in our zap object.
-		 * The first is named txg_%llu, and contains the txg
-		 * in which the property was last updated.  The second
-		 * is named prop_%llu, and the nth element of its value
-		 * should be txg + object + n.
-		 */
-		tx = dmu_tx_create(os);
-		dmu_tx_hold_zap(tx, object, TRUE, NULL);
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		if (error) {
-			ztest_record_enospc("create zap entry");
-			dmu_tx_abort(tx);
-			return;
+	prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
+	(void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
+	(void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
+	bzero(value, sizeof (value));
+	last_txg = 0;
+
+	/*
+	 * If these zap entries already exist, validate their contents.
+	 */
+	error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
+	if (error == 0) {
+		ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
+		ASSERT3U(zl_ints, ==, 1);
+
+		VERIFY(zap_lookup(os, object, txgname, zl_intsize,
+		    zl_ints, &last_txg) == 0);
+
+		VERIFY(zap_length(os, object, propname, &zl_intsize,
+		    &zl_ints) == 0);
+
+		ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
+		ASSERT3U(zl_ints, ==, ints);
+
+		VERIFY(zap_lookup(os, object, propname, zl_intsize,
+		    zl_ints, value) == 0);
+
+		for (i = 0; i < ints; i++) {
+			ASSERT3U(value[i], ==, last_txg + object + i);
 		}
-		txg = dmu_tx_get_txg(tx);
-
-		if (last_txg > txg)
-			fatal(0, "zap future leak: old %llu new %llu",
-			    last_txg, txg);
-
-		for (i = 0; i < ints; i++)
-			value[i] = txg + object + i;
-
-		error = zap_update(os, object, txgname, sizeof (uint64_t),
-		    1, &txg, tx);
-		if (error)
-			fatal(0, "zap_update('%s', %llu, '%s') = %d",
-			    osname, object, txgname, error);
-
-		error = zap_update(os, object, propname, sizeof (uint64_t),
-		    ints, value, tx);
-		if (error)
-			fatal(0, "zap_update('%s', %llu, '%s') = %d",
-			    osname, object, propname, error);
-
-		dmu_tx_commit(tx);
-
-		/*
-		 * Remove a random pair of entries.
-		 */
-		prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
-		(void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
-		(void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
-
-		error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
-
-		if (error == ENOENT)
-			continue;
-
-		ASSERT3U(error, ==, 0);
-
-		tx = dmu_tx_create(os);
-		dmu_tx_hold_zap(tx, object, TRUE, NULL);
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		if (error) {
-			ztest_record_enospc("remove zap entry");
-			dmu_tx_abort(tx);
-			return;
-		}
-		error = zap_remove(os, object, txgname, tx);
-		if (error)
-			fatal(0, "zap_remove('%s', %llu, '%s') = %d",
-			    osname, object, txgname, error);
-
-		error = zap_remove(os, object, propname, tx);
-		if (error)
-			fatal(0, "zap_remove('%s', %llu, '%s') = %d",
-			    osname, object, propname, error);
-
-		dmu_tx_commit(tx);
+	} else {
+		ASSERT3U(error, ==, ENOENT);
+	}
+
+	/*
+	 * Atomically update two entries in our zap object.
+	 * The first is named txg_%llu, and contains the txg
+	 * in which the property was last updated.  The second
+	 * is named prop_%llu, and the nth element of its value
+	 * should be txg + object + n.
+	 */
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_zap(tx, object, TRUE, NULL);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		ztest_record_enospc("create zap entry");
+		dmu_tx_abort(tx);
+		return;
 	}
+	txg = dmu_tx_get_txg(tx);
+
+	if (last_txg > txg)
+		fatal(0, "zap future leak: old %llu new %llu", last_txg, txg);
+
+	for (i = 0; i < ints; i++)
+		value[i] = txg + object + i;
+
+	error = zap_update(os, object, txgname, sizeof (uint64_t), 1, &txg, tx);
+	if (error)
+		fatal(0, "zap_update('%s', %llu, '%s') = %d",
+		    osname, object, txgname, error);
+
+	error = zap_update(os, object, propname, sizeof (uint64_t),
+	    ints, value, tx);
+	if (error)
+		fatal(0, "zap_update('%s', %llu, '%s') = %d",
+		    osname, object, propname, error);
+
+	dmu_tx_commit(tx);
+
+	/*
+	 * Remove a random pair of entries.
+	 */
+	prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
+	(void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
+	(void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
+
+	error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
+
+	if (error == ENOENT)
+		return;
+
+	ASSERT3U(error, ==, 0);
+
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_zap(tx, object, TRUE, NULL);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		ztest_record_enospc("remove zap entry");
+		dmu_tx_abort(tx);
+		return;
+	}
+	error = zap_remove(os, object, txgname, tx);
+	if (error)
+		fatal(0, "zap_remove('%s', %llu, '%s') = %d",
+		    osname, object, txgname, error);
+
+	error = zap_remove(os, object, propname, tx);
+	if (error)
+		fatal(0, "zap_remove('%s', %llu, '%s') = %d",
+		    osname, object, propname, error);
+
+	dmu_tx_commit(tx);
 
 	/*
 	 * Once in a while, destroy the object.
 	 */
-	if (ztest_random(100) != 0)
+	if (ztest_random(1000) != 0)
 		return;
 
 	tx = dmu_tx_create(os);
@@ -2401,111 +2388,107 @@
 {
 	objset_t *os = za->za_os;
 	uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc;
-	int iters = 100;
 	dmu_tx_t *tx;
 	int i, namelen, error;
 	char name[20], string_value[20];
 	void *data;
 
-	while (--iters >= 0) {
-		/*
-		 * Generate a random name of the form 'xxx.....' where each
-		 * x is a random printable character and the dots are dots.
-		 * There are 94 such characters, and the name length goes from
-		 * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names.
-		 */
-		namelen = ztest_random(sizeof (name) - 5) + 5 + 1;
-
-		for (i = 0; i < 3; i++)
-			name[i] = '!' + ztest_random('~' - '!' + 1);
-		for (; i < namelen - 1; i++)
-			name[i] = '.';
-		name[i] = '\0';
-
-		if (ztest_random(2) == 0)
-			object = ZTEST_MICROZAP_OBJ;
-		else
-			object = ZTEST_FATZAP_OBJ;
-
-		if ((namelen & 1) || object == ZTEST_MICROZAP_OBJ) {
-			wsize = sizeof (txg);
-			wc = 1;
-			data = &txg;
-		} else {
-			wsize = 1;
-			wc = namelen;
-			data = string_value;
+	/*
+	 * Generate a random name of the form 'xxx.....' where each
+	 * x is a random printable character and the dots are dots.
+	 * There are 94 such characters, and the name length goes from
+	 * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names.
+	 */
+	namelen = ztest_random(sizeof (name) - 5) + 5 + 1;
+
+	for (i = 0; i < 3; i++)
+		name[i] = '!' + ztest_random('~' - '!' + 1);
+	for (; i < namelen - 1; i++)
+		name[i] = '.';
+	name[i] = '\0';
+
+	if (ztest_random(2) == 0)
+		object = ZTEST_MICROZAP_OBJ;
+	else
+		object = ZTEST_FATZAP_OBJ;
+
+	if ((namelen & 1) || object == ZTEST_MICROZAP_OBJ) {
+		wsize = sizeof (txg);
+		wc = 1;
+		data = &txg;
+	} else {
+		wsize = 1;
+		wc = namelen;
+		data = string_value;
+	}
+
+	count = -1ULL;
+	VERIFY(zap_count(os, object, &count) == 0);
+	ASSERT(count != -1ULL);
+
+	/*
+	 * Select an operation: length, lookup, add, update, remove.
+	 */
+	i = ztest_random(5);
+
+	if (i >= 2) {
+		tx = dmu_tx_create(os);
+		dmu_tx_hold_zap(tx, object, TRUE, NULL);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			ztest_record_enospc("zap parallel");
+			dmu_tx_abort(tx);
+			return;
 		}
-
-		count = -1ULL;
-		VERIFY(zap_count(os, object, &count) == 0);
-		ASSERT(count != -1ULL);
-
-		/*
-		 * Select an operation: length, lookup, add, update, remove.
-		 */
-		i = ztest_random(5);
-
-		if (i >= 2) {
-			tx = dmu_tx_create(os);
-			dmu_tx_hold_zap(tx, object, TRUE, NULL);
-			error = dmu_tx_assign(tx, TXG_WAIT);
-			if (error) {
-				ztest_record_enospc("zap parallel");
-				dmu_tx_abort(tx);
-				return;
-			}
-			txg = dmu_tx_get_txg(tx);
-			bcopy(name, string_value, namelen);
+		txg = dmu_tx_get_txg(tx);
+		bcopy(name, string_value, namelen);
+	} else {
+		tx = NULL;
+		txg = 0;
+		bzero(string_value, namelen);
+	}
+
+	switch (i) {
+
+	case 0:
+		error = zap_length(os, object, name, &zl_wsize, &zl_wc);
+		if (error == 0) {
+			ASSERT3U(wsize, ==, zl_wsize);
+			ASSERT3U(wc, ==, zl_wc);
 		} else {
-			tx = NULL;
-			txg = 0;
-			bzero(string_value, namelen);
+			ASSERT3U(error, ==, ENOENT);
 		}
-
-		switch (i) {
-
-		case 0:
-			error = zap_length(os, object, name, &zl_wsize, &zl_wc);
-			if (error == 0) {
-				ASSERT3U(wsize, ==, zl_wsize);
-				ASSERT3U(wc, ==, zl_wc);
-			} else {
-				ASSERT3U(error, ==, ENOENT);
-			}
-			break;
-
-		case 1:
-			error = zap_lookup(os, object, name, wsize, wc, data);
-			if (error == 0) {
-				if (data == string_value &&
-				    bcmp(name, data, namelen) != 0)
-					fatal(0, "name '%s' != val '%s' len %d",
-					    name, data, namelen);
-			} else {
-				ASSERT3U(error, ==, ENOENT);
-			}
-			break;
-
-		case 2:
-			error = zap_add(os, object, name, wsize, wc, data, tx);
-			ASSERT(error == 0 || error == EEXIST);
-			break;
-
-		case 3:
-			VERIFY(zap_update(os, object, name, wsize, wc,
-			    data, tx) == 0);
-			break;
-
-		case 4:
-			error = zap_remove(os, object, name, tx);
-			ASSERT(error == 0 || error == ENOENT);
-			break;
+		break;
+
+	case 1:
+		error = zap_lookup(os, object, name, wsize, wc, data);
+		if (error == 0) {
+			if (data == string_value &&
+			    bcmp(name, data, namelen) != 0)
+				fatal(0, "name '%s' != val '%s' len %d",
+				    name, data, namelen);
+		} else {
+			ASSERT3U(error, ==, ENOENT);
 		}
-
-		if (tx != NULL)
-			dmu_tx_commit(tx);
+		break;
+
+	case 2:
+		error = zap_add(os, object, name, wsize, wc, data, tx);
+		ASSERT(error == 0 || error == EEXIST);
+		break;
+
+	case 3:
+		VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0);
+		break;
+
+	case 4:
+		error = zap_remove(os, object, name, tx);
+		ASSERT(error == 0 || error == ENOENT);
+		break;
 	}
+
+	if (tx != NULL)
+		dmu_tx_commit(tx);
 }
 
 void
@@ -2590,7 +2573,7 @@
 	char path0[MAXPATHLEN];
 	char pathrand[MAXPATHLEN];
 	size_t fsize;
-	spa_t *spa = dmu_objset_spa(za->za_os);
+	spa_t *spa = za->za_spa;
 	int bshift = SPA_MAXBLOCKSHIFT + 2;	/* don't scrog all labels */
 	int iters = 1000;
 	vdev_t *vd0;
@@ -2689,7 +2672,7 @@
 void
 ztest_scrub(ztest_args_t *za)
 {
-	spa_t *spa = dmu_objset_spa(za->za_os);
+	spa_t *spa = za->za_spa;
 
 	mutex_enter(&spa_namespace_lock);
 	(void) spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_FALSE);
@@ -2739,7 +2722,7 @@
 	if (error != 0)
 		fatal(0, "spa_open('%s') = %d", newname, error);
 
-	ASSERT(spa == dmu_objset_spa(za->za_os));
+	ASSERT(spa == za->za_spa);
 	spa_close(spa, FTAG);
 
 	/*
@@ -2757,7 +2740,7 @@
 	if (error != 0)
 		fatal(0, "spa_open('%s') = %d", oldname, error);
 
-	ASSERT(spa == dmu_objset_spa(za->za_os));
+	ASSERT(spa == za->za_spa);
 	spa_close(spa, FTAG);
 
 	umem_free(newname, strlen(newname) + 1);
@@ -3038,29 +3021,15 @@
 	ztest_shared_t *zs = ztest_shared;
 	hrtime_t now, functime;
 	ztest_info_t *zi;
-	int f;
+	int f, i;
 
 	while ((now = gethrtime()) < za->za_stop) {
 		/*
 		 * See if it's time to force a crash.
 		 */
 		if (now > za->za_kill) {
-			dmu_tx_t *tx;
-			uint64_t txg;
-
-			mutex_enter(&spa_namespace_lock);
-			tx = dmu_tx_create(za->za_os);
-			VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
-			txg = dmu_tx_get_txg(tx);
-			dmu_tx_commit(tx);
-			zs->zs_txg = txg;
-			if (zopt_verbose >= 3)
-				(void) printf(
-				    "killing process after txg %lld\n",
-				    (u_longlong_t)txg);
-			txg_wait_synced(dmu_objset_pool(za->za_os), txg);
-			zs->zs_alloc = spa_get_alloc(dmu_objset_spa(za->za_os));
-			zs->zs_space = spa_get_space(dmu_objset_spa(za->za_os));
+			zs->zs_alloc = spa_get_alloc(za->za_spa);
+			zs->zs_space = spa_get_space(za->za_spa);
 			(void) kill(getpid(), SIGKILL);
 		}
 
@@ -3085,9 +3054,8 @@
 		    ZTEST_DIRSIZE;
 		za->za_diroff_shared = (1ULL << 63);
 
-		ztest_dmu_write_parallel(za);
-
-		zi->zi_func(za);
+		for (i = 0; i < zi->zi_iters; i++)
+			zi->zi_func(za);
 
 		functime = gethrtime() - now;
 
@@ -3234,6 +3202,17 @@
 
 	for (t = 0; t < zopt_threads; t++) {
 		d = t % zopt_datasets;
+
+		(void) strcpy(za[t].za_pool, pool);
+		za[t].za_os = za[d].za_os;
+		za[t].za_spa = spa;
+		za[t].za_zilog = za[d].za_zilog;
+		za[t].za_instance = t;
+		za[t].za_random = ztest_random(-1ULL);
+		za[t].za_start = za[0].za_start;
+		za[t].za_stop = za[0].za_stop;
+		za[t].za_kill = za[0].za_kill;
+
 		if (t < zopt_datasets) {
 			ztest_replay_t zr;
 			int test_future = FALSE;
@@ -3243,13 +3222,11 @@
 			    ztest_create_cb, NULL);
 			if (error == EEXIST) {
 				test_future = TRUE;
+			} else if (error == ENOSPC) {
+				zs->zs_enospc_count++;
+				(void) rw_unlock(&ztest_shared->zs_name_lock);
+				break;
 			} else if (error != 0) {
-				if (error == ENOSPC) {
-					zs->zs_enospc_count++;
-					(void) rw_unlock(
-					    &ztest_shared->zs_name_lock);
-					break;
-				}
 				fatal(0, "dmu_objset_create(%s) = %d",
 				    name, error);
 			}
@@ -3259,22 +3236,13 @@
 				fatal(0, "dmu_objset_open('%s') = %d",
 				    name, error);
 			(void) rw_unlock(&ztest_shared->zs_name_lock);
-			if (test_future && ztest_shared->zs_txg > 0)
-				ztest_dmu_check_future_leak(za[d].za_os,
-				    ztest_shared->zs_txg);
+			if (test_future)
+				ztest_dmu_check_future_leak(&za[t]);
 			zr.zr_os = za[d].za_os;
 			zil_replay(zr.zr_os, &zr, &zr.zr_assign,
 			    ztest_replay_vector);
 			za[d].za_zilog = zil_open(za[d].za_os, NULL);
 		}
-		za[t].za_pool = spa_strdup(pool);
-		za[t].za_os = za[d].za_os;
-		za[t].za_zilog = za[d].za_zilog;
-		za[t].za_instance = t;
-		za[t].za_random = ztest_random(-1ULL);
-		za[t].za_start = za[0].za_start;
-		za[t].za_stop = za[0].za_stop;
-		za[t].za_kill = za[0].za_kill;
 
 		error = thr_create(0, 0, ztest_thread, &za[t], THR_BOUND,
 		    &za[t].za_thread);
@@ -3282,7 +3250,6 @@
 			fatal(0, "can't create thread %d: error %d",
 			    t, error);
 	}
-	ztest_shared->zs_txg = 0;
 
 	while (--t >= 0) {
 		error = thr_join(za[t].za_thread, NULL, NULL);
@@ -3294,11 +3261,8 @@
 			zil_close(za[t].za_zilog);
 			dmu_objset_close(za[t].za_os);
 		}
-		spa_strfree(za[t].za_pool);
 	}
 
-	umem_free(za, zopt_threads * sizeof (ztest_args_t));
-
 	if (zopt_verbose >= 3)
 		show_pool_stats(spa);
 
@@ -3308,15 +3272,15 @@
 	zs->zs_space = spa_get_space(spa);
 
 	/*
-	 * Did we have out-of-space errors?  If so, destroy a random objset.
+	 * If we had out-of-space errors, destroy a random objset.
 	 */
 	if (zs->zs_enospc_count != 0) {
 		(void) rw_rdlock(&ztest_shared->zs_name_lock);
-		(void) snprintf(name, 100, "%s/%s_%d", pool, pool,
-		    (int)ztest_random(zopt_datasets));
+		d = (int)ztest_random(zopt_datasets);
+		(void) snprintf(name, 100, "%s/%s_%d", pool, pool, d);
 		if (zopt_verbose >= 3)
 			(void) printf("Destroying %s to free up space\n", name);
-		(void) dmu_objset_find(name, ztest_destroy_cb, NULL,
+		(void) dmu_objset_find(name, ztest_destroy_cb, &za[d],
 		    DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
 		(void) rw_unlock(&ztest_shared->zs_name_lock);
 	}
@@ -3330,8 +3294,6 @@
 	for (t = 1; t < 50; t++)
 		dmu_prefetch(spa->spa_meta_objset, t, 0, 1 << 15);
 
-	spa_close(spa, FTAG);
-
 	/* Shutdown the suspend monitor thread */
 	zio_io_fail_shift = 0;
 	ztest_exiting = B_TRUE;
@@ -3342,6 +3304,10 @@
 	if (error)
 		fatal(0, "thr_join(%d) = %d", tid, error);
 
+	umem_free(za, zopt_threads * sizeof (ztest_args_t));
+
+	spa_close(spa, FTAG);
+
 	kernel_fini();
 }
 
--- a/usr/src/lib/libzpool/common/llib-lzpool	Tue Nov 27 17:41:22 2007 -0800
+++ b/usr/src/lib/libzpool/common/llib-lzpool	Tue Nov 27 22:58:05 2007 -0800
@@ -48,6 +48,6 @@
 #include <sys/bplist.h>
 #include <sys/zfs_znode.h>
 
-extern uint64_t zio_gang_bang;
+extern uint64_t metaslab_gang_bang;
 extern uint16_t zio_zil_fail_shift;
 extern uint16_t zio_io_fail_shift;
--- a/usr/src/uts/common/Makefile.files	Tue Nov 27 17:41:22 2007 -0800
+++ b/usr/src/uts/common/Makefile.files	Tue Nov 27 22:58:05 2007 -0800
@@ -1077,7 +1077,6 @@
 	lzjb.o			\
 	metaslab.o		\
 	refcount.o		\
-	rprwlock.o		\
 	sha256.o		\
 	spa.o			\
 	spa_config.o		\
--- a/usr/src/uts/common/fs/zfs/metaslab.c	Tue Nov 27 17:41:22 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/metaslab.c	Tue Nov 27 22:58:05 2007 -0800
@@ -35,6 +35,7 @@
 #include <sys/zio.h>
 
 uint64_t metaslab_aliquot = 512ULL << 10;
+uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
 
 /*
  * ==========================================================================
@@ -728,6 +729,12 @@
 	ASSERT(!DVA_IS_VALID(&dva[d]));
 
 	/*
+	 * For testing, make some blocks above a certain size be gang blocks.
+	 */
+	if (psize >= metaslab_gang_bang && (lbolt & 3) == 0)
+		return (ENOSPC);
+
+	/*
 	 * Start at the rotor and loop through all mgs until we find something.
 	 * Note that there's no locking on mc_rotor or mc_allocated because
 	 * nothing actually breaks if we miss a few updates -- we just won't
--- a/usr/src/uts/common/fs/zfs/rprwlock.c	Tue Nov 27 17:41:22 2007 -0800
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,118 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/refcount.h>
-#include <sys/rprwlock.h>
-
-void
-rprw_init(rprwlock_t *rwl)
-{
-	mutex_init(&rwl->rw_lock, NULL, MUTEX_DEFAULT, NULL);
-	rwl->rw_writer = NULL;
-	cv_init(&rwl->rw_cv, NULL, CV_DEFAULT, NULL);
-	refcount_create(&rwl->rw_count);
-}
-
-void
-rprw_destroy(rprwlock_t *rwl)
-{
-	mutex_destroy(&rwl->rw_lock);
-	ASSERT(rwl->rw_writer == NULL);
-	cv_destroy(&rwl->rw_cv);
-	refcount_destroy(&rwl->rw_count);
-}
-
-void
-rprw_enter_read(rprwlock_t *rwl, void *tag)
-{
-	mutex_enter(&rwl->rw_lock);
-
-	if (rwl->rw_writer != curthread) {
-		while (rwl->rw_writer != NULL)
-			cv_wait(&rwl->rw_cv, &rwl->rw_lock);
-	}
-
-	(void) refcount_add(&rwl->rw_count, tag);
-
-	mutex_exit(&rwl->rw_lock);
-}
-
-void
-rprw_enter_write(rprwlock_t *rwl, void *tag)
-{
-	mutex_enter(&rwl->rw_lock);
-
-	if (rwl->rw_writer != curthread) {
-		while (!refcount_is_zero(&rwl->rw_count))
-			cv_wait(&rwl->rw_cv, &rwl->rw_lock);
-		rwl->rw_writer = curthread;
-	}
-
-	(void) refcount_add(&rwl->rw_count, tag);
-
-	mutex_exit(&rwl->rw_lock);
-}
-
-void
-rprw_enter(rprwlock_t *rwl, krw_t rw, void *tag)
-{
-	if (rw == RW_READER)
-		rprw_enter_read(rwl, tag);
-	else
-		rprw_enter_write(rwl, tag);
-}
-
-void
-rprw_exit(rprwlock_t *rwl, void *tag)
-{
-	mutex_enter(&rwl->rw_lock);
-
-	ASSERT(!refcount_is_zero(&rwl->rw_count));
-	ASSERT(rwl->rw_writer == NULL || curthread == rwl->rw_writer);
-	if (refcount_remove(&rwl->rw_count, tag) == 0) {
-		cv_broadcast(&rwl->rw_cv);
-		rwl->rw_writer = NULL;  /* OK in either case */
-	}
-
-	mutex_exit(&rwl->rw_lock);
-}
-
-boolean_t
-rprw_held(rprwlock_t *rwl, krw_t rw)
-{
-	boolean_t held;
-
-	mutex_enter(&rwl->rw_lock);
-	if (rw == RW_WRITER)
-		held = (rwl->rw_writer == curthread);
-	else
-		held = !rwl->rw_writer && !refcount_is_zero(&rwl->rw_count);
-	mutex_exit(&rwl->rw_lock);
-
-	return (held);
-}
--- a/usr/src/uts/common/fs/zfs/spa_misc.c	Tue Nov 27 17:41:22 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c	Tue Nov 27 22:58:05 2007 -0800
@@ -144,16 +144,9 @@
  *				zero.  Must be called with spa_namespace_lock
  *				held.
  *
- * The spa_config_lock is manipulated using the following functions:
- *
- *	spa_config_enter()	Acquire the config lock as RW_READER or
- *				RW_WRITER.  At least one reference on the spa_t
- *				must exist.
- *
- *	spa_config_exit()	Release the config lock.
- *
- *	spa_config_held()	Returns true if the config lock is currently
- *				held in the given state.
+ * The spa_config_lock is a form of rwlock.  It must be held as RW_READER
+ * to perform I/O to the pool, and as RW_WRITER to change the vdev config.
+ * The spa_config_lock is manipulated with spa_config_{enter,exit,held}().
  *
  * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
  *
@@ -202,6 +195,80 @@
 
 /*
  * ==========================================================================
+ * SPA config locking
+ * ==========================================================================
+ */
+static void
+spa_config_lock_init(spa_config_lock_t *scl)
+{
+	mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL);
+	scl->scl_writer = NULL;
+	cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL);
+	refcount_create(&scl->scl_count);
+}
+
+static void
+spa_config_lock_destroy(spa_config_lock_t *scl)
+{
+	mutex_destroy(&scl->scl_lock);
+	ASSERT(scl->scl_writer == NULL);
+	cv_destroy(&scl->scl_cv);
+	refcount_destroy(&scl->scl_count);
+}
+
+void
+spa_config_enter(spa_t *spa, krw_t rw, void *tag)
+{
+	spa_config_lock_t *scl = &spa->spa_config_lock;
+
+	mutex_enter(&scl->scl_lock);
+
+	if (rw == RW_READER) {
+		while (scl->scl_writer != NULL && scl->scl_writer != curthread)
+			cv_wait(&scl->scl_cv, &scl->scl_lock);
+	} else {
+		while (!refcount_is_zero(&scl->scl_count) &&
+		    scl->scl_writer != curthread)
+			cv_wait(&scl->scl_cv, &scl->scl_lock);
+		scl->scl_writer = curthread;
+	}
+
+	(void) refcount_add(&scl->scl_count, tag);
+
+	mutex_exit(&scl->scl_lock);
+}
+
+void
+spa_config_exit(spa_t *spa, void *tag)
+{
+	spa_config_lock_t *scl = &spa->spa_config_lock;
+
+	mutex_enter(&scl->scl_lock);
+
+	ASSERT(!refcount_is_zero(&scl->scl_count));
+
+	if (refcount_remove(&scl->scl_count, tag) == 0) {
+		cv_broadcast(&scl->scl_cv);
+		ASSERT(scl->scl_writer == NULL || scl->scl_writer == curthread);
+		scl->scl_writer = NULL;  /* OK in either case */
+	}
+
+	mutex_exit(&scl->scl_lock);
+}
+
+boolean_t
+spa_config_held(spa_t *spa, krw_t rw)
+{
+	spa_config_lock_t *scl = &spa->spa_config_lock;
+
+	if (rw == RW_READER)
+		return (!refcount_is_zero(&scl->scl_count));
+	else
+		return (scl->scl_writer == curthread);
+}
+
+/*
+ * ==========================================================================
  * SPA namespace functions
  * ==========================================================================
  */
@@ -275,7 +342,7 @@
 	spa->spa_final_txg = UINT64_MAX;
 
 	refcount_create(&spa->spa_refcount);
-	rprw_init(&spa->spa_config_lock);
+	spa_config_lock_init(&spa->spa_config_lock);
 
 	avl_add(&spa_namespace_avl, spa);
 
@@ -324,7 +391,7 @@
 
 	refcount_destroy(&spa->spa_refcount);
 
-	rprw_destroy(&spa->spa_config_lock);
+	spa_config_lock_destroy(&spa->spa_config_lock);
 
 	rw_destroy(&spa->spa_traverse_lock);
 
@@ -639,29 +706,6 @@
 
 /*
  * ==========================================================================
- * SPA config locking
- * ==========================================================================
- */
-void
-spa_config_enter(spa_t *spa, krw_t rw, void *tag)
-{
-	rprw_enter(&spa->spa_config_lock, rw, tag);
-}
-
-void
-spa_config_exit(spa_t *spa, void *tag)
-{
-	rprw_exit(&spa->spa_config_lock, tag);
-}
-
-boolean_t
-spa_config_held(spa_t *spa, krw_t rw)
-{
-	return (rprw_held(&spa->spa_config_lock, rw));
-}
-
-/*
- * ==========================================================================
  * SPA vdev locking
  * ==========================================================================
  */
@@ -1003,7 +1047,7 @@
 	 * config lock, both of which are required to do a rename.
 	 */
 	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
-	    spa_config_held(spa, RW_READER) || spa_config_held(spa, RW_WRITER));
+	    spa_config_held(spa, RW_READER));
 
 	return (spa->spa_name);
 }
--- a/usr/src/uts/common/fs/zfs/sys/rprwlock.h	Tue Nov 27 17:41:22 2007 -0800
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,61 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_RPRWLOCK_H
-#define	_SYS_RPRWLOCK_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/inttypes.h>
-#include <sys/list.h>
-#include <sys/zfs_context.h>
-#include <sys/refcount.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-typedef struct rprwlock {
-	kmutex_t	rw_lock;
-	kthread_t	*rw_writer;
-	kcondvar_t	rw_cv;
-	refcount_t	rw_count;
-} rprwlock_t;
-
-void rprw_init(rprwlock_t *rwl);
-void rprw_destroy(rprwlock_t *rwl);
-void rprw_enter_read(rprwlock_t *rwl, void *tag);
-void rprw_enter_write(rprwlock_t *rwl, void *tag);
-void rprw_enter(rprwlock_t *rwl, krw_t rw, void *tag);
-void rprw_exit(rprwlock_t *rwl, void *tag);
-boolean_t rprw_held(rprwlock_t *rwl, krw_t rw);
-#define	RPRW_READ_HELD(x)	rprw_held(x, RW_READER)
-#define	RPRW_WRITE_HELD(x)	rprw_held(x, RW_WRITER)
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_RPRWLOCK_H */
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h	Tue Nov 27 17:41:22 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h	Tue Nov 27 22:58:05 2007 -0800
@@ -37,7 +37,6 @@
 #include <sys/zfs_context.h>
 #include <sys/avl.h>
 #include <sys/refcount.h>
-#include <sys/rprwlock.h>
 #include <sys/bplist.h>
 
 #ifdef	__cplusplus
@@ -68,6 +67,14 @@
 	uint_t		sav_npending;		/* # pending devices */
 };
 
+typedef struct spa_config_lock {
+	kmutex_t	scl_lock;
+	kthread_t	*scl_writer;
+	uint16_t	scl_write_wanted;
+	kcondvar_t	scl_cv;
+	refcount_t	scl_count;
+} spa_config_lock_t;
+
 struct spa {
 	/*
 	 * Fields protected by spa_namespace_lock.
@@ -157,7 +164,7 @@
 	 * In order for the MDB module to function correctly, the other
 	 * fields must remain in the same location.
 	 */
-	rprwlock_t	spa_config_lock;	/* configuration changes */
+	spa_config_lock_t spa_config_lock;	/* configuration changes */
 	refcount_t	spa_refcount;		/* number of opens */
 };
 
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h	Tue Nov 27 17:41:22 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h	Tue Nov 27 22:58:05 2007 -0800
@@ -83,9 +83,6 @@
 
 extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
 
-extern void vdev_io_start(zio_t *zio);
-extern void vdev_io_done(zio_t *zio);
-
 extern int vdev_fault(spa_t *spa, uint64_t guid);
 extern int vdev_degrade(spa_t *spa, uint64_t guid);
 extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags,
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h	Tue Nov 27 17:41:22 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h	Tue Nov 27 22:58:05 2007 -0800
@@ -62,8 +62,8 @@
 typedef void	vdev_close_func_t(vdev_t *vd);
 typedef int	vdev_probe_func_t(vdev_t *vd);
 typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
-typedef void	vdev_io_start_func_t(zio_t *zio);
-typedef void	vdev_io_done_func_t(zio_t *zio);
+typedef int	vdev_io_start_func_t(zio_t *zio);
+typedef int	vdev_io_done_func_t(zio_t *zio);
 typedef void	vdev_state_change_func_t(vdev_t *vd, int, int);
 
 typedef struct vdev_ops {
--- a/usr/src/uts/common/fs/zfs/sys/zio.h	Tue Nov 27 17:41:22 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h	Tue Nov 27 22:58:05 2007 -0800
@@ -153,6 +153,7 @@
 	(ZIO_FLAG_CANFAIL |		\
 	ZIO_FLAG_FAILFAST |		\
 	ZIO_FLAG_CONFIG_HELD |		\
+	ZIO_FLAG_DONT_CACHE |		\
 	ZIO_FLAG_DONT_RETRY |		\
 	ZIO_FLAG_IO_REPAIR |		\
 	ZIO_FLAG_SPECULATIVE |		\
@@ -164,9 +165,11 @@
 
 #define	ZIO_FLAG_VDEV_INHERIT		\
 	(ZIO_FLAG_GANG_INHERIT |	\
-	ZIO_FLAG_DONT_CACHE |		\
 	ZIO_FLAG_PHYSICAL)
 
+#define	ZIO_PIPELINE_CONTINUE		0x100
+#define	ZIO_PIPELINE_STOP		0x101
+
 /*
  * We'll take the unused errno 'EBADE' (from the Convergent graveyard)
  * to indicate checksum errors.
@@ -262,7 +265,6 @@
 	uint32_t	io_numerrors;
 	uint32_t	io_pipeline;
 	uint32_t	io_orig_pipeline;
-	uint32_t	io_async_stages;
 	uint64_t	io_children_notready;
 	uint64_t	io_children_notdone;
 	void		*io_waiter;
@@ -319,21 +321,18 @@
 
 extern int zio_wait(zio_t *zio);
 extern void zio_nowait(zio_t *zio);
+extern void zio_execute(zio_t *zio);
+extern void zio_interrupt(zio_t *zio);
+
+extern int zio_wait_for_children_ready(zio_t *zio);
+extern int zio_wait_for_children_done(zio_t *zio);
 
 extern void *zio_buf_alloc(size_t size);
 extern void zio_buf_free(void *buf, size_t size);
 extern void *zio_data_buf_alloc(size_t size);
 extern void zio_data_buf_free(void *buf, size_t size);
 
-/*
- * Move an I/O to the next stage of the pipeline and execute that stage.
- * There's no locking on io_stage because there's no legitimate way for
- * multiple threads to be attempting to process the same I/O.
- */
-extern void zio_next_stage(zio_t *zio);
-extern void zio_next_stage_async(zio_t *zio);
 extern void zio_resubmit_stage_async(void *);
-extern void zio_wait_children_done(zio_t *zio);
 
 /*
  * Delegate I/O to a child vdev.
--- a/usr/src/uts/common/fs/zfs/sys/zio_impl.h	Tue Nov 27 17:41:22 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/zio_impl.h	Tue Nov 27 22:58:05 2007 -0800
@@ -38,16 +38,15 @@
 /*
  * I/O Groups: pipeline stage definitions.
  */
-
 typedef enum zio_stage {
 	ZIO_STAGE_OPEN = 0,			/* RWFCI */
-	ZIO_STAGE_WAIT_CHILDREN_READY,		/* RWFCI */
+	ZIO_STAGE_WAIT_FOR_CHILDREN_READY,	/* RWFCI */
 
+	ZIO_STAGE_READ_INIT,			/* R---- */
+	ZIO_STAGE_ISSUE_ASYNC,			/* -W--- */
 	ZIO_STAGE_WRITE_COMPRESS,		/* -W--- */
 	ZIO_STAGE_CHECKSUM_GENERATE,		/* -W--- */
 
-	ZIO_STAGE_GANG_PIPELINE,		/* -WFC- */
-
 	ZIO_STAGE_GET_GANG_HEADER,		/* -WFC- */
 	ZIO_STAGE_REWRITE_GANG_MEMBERS,		/* -W--- */
 	ZIO_STAGE_FREE_GANG_MEMBERS,		/* --F-- */
@@ -61,13 +60,11 @@
 
 	ZIO_STAGE_READY,			/* RWFCI */
 
-	ZIO_STAGE_READ_INIT,			/* R---- */
-
 	ZIO_STAGE_VDEV_IO_START,		/* RW--I */
 	ZIO_STAGE_VDEV_IO_DONE,			/* RW--I */
 	ZIO_STAGE_VDEV_IO_ASSESS,		/* RW--I */
 
-	ZIO_STAGE_WAIT_CHILDREN_DONE,		/* RWFCI */
+	ZIO_STAGE_WAIT_FOR_CHILDREN_DONE,	/* RWFCI */
 
 	ZIO_STAGE_CHECKSUM_VERIFY,		/* R---- */
 	ZIO_STAGE_READ_GANG_MEMBERS,		/* R---- */
@@ -77,30 +74,22 @@
 	ZIO_STAGE_DONE				/* RWFCI */
 } zio_stage_t;
 
-/*
- * The stages for which there's some performance value in going async.
- * When compression is enabled, ZIO_STAGE_WRITE_COMPRESS is ORed in as well.
- */
-#define	ZIO_ASYNC_PIPELINE_STAGES				\
-	((1U << ZIO_STAGE_CHECKSUM_GENERATE) |			\
-	(1U << ZIO_STAGE_VDEV_IO_DONE) |			\
-	(1U << ZIO_STAGE_CHECKSUM_VERIFY) |			\
-	(1U << ZIO_STAGE_READ_DECOMPRESS))
+#define	ZIO_INTERLOCK_STAGES					\
+	((1U << ZIO_STAGE_WAIT_FOR_CHILDREN_READY) |		\
+	(1U << ZIO_STAGE_READY) |				\
+	(1U << ZIO_STAGE_WAIT_FOR_CHILDREN_DONE) |		\
+	(1U << ZIO_STAGE_ASSESS) |				\
+	(1U << ZIO_STAGE_DONE))
 
-#define	ZIO_VDEV_IO_PIPELINE					\
+#define	ZIO_VDEV_IO_STAGES					\
 	((1U << ZIO_STAGE_VDEV_IO_START) |			\
 	(1U << ZIO_STAGE_VDEV_IO_DONE) |			\
 	(1U << ZIO_STAGE_VDEV_IO_ASSESS))
 
 #define	ZIO_READ_PHYS_PIPELINE					\
-	((1U << ZIO_STAGE_OPEN) |				\
-	(1U << ZIO_STAGE_WAIT_CHILDREN_READY) |			\
-	(1U << ZIO_STAGE_READY) |				\
-	ZIO_VDEV_IO_PIPELINE |					\
-	(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
-	(1U << ZIO_STAGE_CHECKSUM_VERIFY) |			\
-	(1U << ZIO_STAGE_ASSESS) |				\
-	(1U << ZIO_STAGE_DONE))
+	(ZIO_INTERLOCK_STAGES |					\
+	ZIO_VDEV_IO_STAGES |					\
+	(1U << ZIO_STAGE_CHECKSUM_VERIFY))
 
 #define	ZIO_READ_GANG_PIPELINE					\
 	ZIO_READ_PHYS_PIPELINE
@@ -109,97 +98,66 @@
 	(1U << ZIO_STAGE_READ_INIT) |				\
 	ZIO_READ_PHYS_PIPELINE
 
+#define	ZIO_WRITE_COMMON_STAGES					\
+	(ZIO_INTERLOCK_STAGES |					\
+	ZIO_VDEV_IO_STAGES |					\
+	(1U << ZIO_STAGE_ISSUE_ASYNC) |				\
+	(1U << ZIO_STAGE_CHECKSUM_GENERATE))
+
 #define	ZIO_WRITE_PHYS_PIPELINE					\
-	((1U << ZIO_STAGE_OPEN) |				\
-	(1U << ZIO_STAGE_WAIT_CHILDREN_READY) |			\
-	(1U << ZIO_STAGE_CHECKSUM_GENERATE) |			\
-	(1U << ZIO_STAGE_READY) |				\
-	ZIO_VDEV_IO_PIPELINE |					\
-	(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
-	(1U << ZIO_STAGE_ASSESS) |				\
-	(1U << ZIO_STAGE_DONE))
-
-#define	ZIO_WRITE_COMMON_PIPELINE				\
-	ZIO_WRITE_PHYS_PIPELINE
+	ZIO_WRITE_COMMON_STAGES
 
 #define	ZIO_WRITE_PIPELINE					\
-	((1U << ZIO_STAGE_WRITE_COMPRESS) |			\
-	ZIO_WRITE_COMMON_PIPELINE)
+	(ZIO_WRITE_COMMON_STAGES |				\
+	(1U << ZIO_STAGE_WRITE_COMPRESS))
 
-#define	ZIO_GANG_STAGES						\
+#define	ZIO_GANG_REWRITE_STAGES					\
 	((1U << ZIO_STAGE_GET_GANG_HEADER) |			\
 	(1U << ZIO_STAGE_REWRITE_GANG_MEMBERS) |		\
-	(1U << ZIO_STAGE_FREE_GANG_MEMBERS) |			\
-	(1U << ZIO_STAGE_CLAIM_GANG_MEMBERS) |			\
-	(1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE) |		\
-	(1U << ZIO_STAGE_READ_GANG_MEMBERS))
-
-#define	ZIO_REWRITE_PIPELINE					\
-	((1U << ZIO_STAGE_GANG_PIPELINE) |			\
-	(1U << ZIO_STAGE_GET_GANG_HEADER) |			\
-	(1U << ZIO_STAGE_REWRITE_GANG_MEMBERS) |		\
-	(1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE) |		\
-	ZIO_WRITE_COMMON_PIPELINE)
-
-#define	ZIO_WRITE_ALLOCATE_PIPELINE				\
-	((1U << ZIO_STAGE_DVA_ALLOCATE) |			\
-	ZIO_WRITE_COMMON_PIPELINE)
+	(1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE))
 
 #define	ZIO_GANG_FREE_STAGES					\
 	((1U << ZIO_STAGE_GET_GANG_HEADER) |			\
 	(1U << ZIO_STAGE_FREE_GANG_MEMBERS))
 
-#define	ZIO_FREE_PIPELINE					\
-	((1U << ZIO_STAGE_OPEN) |				\
-	(1U << ZIO_STAGE_WAIT_CHILDREN_READY) |			\
-	(1U << ZIO_STAGE_GANG_PIPELINE) |			\
-	(1U << ZIO_STAGE_GET_GANG_HEADER) |			\
-	(1U << ZIO_STAGE_FREE_GANG_MEMBERS) |			\
-	(1U << ZIO_STAGE_DVA_FREE) |				\
-	(1U << ZIO_STAGE_READY) |				\
-	(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
-	(1U << ZIO_STAGE_ASSESS) |				\
-	(1U << ZIO_STAGE_DONE))
+#define	ZIO_GANG_CLAIM_STAGES					\
+	((1U << ZIO_STAGE_GET_GANG_HEADER) |			\
+	(1U << ZIO_STAGE_CLAIM_GANG_MEMBERS))
+
+#define	ZIO_REWRITE_PIPELINE(bp)				\
+	(ZIO_WRITE_COMMON_STAGES |				\
+	(BP_IS_GANG(bp) ? ZIO_GANG_REWRITE_STAGES : 0))
 
-#define	ZIO_CLAIM_PIPELINE					\
-	((1U << ZIO_STAGE_OPEN) |				\
-	(1U << ZIO_STAGE_WAIT_CHILDREN_READY) |			\
-	(1U << ZIO_STAGE_GANG_PIPELINE) |			\
-	(1U << ZIO_STAGE_GET_GANG_HEADER) |			\
-	(1U << ZIO_STAGE_CLAIM_GANG_MEMBERS) |			\
+#define	ZIO_WRITE_ALLOCATE_PIPELINE				\
+	(ZIO_WRITE_COMMON_STAGES |				\
+	(1U << ZIO_STAGE_DVA_ALLOCATE))
+
+#define	ZIO_FREE_PIPELINE(bp)					\
+	(ZIO_INTERLOCK_STAGES |					\
+	(1U << ZIO_STAGE_DVA_FREE) |				\
+	(BP_IS_GANG(bp) ? ZIO_GANG_FREE_STAGES : 0))
+
+#define	ZIO_CLAIM_PIPELINE(bp)					\
+	(ZIO_INTERLOCK_STAGES |					\
 	(1U << ZIO_STAGE_DVA_CLAIM) |				\
-	(1U << ZIO_STAGE_READY) |				\
-	(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
-	(1U << ZIO_STAGE_ASSESS) |				\
-	(1U << ZIO_STAGE_DONE))
+	(BP_IS_GANG(bp) ? ZIO_GANG_CLAIM_STAGES : 0))
 
 #define	ZIO_IOCTL_PIPELINE					\
-	((1U << ZIO_STAGE_OPEN) |				\
-	(1U << ZIO_STAGE_WAIT_CHILDREN_READY) |			\
-	(1U << ZIO_STAGE_READY) |				\
-	ZIO_VDEV_IO_PIPELINE |					\
-	(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
-	(1U << ZIO_STAGE_ASSESS) |				\
-	(1U << ZIO_STAGE_DONE))
+	(ZIO_INTERLOCK_STAGES |					\
+	ZIO_VDEV_IO_STAGES)
+
 
 #define	ZIO_WAIT_FOR_CHILDREN_PIPELINE				\
-	((1U << ZIO_STAGE_WAIT_CHILDREN_READY) |		\
-	(1U << ZIO_STAGE_READY) |				\
-	(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
+	ZIO_INTERLOCK_STAGES
+
+#define	ZIO_VDEV_CHILD_PIPELINE					\
+	(ZIO_VDEV_IO_STAGES |					\
 	(1U << ZIO_STAGE_ASSESS) |				\
+	(1U << ZIO_STAGE_WAIT_FOR_CHILDREN_DONE) |		\
 	(1U << ZIO_STAGE_DONE))
 
-#define	ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE			\
-	((1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
-	(1U << ZIO_STAGE_ASSESS) |				\
-	(1U << ZIO_STAGE_DONE))
-
-#define	ZIO_VDEV_CHILD_PIPELINE					\
-	(ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE |			\
-	ZIO_VDEV_IO_PIPELINE)
-
 #define	ZIO_ERROR_PIPELINE_MASK					\
-	ZIO_WAIT_FOR_CHILDREN_PIPELINE
+	ZIO_INTERLOCK_STAGES
 
 typedef struct zio_transform zio_transform_t;
 struct zio_transform {
--- a/usr/src/uts/common/fs/zfs/vdev.c	Tue Nov 27 17:41:22 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev.c	Tue Nov 27 22:58:05 2007 -0800
@@ -136,6 +136,9 @@
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
+	ASSERT(spa_config_held(spa, RW_READER) ||
+	    curthread == spa->spa_scrub_thread);
+
 	if (vdev < rvd->vdev_children)
 		return (rvd->vdev_child[vdev]);
 
@@ -1459,18 +1462,6 @@
 	return (vd->vdev_ops->vdev_op_asize(vd, psize));
 }
 
-void
-vdev_io_start(zio_t *zio)
-{
-	zio->io_vd->vdev_ops->vdev_op_io_start(zio);
-}
-
-void
-vdev_io_done(zio_t *zio)
-{
-	zio->io_vd->vdev_ops->vdev_op_io_done(zio);
-}
-
 const char *
 vdev_description(vdev_t *vd)
 {
--- a/usr/src/uts/common/fs/zfs/vdev_cache.c	Tue Nov 27 17:41:22 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev_cache.c	Tue Nov 27 22:58:05 2007 -0800
@@ -231,7 +231,7 @@
 		zio->io_delegate_list = dio->io_delegate_next;
 		dio->io_delegate_next = NULL;
 		dio->io_error = zio->io_error;
-		zio_next_stage(dio);
+		zio_execute(dio);
 	}
 }
 
@@ -286,15 +286,10 @@
 		zio_vdev_io_bypass(zio);
 
 		mutex_exit(&vc->vc_lock);
-		zio_next_stage(zio);
+		zio_execute(zio);
 		return (0);
 	}
 
-	if (!(zio->io_flags & ZIO_FLAG_METADATA)) {
-		mutex_exit(&vc->vc_lock);
-		return (EINVAL);
-	}
-
 	ve = vdev_cache_allocate(zio);
 
 	if (ve == NULL) {
--- a/usr/src/uts/common/fs/zfs/vdev_disk.c	Tue Nov 27 17:41:22 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev_disk.c	Tue Nov 27 22:58:05 2007 -0800
@@ -386,7 +386,7 @@
 
 	kmem_free(vdb, sizeof (vdev_disk_buf_t));
 
-	zio_next_stage_async(zio);
+	zio_interrupt(zio);
 }
 
 static void
@@ -396,10 +396,10 @@
 
 	zio->io_error = error;
 
-	zio_next_stage_async(zio);
+	zio_interrupt(zio);
 }
 
-static void
+static int
 vdev_disk_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
@@ -414,8 +414,7 @@
 		/* XXPOLICY */
 		if (!vdev_readable(vd)) {
 			zio->io_error = ENXIO;
-			zio_next_stage_async(zio);
-			return;
+			return (ZIO_PIPELINE_CONTINUE);
 		}
 
 		switch (zio->io_cmd) {
@@ -444,8 +443,10 @@
 				 * and will call vdev_disk_ioctl_done()
 				 * upon completion.
 				 */
-				return;
-			} else if (error == ENOTSUP || error == ENOTTY) {
+				return (ZIO_PIPELINE_STOP);
+			}
+
+			if (error == ENOTSUP || error == ENOTTY) {
 				/*
 				 * If we get ENOTSUP or ENOTTY, we know that
 				 * no future attempts will ever succeed.
@@ -463,15 +464,26 @@
 			zio->io_error = ENOTSUP;
 		}
 
-		zio_next_stage_async(zio);
-		return;
+		return (ZIO_PIPELINE_CONTINUE);
 	}
 
 	if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
-		return;
+		return (ZIO_PIPELINE_STOP);
 
 	if ((zio = vdev_queue_io(zio)) == NULL)
-		return;
+		return (ZIO_PIPELINE_STOP);
+
+	if (zio->io_type == ZIO_TYPE_WRITE)
+		error = vdev_writeable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
+	else
+		error = vdev_readable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
+	error = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : error;
+
+	if (error) {
+		zio->io_error = error;
+		zio_interrupt(zio);
+		return (ZIO_PIPELINE_STOP);
+	}
 
 	flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
 	flags |= B_BUSY | B_NOCACHE;
@@ -491,26 +503,14 @@
 	bp->b_bufsize = zio->io_size;
 	bp->b_iodone = (int (*)())vdev_disk_io_intr;
 
-	/* XXPOLICY */
-	if (zio->io_type == ZIO_TYPE_WRITE)
-		error = vdev_writeable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
-	else
-		error = vdev_readable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
-	error = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : error;
-	if (error) {
-		zio->io_error = error;
-		bioerror(bp, error);
-		bp->b_resid = bp->b_bcount;
-		bp->b_iodone(bp);
-		return;
-	}
-
 	error = ldi_strategy(dvd->vd_lh, bp);
 	/* ldi_strategy() will return non-zero only on programming errors */
 	ASSERT(error == 0);
+
+	return (ZIO_PIPELINE_STOP);
 }
 
-static void
+static int
 vdev_disk_io_done(zio_t *zio)
 {
 	vdev_queue_io_done(zio);
@@ -544,7 +544,7 @@
 		}
 	}
 
-	zio_next_stage(zio);
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
 vdev_ops_t vdev_disk_ops = {
--- a/usr/src/uts/common/fs/zfs/vdev_file.c	Tue Nov 27 17:41:22 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev_file.c	Tue Nov 27 22:58:05 2007 -0800
@@ -215,7 +215,7 @@
 	return (error);
 }
 
-static void
+static int
 vdev_file_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
@@ -229,8 +229,7 @@
 		/* XXPOLICY */
 		if (!vdev_readable(vd)) {
 			zio->io_error = ENXIO;
-			zio_next_stage_async(zio);
-			return;
+			return (ZIO_PIPELINE_CONTINUE);
 		}
 
 		switch (zio->io_cmd) {
@@ -244,8 +243,7 @@
 			zio->io_error = ENOTSUP;
 		}
 
-		zio_next_stage_async(zio);
-		return;
+		return (ZIO_PIPELINE_CONTINUE);
 	}
 
 	/*
@@ -254,11 +252,11 @@
 	 */
 #ifndef _KERNEL
 	if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
-		return;
+		return (ZIO_PIPELINE_STOP);
 #endif
 
 	if ((zio = vdev_queue_io(zio)) == NULL)
-		return;
+		return (ZIO_PIPELINE_STOP);
 
 	/* XXPOLICY */
 	if (zio->io_type == ZIO_TYPE_WRITE)
@@ -268,8 +266,8 @@
 	error = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : error;
 	if (error) {
 		zio->io_error = error;
-		zio_next_stage_async(zio);
-		return;
+		zio_interrupt(zio);
+		return (ZIO_PIPELINE_STOP);
 	}
 
 	zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ?
@@ -280,26 +278,25 @@
 	if (resid != 0 && zio->io_error == 0)
 		zio->io_error = ENOSPC;
 
-	zio_next_stage_async(zio);
+	zio_interrupt(zio);
+
+	return (ZIO_PIPELINE_STOP);
 }
 
-static void
+static int
 vdev_file_io_done(zio_t *zio)
 {
+	vdev_t *vd = zio->io_vd;
 
 	if (zio_injection_enabled && zio->io_error == 0)
-		zio->io_error = zio_handle_device_injection(zio->io_vd, EIO);
+		zio->io_error = zio_handle_device_injection(vd, EIO);
 
 	/*
 	 * If an error has been encountered then attempt to probe the device
 	 * to determine if it's still accessible.
 	 */
-	if (zio->io_error == EIO) {
-		vdev_t *vd = zio->io_vd;
-
-		if (vdev_probe(vd) != 0)
-			vd->vdev_is_failing = B_TRUE;
-	}
+	if (zio->io_error == EIO && vdev_probe(vd) != 0)
+		vd->vdev_is_failing = B_TRUE;
 
 	vdev_queue_io_done(zio);
 
@@ -308,7 +305,7 @@
 		vdev_cache_write(zio);
 #endif
 
-	zio_next_stage(zio);
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
 vdev_ops_t vdev_file_ops = {
--- a/usr/src/uts/common/fs/zfs/vdev_mirror.c	Tue Nov 27 17:41:22 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c	Tue Nov 27 22:58:05 2007 -0800
@@ -253,7 +253,7 @@
 	return (-1);
 }
 
-static void
+static int
 vdev_mirror_io_start(zio_t *zio)
 {
 	mirror_map_t *mm;
@@ -279,8 +279,7 @@
 				    ZIO_FLAG_CANFAIL,
 				    vdev_mirror_scrub_done, mc));
 			}
-			zio_wait_children_done(zio);
-			return;
+			return (zio_wait_for_children_done(zio));
 		}
 		/*
 		 * For normal reads just pick one child.
@@ -316,10 +315,10 @@
 		c++;
 	}
 
-	zio_wait_children_done(zio);
+	return (zio_wait_for_children_done(zio));
 }
 
-static void
+static int
 vdev_mirror_io_done(zio_t *zio)
 {
 	mirror_map_t *mm = zio->io_vsd;
@@ -362,8 +361,7 @@
 		if (good_copies != 0)
 			zio->io_error = 0;
 		vdev_mirror_map_free(zio);
-		zio_next_stage(zio);
-		return;
+		return (ZIO_PIPELINE_CONTINUE);
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ);
@@ -383,8 +381,7 @@
 		    mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size,
 		    ZIO_TYPE_READ, zio->io_priority, ZIO_FLAG_CANFAIL,
 		    vdev_mirror_child_done, mc));
-		zio_wait_children_done(zio);
-		return;
+		return (zio_wait_for_children_done(zio));
 	}
 
 	/* XXPOLICY */
@@ -441,12 +438,13 @@
 		}
 
 		zio_nowait(rio);
-		zio_wait_children_done(zio);
-		return;
+
+		return (zio_wait_for_children_done(zio));
 	}
 
 	vdev_mirror_map_free(zio);
-	zio_next_stage(zio);
+
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static void
--- a/usr/src/uts/common/fs/zfs/vdev_missing.c	Tue Nov 27 17:41:22 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev_missing.c	Tue Nov 27 22:58:05 2007 -0800
@@ -62,18 +62,18 @@
 }
 
 /* ARGSUSED */
-static void
+static int
 vdev_missing_io_start(zio_t *zio)
 {
 	zio->io_error = ENOTSUP;
-	zio_next_stage_async(zio);
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
 /* ARGSUSED */
-static void
+static int
 vdev_missing_io_done(zio_t *zio)
 {
-	zio_next_stage(zio);
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
 /* ARGSUSED */
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c	Tue Nov 27 17:41:22 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c	Tue Nov 27 22:58:05 2007 -0800
@@ -162,7 +162,7 @@
 		aio->io_delegate_list = dio->io_delegate_next;
 		dio->io_delegate_next = NULL;
 		dio->io_error = aio->io_error;
-		zio_next_stage(dio);
+		zio_execute(dio);
 	}
 	ASSERT3U(offset, ==, aio->io_size);
 
@@ -172,11 +172,8 @@
 #define	IS_ADJACENT(io, nio) \
 	((io)->io_offset + (io)->io_size == (nio)->io_offset)
 
-typedef void zio_issue_func_t(zio_t *);
-
 static zio_t *
-vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit,
-	zio_issue_func_t **funcp)
+vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
 {
 	zio_t *fio, *lio, *aio, *dio;
 	avl_tree_t *tree;
@@ -184,8 +181,6 @@
 
 	ASSERT(MUTEX_HELD(&vq->vq_lock));
 
-	*funcp = NULL;
-
 	if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
 	    avl_numnodes(&vq->vq_deadline_tree) == 0)
 		return (NULL);
@@ -245,7 +240,6 @@
 
 		avl_add(&vq->vq_pending_tree, aio);
 
-		*funcp = zio_nowait;
 		return (aio);
 	}
 
@@ -254,8 +248,6 @@
 
 	avl_add(&vq->vq_pending_tree, fio);
 
-	*funcp = zio_next_stage;
-
 	return (fio);
 }
 
@@ -264,7 +256,6 @@
 {
 	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
 	zio_t *nio;
-	zio_issue_func_t *func;
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
 
@@ -285,15 +276,19 @@
 
 	vdev_queue_io_add(vq, zio);
 
-	nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending, &func);
+	nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending);
 
 	mutex_exit(&vq->vq_lock);
 
-	if (nio == NULL || func != zio_nowait)
-		return (nio);
+	if (nio == NULL)
+		return (NULL);
 
-	func(nio);
-	return (NULL);
+	if (nio->io_done == vdev_queue_agg_io_done) {
+		zio_nowait(nio);
+		return (NULL);
+	}
+
+	return (nio);
 }
 
 void
@@ -301,7 +296,6 @@
 {
 	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
 	zio_t *nio;
-	zio_issue_func_t *func;
 	int i;
 
 	mutex_enter(&vq->vq_lock);
@@ -309,13 +303,16 @@
 	avl_remove(&vq->vq_pending_tree, zio);
 
 	for (i = 0; i < zfs_vdev_ramp_rate; i++) {
-		nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending, &func);
+		nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending);
 		if (nio == NULL)
 			break;
 		mutex_exit(&vq->vq_lock);
-		if (func == zio_next_stage)
+		if (nio->io_done == vdev_queue_agg_io_done) {
+			zio_nowait(nio);
+		} else {
 			zio_vdev_io_reissue(nio);
-		func(nio);
+			zio_execute(nio);
+		}
 		mutex_enter(&vq->vq_lock);
 	}
 
--- a/usr/src/uts/common/fs/zfs/vdev_raidz.c	Tue Nov 27 17:41:22 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c	Tue Nov 27 22:58:05 2007 -0800
@@ -639,7 +639,7 @@
 	vdev_raidz_map_free(zio->io_private);
 }
 
-static void
+static int
 vdev_raidz_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
@@ -672,8 +672,8 @@
 			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
 			    vdev_raidz_child_done, rc));
 		}
-		zio_wait_children_done(zio);
-		return;
+
+		return (zio_wait_for_children_done(zio));
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ);
@@ -714,7 +714,7 @@
 		}
 	}
 
-	zio_wait_children_done(zio);
+	return (zio_wait_for_children_done(zio));
 }
 
 /*
@@ -783,7 +783,7 @@
 static uint64_t raidz_corrected_q;
 static uint64_t raidz_corrected_pq;
 
-static void
+static int
 vdev_raidz_io_done(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
@@ -840,8 +840,8 @@
 			zio->io_error = 0;
 
 		vdev_raidz_map_free(zio);
-		zio_next_stage(zio);
-		return;
+
+		return (ZIO_PIPELINE_CONTINUE);
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ);
@@ -1022,8 +1022,8 @@
 			    vdev_raidz_child_done, rc));
 		} while (++c < rm->rm_cols);
 		dprintf("rereading\n");
-		zio_wait_children_done(zio);
-		return;
+
+		return (zio_wait_for_children_done(zio));
 	}
 
 	/*
@@ -1205,12 +1205,13 @@
 		}
 
 		zio_nowait(rio);
-		zio_wait_children_done(zio);
-		return;
+
+		return (zio_wait_for_children_done(zio));
 	}
 
 	vdev_raidz_map_free(zio);
-	zio_next_stage(zio);
+
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static void
--- a/usr/src/uts/common/fs/zfs/zio.c	Tue Nov 27 17:41:22 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/zio.c	Tue Nov 27 22:58:05 2007 -0800
@@ -61,9 +61,6 @@
 char *zio_type_name[ZIO_TYPES] = {
 	"null", "read", "write", "free", "claim", "ioctl" };
 
-/* At or above this size, force gang blocking - for testing */
-uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1;
-
 /* Force an allocation failure when non-zero */
 uint16_t zio_zil_fail_shift = 0;
 uint16_t zio_io_fail_shift = 0;
@@ -170,8 +167,6 @@
 			    align, NULL, NULL, NULL, NULL, data_alloc_arena,
 			    KMC_NODEBUG);
 
-			dprintf("creating cache for size %5lx align %5lx\n",
-			    size, align);
 		}
 	}
 
@@ -356,9 +351,6 @@
 		zio->io_bp = bp;
 		zio->io_bp_copy = *bp;
 		zio->io_bp_orig = *bp;
-		if (dmu_ot[BP_GET_TYPE(bp)].ot_metadata ||
-		    BP_GET_LEVEL(bp) != 0)
-			zio->io_flags |= ZIO_FLAG_METADATA;
 	}
 	zio->io_done = done;
 	zio->io_private = private;
@@ -366,10 +358,7 @@
 	zio->io_priority = priority;
 	zio->io_stage = stage;
 	zio->io_pipeline = pipeline;
-	zio->io_async_stages = ZIO_ASYNC_PIPELINE_STAGES;
 	zio->io_timestamp = lbolt64;
-	if (pio != NULL)
-		zio->io_flags |= (pio->io_flags & ZIO_FLAG_METADATA);
 	mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
 	zio_push_transform(zio, data, size, size);
@@ -395,7 +384,7 @@
 	if (pio == NULL) {
 		if (type != ZIO_TYPE_NULL &&
 		    !(flags & ZIO_FLAG_CONFIG_HELD)) {
-			spa_config_enter(zio->io_spa, RW_READER, zio);
+			spa_config_enter(spa, RW_READER, zio);
 			zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED;
 		}
 		zio->io_root = zio;
@@ -409,7 +398,7 @@
 		    !(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) &&
 		    !(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) {
 			pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED;
-			spa_config_enter(zio->io_spa, RW_READER, pio);
+			spa_config_enter(spa, RW_READER, pio);
 		}
 		if (stage < ZIO_STAGE_READY)
 			pio->io_children_notready++;
@@ -524,9 +513,6 @@
 	zio->io_compress = compress;
 	zio->io_ndvas = ncopies;
 
-	if (compress != ZIO_COMPRESS_OFF)
-		zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS;
-
 	if (bp->blk_birth != txg) {
 		/* XXX the bp usually (always?) gets re-zeroed later */
 		BP_ZERO(bp);
@@ -551,7 +537,7 @@
 
 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER,
-	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
+	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE(bp));
 
 	zio->io_bookmark = *zb;
 	zio->io_checksum = checksum;
@@ -612,7 +598,7 @@
 
 	zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
 	    ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER,
-	    ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
+	    ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE(bp));
 
 	zio->io_bp = &zio->io_bp_copy;
 
@@ -641,7 +627,7 @@
 
 	zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
 	    ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0,
-	    ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
+	    ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE(bp));
 
 	zio->io_bp = &zio->io_bp_copy;
 
@@ -820,7 +806,7 @@
 
 	zio->io_waiter = curthread;
 
-	zio_next_stage_async(zio);
+	zio_execute(zio);
 
 	mutex_enter(&zio->io_lock);
 	while (zio->io_stalled != ZIO_STAGE_DONE)
@@ -838,7 +824,23 @@
 void
 zio_nowait(zio_t *zio)
 {
-	zio_next_stage_async(zio);
+	zio_execute(zio);
+}
+
+void
+zio_interrupt(zio_t *zio)
+{
+	(void) taskq_dispatch(zio->io_spa->spa_zio_intr_taskq[zio->io_type],
+	    (task_func_t *)zio_execute, zio, TQ_SLEEP);
+}
+
+static int
+zio_issue_async(zio_t *zio)
+{
+	(void) taskq_dispatch(zio->io_spa->spa_zio_issue_taskq[zio->io_type],
+	    (task_func_t *)zio_execute, zio, TQ_SLEEP);
+
+	return (ZIO_PIPELINE_STOP);
 }
 
 /*
@@ -846,18 +848,20 @@
  * I/O pipeline interlocks: parent/child dependency scoreboarding
  * ==========================================================================
  */
-static void
+static int
 zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp)
 {
+	int rv = ZIO_PIPELINE_CONTINUE;
+
 	mutex_enter(&zio->io_lock);
-	if (*countp == 0) {
-		ASSERT(zio->io_stalled == 0);
-		mutex_exit(&zio->io_lock);
-		zio_next_stage(zio);
-	} else {
+	ASSERT(zio->io_stalled == 0);
+	if (*countp != 0) {
 		zio->io_stalled = stage;
-		mutex_exit(&zio->io_lock);
+		rv = ZIO_PIPELINE_STOP;
 	}
+	mutex_exit(&zio->io_lock);
+
+	return (rv);
 }
 
 static void
@@ -872,48 +876,54 @@
 	if (--*countp == 0 && pio->io_stalled == stage) {
 		pio->io_stalled = 0;
 		mutex_exit(&pio->io_lock);
-		zio_next_stage_async(pio);
+		zio_execute(pio);
 	} else {
 		mutex_exit(&pio->io_lock);
 	}
 }
 
-static void
-zio_wait_children_ready(zio_t *zio)
+int
+zio_wait_for_children_ready(zio_t *zio)
 {
-	zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_READY,
-	    &zio->io_children_notready);
+	return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY,
+	    &zio->io_children_notready));
 }
 
-void
-zio_wait_children_done(zio_t *zio)
+int
+zio_wait_for_children_done(zio_t *zio)
 {
-	zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_DONE,
-	    &zio->io_children_notdone);
+	return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE,
+	    &zio->io_children_notdone));
 }
 
-static void
+static int
 zio_read_init(zio_t *zio)
 {
-	if (BP_GET_COMPRESS(zio->io_bp) != ZIO_COMPRESS_OFF) {
-		uint64_t csize = BP_GET_PSIZE(zio->io_bp);
+	blkptr_t *bp = zio->io_bp;
+
+	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
+		uint64_t csize = BP_GET_PSIZE(bp);
 		void *cbuf = zio_buf_alloc(csize);
 
 		zio_push_transform(zio, cbuf, csize, csize);
 		zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS;
 	}
 
-	if (BP_IS_GANG(zio->io_bp)) {
+	if (BP_IS_GANG(bp)) {
 		uint64_t gsize = SPA_GANGBLOCKSIZE;
 		void *gbuf = zio_buf_alloc(gsize);
 
 		zio_push_transform(zio, gbuf, gsize, gsize);
 		zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS;
 	}
-	zio_next_stage(zio);
+
+	if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0)
+		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
+
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
-static void
+static int
 zio_ready(zio_t *zio)
 {
 	zio_t *pio = zio->io_parent;
@@ -922,16 +932,16 @@
 		zio->io_ready(zio);
 
 	if (pio != NULL)
-		zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY,
+		zio_notify_parent(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY,
 		    &pio->io_children_notready);
 
 	if (zio->io_bp)
 		zio->io_bp_copy = *zio->io_bp;
 
-	zio_next_stage(zio);
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
-static void
+static int
 zio_vdev_retry_io(zio_t *zio)
 {
 	zio_t *pio = zio->io_parent;
@@ -967,7 +977,7 @@
 		if (pio->io_stage > ZIO_STAGE_OPEN && IO_IS_ALLOCATING(pio))
 			pio->io_flags |= ZIO_FLAG_WRITE_RETRY;
 
-		ASSERT(pio->io_stage <= ZIO_STAGE_WAIT_CHILDREN_DONE);
+		ASSERT(pio->io_stage <= ZIO_STAGE_WAIT_FOR_CHILDREN_DONE);
 		mutex_exit(&pio->io_lock);
 	}
 
@@ -977,7 +987,8 @@
 	 */
 	zio->io_flags &= ~ZIO_FLAG_WRITE_RETRY;
 	zio->io_error = 0;
-	zio_next_stage_async(zio);
+
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
 int
@@ -1029,7 +1040,7 @@
 			zio->io_stage = ZIO_STAGE_READY;
 		}
 
-		(void) taskq_dispatch(zio_taskq, zio_resubmit_stage_async,
+		(void) taskq_dispatch(zio_taskq, (task_func_t *)zio_execute,
 		    zio, TQ_SLEEP);
 	}
 	mutex_exit(&spa->spa_zio_lock);
@@ -1049,7 +1060,7 @@
 	return (0);
 }
 
-static void
+static int
 zio_vdev_suspend_io(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
@@ -1069,9 +1080,11 @@
 	cv_broadcast(&spa->spa_zio_cv);
 #endif
 	mutex_exit(&spa->spa_zio_lock);
+
+	return (ZIO_PIPELINE_STOP);
 }
 
-static void
+static int
 zio_assess(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
@@ -1138,10 +1151,9 @@
 		 * property.
 		 */
 		if (zio_write_retry && zio->io_error != ENOSPC &&
-		    IO_IS_ALLOCATING(zio)) {
-			zio_vdev_retry_io(zio);
-			return;
-		}
+		    IO_IS_ALLOCATING(zio))
+			return (zio_vdev_retry_io(zio));
+
 		ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY));
 
 		/*
@@ -1175,22 +1187,20 @@
 				    "uncorrectable I/O failure and the "
 				    "failure mode property for this pool "
 				    "is set to panic.", spa_name(spa));
-			} else {
-				cmn_err(CE_WARN, "Pool '%s' has encountered "
-				    "an uncorrectable I/O error. Manual "
-				    "intervention is required.",
-				    spa_name(spa));
-				zio_vdev_suspend_io(zio);
 			}
-			return;
+			cmn_err(CE_WARN, "Pool '%s' has encountered "
+			    "an uncorrectable I/O error. "
+			    "Manual intervention is required.", spa_name(spa));
+			return (zio_vdev_suspend_io(zio));
 		}
 	}
 	ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY));
 	ASSERT(zio->io_children_notready == 0);
-	zio_next_stage(zio);
+
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
-static void
+static int
 zio_done(zio_t *zio)
 {
 	zio_t *pio = zio->io_parent;
@@ -1221,7 +1231,7 @@
 			pio->io_child = next;
 		mutex_exit(&pio->io_lock);
 
-		zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_DONE,
+		zio_notify_parent(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE,
 		    &pio->io_children_notdone);
 	}
 
@@ -1243,6 +1253,8 @@
 		cv_destroy(&zio->io_cv);
 		kmem_cache_free(zio_cache, zio);
 	}
+
+	return (ZIO_PIPELINE_STOP);
 }
 
 /*
@@ -1250,7 +1262,7 @@
  * Compression support
  * ==========================================================================
  */
-static void
+static int
 zio_write_compress(zio_t *zio)
 {
 	int compress = zio->io_compress;
@@ -1300,7 +1312,7 @@
 		ASSERT(csize != 0);
 		BP_SET_LSIZE(bp, lsize);
 		BP_SET_COMPRESS(bp, compress);
-		zio->io_pipeline = ZIO_REWRITE_PIPELINE;
+		zio->io_pipeline = ZIO_REWRITE_PIPELINE(bp);
 	} else {
 		if (bp->blk_birth == zio->io_txg)
 			BP_ZERO(bp);
@@ -1316,10 +1328,10 @@
 		}
 	}
 
-	zio_next_stage(zio);
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
-static void
+static int
 zio_read_decompress(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
@@ -1338,7 +1350,7 @@
 
 	zio_buf_free(data, bufsize);
 
-	zio_next_stage(zio);
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
 /*
@@ -1347,19 +1359,6 @@
  * ==========================================================================
  */
 static void
-zio_gang_pipeline(zio_t *zio)
-{
-	/*
-	 * By default, the pipeline assumes that we're dealing with a gang
-	 * block.  If we're not, strip out any gang-specific stages.
-	 */
-	if (!BP_IS_GANG(zio->io_bp))
-		zio->io_pipeline &= ~ZIO_GANG_STAGES;
-
-	zio_next_stage(zio);
-}
-
-static void
 zio_gang_byteswap(zio_t *zio)
 {
 	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
@@ -1368,7 +1367,7 @@
 		byteswap_uint64_array(zio->io_data, zio->io_size);
 }
 
-static void
+static int
 zio_get_gang_header(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
@@ -1384,10 +1383,10 @@
 	    zio->io_flags & ZIO_FLAG_GANG_INHERIT,
 	    ZIO_STAGE_OPEN, ZIO_READ_GANG_PIPELINE));
 
-	zio_wait_children_done(zio);
+	return (zio_wait_for_children_done(zio));
 }
 
-static void
+static int
 zio_read_gang_members(zio_t *zio)
 {
 	zio_gbh_phys_t *gbh;
@@ -1410,16 +1409,17 @@
 		ASSERT(!BP_IS_HOLE(gbp));
 
 		zio_nowait(zio_read(zio, zio->io_spa, gbp,
-		    (char *)zio->io_data + loff, lsize, NULL, NULL,
-		    zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT,
-		    &zio->io_bookmark));
+		    (char *)zio->io_data + loff, lsize,
+		    NULL, NULL, zio->io_priority,
+		    zio->io_flags & ZIO_FLAG_GANG_INHERIT, &zio->io_bookmark));
 	}
 
 	zio_buf_free(gbh, gbufsize);
-	zio_wait_children_done(zio);
+
+	return (zio_wait_for_children_done(zio));
 }
 
-static void
+static int
 zio_rewrite_gang_members(zio_t *zio)
 {
 	zio_gbh_phys_t *gbh;
@@ -1446,15 +1446,16 @@
 
 		zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum,
 		    zio->io_txg, gbp, (char *)zio->io_data + loff, lsize,
-		    NULL, NULL, zio->io_priority, zio->io_flags,
-		    &zio->io_bookmark));
+		    NULL, NULL, zio->io_priority,
+		    zio->io_flags & ZIO_FLAG_GANG_INHERIT, &zio->io_bookmark));
 	}
 
 	zio_push_transform(zio, gbh, gsize, gbufsize);
-	zio_wait_children_ready(zio);
+
+	return (zio_wait_for_children_ready(zio));
 }
 
-static void
+static int
 zio_free_gang_members(zio_t *zio)
 {
 	zio_gbh_phys_t *gbh;
@@ -1476,10 +1477,11 @@
 	}
 
 	zio_buf_free(gbh, gbufsize);
-	zio_next_stage(zio);
+
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
-static void
+static int
 zio_claim_gang_members(zio_t *zio)
 {
 	zio_gbh_phys_t *gbh;
@@ -1500,7 +1502,8 @@
 	}
 
 	zio_buf_free(gbh, gbufsize);
-	zio_next_stage(zio);
+
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static void
@@ -1549,8 +1552,10 @@
 
 	error = metaslab_alloc(spa, mc, gsize, bp, gbh_ndvas, txg, NULL,
 	    B_FALSE);
-	if (error)
-		return (error);
+	if (error) {
+		zio->io_error = error;
+		return (ZIO_PIPELINE_CONTINUE);
+	}
 
 	for (d = 0; d < gbh_ndvas; d++)
 		DVA_SET_GANG(&dva[d], 1);
@@ -1560,10 +1565,6 @@
 	gbh = zio_buf_alloc(gsize);
 	bzero(gbh, gsize);
 
-	/* We need to test multi-level gang blocks */
-	if (maxalloc >= zio_gang_bang && (lbolt & 0x1) == 0)
-		maxalloc = MAX(maxalloc >> 2, SPA_MINBLOCKSIZE);
-
 	for (loff = 0, i = 0; loff != zio->io_size;
 	    loff += lsize, resid -= lsize, gbps_left--, i++) {
 		blkptr_t *gbp = &gbh->zg_blkptr[i];
@@ -1579,8 +1580,10 @@
 				break;
 			ASSERT3U(error, ==, ENOSPC);
 			/* XXX - free up previous allocations? */
-			if (maxalloc == SPA_MINBLOCKSIZE)
-				return (error);
+			if (maxalloc == SPA_MINBLOCKSIZE) {
+				zio->io_error = error;
+				return (ZIO_PIPELINE_CONTINUE);
+			}
 			maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE);
 		}
 
@@ -1614,14 +1617,14 @@
 	zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE;
 
 	zio_push_transform(zio, gbh, gsize, gsize);
+
 	/*
-	 * As much as we'd like this to be zio_wait_children_ready(),
+	 * As much as we'd like this to be 'ready' instead of 'done',
 	 * updating our ASIZE doesn't happen until the io_done callback,
 	 * so we have to wait for that to finish in order for our BP
 	 * to be stable.
 	 */
-	zio_wait_children_done(zio);
-	return (0);
+	return (zio_wait_for_children_done(zio));
 }
 
 /*
@@ -1629,7 +1632,7 @@
  * Allocate and free blocks
  * ==========================================================================
  */
-static void
+static int
 zio_dva_allocate(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
@@ -1642,14 +1645,6 @@
 	ASSERT3U(zio->io_ndvas, >, 0);
 	ASSERT3U(zio->io_ndvas, <=, spa_max_replication(spa));
 
-	/* For testing, make some blocks above a certain size be gang blocks */
-	if (zio->io_size >= zio_gang_bang && (lbolt & 0x3) == 0) {
-		error = zio_write_allocate_gang_members(zio, mc);
-		if (error)
-			zio->io_error = error;
-		return;
-	}
-
 	/*
 	 * For testing purposes, we force I/Os to retry. We don't allow
 	 * retries beyond the first pass since those I/Os are non-allocating
@@ -1668,17 +1663,15 @@
 	if (error == 0) {
 		bp->blk_birth = zio->io_txg;
 	} else if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) {
-		error = zio_write_allocate_gang_members(zio, mc);
-		if (error == 0)
-			return;
-		zio->io_error = error;
+		return (zio_write_allocate_gang_members(zio, mc));
 	} else {
 		zio->io_error = error;
 	}
-	zio_next_stage(zio);
+
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
-static void
+static int
 zio_dva_free(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
@@ -1687,15 +1680,15 @@
 
 	BP_ZERO(bp);
 
-	zio_next_stage(zio);
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
-static void
+static int
 zio_dva_claim(zio_t *zio)
 {
 	zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
 
-	zio_next_stage(zio);
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
 /*
@@ -1704,7 +1697,7 @@
  * ==========================================================================
  */
 
-static void
+static int
 zio_vdev_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
@@ -1719,24 +1712,21 @@
 	 * at that time.
 	 */
 	if (spa_state(spa) == POOL_STATE_IO_FAILURE &&
-	    zio->io_type == ZIO_TYPE_WRITE) {
-		zio_vdev_suspend_io(zio);
-		return;
-	}
+	    zio->io_type == ZIO_TYPE_WRITE)
+		return (zio_vdev_suspend_io(zio));
 
-	if (vd == NULL) {
-		/* The mirror_ops handle multiple DVAs in a single BP */
-		vdev_mirror_ops.vdev_op_io_start(zio);
-		return;
-	}
+	/*
+	 * The mirror_ops handle multiple DVAs in a single BP
+	 */
+	if (vd == NULL)
+		return (vdev_mirror_ops.vdev_op_io_start(zio));
 
 	align = 1ULL << tvd->vdev_ashift;
 
 	if (zio->io_retries == 0 && vd == tvd)
 		zio->io_flags |= ZIO_FLAG_FAILFAST;
 
-	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
-	    vd->vdev_children == 0) {
+	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && vd->vdev_children == 0) {
 		zio->io_flags |= ZIO_FLAG_PHYSICAL;
 		zio->io_offset += VDEV_LABEL_START_SIZE;
 	}
@@ -1760,19 +1750,16 @@
 	    P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size);
 	ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE));
 
-	vdev_io_start(zio);
-
-	/* zio_next_stage_async() gets called from io completion interrupt */
+	return (vd->vdev_ops->vdev_op_io_start(zio));
 }
 
-static void
+static int
 zio_vdev_io_done(zio_t *zio)
 {
 	if (zio->io_vd == NULL)
-		/* The mirror_ops handle multiple DVAs in a single BP */
-		vdev_mirror_ops.vdev_op_io_done(zio);
-	else
-		vdev_io_done(zio);
+		return (vdev_mirror_ops.vdev_op_io_done(zio));
+
+	return (zio->io_vd->vdev_ops->vdev_op_io_done(zio));
 }
 
 /* XXPOLICY */
@@ -1795,7 +1782,7 @@
 	return (B_TRUE);
 }
 
-static void
+static int
 zio_vdev_io_assess(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
@@ -1833,15 +1820,10 @@
 		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
 		zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1;
 
-		dprintf("retry #%d for %s to %s offset %llx\n",
-		    zio->io_retries, zio_type_name[zio->io_type],
-		    vdev_description(vd), zio->io_offset);
-
-		zio_next_stage_async(zio);
-		return;
+		return (ZIO_PIPELINE_CONTINUE);
 	}
 
-	zio_next_stage(zio);
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
 void
@@ -1876,7 +1858,7 @@
  * Generate and verify checksums
  * ==========================================================================
  */
-static void
+static int
 zio_checksum_generate(zio_t *zio)
 {
 	int checksum = zio->io_checksum;
@@ -1889,10 +1871,10 @@
 
 	zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size);
 
-	zio_next_stage(zio);
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
-static void
+static int
 zio_gang_checksum_generate(zio_t *zio)
 {
 	zio_cksum_t zc;
@@ -1905,10 +1887,10 @@
 
 	zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size);
 
-	zio_next_stage(zio);
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
-static void
+static int
 zio_checksum_verify(zio_t *zio)
 {
 	if (zio->io_bp != NULL) {
@@ -1918,7 +1900,7 @@
 			    zio->io_spa, zio->io_vd, zio, 0, 0);
 	}
 
-	zio_next_stage(zio);
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
 /*
@@ -1949,20 +1931,15 @@
  * Define the pipeline
  * ==========================================================================
  */
-typedef void zio_pipe_stage_t(zio_t *zio);
-
-static void
-zio_badop(zio_t *zio)
-{
-	panic("Invalid I/O pipeline stage %u for zio %p", zio->io_stage, zio);
-}
+typedef int zio_pipe_stage_t(zio_t *zio);
 
 zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = {
-	zio_badop,
-	zio_wait_children_ready,
+	NULL,
+	zio_wait_for_children_ready,
+	zio_read_init,
+	zio_issue_async,
 	zio_write_compress,
 	zio_checksum_generate,
-	zio_gang_pipeline,
 	zio_get_gang_header,
 	zio_rewrite_gang_members,
 	zio_free_gang_members,
@@ -1972,116 +1949,63 @@
 	zio_dva_claim,
 	zio_gang_checksum_generate,
 	zio_ready,
-	zio_read_init,
 	zio_vdev_io_start,
 	zio_vdev_io_done,
 	zio_vdev_io_assess,
-	zio_wait_children_done,
+	zio_wait_for_children_done,
 	zio_checksum_verify,
 	zio_read_gang_members,
 	zio_read_decompress,
 	zio_assess,
 	zio_done,
-	zio_badop
+	NULL
 };
 
 /*
- * Move an I/O to the next stage of the pipeline and execute that stage.
- * There's no locking on io_stage because there's no legitimate way for
- * multiple threads to be attempting to process the same I/O.
+ * Execute the I/O pipeline until one of the following occurs:
+ * (1) the I/O completes; (2) the pipeline stalls waiting for
+ * dependent child I/Os; (3) the I/O issues, so we're waiting
+ * for an I/O completion interrupt; (4) the I/O is delegated by
+ * vdev-level caching or aggregation; (5) the I/O is deferred
+ * due to vdev-level queueing; (6) the I/O is handed off to
+ * another thread.  In all cases, the pipeline stops whenever
+ * there's no CPU work; it never burns a thread in cv_wait().
+ *
+ * There's no locking on io_stage because there's no legitimate way
+ * for multiple threads to be attempting to process the same I/O.
  */
 void
-zio_next_stage(zio_t *zio)
+zio_execute(zio_t *zio)
 {
-	uint32_t pipeline = zio->io_pipeline;
-
-	ASSERT(!MUTEX_HELD(&zio->io_lock));
-
-	if (zio->io_error) {
-		dprintf("zio %p vdev %s offset %llx stage %d error %d\n",
-		    zio, vdev_description(zio->io_vd),
-		    zio->io_offset, zio->io_stage, zio->io_error);
-		if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0)
-			pipeline &= ZIO_ERROR_PIPELINE_MASK;
-	}
-
-	while (((1U << ++zio->io_stage) & pipeline) == 0)
-		continue;
-
-	ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
-	ASSERT(zio->io_stalled == 0);
+	while (zio->io_stage < ZIO_STAGE_DONE) {
+		uint32_t pipeline = zio->io_pipeline;
+		int rv;
 
-	/*
-	 * See the comment in zio_next_stage_async() about per-CPU taskqs.
-	 */
-	if (((1U << zio->io_stage) & zio->io_async_stages) &&
-	    (zio->io_stage == ZIO_STAGE_WRITE_COMPRESS) &&
-	    !(zio->io_flags & ZIO_FLAG_METADATA)) {
-		taskq_t *tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type];
-		(void) taskq_dispatch(tq,
-		    (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP);
-	} else {
-		zio_pipeline[zio->io_stage](zio);
-	}
-}
-
-void
-zio_next_stage_async(zio_t *zio)
-{
-	taskq_t *tq;
-	uint32_t pipeline = zio->io_pipeline;
-
-	ASSERT(!MUTEX_HELD(&zio->io_lock));
+		ASSERT(!MUTEX_HELD(&zio->io_lock));
 
-	if (zio->io_error) {
-		dprintf("zio %p vdev %s offset %llx stage %d error %d\n",
-		    zio, vdev_description(zio->io_vd),
-		    zio->io_offset, zio->io_stage, zio->io_error);
-		if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0)
+		/*
+		 * If an error occurred outside the vdev stack,
+		 * just execute the interlock stages to clean up.
+		 */
+		if (zio->io_error &&
+		    ((1U << zio->io_stage) & ZIO_VDEV_IO_STAGES) == 0)
 			pipeline &= ZIO_ERROR_PIPELINE_MASK;
-	}
 
-	while (((1U << ++zio->io_stage) & pipeline) == 0)
-		continue;
-
-	ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
-	ASSERT(zio->io_stalled == 0);
+		while (((1U << ++zio->io_stage) & pipeline) == 0)
+			continue;
 
-	/*
-	 * For performance, we'll probably want two sets of task queues:
-	 * per-CPU issue taskqs and per-CPU completion taskqs.  The per-CPU
-	 * part is for read performance: since we have to make a pass over
-	 * the data to checksum it anyway, we want to do this on the same CPU
-	 * that issued the read, because (assuming CPU scheduling affinity)
-	 * that thread is probably still there.  Getting this optimization
-	 * right avoids performance-hostile cache-to-cache transfers.
-	 *
-	 * Note that having two sets of task queues is also necessary for
-	 * correctness: if all of the issue threads get bogged down waiting
-	 * for dependent reads (e.g. metaslab freelist) to complete, then
-	 * there won't be any threads available to service I/O completion
-	 * interrupts.
-	 */
-	if ((1U << zio->io_stage) & zio->io_async_stages) {
-		if (zio->io_stage < ZIO_STAGE_VDEV_IO_DONE)
-			tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type];
-		else
-			tq = zio->io_spa->spa_zio_intr_taskq[zio->io_type];
-		(void) taskq_dispatch(tq,
-		    (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP);
-	} else {
-		zio_pipeline[zio->io_stage](zio);
+		ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
+		ASSERT(zio->io_stalled == 0);
+
+		rv = zio_pipeline[zio->io_stage](zio);
+
+		if (rv == ZIO_PIPELINE_STOP)
+			return;
+
+		ASSERT(rv == ZIO_PIPELINE_CONTINUE);
 	}
 }
 
-void
-zio_resubmit_stage_async(void *arg)
-{
-	zio_t *zio = (zio_t *)(uintptr_t)arg;
-
-	zio_next_stage_async(zio);
-}
-
 static boolean_t
 zio_io_should_fail(uint16_t range)
 {