# HG changeset patch
# User Jeff Bonwick <Jeff.Bonwick@Sun.COM>
# Date 1257113686 28800
# Node ID e2081f5023069120b7a160a1826f5bf80151828f
# Parent  8aac17999e4d70b0ef52fa77356798598b9a6c44
PSARC 2009/571 ZFS Deduplication Properties
6677093 zfs should have dedup capability

diff -r 8aac17999e4d -r e2081f502306 usr/src/cmd/filebench/Makefile.com
--- a/usr/src/cmd/filebench/Makefile.com	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/cmd/filebench/Makefile.com	Sun Nov 01 14:14:46 2009 -0800
@@ -51,9 +51,9 @@
 ROOTFBBINDIR = $(ROOT)/usr/benchmarks/filebench/bin
 OBJS = $(SRCS:%.c=%.o) parser_gram.o parser_lex.o
 LINTFLAGS += -erroff=E_FUNC_ARG_UNUSED -erroff=E_NAME_DEF_NOT_USED2 \
-	-erroff=E_NAME_USED_NOT_DEF2
+	-erroff=E_NAME_USED_NOT_DEF2 -erroff=E_INCONS_ARG_DECL2
 LINTFLAGS64 += -erroff=E_FUNC_ARG_UNUSED -erroff=E_NAME_DEF_NOT_USED2 \
-	-erroff=E_NAME_USED_NOT_DEF2
+	-erroff=E_NAME_USED_NOT_DEF2 -erroff=E_INCONS_ARG_DECL2
 LINTFILES = $(SRCS:%.c=%.ln)
 CLEANFILES += parser_gram.c parser_gram.h parser_lex.c y.tab.h y.tab.c
 
diff -r 8aac17999e4d -r e2081f502306 usr/src/cmd/mdb/common/modules/zfs/zfs.c
--- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c	Sun Nov 01 14:14:46 2009 -0800
@@ -35,7 +35,6 @@
 #include <sys/list.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
-#include <sys/zio_compress.h>
 #include <ctype.h>
 
 #ifndef _KERNEL
@@ -48,15 +47,6 @@
 #define	ZFS_OBJ_NAME	"libzpool.so.1"
 #endif
 
-static char *
-local_strdup(const char *s)
-{
-	char *s1 = mdb_alloc(strlen(s) + 1, UM_SLEEP);
-
-	(void) strcpy(s1, s);
-	return (s1);
-}
-
 static int
 getmember(uintptr_t addr, const char *type, mdb_ctf_id_t *idp,
     const char *member, int len, void *buf)
@@ -130,27 +120,6 @@
 	return (GETMEMBID(addr + off, &rc_id, rc_count, *rc));
 }
 
-static int
-read_symbol(char *sym_name, void **bufp)
-{
-	GElf_Sym sym;
-
-	if (mdb_lookup_by_obj(MDB_TGT_OBJ_EVERY, sym_name, &sym)) {
-		mdb_warn("can't find symbol %s", sym_name);
-		return (DCMD_ERR);
-	}
-
-	*bufp = mdb_alloc(sym.st_size, UM_SLEEP);
-
-	if (mdb_vread(*bufp, sym.st_size, sym.st_value) == -1) {
-		mdb_warn("can't read data for symbol %s", sym_name);
-		mdb_free(*bufp, sym.st_size);
-		return (DCMD_ERR);
-	}
-
-	return (DCMD_OK);
-}
-
 static int verbose;
 
 static int
@@ -305,30 +274,6 @@
 
 /* ARGSUSED */
 static int
-zio_pipeline(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
-{
-	mdb_ctf_id_t pipe_enum;
-	int i;
-	char stage[1024];
-
-	if (mdb_ctf_lookup_by_name("enum zio_stage", &pipe_enum) == -1) {
-		mdb_warn("Could not find enum zio_stage");
-		return (DCMD_ERR);
-	}
-
-	for (i = 0; i < 32; i++) {
-		if (addr & (1U << i)) {
-			enum_lookup(stage, sizeof (stage), pipe_enum, i,
-			    "ZIO_STAGE_");
-			mdb_printf("    %s\n", stage);
-		}
-	}
-
-	return (DCMD_OK);
-}
-
-/* ARGSUSED */
-static int
 zfs_params(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 {
 	/*
@@ -351,9 +296,8 @@
 		"metaslab_aliquot",
 		"reference_tracking_enable",
 		"reference_history",
-		"zio_taskq_threads",
 		"spa_max_replication_override",
-		"spa_mode",
+		"spa_mode_global",
 		"zfs_flags",
 		"zfs_txg_synctime",
 		"zfs_txg_timeout",
@@ -383,9 +327,8 @@
 		"zio_injection_enabled",
 		"zvol_immediate_write_sz",
 	};
-	int i;
 
-	for (i = 0; i < sizeof (params) / sizeof (params[0]); i++) {
+	for (int i = 0; i < sizeof (params) / sizeof (params[0]); i++) {
 		int sz;
 		uint64_t val64;
 		uint32_t *val32p = (uint32_t *)&val64;
@@ -407,76 +350,33 @@
 static int
 blkptr(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 {
-	blkptr_t bp;
-	dmu_object_type_info_t *doti;
-	zio_compress_info_t *zct;
-	zio_checksum_info_t *zci;
-	int i;
-	char buf[MAXPATHLEN];
+	mdb_ctf_id_t type_enum, checksum_enum, compress_enum;
+	char type[80], checksum[80], compress[80];
+	blkptr_t blk, *bp = &blk;
+	char buf[BP_SPRINTF_LEN];
 
-	if (mdb_vread(&bp, sizeof (blkptr_t), addr) == -1) {
+	if (mdb_vread(&blk, sizeof (blkptr_t), addr) == -1) {
 		mdb_warn("failed to read blkptr_t");
 		return (DCMD_ERR);
 	}
 
-	if (read_symbol("dmu_ot", (void **)&doti) != DCMD_OK)
-		return (DCMD_ERR);
-	for (i = 0; i < DMU_OT_NUMTYPES; i++) {
-		mdb_readstr(buf, sizeof (buf), (uintptr_t)doti[i].ot_name);
-		doti[i].ot_name = local_strdup(buf);
-	}
-
-	if (read_symbol("zio_checksum_table", (void **)&zci) != DCMD_OK)
+	if (mdb_ctf_lookup_by_name("enum dmu_object_type", &type_enum) == -1 ||
+	    mdb_ctf_lookup_by_name("enum zio_checksum", &checksum_enum) == -1 ||
+	    mdb_ctf_lookup_by_name("enum zio_compress", &compress_enum) == -1) {
+		mdb_warn("Could not find blkptr enumerated types");
 		return (DCMD_ERR);
-	for (i = 0; i < ZIO_CHECKSUM_FUNCTIONS; i++) {
-		mdb_readstr(buf, sizeof (buf), (uintptr_t)zci[i].ci_name);
-		zci[i].ci_name = local_strdup(buf);
-	}
-
-	if (read_symbol("zio_compress_table", (void **)&zct) != DCMD_OK)
-		return (DCMD_ERR);
-	for (i = 0; i < ZIO_COMPRESS_FUNCTIONS; i++) {
-		mdb_readstr(buf, sizeof (buf), (uintptr_t)zct[i].ci_name);
-		zct[i].ci_name = local_strdup(buf);
 	}
 
-	/*
-	 * Super-ick warning:  This code is also duplicated in
-	 * cmd/zdb.c .   Yeah, I hate code replication, too.
-	 */
-	for (i = 0; i < BP_GET_NDVAS(&bp); i++) {
-		dva_t *dva = &bp.blk_dva[i];
+	enum_lookup(type, sizeof (type), type_enum,
+	    BP_GET_TYPE(bp), "DMU_OT_");
+	enum_lookup(checksum, sizeof (checksum), checksum_enum,
+	    BP_GET_CHECKSUM(bp), "ZIO_CHECKSUM_");
+	enum_lookup(compress, sizeof (compress), compress_enum,
+	    BP_GET_COMPRESS(bp), "ZIO_COMPRESS_");
 
-		mdb_printf("DVA[%d]: vdev_id %lld / %llx\n", i,
-		    DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva));
-		mdb_printf("DVA[%d]:       GANG: %-5s  GRID:  %04x\t"
-		    "ASIZE: %llx\n", i, DVA_GET_GANG(dva) ? "TRUE" : "FALSE",
-		    (int)DVA_GET_GRID(dva), DVA_GET_ASIZE(dva));
-		mdb_printf("DVA[%d]: %llu:%llx:%llx:%s%s%s%s\n", i,
-		    DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), BP_GET_PSIZE(&bp),
-		    BP_SHOULD_BYTESWAP(&bp) ? "e" : "",
-		    !DVA_GET_GANG(dva) && BP_GET_LEVEL(&bp) != 0 ? "i" : "",
-		    DVA_GET_GANG(dva) ? "g" : "",
-		    BP_GET_COMPRESS(&bp) != 0 ? "d" : "");
-	}
-	mdb_printf("LSIZE:  %-16llx\t\tPSIZE: %llx\n",
-	    BP_GET_LSIZE(&bp), BP_GET_PSIZE(&bp));
-	mdb_printf("ENDIAN: %6s\t\t\t\t\tTYPE:  %s\n",
-	    BP_GET_BYTEORDER(&bp) ? "LITTLE" : "BIG",
-	    BP_GET_TYPE(&bp) < DMU_OT_NUMTYPES ?
-	    doti[BP_GET_TYPE(&bp)].ot_name : "UNKNOWN");
-	mdb_printf("BIRTH:  %-16llx   LEVEL: %-2d\tFILL:  %llx\n",
-	    bp.blk_birth, (int)BP_GET_LEVEL(&bp), bp.blk_fill);
-	mdb_printf("CKFUNC: %-16s\t\tCOMP:  %s\n",
-	    BP_GET_CHECKSUM(&bp) < ZIO_CHECKSUM_FUNCTIONS ?
-	    zci[BP_GET_CHECKSUM(&bp)].ci_name : "UNKNOWN",
-	    BP_GET_COMPRESS(&bp) < ZIO_COMPRESS_FUNCTIONS ?
-	    zct[BP_GET_COMPRESS(&bp)].ci_name : "UNKNOWN");
-	mdb_printf("CKSUM:  %llx:%llx:%llx:%llx\n",
-	    bp.blk_cksum.zc_word[0],
-	    bp.blk_cksum.zc_word[1],
-	    bp.blk_cksum.zc_word[2],
-	    bp.blk_cksum.zc_word[3]);
+	SPRINTF_BLKPTR(mdb_snprintf, '\n', buf, bp, type, checksum, compress);
+
+	mdb_printf("%s\n", buf);
 
 	return (DCMD_OK);
 }
@@ -2293,7 +2193,6 @@
 	    "zio_t summary", zio_print },
 	{ "zio_state", "?", "print out all zio_t structures on system or "
 	    "for a particular pool", zio_state },
-	{ "zio_pipeline", ":", "decode a zio pipeline", zio_pipeline },
 	{ "zfs_blkstats", ":[-v]",
 	    "given a spa_t, print block type stats from last scrub",
 	    zfs_blkstats },
diff -r 8aac17999e4d -r e2081f502306 usr/src/cmd/sgs/Makefile.var
--- a/usr/src/cmd/sgs/Makefile.var	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/cmd/sgs/Makefile.var	Sun Nov 01 14:14:46 2009 -0800
@@ -75,7 +75,7 @@
 # the system.
 #
 VAR_AVLDIR=		$(SRCBASE)/common/avl
-VAR_AVLINCDIR=
+VAR_AVLINCDIR=		-I $(SRCBASE)/uts/common
 
 #
 # VAR_DTRDIR - directory to find dtrace_data.c in.
diff -r 8aac17999e4d -r e2081f502306 usr/src/cmd/zdb/Makefile.com
--- a/usr/src/cmd/zdb/Makefile.com	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/cmd/zdb/Makefile.com	Sun Nov 01 14:14:46 2009 -0800
@@ -33,6 +33,7 @@
 
 INCS += -I../../../lib/libzpool/common 
 INCS +=	-I../../../uts/common/fs/zfs
+INCS +=	-I../../../common/zfs
 
 LDLIBS += -lzpool -lumem -lnvpair -lzfs -lavl
 
diff -r 8aac17999e4d -r e2081f502306 usr/src/cmd/zdb/zdb.c
--- a/usr/src/cmd/zdb/zdb.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/cmd/zdb/zdb.c	Sun Nov 01 14:14:46 2009 -0800
@@ -51,6 +51,7 @@
 #include <sys/zio_compress.h>
 #include <sys/zfs_fuid.h>
 #include <sys/arc.h>
+#include <sys/ddt.h>
 #undef ZFS_MAXNAMELEN
 #undef verify
 #include <libzfs.h>
@@ -72,8 +73,6 @@
 uint64_t *zopt_object = NULL;
 int zopt_objects = 0;
 libzfs_handle_t *g_zfs;
-boolean_t zdb_sig_user_data = B_TRUE;
-int zdb_sig_cksumalg = ZIO_CHECKSUM_SHA256;
 
 /*
  * These libumem hooks provide a reasonable set of defaults for the allocator's
@@ -121,8 +120,7 @@
 	(void) fprintf(stderr, "        -c checksum all metadata (twice for "
 	    "all data) blocks\n");
 	(void) fprintf(stderr, "        -s report stats on zdb's I/O\n");
-	(void) fprintf(stderr, "        -S <user|all>:<cksum_alg|all> -- "
-	    "dump blkptr signatures\n");
+	(void) fprintf(stderr, "        -S simulate dedup to measure effect\n");
 	(void) fprintf(stderr, "        -v verbose (applies to all others)\n");
 	(void) fprintf(stderr, "        -l dump label contents\n");
 	(void) fprintf(stderr, "        -L disable leak tracking (do not "
@@ -540,6 +538,198 @@
 }
 
 static void
+dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index)
+{
+	const ddt_phys_t *ddp = dde->dde_phys;
+	const ddt_key_t *ddk = &dde->dde_key;
+	char *types[4] = { "ditto", "single", "double", "triple" };
+	char blkbuf[BP_SPRINTF_LEN];
+	blkptr_t blk;
+
+	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+		if (ddp->ddp_phys_birth == 0)
+			continue;
+		ddt_bp_create(ddt, ddk, ddp, &blk);
+		sprintf_blkptr(blkbuf, &blk);
+		(void) printf("index %llx refcnt %llu %s %s\n",
+		    (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
+		    types[p], blkbuf);
+	}
+}
+
+static void
+dump_dedup_ratio(const ddt_stat_t *dds)
+{
+	double rL, rP, rD, D, dedup, compress, copies;
+
+	if (dds->dds_blocks == 0)
+		return;
+
+	rL = (double)dds->dds_ref_lsize;
+	rP = (double)dds->dds_ref_psize;
+	rD = (double)dds->dds_ref_dsize;
+	D = (double)dds->dds_dsize;
+
+	dedup = rD / D;
+	compress = rL / rP;
+	copies = rD / rP;
+
+	(void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, "
+	    "dedup * compress / copies = %.2f\n\n",
+	    dedup, compress, copies, dedup * compress / copies);
+}
+
+static void
+dump_ddt_stat(const ddt_stat_t *dds, int h)
+{
+	char refcnt[6];
+	char blocks[6], lsize[6], psize[6], dsize[6];
+	char ref_blocks[6], ref_lsize[6], ref_psize[6], ref_dsize[6];
+
+	if (dds->dds_blocks == 0)
+		return;
+
+	if (h == -1)
+		(void) strcpy(refcnt, "Total");
+	else
+		nicenum(1ULL << h, refcnt);
+
+	nicenum(dds->dds_blocks, blocks);
+	nicenum(dds->dds_lsize, lsize);
+	nicenum(dds->dds_psize, psize);
+	nicenum(dds->dds_dsize, dsize);
+	nicenum(dds->dds_ref_blocks, ref_blocks);
+	nicenum(dds->dds_ref_lsize, ref_lsize);
+	nicenum(dds->dds_ref_psize, ref_psize);
+	nicenum(dds->dds_ref_dsize, ref_dsize);
+
+	(void) printf("%6s   %6s   %5s   %5s   %5s   %6s   %5s   %5s   %5s\n",
+	    refcnt,
+	    blocks, lsize, psize, dsize,
+	    ref_blocks, ref_lsize, ref_psize, ref_dsize);
+}
+
+static void
+dump_ddt_histogram(const ddt_histogram_t *ddh)
+{
+	ddt_stat_t dds_total = { 0 };
+
+	ddt_histogram_stat(&dds_total, ddh);
+
+	(void) printf("\n");
+
+	(void) printf("bucket   "
+	    "           allocated             "
+	    "          referenced          \n");
+	(void) printf("______   "
+	    "______________________________   "
+	    "______________________________\n");
+
+	(void) printf("%6s   %6s   %5s   %5s   %5s   %6s   %5s   %5s   %5s\n",
+	    "refcnt",
+	    "blocks", "LSIZE", "PSIZE", "DSIZE",
+	    "blocks", "LSIZE", "PSIZE", "DSIZE");
+
+	(void) printf("%6s   %6s   %5s   %5s   %5s   %6s   %5s   %5s   %5s\n",
+	    "------",
+	    "------", "-----", "-----", "-----",
+	    "------", "-----", "-----", "-----");
+
+	for (int h = 0; h < 64; h++)
+		dump_ddt_stat(&ddh->ddh_stat[h], h);
+
+	dump_ddt_stat(&dds_total, -1);
+
+	(void) printf("\n");
+}
+
+static void
+dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+{
+	char name[DDT_NAMELEN];
+	ddt_entry_t dde;
+	uint64_t walk = 0;
+	dmu_object_info_t doi;
+	uint64_t count, dspace, mspace;
+	int error;
+
+	error = ddt_object_info(ddt, type, class, &doi);
+
+	if (error == ENOENT)
+		return;
+	ASSERT(error == 0);
+
+	count = ddt_object_count(ddt, type, class);
+	dspace = doi.doi_physical_blocks_512 << 9;
+	mspace = doi.doi_fill_count * doi.doi_data_block_size;
+
+	ASSERT(count != 0);	/* we should have destroyed it */
+
+	ddt_object_name(ddt, type, class, name);
+
+	(void) printf("%s: %llu entries, size %llu on disk, %llu in core\n",
+	    name,
+	    (u_longlong_t)count,
+	    (u_longlong_t)(dspace / count),
+	    (u_longlong_t)(mspace / count));
+
+	if (dump_opt['D'] < 3)
+		return;
+
+	dump_ddt_histogram(&ddt->ddt_histogram[type][class]);
+
+	if (dump_opt['D'] < 4)
+		return;
+
+	if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE)
+		return;
+
+	(void) printf("%s contents:\n\n", name);
+
+	while ((error = ddt_object_walk(ddt, type, class, &dde, &walk)) == 0)
+		dump_dde(ddt, &dde, walk);
+
+	ASSERT(error == ENOENT);
+
+	(void) printf("\n");
+}
+
+static void
+dump_all_ddts(spa_t *spa)
+{
+	ddt_histogram_t ddh_total = { 0 };
+	ddt_stat_t dds_total = { 0 };
+
+	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+		ddt_t *ddt = spa->spa_ddt[c];
+		for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+			for (enum ddt_class class = 0; class < DDT_CLASSES;
+			    class++) {
+				ddt_histogram_add(&ddh_total,
+				    &ddt->ddt_histogram[type][class]);
+				dump_ddt(ddt, type, class);
+			}
+		}
+	}
+
+	ddt_histogram_stat(&dds_total, &ddh_total);
+
+	if (dds_total.dds_blocks == 0) {
+		(void) printf("All DDTs are empty\n");
+		return;
+	}
+
+	(void) printf("\n");
+
+	if (dump_opt['D'] > 1) {
+		(void) printf("DDT histogram (aggregated over all DDTs):\n");
+		dump_ddt_histogram(&ddh_total);
+	}
+
+	dump_dedup_ratio(&dds_total);
+}
+
+static void
 dump_dtl_seg(space_map_t *sm, uint64_t start, uint64_t size)
 {
 	char *prefix = (void *)sm;
@@ -658,35 +848,48 @@
 }
 
 static uint64_t
-blkid2offset(const dnode_phys_t *dnp, int level, uint64_t blkid)
+blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_t *zb)
 {
-	if (level < 0)
-		return (blkid);
-
-	return ((blkid << (level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
+	if (dnp == NULL) {
+		ASSERT(zb->zb_level < 0);
+		if (zb->zb_object == 0)
+			return (zb->zb_blkid);
+		return (zb->zb_blkid * BP_GET_LSIZE(bp));
+	}
+
+	ASSERT(zb->zb_level >= 0);
+
+	return ((zb->zb_blkid <<
+	    (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
 	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 }
 
 static void
-sprintf_blkptr_compact(char *blkbuf, blkptr_t *bp, int alldvas)
+sprintf_blkptr_compact(char *blkbuf, blkptr_t *bp)
 {
 	dva_t *dva = bp->blk_dva;
-	int ndvas = alldvas ? BP_GET_NDVAS(bp) : 1;
-	int i;
+	int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;
+
+	if (dump_opt['b'] >= 5) {
+		sprintf_blkptr(blkbuf, bp);
+		return;
+	}
 
 	blkbuf[0] = '\0';
 
-	for (i = 0; i < ndvas; i++)
+	for (int i = 0; i < ndvas; i++)
 		(void) sprintf(blkbuf + strlen(blkbuf), "%llu:%llx:%llx ",
 		    (u_longlong_t)DVA_GET_VDEV(&dva[i]),
 		    (u_longlong_t)DVA_GET_OFFSET(&dva[i]),
 		    (u_longlong_t)DVA_GET_ASIZE(&dva[i]));
 
-	(void) sprintf(blkbuf + strlen(blkbuf), "%llxL/%llxP F=%llu B=%llu",
+	(void) sprintf(blkbuf + strlen(blkbuf),
+	    "%llxL/%llxP F=%llu B=%llu/%llu",
 	    (u_longlong_t)BP_GET_LSIZE(bp),
 	    (u_longlong_t)BP_GET_PSIZE(bp),
 	    (u_longlong_t)bp->blk_fill,
-	    (u_longlong_t)bp->blk_birth);
+	    (u_longlong_t)bp->blk_birth,
+	    (u_longlong_t)BP_PHYSICAL_BIRTH(bp));
 }
 
 static void
@@ -699,8 +902,7 @@
 	ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
 	ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
 
-	(void) printf("%16llx ",
-	    (u_longlong_t)blkid2offset(dnp, zb->zb_level, zb->zb_blkid));
+	(void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb));
 
 	ASSERT(zb->zb_level >= 0);
 
@@ -712,18 +914,10 @@
 		}
 	}
 
-	sprintf_blkptr_compact(blkbuf, bp, dump_opt['d'] > 5 ? 1 : 0);
+	sprintf_blkptr_compact(blkbuf, bp);
 	(void) printf("%s\n", blkbuf);
 }
 
-#define	SET_BOOKMARK(zb, objset, object, level, blkid)  \
-{                                                       \
-	(zb)->zb_objset = objset;                       \
-	(zb)->zb_object = object;                       \
-	(zb)->zb_level = level;                         \
-	(zb)->zb_blkid = blkid;                         \
-}
-
 static int
 visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
     blkptr_t *bp, const zbookmark_t *zb)
@@ -859,7 +1053,7 @@
 	nicenum(ds->ds_compressed_bytes, compressed);
 	nicenum(ds->ds_uncompressed_bytes, uncompressed);
 	nicenum(ds->ds_unique_bytes, unique);
-	sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, &ds->ds_bp);
+	sprintf_blkptr(blkbuf, &ds->ds_bp);
 
 	(void) printf("\t\tdir_obj = %llu\n",
 	    (u_longlong_t)ds->ds_dir_obj);
@@ -910,11 +1104,11 @@
 	if (dump_opt['d'] < 3)
 		return;
 
-	mutex_init(&bpl.bpl_lock, NULL, MUTEX_DEFAULT, NULL);
+	bplist_init(&bpl);
 	VERIFY(0 == bplist_open(&bpl, mos, object));
 	if (bplist_empty(&bpl)) {
 		bplist_close(&bpl);
-		mutex_destroy(&bpl.bpl_lock);
+		bplist_fini(&bpl);
 		return;
 	}
 
@@ -932,7 +1126,7 @@
 
 	if (dump_opt['d'] < 5) {
 		bplist_close(&bpl);
-		mutex_destroy(&bpl.bpl_lock);
+		bplist_fini(&bpl);
 		return;
 	}
 
@@ -942,13 +1136,13 @@
 		char blkbuf[BP_SPRINTF_LEN];
 
 		ASSERT(bp->blk_birth != 0);
-		sprintf_blkptr_compact(blkbuf, bp, dump_opt['d'] > 5 ? 1 : 0);
+		sprintf_blkptr_compact(blkbuf, bp);
 		(void) printf("\tItem %3llu: %s\n",
 		    (u_longlong_t)itor - 1, blkbuf);
 	}
 
 	bplist_close(&bpl);
-	mutex_destroy(&bpl.bpl_lock);
+	bplist_fini(&bpl);
 }
 
 static avl_tree_t idx_tree;
@@ -1107,6 +1301,8 @@
 	dump_zap,		/* ZFS user/group used		*/
 	dump_zap,		/* ZFS user/group quota		*/
 	dump_zap,		/* snapshot refcount tags	*/
+	dump_none,		/* DDT ZAP object		*/
+	dump_zap,		/* DDT statistics		*/
 	dump_unknown		/* Unknown type, must be last	*/
 };
 
@@ -1118,13 +1314,14 @@
 	dnode_t *dn;
 	void *bonus = NULL;
 	size_t bsize = 0;
-	char iblk[6], dblk[6], lsize[6], asize[6], bonus_size[6], segsize[6];
+	char iblk[6], dblk[6], lsize[6], asize[6], bonus_size[6], fill[7];
 	char aux[50];
 	int error;
 
 	if (*print_header) {
-		(void) printf("\n    Object  lvl   iblk   dblk  lsize"
-		    "  asize  type\n");
+		(void) printf("\n%10s  %3s  %5s  %5s  %5s  %5s  %6s  %s\n",
+		    "Object", "lvl", "iblk", "dblk", "dsize", "lsize",
+		    "%full", "type");
 		*print_header = 0;
 	}
 
@@ -1143,10 +1340,11 @@
 
 	nicenum(doi.doi_metadata_block_size, iblk);
 	nicenum(doi.doi_data_block_size, dblk);
-	nicenum(doi.doi_data_block_size * (doi.doi_max_block_offset + 1),
-	    lsize);
-	nicenum(doi.doi_physical_blks << 9, asize);
+	nicenum(doi.doi_max_offset, lsize);
+	nicenum(doi.doi_physical_blocks_512 << 9, asize);
 	nicenum(doi.doi_bonus_size, bonus_size);
+	(void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count *
+	    doi.doi_data_block_size / doi.doi_max_offset);
 
 	aux[0] = '\0';
 
@@ -1160,13 +1358,13 @@
 		    ZDB_COMPRESS_NAME(doi.doi_compress));
 	}
 
-	(void) printf("%10lld  %3u  %5s  %5s  %5s  %5s  %s%s\n",
-	    (u_longlong_t)object, doi.doi_indirection, iblk, dblk, lsize,
-	    asize, ZDB_OT_NAME(doi.doi_type), aux);
+	(void) printf("%10lld  %3u  %5s  %5s  %5s  %5s  %6s  %s%s\n",
+	    (u_longlong_t)object, doi.doi_indirection, iblk, dblk,
+	    asize, lsize, fill, ZDB_OT_NAME(doi.doi_type), aux);
 
 	if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
-		(void) printf("%10s  %3s  %5s  %5s  %5s  %5s  %s\n",
-		    "", "", "", "", bonus_size, "bonus",
+		(void) printf("%10s  %3s  %5s  %5s  %5s  %5s  %6s  %s\n",
+		    "", "", "", "", "", bonus_size, "bonus",
 		    ZDB_OT_NAME(doi.doi_bonus_type));
 	}
 
@@ -1203,6 +1401,7 @@
 		}
 
 		for (;;) {
+			char segsize[6];
 			error = dnode_next_offset(dn,
 			    0, &start, minlvl, blkfill, 0);
 			if (error)
@@ -1261,8 +1460,7 @@
 
 	if (verbosity >= 4) {
 		(void) sprintf(blkbuf, ", rootbp ");
-		(void) sprintf_blkptr(blkbuf + strlen(blkbuf),
-		    BP_SPRINTF_LEN - strlen(blkbuf), os->os_rootbp);
+		(void) sprintf_blkptr(blkbuf + strlen(blkbuf), os->os_rootbp);
 	} else {
 		blkbuf[0] = '\0';
 	}
@@ -1275,7 +1473,16 @@
 	    (u_longlong_t)dds.dds_creation_txg,
 	    numbuf, (u_longlong_t)usedobjs, blkbuf);
 
-	dump_intent_log(dmu_objset_zil(os));
+	if (zopt_objects != 0) {
+		for (i = 0; i < zopt_objects; i++)
+			dump_object(os, zopt_object[i], verbosity,
+			    &print_header);
+		(void) printf("\n");
+		return;
+	}
+
+	if (dump_opt['i'] != 0 || verbosity >= 2)
+		dump_intent_log(dmu_objset_zil(os));
 
 	if (dmu_objset_ds(os) != NULL)
 		dump_bplist(dmu_objset_pool(os)->dp_meta_objset,
@@ -1287,14 +1494,6 @@
 	if (os->os_rootbp->blk_birth == 0)
 		return;
 
-	if (zopt_objects != 0) {
-		for (i = 0; i < zopt_objects; i++)
-			dump_object(os, zopt_object[i], verbosity,
-			    &print_header);
-		(void) printf("\n");
-		return;
-	}
-
 	dump_object(os, 0, verbosity, &print_header);
 	object_count = 0;
 	if (os->os_userused_dnode &&
@@ -1333,7 +1532,7 @@
 	    (u_longlong_t)ub->ub_timestamp, asctime(localtime(&timestamp)));
 	if (dump_opt['u'] >= 3) {
 		char blkbuf[BP_SPRINTF_LEN];
-		sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, &ub->ub_rootbp);
+		sprintf_blkptr(blkbuf, &ub->ub_rootbp);
 		(void) printf("\trootbp = %s\n", blkbuf);
 	}
 	(void) printf("\n");
@@ -1466,7 +1665,7 @@
 
 	error = dmu_objset_own(dsname, DMU_OST_ANY, B_TRUE, FTAG, &os);
 	if (error) {
-		(void) printf("Could not open %s\n", dsname);
+		(void) printf("Could not open %s, error %d\n", dsname, error);
 		return (0);
 	}
 	dump_dir(os);
@@ -1475,6 +1674,160 @@
 	return (0);
 }
 
+/*
+ * Block statistics.
+ */
+typedef struct zdb_blkstats {
+	uint64_t	zb_asize;
+	uint64_t	zb_lsize;
+	uint64_t	zb_psize;
+	uint64_t	zb_count;
+} zdb_blkstats_t;
+
+/*
+ * Extended object types to report deferred frees and dedup auto-ditto blocks.
+ */
+#define	ZDB_OT_DEFERRED	(DMU_OT_NUMTYPES + 0)
+#define	ZDB_OT_DITTO	(DMU_OT_NUMTYPES + 1)
+#define	ZDB_OT_TOTAL	(DMU_OT_NUMTYPES + 2)
+
+static char *zdb_ot_extname[] = {
+	"deferred free",
+	"dedup ditto",
+	"Total",
+};
+
+#define	ZB_TOTAL	DN_MAX_LEVELS
+
+typedef struct zdb_cb {
+	zdb_blkstats_t	zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
+	uint64_t	zcb_dedup_asize;
+	uint64_t	zcb_dedup_blocks;
+	uint64_t	zcb_errors[256];
+	int		zcb_readfails;
+	int		zcb_haderrors;
+} zdb_cb_t;
+
+static void
+zdb_count_block(spa_t *spa, zilog_t *zilog, zdb_cb_t *zcb, const blkptr_t *bp,
+    dmu_object_type_t type)
+{
+	uint64_t refcnt = 0;
+
+	ASSERT(type < ZDB_OT_TOTAL);
+
+	if (zilog && zil_bp_tree_add(zilog, bp) != 0)
+		return;
+
+	for (int i = 0; i < 4; i++) {
+		int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
+		int t = (i & 1) ? type : ZDB_OT_TOTAL;
+		zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
+
+		zb->zb_asize += BP_GET_ASIZE(bp);
+		zb->zb_lsize += BP_GET_LSIZE(bp);
+		zb->zb_psize += BP_GET_PSIZE(bp);
+		zb->zb_count++;
+	}
+
+	if (dump_opt['L'])
+		return;
+
+	if (BP_GET_DEDUP(bp)) {
+		ddt_t *ddt;
+		ddt_entry_t *dde;
+
+		ddt = ddt_select(spa, bp);
+		ddt_enter(ddt);
+		dde = ddt_lookup(ddt, bp, B_FALSE);
+
+		if (dde == NULL) {
+			refcnt = 0;
+		} else {
+			ddt_phys_t *ddp = ddt_phys_select(dde, bp);
+			ddt_phys_decref(ddp);
+			refcnt = ddp->ddp_refcnt;
+			if (ddt_phys_total_refcnt(dde) == 0)
+				ddt_remove(ddt, dde);
+		}
+		ddt_exit(ddt);
+	}
+
+	VERIFY3U(zio_wait(zio_claim(NULL, spa,
+	    refcnt ? 0 : spa_first_txg(spa),
+	    bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
+}
+
+static int
+zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+	zdb_cb_t *zcb = arg;
+	char blkbuf[BP_SPRINTF_LEN];
+	dmu_object_type_t type;
+	boolean_t is_metadata;
+
+	if (bp == NULL)
+		return (0);
+
+	type = BP_GET_TYPE(bp);
+
+	zdb_count_block(spa, zilog, zcb, bp, type);
+
+	is_metadata = (BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata);
+
+	if (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata)) {
+		int ioerr;
+		size_t size = BP_GET_PSIZE(bp);
+		void *data = malloc(size);
+		int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
+
+		/* If it's an intent log block, failure is expected. */
+		if (zb->zb_level == ZB_ZIL_LEVEL)
+			flags |= ZIO_FLAG_SPECULATIVE;
+
+		ioerr = zio_wait(zio_read(NULL, spa, bp, data, size,
+		    NULL, NULL, ZIO_PRIORITY_ASYNC_READ, flags, zb));
+
+		free(data);
+
+		if (ioerr && !(flags & ZIO_FLAG_SPECULATIVE)) {
+			zcb->zcb_haderrors = 1;
+			zcb->zcb_errors[ioerr]++;
+
+			if (dump_opt['b'] >= 2)
+				sprintf_blkptr(blkbuf, bp);
+			else
+				blkbuf[0] = '\0';
+
+			(void) printf("zdb_blkptr_cb: "
+			    "Got error %d reading "
+			    "<%llu, %llu, %lld, %llx> %s -- skipping\n",
+			    ioerr,
+			    (u_longlong_t)zb->zb_objset,
+			    (u_longlong_t)zb->zb_object,
+			    (u_longlong_t)zb->zb_level,
+			    (u_longlong_t)zb->zb_blkid,
+			    blkbuf);
+		}
+	}
+
+	zcb->zcb_readfails = 0;
+
+	if (dump_opt['b'] >= 4) {
+		sprintf_blkptr(blkbuf, bp);
+		(void) printf("objset %llu object %llu "
+		    "level %lld offset 0x%llx %s\n",
+		    (u_longlong_t)zb->zb_objset,
+		    (u_longlong_t)zb->zb_object,
+		    (longlong_t)zb->zb_level,
+		    (u_longlong_t)blkid2offset(dnp, bp, zb),
+		    blkbuf);
+	}
+
+	return (0);
+}
+
 static void
 zdb_leak(space_map_t *sm, uint64_t start, uint64_t size)
 {
@@ -1512,169 +1865,90 @@
 };
 
 static void
-zdb_leak_init(spa_t *spa)
+zdb_ddt_leak_init(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    zdb_cb_t *zcb)
 {
-	vdev_t *rvd = spa->spa_root_vdev;
-
-	for (int c = 0; c < rvd->vdev_children; c++) {
-		vdev_t *vd = rvd->vdev_child[c];
-		for (int m = 0; m < vd->vdev_ms_count; m++) {
-			metaslab_t *msp = vd->vdev_ms[m];
-			mutex_enter(&msp->ms_lock);
-			VERIFY(space_map_load(&msp->ms_map, &zdb_space_map_ops,
-			    SM_ALLOC, &msp->ms_smo, spa->spa_meta_objset) == 0);
-			msp->ms_map.sm_ppd = vd;
-			mutex_exit(&msp->ms_lock);
+	uint64_t walk = 0;
+	ddt_entry_t dde;
+	int error;
+
+	if (class == DDT_CLASS_UNIQUE || !ddt_object_exists(ddt, type, class))
+		return;
+
+	while ((error = ddt_object_walk(ddt, type, class, &dde, &walk)) == 0) {
+		blkptr_t blk;
+		ddt_phys_t *ddp = dde.dde_phys;
+		ASSERT(ddt_phys_total_refcnt(&dde) > 1);
+		for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+			if (ddp->ddp_phys_birth == 0)
+				continue;
+			ddt_bp_create(ddt, &dde.dde_key, ddp, &blk);
+			if (p == DDT_PHYS_DITTO) {
+				zdb_count_block(ddt->ddt_spa, NULL, zcb, &blk,
+				    ZDB_OT_DITTO);
+			} else {
+				zcb->zcb_dedup_asize +=
+				    BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
+				zcb->zcb_dedup_blocks++;
+			}
+		}
+		if (!dump_opt['L']) {
+			ddt_enter(ddt);
+			VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
+			ddt_exit(ddt);
 		}
 	}
+
+	ASSERT(error == ENOENT);
+}
+
+static void
+zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
+{
+	if (!dump_opt['L']) {
+		vdev_t *rvd = spa->spa_root_vdev;
+		for (int c = 0; c < rvd->vdev_children; c++) {
+			vdev_t *vd = rvd->vdev_child[c];
+			for (int m = 0; m < vd->vdev_ms_count; m++) {
+				metaslab_t *msp = vd->vdev_ms[m];
+				mutex_enter(&msp->ms_lock);
+				space_map_unload(&msp->ms_map);
+				VERIFY(space_map_load(&msp->ms_map,
+				    &zdb_space_map_ops, SM_ALLOC, &msp->ms_smo,
+				    spa->spa_meta_objset) == 0);
+				msp->ms_map.sm_ppd = vd;
+				mutex_exit(&msp->ms_lock);
+			}
+		}
+	}
+
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
+		for (enum ddt_type type = 0; type < DDT_TYPES; type++)
+			for (enum ddt_class class = 0; class < DDT_CLASSES;
+			    class++)
+				zdb_ddt_leak_init(spa->spa_ddt[c],
+				    type, class, zcb);
+
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
 }
 
 static void
 zdb_leak_fini(spa_t *spa)
 {
-	vdev_t *rvd = spa->spa_root_vdev;
-
-	for (int c = 0; c < rvd->vdev_children; c++) {
-		vdev_t *vd = rvd->vdev_child[c];
-		for (int m = 0; m < vd->vdev_ms_count; m++) {
-			metaslab_t *msp = vd->vdev_ms[m];
-			mutex_enter(&msp->ms_lock);
-			space_map_unload(&msp->ms_map);
-			mutex_exit(&msp->ms_lock);
-		}
-	}
-}
-
-/*
- * Verify that the sum of the sizes of all blocks in the pool adds up
- * to the SPA's sa_alloc total.
- */
-typedef struct zdb_blkstats {
-	uint64_t	zb_asize;
-	uint64_t	zb_lsize;
-	uint64_t	zb_psize;
-	uint64_t	zb_count;
-} zdb_blkstats_t;
-
-#define	DMU_OT_DEFERRED	DMU_OT_NONE
-#define	DMU_OT_TOTAL	DMU_OT_NUMTYPES
-
-#define	ZB_TOTAL	DN_MAX_LEVELS
-
-typedef struct zdb_cb {
-	zdb_blkstats_t	zcb_type[ZB_TOTAL + 1][DMU_OT_TOTAL + 1];
-	uint64_t	zcb_errors[256];
-	int		zcb_readfails;
-	int		zcb_haderrors;
-} zdb_cb_t;
-
-static void
-zdb_count_block(spa_t *spa, zdb_cb_t *zcb, blkptr_t *bp, dmu_object_type_t type)
-{
-	for (int i = 0; i < 4; i++) {
-		int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
-		int t = (i & 1) ? type : DMU_OT_TOTAL;
-		zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
-
-		zb->zb_asize += BP_GET_ASIZE(bp);
-		zb->zb_lsize += BP_GET_LSIZE(bp);
-		zb->zb_psize += BP_GET_PSIZE(bp);
-		zb->zb_count++;
-	}
-
-	if (dump_opt['S']) {
-		boolean_t print_sig;
-
-		print_sig = !zdb_sig_user_data || (BP_GET_LEVEL(bp) == 0 &&
-		    BP_GET_TYPE(bp) == DMU_OT_PLAIN_FILE_CONTENTS);
-
-		if (BP_GET_CHECKSUM(bp) < zdb_sig_cksumalg)
-			print_sig = B_FALSE;
-
-		if (print_sig) {
-			(void) printf("%llu\t%lld\t%lld\t%s\t%s\t%s\t"
-			    "%llx:%llx:%llx:%llx\n",
-			    (u_longlong_t)BP_GET_LEVEL(bp),
-			    (longlong_t)BP_GET_PSIZE(bp),
-			    (longlong_t)BP_GET_NDVAS(bp),
-			    ZDB_OT_NAME(BP_GET_TYPE(bp)),
-			    ZDB_CHECKSUM_NAME(BP_GET_CHECKSUM(bp)),
-			    ZDB_COMPRESS_NAME(BP_GET_COMPRESS(bp)),
-			    (u_longlong_t)bp->blk_cksum.zc_word[0],
-			    (u_longlong_t)bp->blk_cksum.zc_word[1],
-			    (u_longlong_t)bp->blk_cksum.zc_word[2],
-			    (u_longlong_t)bp->blk_cksum.zc_word[3]);
-		}
-	}
-
-	if (!dump_opt['L'])
-		VERIFY(zio_wait(zio_claim(NULL, spa, spa_first_txg(spa), bp,
-		    NULL, NULL, ZIO_FLAG_MUSTSUCCEED)) == 0);
-}
-
-static int
-zdb_blkptr_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
-    const dnode_phys_t *dnp, void *arg)
-{
-	zdb_cb_t *zcb = arg;
-	char blkbuf[BP_SPRINTF_LEN];
-	dmu_object_type_t type;
-	boolean_t is_metadata;
-
-	if (bp == NULL)
-		return (0);
-
-	type = BP_GET_TYPE(bp);
-
-	zdb_count_block(spa, zcb, bp, type);
-
-	is_metadata = (BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata);
-
-	if (dump_opt['c'] > 1 || dump_opt['S'] ||
-	    (dump_opt['c'] && is_metadata)) {
-		size_t size = BP_GET_PSIZE(bp);
-		void *data = malloc(size);
-		int ioerr = zio_wait(zio_read(NULL, spa, bp, data, size,
-		    NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
-		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
-		free(data);
-
-		/* We expect io errors on intent log */
-		if (ioerr && type != DMU_OT_INTENT_LOG) {
-			zcb->zcb_haderrors = 1;
-			zcb->zcb_errors[ioerr]++;
-
-			if (dump_opt['b'] >= 2)
-				sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, bp);
-			else
-				blkbuf[0] = '\0';
-
-			if (!dump_opt['S']) {
-				(void) printf("zdb_blkptr_cb: "
-				    "Got error %d reading "
-				    "<%llu, %llu, %lld, %llx> %s -- skipping\n",
-				    ioerr,
-				    (u_longlong_t)zb->zb_objset,
-				    (u_longlong_t)zb->zb_object,
-				    (u_longlong_t)zb->zb_level,
-				    (u_longlong_t)zb->zb_blkid,
-				    blkbuf);
+	if (!dump_opt['L']) {
+		vdev_t *rvd = spa->spa_root_vdev;
+		for (int c = 0; c < rvd->vdev_children; c++) {
+			vdev_t *vd = rvd->vdev_child[c];
+			for (int m = 0; m < vd->vdev_ms_count; m++) {
+				metaslab_t *msp = vd->vdev_ms[m];
+				mutex_enter(&msp->ms_lock);
+				space_map_unload(&msp->ms_map);
+				mutex_exit(&msp->ms_lock);
 			}
 		}
 	}
-
-	zcb->zcb_readfails = 0;
-
-	if (dump_opt['b'] >= 4) {
-		sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, bp);
-		(void) printf("objset %llu object %llu offset 0x%llx %s\n",
-		    (u_longlong_t)zb->zb_objset,
-		    (u_longlong_t)zb->zb_object,
-		    (u_longlong_t)blkid2offset(dnp, zb->zb_level, zb->zb_blkid),
-		    blkbuf);
-	}
-
-	return (0);
 }
 
 static int
@@ -1682,19 +1956,15 @@
 {
 	zdb_cb_t zcb = { 0 };
 	zdb_blkstats_t *zb, *tzb;
-	uint64_t alloc, space, logalloc;
-	vdev_t *rvd = spa->spa_root_vdev;
+	uint64_t norm_alloc, norm_space, total_alloc, total_found;
 	int leaks = 0;
-	int c, e;
-
-	if (!dump_opt['S']) {
-		(void) printf("\nTraversing all blocks %s%s%s%s%s...\n",
-		    (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
-		    (dump_opt['c'] == 1) ? "metadata " : "",
-		    dump_opt['c'] ? "checksums " : "",
-		    (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
-		    !dump_opt['L'] ? "nothing leaked " : "");
-	}
+
+	(void) printf("\nTraversing all blocks %s%s%s%s%s...\n",
+	    (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
+	    (dump_opt['c'] == 1) ? "metadata " : "",
+	    dump_opt['c'] ? "checksums " : "",
+	    (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
+	    !dump_opt['L'] ? "nothing leaked " : "");
 
 	/*
 	 * Load all space maps as SM_ALLOC maps, then traverse the pool
@@ -1704,28 +1974,27 @@
 	 * it's not part of any space map) is a double allocation,
 	 * reference to a freed block, or an unclaimed log block.
 	 */
-	if (!dump_opt['L'])
-		zdb_leak_init(spa);
+	zdb_leak_init(spa, &zcb);
 
 	/*
 	 * If there's a deferred-free bplist, process that first.
 	 */
-	if (spa->spa_sync_bplist_obj != 0) {
-		bplist_t *bpl = &spa->spa_sync_bplist;
+	if (spa->spa_deferred_bplist_obj != 0) {
+		bplist_t *bpl = &spa->spa_deferred_bplist;
 		blkptr_t blk;
 		uint64_t itor = 0;
 
 		VERIFY(0 == bplist_open(bpl, spa->spa_meta_objset,
-		    spa->spa_sync_bplist_obj));
+		    spa->spa_deferred_bplist_obj));
 
 		while (bplist_iterate(bpl, &itor, &blk) == 0) {
 			if (dump_opt['b'] >= 4) {
 				char blkbuf[BP_SPRINTF_LEN];
-				sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, &blk);
+				sprintf_blkptr(blkbuf, &blk);
 				(void) printf("[%s] %s\n",
 				    "deferred free", blkbuf);
 			}
-			zdb_count_block(spa, &zcb, &blk, DMU_OT_DEFERRED);
+			zdb_count_block(spa, NULL, &zcb, &blk, ZDB_OT_DEFERRED);
 		}
 
 		bplist_close(bpl);
@@ -1733,10 +2002,10 @@
 
 	zcb.zcb_haderrors |= traverse_pool(spa, zdb_blkptr_cb, &zcb, 0);
 
-	if (zcb.zcb_haderrors && !dump_opt['S']) {
+	if (zcb.zcb_haderrors) {
 		(void) printf("\nError counts:\n\n");
 		(void) printf("\t%5s  %s\n", "errno", "count");
-		for (e = 0; e < 256; e++) {
+		for (int e = 0; e < 256; e++) {
 			if (zcb.zcb_errors[e] != 0) {
 				(void) printf("\t%5d  %llu\n",
 				    e, (u_longlong_t)zcb.zcb_errors[e]);
@@ -1747,43 +2016,27 @@
 	/*
 	 * Report any leaked segments.
 	 */
-	if (!dump_opt['L'])
-		zdb_leak_fini(spa);
-
-	/*
-	 * If we're interested in printing out the blkptr signatures,
-	 * return now as we don't print out anything else (including
-	 * errors and leaks).
-	 */
-	if (dump_opt['S'])
-		return (zcb.zcb_haderrors ? 3 : 0);
-
-	alloc = spa_get_alloc(spa);
-	space = spa_get_space(spa);
-
-	/*
-	 * Log blocks allocated from a separate log device don't count
-	 * as part of the normal pool space; factor them in here.
-	 */
-	logalloc = 0;
-
-	for (c = 0; c < rvd->vdev_children; c++)
-		if (rvd->vdev_child[c]->vdev_islog)
-			logalloc += rvd->vdev_child[c]->vdev_stat.vs_alloc;
-
-	tzb = &zcb.zcb_type[ZB_TOTAL][DMU_OT_TOTAL];
-
-	if (tzb->zb_asize == alloc + logalloc) {
+	zdb_leak_fini(spa);
+
+	tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL];
+
+	norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
+	norm_space = metaslab_class_get_space(spa_normal_class(spa));
+
+	total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa));
+	total_found = tzb->zb_asize - zcb.zcb_dedup_asize;
+
+	if (total_found == total_alloc) {
 		if (!dump_opt['L'])
 			(void) printf("\n\tNo leaks (block sum matches space"
 			    " maps exactly)\n");
 	} else {
 		(void) printf("block traversal size %llu != alloc %llu "
 		    "(%s %lld)\n",
-		    (u_longlong_t)tzb->zb_asize,
-		    (u_longlong_t)alloc + logalloc,
+		    (u_longlong_t)total_found,
+		    (u_longlong_t)total_alloc,
 		    (dump_opt['L']) ? "unreachable" : "leaked",
-		    (longlong_t)(alloc + logalloc - tzb->zb_asize));
+		    (longlong_t)(total_alloc - total_found));
 		leaks = 1;
 	}
 
@@ -1793,33 +2046,40 @@
 	(void) printf("\n");
 	(void) printf("\tbp count:      %10llu\n",
 	    (u_longlong_t)tzb->zb_count);
-	(void) printf("\tbp logical:    %10llu\t avg: %6llu\n",
+	(void) printf("\tbp logical:    %10llu      avg: %6llu\n",
 	    (u_longlong_t)tzb->zb_lsize,
 	    (u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
-	(void) printf("\tbp physical:   %10llu\t avg:"
-	    " %6llu\tcompression: %6.2f\n",
+	(void) printf("\tbp physical:   %10llu      avg:"
+	    " %6llu     compression: %6.2f\n",
 	    (u_longlong_t)tzb->zb_psize,
 	    (u_longlong_t)(tzb->zb_psize / tzb->zb_count),
 	    (double)tzb->zb_lsize / tzb->zb_psize);
-	(void) printf("\tbp allocated:  %10llu\t avg:"
-	    " %6llu\tcompression: %6.2f\n",
+	(void) printf("\tbp allocated:  %10llu      avg:"
+	    " %6llu     compression: %6.2f\n",
 	    (u_longlong_t)tzb->zb_asize,
 	    (u_longlong_t)(tzb->zb_asize / tzb->zb_count),
 	    (double)tzb->zb_lsize / tzb->zb_asize);
-	(void) printf("\tSPA allocated: %10llu\tused: %5.2f%%\n",
-	    (u_longlong_t)alloc, 100.0 * alloc / space);
+	(void) printf("\tbp deduped:    %10llu    ref>1:"
+	    " %6llu   deduplication: %6.2f\n",
+	    (u_longlong_t)zcb.zcb_dedup_asize,
+	    (u_longlong_t)zcb.zcb_dedup_blocks,
+	    (double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0);
+	(void) printf("\tSPA allocated: %10llu     used: %5.2f%%\n",
+	    (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
 
 	if (dump_opt['b'] >= 2) {
 		int l, t, level;
 		(void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
 		    "\t  avg\t comp\t%%Total\tType\n");
 
-		for (t = 0; t <= DMU_OT_NUMTYPES; t++) {
+		for (t = 0; t <= ZDB_OT_TOTAL; t++) {
 			char csize[6], lsize[6], psize[6], asize[6], avg[6];
 			char *typename;
 
-			typename = t == DMU_OT_DEFERRED ? "deferred free" :
-			    t == DMU_OT_TOTAL ? "Total" : dmu_ot[t].ot_name;
+			if (t < DMU_OT_NUMTYPES)
+				typename = dmu_ot[t].ot_name;
+			else
+				typename = zdb_ot_extname[t - DMU_OT_NUMTYPES];
 
 			if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) {
 				(void) printf("%6s\t%5s\t%5s\t%5s"
@@ -1881,12 +2141,116 @@
 	return (0);
 }
 
+typedef struct zdb_ddt_entry {
+	ddt_key_t	zdde_key;
+	uint64_t	zdde_ref_blocks;
+	uint64_t	zdde_ref_lsize;
+	uint64_t	zdde_ref_psize;
+	uint64_t	zdde_ref_dsize;
+	avl_node_t	zdde_node;
+} zdb_ddt_entry_t;
+
+/* ARGSUSED */
+static int
+zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+	avl_tree_t *t = arg;
+	avl_index_t where;
+	zdb_ddt_entry_t *zdde, zdde_search;
+
+	if (bp == NULL)
+		return (0);
+
+	if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
+		(void) printf("traversing objset %llu, %llu objects, "
+		    "%lu blocks so far\n",
+		    (u_longlong_t)zb->zb_objset,
+		    (u_longlong_t)bp->blk_fill,
+		    avl_numnodes(t));
+	}
+
+	if (BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata)
+		return (0);
+
+	ddt_key_fill(&zdde_search.zdde_key, bp);
+
+	zdde = avl_find(t, &zdde_search, &where);
+
+	if (zdde == NULL) {
+		zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL);
+		zdde->zdde_key = zdde_search.zdde_key;
+		avl_insert(t, zdde, where);
+	}
+
+	zdde->zdde_ref_blocks += 1;
+	zdde->zdde_ref_lsize += BP_GET_LSIZE(bp);
+	zdde->zdde_ref_psize += BP_GET_PSIZE(bp);
+	zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp);
+
+	return (0);
+}
+
+static void
+dump_simulated_ddt(spa_t *spa)
+{
+	avl_tree_t t;
+	void *cookie = NULL;
+	zdb_ddt_entry_t *zdde;
+	ddt_histogram_t ddh_total = { 0 };
+	ddt_stat_t dds_total = { 0 };
+
+	avl_create(&t, ddt_entry_compare,
+	    sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node));
+
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+	(void) traverse_pool(spa, zdb_ddt_add_cb, &t, 0);
+
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+	while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {
+		ddt_stat_t dds;
+		uint64_t refcnt = zdde->zdde_ref_blocks;
+		ASSERT(refcnt != 0);
+
+		dds.dds_blocks = zdde->zdde_ref_blocks / refcnt;
+		dds.dds_lsize = zdde->zdde_ref_lsize / refcnt;
+		dds.dds_psize = zdde->zdde_ref_psize / refcnt;
+		dds.dds_dsize = zdde->zdde_ref_dsize / refcnt;
+
+		dds.dds_ref_blocks = zdde->zdde_ref_blocks;
+		dds.dds_ref_lsize = zdde->zdde_ref_lsize;
+		dds.dds_ref_psize = zdde->zdde_ref_psize;
+		dds.dds_ref_dsize = zdde->zdde_ref_dsize;
+
+		ddt_stat_add(&ddh_total.ddh_stat[highbit(refcnt) - 1], &dds, 0);
+
+		umem_free(zdde, sizeof (*zdde));
+	}
+
+	avl_destroy(&t);
+
+	ddt_histogram_stat(&dds_total, &ddh_total);
+
+	(void) printf("Simulated DDT histogram:\n");
+
+	dump_ddt_histogram(&ddh_total);
+
+	dump_dedup_ratio(&dds_total);
+}
+
 static void
 dump_zpool(spa_t *spa)
 {
 	dsl_pool_t *dp = spa_get_dsl(spa);
 	int rc = 0;
 
+	if (dump_opt['S']) {
+		dump_simulated_ddt(spa);
+		return;
+	}
+
 	if (!dump_opt['e'] && dump_opt['C'] > 1) {
 		(void) printf("\nCached configuration:\n");
 		dump_nvlist(spa->spa_config, 8);
@@ -1898,6 +2262,9 @@
 	if (dump_opt['u'])
 		dump_uberblock(&spa->spa_uberblock);
 
+	if (dump_opt['D'])
+		dump_all_ddts(spa);
+
 	if (dump_opt['d'] > 2 || dump_opt['m'])
 		dump_metaslabs(spa);
 
@@ -1905,13 +2272,13 @@
 		dump_dir(dp->dp_meta_objset);
 		if (dump_opt['d'] >= 3) {
 			dump_bplist(dp->dp_meta_objset,
-			    spa->spa_sync_bplist_obj, "Deferred frees");
+			    spa->spa_deferred_bplist_obj, "Deferred frees");
 			dump_dtl(spa->spa_root_vdev, 0);
 		}
 		(void) dmu_objset_find(spa_name(spa), dump_one_dir,
 		    NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
 	}
-	if (dump_opt['b'] || dump_opt['c'] || dump_opt['S'])
+	if (dump_opt['b'] || dump_opt['c'])
 		rc = dump_block_stats(spa);
 
 	if (dump_opt['s'])
@@ -1938,51 +2305,13 @@
 static void
 zdb_print_blkptr(blkptr_t *bp, int flags)
 {
-	dva_t *dva = bp->blk_dva;
-	int d;
+	char blkbuf[BP_SPRINTF_LEN];
 
 	if (flags & ZDB_FLAG_BSWAP)
 		byteswap_uint64_array((void *)bp, sizeof (blkptr_t));
-	/*
-	 * Super-ick warning:  This code is also duplicated in
-	 * cmd/mdb/common/modules/zfs/zfs.c .  Yeah, I hate code
-	 * replication, too.
-	 */
-	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
-		(void) printf("\tDVA[%d]: vdev_id %lld / %llx\n", d,
-		    (longlong_t)DVA_GET_VDEV(&dva[d]),
-		    (longlong_t)DVA_GET_OFFSET(&dva[d]));
-		(void) printf("\tDVA[%d]:       GANG: %-5s  GRID:  %04llx\t"
-		    "ASIZE: %llx\n", d,
-		    DVA_GET_GANG(&dva[d]) ? "TRUE" : "FALSE",
-		    (longlong_t)DVA_GET_GRID(&dva[d]),
-		    (longlong_t)DVA_GET_ASIZE(&dva[d]));
-		(void) printf("\tDVA[%d]: %llu:%llx:%llx:%s%s%s%s\n", d,
-		    (u_longlong_t)DVA_GET_VDEV(&dva[d]),
-		    (longlong_t)DVA_GET_OFFSET(&dva[d]),
-		    (longlong_t)BP_GET_PSIZE(bp),
-		    BP_SHOULD_BYTESWAP(bp) ? "e" : "",
-		    !DVA_GET_GANG(&dva[d]) && BP_GET_LEVEL(bp) != 0 ?
-		    "d" : "",
-		    DVA_GET_GANG(&dva[d]) ? "g" : "",
-		    BP_GET_COMPRESS(bp) != 0 ? "d" : "");
-	}
-	(void) printf("\tLSIZE:  %-16llx\t\tPSIZE: %llx\n",
-	    (longlong_t)BP_GET_LSIZE(bp), (longlong_t)BP_GET_PSIZE(bp));
-	(void) printf("\tENDIAN: %6s\t\t\t\t\tTYPE:  %s\n",
-	    BP_GET_BYTEORDER(bp) ? "LITTLE" : "BIG",
-	    ZDB_OT_NAME(BP_GET_TYPE(bp)));
-	(void) printf("\tBIRTH:  %-16llx   LEVEL: %-2llu\tFILL:  %llx\n",
-	    (u_longlong_t)bp->blk_birth, (u_longlong_t)BP_GET_LEVEL(bp),
-	    (u_longlong_t)bp->blk_fill);
-	(void) printf("\tCKFUNC: %-16s\t\tCOMP:  %s\n",
-	    ZDB_CHECKSUM_NAME(BP_GET_CHECKSUM(bp)),
-	    ZDB_COMPRESS_NAME(BP_GET_COMPRESS(bp)));
-	(void) printf("\tCKSUM:  %llx:%llx:%llx:%llx\n",
-	    (u_longlong_t)bp->blk_cksum.zc_word[0],
-	    (u_longlong_t)bp->blk_cksum.zc_word[1],
-	    (u_longlong_t)bp->blk_cksum.zc_word[2],
-	    (u_longlong_t)bp->blk_cksum.zc_word[3]);
+
+	sprintf_blkptr(blkbuf, bp);
+	(void) printf("%s\n", blkbuf);
 }
 
 static void
@@ -2005,7 +2334,7 @@
 {
 	if (flags & ZDB_FLAG_BSWAP)
 		byteswap_uint64_array(buf, size);
-	(void) write(2, buf, size);
+	(void) write(1, buf, size);
 }
 
 static void
@@ -2108,10 +2437,10 @@
  *	flags          - A string of characters specifying options
  *		 b: Decode a blkptr at given offset within block
  *		*c: Calculate and display checksums
- *		*d: Decompress data before dumping
+ *		 d: Decompress data before dumping
  *		 e: Byteswap data before dumping
- *		*g: Display data as a gang block header
- *		*i: Display as an indirect block
+ *		 g: Display data as a gang block header
+ *		 i: Display as an indirect block
  *		 p: Do I/O to physical offset
  *		 r: Dump raw data to stdout
  *
@@ -2120,13 +2449,15 @@
 static void
 zdb_read_block(char *thing, spa_t *spa)
 {
+	blkptr_t blk, *bp = &blk;
+	dva_t *dva = bp->blk_dva;
 	int flags = 0;
-	uint64_t offset = 0, size = 0, blkptr_offset = 0;
+	uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0;
 	zio_t *zio;
 	vdev_t *vd;
-	void *buf;
+	void *pbuf, *lbuf, *buf;
 	char *s, *p, *dup, *vdev, *flagstr;
-	int i, error, zio_flags;
+	int i, error;
 
 	dup = strdup(thing);
 	s = strtok(dup, ":");
@@ -2163,7 +2494,7 @@
 			flags |= bit;
 
 			/* If it's not something with an argument, keep going */
-			if ((bit & (ZDB_FLAG_CHECKSUM | ZDB_FLAG_DECOMPRESS |
+			if ((bit & (ZDB_FLAG_CHECKSUM |
 			    ZDB_FLAG_PRINT_BLKPTR)) == 0)
 				continue;
 
@@ -2185,22 +2516,58 @@
 		return;
 	} else {
 		if (vd->vdev_path)
-			(void) printf("Found vdev: %s\n", vd->vdev_path);
+			(void) fprintf(stderr, "Found vdev: %s\n",
+			    vd->vdev_path);
 		else
-			(void) printf("Found vdev type: %s\n",
+			(void) fprintf(stderr, "Found vdev type: %s\n",
 			    vd->vdev_ops->vdev_op_type);
 	}
 
-	buf = umem_alloc(size, UMEM_NOFAIL);
-
-	zio_flags = ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE |
-	    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY;
+	psize = size;
+	lsize = size;
+
+	pbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
+	lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
+
+	BP_ZERO(bp);
+
+	DVA_SET_VDEV(&dva[0], vd->vdev_id);
+	DVA_SET_OFFSET(&dva[0], offset);
+	DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH));
+	DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize));
+
+	BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
+
+	BP_SET_LSIZE(bp, lsize);
+	BP_SET_PSIZE(bp, psize);
+	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
+	BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
+	BP_SET_TYPE(bp, DMU_OT_NONE);
+	BP_SET_LEVEL(bp, 0);
+	BP_SET_DEDUP(bp, 0);
+	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	zio = zio_root(spa, NULL, NULL, 0);
-	/* XXX todo - cons up a BP so RAID-Z will be happy */
-	zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset, buf, size,
-	    ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, zio_flags, NULL, NULL));
+
+	if (vd == vd->vdev_top) {
+		/*
+		 * Treat this as a normal block read.
+		 */
+		zio_nowait(zio_read(zio, spa, bp, pbuf, psize, NULL, NULL,
+		    ZIO_PRIORITY_SYNC_READ,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL));
+	} else {
+		/*
+		 * Treat this as a vdev child I/O.
+		 */
+		zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pbuf, psize,
+		    ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
+		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE |
+		    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL, NULL));
+	}
+
 	error = zio_wait(zio);
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
@@ -2209,6 +2576,52 @@
 		goto out;
 	}
 
+	if (flags & ZDB_FLAG_DECOMPRESS) {
+		/*
+		 * We don't know how the data was compressed, so just try
+		 * every decompress function at every inflated blocksize.
+		 */
+		enum zio_compress c;
+		void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
+		void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
+
+		bcopy(pbuf, pbuf2, psize);
+
+		VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf + psize,
+		    SPA_MAXBLOCKSIZE - psize) == 0);
+
+		VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize,
+		    SPA_MAXBLOCKSIZE - psize) == 0);
+
+		for (lsize = SPA_MAXBLOCKSIZE; lsize > psize;
+		    lsize -= SPA_MINBLOCKSIZE) {
+			for (c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) {
+				if (zio_decompress_data(c, pbuf, lbuf,
+				    psize, lsize) == 0 &&
+				    zio_decompress_data(c, pbuf2, lbuf2,
+				    psize, lsize) == 0 &&
+				    bcmp(lbuf, lbuf2, lsize) == 0)
+					break;
+			}
+			if (c != ZIO_COMPRESS_FUNCTIONS)
+				break;
+			lsize -= SPA_MINBLOCKSIZE;
+		}
+
+		umem_free(pbuf2, SPA_MAXBLOCKSIZE);
+		umem_free(lbuf2, SPA_MAXBLOCKSIZE);
+
+		if (lsize <= psize) {
+			(void) printf("Decompress of %s failed\n", thing);
+			goto out;
+		}
+		buf = lbuf;
+		size = lsize;
+	} else {
+		buf = pbuf;
+		size = psize;
+	}
+
 	if (flags & ZDB_FLAG_PRINT_BLKPTR)
 		zdb_print_blkptr((blkptr_t *)(void *)
 		    ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags);
@@ -2223,7 +2636,8 @@
 		zdb_dump_block(thing, buf, size, flags);
 
 out:
-	umem_free(buf, size);
+	umem_free(pbuf, SPA_MAXBLOCKSIZE);
+	umem_free(lbuf, SPA_MAXBLOCKSIZE);
 	free(dup);
 }
 
@@ -2312,7 +2726,6 @@
 	struct rlimit rl = { 1024, 1024 };
 	spa_t *spa = NULL;
 	objset_t *os = NULL;
-	char *endstr;
 	int dump_all = 1;
 	int verbose = 0;
 	int error;
@@ -2327,19 +2740,21 @@
 
 	dprintf_setup(&argc, argv);
 
-	while ((c = getopt(argc, argv, "udhibcmsvCLS:RU:lep:t:")) != -1) {
+	while ((c = getopt(argc, argv, "bcdhilmsuCDRSLevp:t:U:")) != -1) {
 		switch (c) {
-		case 'u':
-		case 'd':
-		case 'i':
-		case 'h':
 		case 'b':
 		case 'c':
+		case 'd':
+		case 'h':
+		case 'i':
+		case 'l':
 		case 'm':
 		case 's':
+		case 'u':
 		case 'C':
-		case 'l':
+		case 'D':
 		case 'R':
+		case 'S':
 			dump_opt[c]++;
 			dump_all = 0;
 			break;
@@ -2350,9 +2765,6 @@
 		case 'v':
 			verbose++;
 			break;
-		case 'U':
-			spa_config_path = optarg;
-			break;
 		case 'p':
 			if (searchdirs == NULL) {
 				searchdirs = umem_alloc(sizeof (char *),
@@ -2368,24 +2780,6 @@
 			}
 			searchdirs[nsearch++] = optarg;
 			break;
-		case 'S':
-			dump_opt[c]++;
-			dump_all = 0;
-			zdb_sig_user_data = (strncmp(optarg, "user:", 5) == 0);
-			if (!zdb_sig_user_data && strncmp(optarg, "all:", 4))
-				usage();
-			endstr = strchr(optarg, ':') + 1;
-			if (strcmp(endstr, "fletcher2") == 0)
-				zdb_sig_cksumalg = ZIO_CHECKSUM_FLETCHER_2;
-			else if (strcmp(endstr, "fletcher4") == 0)
-				zdb_sig_cksumalg = ZIO_CHECKSUM_FLETCHER_4;
-			else if (strcmp(endstr, "sha256") == 0)
-				zdb_sig_cksumalg = ZIO_CHECKSUM_SHA256;
-			else if (strcmp(endstr, "all") == 0)
-				zdb_sig_cksumalg = ZIO_CHECKSUM_FLETCHER_2;
-			else
-				usage();
-			break;
 		case 't':
 			max_txg = strtoull(optarg, NULL, 0);
 			if (max_txg < TXG_INITIAL) {
@@ -2394,6 +2788,9 @@
 				usage();
 			}
 			break;
+		case 'U':
+			spa_config_path = optarg;
+			break;
 		default:
 			usage();
 			break;
@@ -2409,8 +2806,11 @@
 	g_zfs = libzfs_init();
 	ASSERT(g_zfs != NULL);
 
+	if (dump_all)
+		verbose = MAX(verbose, 1);
+
 	for (c = 0; c < 256; c++) {
-		if (dump_all && !strchr("elLR", c))
+		if (dump_all && !strchr("elLRS", c))
 			dump_opt[c] = 1;
 		if (dump_opt[c])
 			dump_opt[c] += verbose;
diff -r 8aac17999e4d -r e2081f502306 usr/src/cmd/zdb/zdb_il.c
--- a/usr/src/cmd/zdb/zdb_il.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/cmd/zdb/zdb_il.c	Sun Nov 01 14:14:46 2009 -0800
@@ -40,12 +40,14 @@
 
 extern uint8_t dump_opt[256];
 
+static char prefix[4] = "\t\t\t";
+
 static void
 print_log_bp(const blkptr_t *bp, const char *prefix)
 {
 	char blkbuf[BP_SPRINTF_LEN];
 
-	sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, bp);
+	sprintf_blkptr(blkbuf, bp);
 	(void) printf("%s%s\n", prefix, blkbuf);
 }
 
@@ -58,15 +60,15 @@
 	char *link = name + strlen(name) + 1;
 
 	if (txtype == TX_SYMLINK)
-		(void) printf("\t\t\t%s -> %s\n", name, link);
+		(void) printf("%s%s -> %s\n", prefix, name, link);
 	else
-		(void) printf("\t\t\t%s\n", name);
+		(void) printf("%s%s\n", prefix, name);
 
-	(void) printf("\t\t\t%s", ctime(&crtime));
-	(void) printf("\t\t\tdoid %llu, foid %llu, mode %llo\n",
+	(void) printf("%s%s", prefix, ctime(&crtime));
+	(void) printf("%sdoid %llu, foid %llu, mode %llo\n", prefix,
 	    (u_longlong_t)lr->lr_doid, (u_longlong_t)lr->lr_foid,
 	    (longlong_t)lr->lr_mode);
-	(void) printf("\t\t\tuid %llu, gid %llu, gen %llu, rdev 0x%llx\n",
+	(void) printf("%suid %llu, gid %llu, gen %llu, rdev 0x%llx\n", prefix,
 	    (u_longlong_t)lr->lr_uid, (u_longlong_t)lr->lr_gid,
 	    (u_longlong_t)lr->lr_gen, (u_longlong_t)lr->lr_rdev);
 }
@@ -75,7 +77,7 @@
 static void
 zil_prt_rec_remove(zilog_t *zilog, int txtype, lr_remove_t *lr)
 {
-	(void) printf("\t\t\tdoid %llu, name %s\n",
+	(void) printf("%sdoid %llu, name %s\n", prefix,
 	    (u_longlong_t)lr->lr_doid, (char *)(lr + 1));
 }
 
@@ -83,7 +85,7 @@
 static void
 zil_prt_rec_link(zilog_t *zilog, int txtype, lr_link_t *lr)
 {
-	(void) printf("\t\t\tdoid %llu, link_obj %llu, name %s\n",
+	(void) printf("%sdoid %llu, link_obj %llu, name %s\n", prefix,
 	    (u_longlong_t)lr->lr_doid, (u_longlong_t)lr->lr_link_obj,
 	    (char *)(lr + 1));
 }
@@ -95,9 +97,9 @@
 	char *snm = (char *)(lr + 1);
 	char *tnm = snm + strlen(snm) + 1;
 
-	(void) printf("\t\t\tsdoid %llu, tdoid %llu\n",
+	(void) printf("%ssdoid %llu, tdoid %llu\n", prefix,
 	    (u_longlong_t)lr->lr_sdoid, (u_longlong_t)lr->lr_tdoid);
-	(void) printf("\t\t\tsrc %s tgt %s\n", snm, tnm);
+	(void) printf("%ssrc %s tgt %s\n", prefix, snm, tnm);
 }
 
 /* ARGSUSED */
@@ -106,44 +108,48 @@
 {
 	char *data, *dlimit;
 	blkptr_t *bp = &lr->lr_blkptr;
+	zbookmark_t zb;
 	char buf[SPA_MAXBLOCKSIZE];
 	int verbose = MAX(dump_opt['d'], dump_opt['i']);
 	int error;
 
-	(void) printf("\t\t\tfoid %llu, offset 0x%llx,"
-	    " length 0x%llx, blkoff 0x%llx\n",
-	    (u_longlong_t)lr->lr_foid, (longlong_t)lr->lr_offset,
-	    (u_longlong_t)lr->lr_length, (u_longlong_t)lr->lr_blkoff);
+	(void) printf("%sfoid %llu, offset %llx, length %llx\n", prefix,
+	    (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset,
+	    (u_longlong_t)lr->lr_length);
 
 	if (txtype == TX_WRITE2 || verbose < 5)
 		return;
 
 	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
-		(void) printf("\t\t\thas blkptr, %s\n",
+		(void) printf("%shas blkptr, %s\n", prefix,
 		    bp->blk_birth >= spa_first_txg(zilog->zl_spa) ?
 		    "will claim" : "won't claim");
-		print_log_bp(bp, "\t\t\t");
+		print_log_bp(bp, prefix);
+
 		if (BP_IS_HOLE(bp)) {
 			(void) printf("\t\t\tLSIZE 0x%llx\n",
 			    (u_longlong_t)BP_GET_LSIZE(bp));
 		}
 		if (bp->blk_birth == 0) {
 			bzero(buf, sizeof (buf));
-		} else {
-			zbookmark_t zb;
-
-			zb.zb_objset = dmu_objset_id(zilog->zl_os);
-			zb.zb_object = lr->lr_foid;
-			zb.zb_level = 0;
-			zb.zb_blkid = -1; /* unknown */
+			(void) printf("%s<hole>\n", prefix);
+			return;
+		}
+		if (bp->blk_birth < zilog->zl_header->zh_claim_txg) {
+			(void) printf("%s<block already committed>\n", prefix);
+			return;
+		}
 
-			error = zio_wait(zio_read(NULL, zilog->zl_spa,
-			    bp, buf, BP_GET_LSIZE(bp), NULL, NULL,
-			    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb));
-			if (error)
-				return;
-		}
-		data = buf + lr->lr_blkoff;
+		SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os),
+		    lr->lr_foid, ZB_ZIL_LEVEL,
+		    lr->lr_offset / BP_GET_LSIZE(bp));
+
+		error = zio_wait(zio_read(NULL, zilog->zl_spa,
+		    bp, buf, BP_GET_LSIZE(bp), NULL, NULL,
+		    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb));
+		if (error)
+			return;
+		data = buf;
 	} else {
 		data = (char *)(lr + 1);
 	}
@@ -151,7 +157,7 @@
 	dlimit = data + MIN(lr->lr_length,
 	    (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE));
 
-	(void) printf("\t\t\t");
+	(void) printf("%s", prefix);
 	while (data < dlimit) {
 		if (isprint(*data))
 			(void) printf("%c ", *data);
@@ -166,7 +172,7 @@
 static void
 zil_prt_rec_truncate(zilog_t *zilog, int txtype, lr_truncate_t *lr)
 {
-	(void) printf("\t\t\tfoid %llu, offset 0x%llx, length 0x%llx\n",
+	(void) printf("%sfoid %llu, offset 0x%llx, length 0x%llx\n", prefix,
 	    (u_longlong_t)lr->lr_foid, (longlong_t)lr->lr_offset,
 	    (u_longlong_t)lr->lr_length);
 }
@@ -178,38 +184,38 @@
 	time_t atime = (time_t)lr->lr_atime[0];
 	time_t mtime = (time_t)lr->lr_mtime[0];
 
-	(void) printf("\t\t\tfoid %llu, mask 0x%llx\n",
+	(void) printf("%sfoid %llu, mask 0x%llx\n", prefix,
 	    (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_mask);
 
 	if (lr->lr_mask & AT_MODE) {
-		(void) printf("\t\t\tAT_MODE  %llo\n",
+		(void) printf("%sAT_MODE  %llo\n", prefix,
 		    (longlong_t)lr->lr_mode);
 	}
 
 	if (lr->lr_mask & AT_UID) {
-		(void) printf("\t\t\tAT_UID   %llu\n",
+		(void) printf("%sAT_UID   %llu\n", prefix,
 		    (u_longlong_t)lr->lr_uid);
 	}
 
 	if (lr->lr_mask & AT_GID) {
-		(void) printf("\t\t\tAT_GID   %llu\n",
+		(void) printf("%sAT_GID   %llu\n", prefix,
 		    (u_longlong_t)lr->lr_gid);
 	}
 
 	if (lr->lr_mask & AT_SIZE) {
-		(void) printf("\t\t\tAT_SIZE  %llu\n",
+		(void) printf("%sAT_SIZE  %llu\n", prefix,
 		    (u_longlong_t)lr->lr_size);
 	}
 
 	if (lr->lr_mask & AT_ATIME) {
-		(void) printf("\t\t\tAT_ATIME %llu.%09llu %s",
+		(void) printf("%sAT_ATIME %llu.%09llu %s", prefix,
 		    (u_longlong_t)lr->lr_atime[0],
 		    (u_longlong_t)lr->lr_atime[1],
 		    ctime(&atime));
 	}
 
 	if (lr->lr_mask & AT_MTIME) {
-		(void) printf("\t\t\tAT_MTIME %llu.%09llu %s",
+		(void) printf("%sAT_MTIME %llu.%09llu %s", prefix,
 		    (u_longlong_t)lr->lr_mtime[0],
 		    (u_longlong_t)lr->lr_mtime[1],
 		    ctime(&mtime));
@@ -220,7 +226,7 @@
 static void
 zil_prt_rec_acl(zilog_t *zilog, int txtype, lr_acl_t *lr)
 {
-	(void) printf("\t\t\tfoid %llu, aclcnt %llu\n",
+	(void) printf("%sfoid %llu, aclcnt %llu\n", prefix,
 	    (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_aclcnt);
 }
 
@@ -256,7 +262,7 @@
 };
 
 /* ARGSUSED */
-static void
+static int
 print_log_record(zilog_t *zilog, lr_t *lr, void *arg, uint64_t claim_txg)
 {
 	int txtype;
@@ -280,23 +286,24 @@
 
 	zil_rec_info[txtype].zri_count++;
 	zil_rec_info[0].zri_count++;
+
+	return (0);
 }
 
 /* ARGSUSED */
-static void
+static int
 print_log_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
 {
-	char blkbuf[BP_SPRINTF_LEN];
+	char blkbuf[BP_SPRINTF_LEN + 10];
 	int verbose = MAX(dump_opt['d'], dump_opt['i']);
 	char *claim;
 
 	if (verbose <= 3)
-		return;
+		return (0);
 
 	if (verbose >= 5) {
 		(void) strcpy(blkbuf, ", ");
-		sprintf_blkptr(blkbuf + strlen(blkbuf),
-		    BP_SPRINTF_LEN - strlen(blkbuf), bp);
+		sprintf_blkptr(blkbuf + strlen(blkbuf), bp);
 	} else {
 		blkbuf[0] = '\0';
 	}
@@ -310,6 +317,8 @@
 
 	(void) printf("\tBlock seqno %llu, %s%s\n",
 	    (u_longlong_t)bp->blk_cksum.zc_word[ZIL_ZC_SEQ], claim, blkbuf);
+
+	return (0);
 }
 
 static void
@@ -342,17 +351,17 @@
 	int verbose = MAX(dump_opt['d'], dump_opt['i']);
 	int i;
 
-	if (zh->zh_log.blk_birth == 0 || verbose < 2)
+	if (zh->zh_log.blk_birth == 0 || verbose < 1)
 		return;
 
-	(void) printf("\n    ZIL header: claim_txg %llu, claim_seq %llu",
-	    (u_longlong_t)zh->zh_claim_txg, (u_longlong_t)zh->zh_claim_seq);
+	(void) printf("\n    ZIL header: claim_txg %llu, "
+	    "claim_blk_seq %llu, claim_lr_seq %llu",
+	    (u_longlong_t)zh->zh_claim_txg,
+	    (u_longlong_t)zh->zh_claim_blk_seq,
+	    (u_longlong_t)zh->zh_claim_lr_seq);
 	(void) printf(" replay_seq %llu, flags 0x%llx\n",
 	    (u_longlong_t)zh->zh_replay_seq, (u_longlong_t)zh->zh_flags);
 
-	if (verbose >= 4)
-		print_log_bp(&zh->zh_log, "\n\tfirst block: ");
-
 	for (i = 0; i < TX_MAX_TYPE; i++)
 		zil_rec_info[i].zri_count = 0;
 
diff -r 8aac17999e4d -r e2081f502306 usr/src/cmd/zpool/zpool_main.c
--- a/usr/src/cmd/zpool/zpool_main.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/cmd/zpool/zpool_main.c	Sun Nov 01 14:14:46 2009 -0800
@@ -250,12 +250,12 @@
 {
 	FILE *fp = cb;
 
-	(void) fprintf(fp, "\t%-13s  ", zpool_prop_to_name(prop));
+	(void) fprintf(fp, "\t%-15s  ", zpool_prop_to_name(prop));
 
 	if (zpool_prop_readonly(prop))
 		(void) fprintf(fp, "  NO   ");
 	else
-		(void) fprintf(fp, " YES    ");
+		(void) fprintf(fp, " YES   ");
 
 	if (zpool_prop_values(prop) == NULL)
 		(void) fprintf(fp, "-\n");
@@ -302,7 +302,7 @@
 		(void) fprintf(fp,
 		    gettext("\nthe following properties are supported:\n"));
 
-		(void) fprintf(fp, "\n\t%-13s  %s  %s\n\n",
+		(void) fprintf(fp, "\n\t%-15s  %s   %s\n\n",
 		    "PROPERTY", "EDIT", "VALUES");
 
 		/* Iterate over all properties */
@@ -2449,7 +2449,7 @@
 	int ret;
 	list_cbdata_t cb = { 0 };
 	static char default_props[] =
-	    "name,size,used,available,capacity,health,altroot";
+	    "name,size,used,available,capacity,dedupratio,health,altroot";
 	char *props = default_props;
 
 	/* check options */
@@ -3672,9 +3672,12 @@
 		(void) printf(gettext(" 15  user/group space accounting\n"));
 		(void) printf(gettext(" 16  stmf property support\n"));
 		(void) printf(gettext(" 17  Triple-parity RAID-Z\n"));
-		(void) printf(gettext(" 18  snapshot user holds\n"));
+		(void) printf(gettext(" 18  Snapshot user holds\n"));
 		(void) printf(gettext(" 19  Log device removal\n"));
-		(void) printf(gettext("For more information on a particular "
+		(void) printf(gettext(" 20  Compression using zle "
+		    "(zero-length encoding)\n"));
+		(void) printf(gettext(" 21  Deduplication\n"));
+		(void) printf(gettext("\nFor more information on a particular "
 		    "version, including supported releases, see:\n\n"));
 		(void) printf("http://www.opensolaris.org/os/community/zfs/"
 		    "version/N\n\n");
diff -r 8aac17999e4d -r e2081f502306 usr/src/cmd/ztest/ztest.c
--- a/usr/src/cmd/ztest/ztest.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/cmd/ztest/ztest.c	Sun Nov 01 14:14:46 2009 -0800
@@ -86,9 +86,8 @@
 #include <sys/mman.h>
 #include <sys/resource.h>
 #include <sys/zio.h>
-#include <sys/zio_checksum.h>
-#include <sys/zio_compress.h>
 #include <sys/zil.h>
+#include <sys/zil_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_file.h>
 #include <sys/spa_impl.h>
@@ -106,6 +105,7 @@
 #include <ctype.h>
 #include <math.h>
 #include <sys/fs/zfs.h>
+#include <libnvpair.h>
 
 static char cmdname[] = "ztest";
 static char *zopt_pool = cmdname;
@@ -127,112 +127,171 @@
 static uint64_t zopt_time = 300;	/* 5 minutes */
 static int zopt_maxfaults;
 
+#define	BT_MAGIC	0x123456789abcdefULL
+
+enum ztest_io_type {
+	ZTEST_IO_WRITE_TAG,
+	ZTEST_IO_WRITE_PATTERN,
+	ZTEST_IO_WRITE_ZEROES,
+	ZTEST_IO_TRUNCATE,
+	ZTEST_IO_SETATTR,
+	ZTEST_IO_TYPES
+};
+
 typedef struct ztest_block_tag {
+	uint64_t	bt_magic;
 	uint64_t	bt_objset;
 	uint64_t	bt_object;
 	uint64_t	bt_offset;
+	uint64_t	bt_gen;
 	uint64_t	bt_txg;
-	uint64_t	bt_thread;
-	uint64_t	bt_seq;
+	uint64_t	bt_crtxg;
 } ztest_block_tag_t;
 
-typedef struct ztest_args {
-	char		za_pool[MAXNAMELEN];
-	spa_t		*za_spa;
-	objset_t	*za_os;
-	zilog_t		*za_zilog;
-	thread_t	za_thread;
-	uint64_t	za_instance;
-	uint64_t	za_random;
-	uint64_t	za_diroff;
-	uint64_t	za_diroff_shared;
-	uint64_t	za_zil_seq;
-	hrtime_t	za_start;
-	hrtime_t	za_stop;
-	hrtime_t	za_kill;
-	/*
-	 * Thread-local variables can go here to aid debugging.
-	 */
-	ztest_block_tag_t za_rbt;
-	ztest_block_tag_t za_wbt;
-	dmu_object_info_t za_doi;
-	dmu_buf_t	*za_dbuf;
-} ztest_args_t;
-
-typedef void ztest_func_t(ztest_args_t *);
+typedef struct bufwad {
+	uint64_t	bw_index;
+	uint64_t	bw_txg;
+	uint64_t	bw_data;
+} bufwad_t;
+
+/*
+ * XXX -- fix zfs range locks to be generic so we can use them here.
+ */
+typedef enum {
+	RL_READER,
+	RL_WRITER,
+	RL_APPEND
+} rl_type_t;
+
+typedef struct rll {
+	void		*rll_writer;
+	int		rll_readers;
+	mutex_t		rll_lock;
+	cond_t		rll_cv;
+} rll_t;
+
+typedef struct rl {
+	uint64_t	rl_object;
+	uint64_t	rl_offset;
+	uint64_t	rl_size;
+	rll_t		*rl_lock;
+} rl_t;
+
+#define	ZTEST_RANGE_LOCKS	64
+#define	ZTEST_OBJECT_LOCKS	64
+
+/*
+ * Object descriptor.  Used as a template for object lookup/create/remove.
+ */
+typedef struct ztest_od {
+	uint64_t	od_dir;
+	uint64_t	od_object;
+	dmu_object_type_t od_type;
+	dmu_object_type_t od_crtype;
+	uint64_t	od_blocksize;
+	uint64_t	od_crblocksize;
+	uint64_t	od_gen;
+	uint64_t	od_crgen;
+	char		od_name[MAXNAMELEN];
+} ztest_od_t;
+
+/*
+ * Per-dataset state.
+ */
+typedef struct ztest_ds {
+	objset_t	*zd_os;
+	zilog_t		*zd_zilog;
+	uint64_t	zd_seq;
+	ztest_od_t	*zd_od;		/* debugging aid */
+	char		zd_name[MAXNAMELEN];
+	mutex_t		zd_dirobj_lock;
+	rll_t		zd_object_lock[ZTEST_OBJECT_LOCKS];
+	rll_t		zd_range_lock[ZTEST_RANGE_LOCKS];
+} ztest_ds_t;
+
+/*
+ * Per-iteration state.
+ */
+typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id);
+
+typedef struct ztest_info {
+	ztest_func_t	*zi_func;	/* test function */
+	uint64_t	zi_iters;	/* iterations per execution */
+	uint64_t	*zi_interval;	/* execute every <interval> seconds */
+	uint64_t	zi_call_count;	/* per-pass count */
+	uint64_t	zi_call_time;	/* per-pass time */
+	uint64_t	zi_call_next;	/* next time to call this function */
+} ztest_info_t;
 
 /*
  * Note: these aren't static because we want dladdr() to work.
  */
 ztest_func_t ztest_dmu_read_write;
-ztest_func_t ztest_dmu_read_write_zcopy;
 ztest_func_t ztest_dmu_write_parallel;
 ztest_func_t ztest_dmu_object_alloc_free;
 ztest_func_t ztest_dmu_commit_callbacks;
 ztest_func_t ztest_zap;
+ztest_func_t ztest_zap_parallel;
+ztest_func_t ztest_zil_commit;
+ztest_func_t ztest_dmu_read_write_zcopy;
+ztest_func_t ztest_dmu_objset_create_destroy;
+ztest_func_t ztest_dmu_prealloc;
 ztest_func_t ztest_fzap;
-ztest_func_t ztest_zap_parallel;
-ztest_func_t ztest_traverse;
+ztest_func_t ztest_dmu_snapshot_create_destroy;
 ztest_func_t ztest_dsl_prop_get_set;
-ztest_func_t ztest_dmu_objset_create_destroy;
-ztest_func_t ztest_dmu_snapshot_create_destroy;
-ztest_func_t ztest_dsl_dataset_promote_busy;
+ztest_func_t ztest_spa_prop_get_set;
 ztest_func_t ztest_spa_create_destroy;
 ztest_func_t ztest_fault_inject;
+ztest_func_t ztest_ddt_repair;
+ztest_func_t ztest_dmu_snapshot_hold;
 ztest_func_t ztest_spa_rename;
+ztest_func_t ztest_scrub;
+ztest_func_t ztest_dsl_dataset_promote_busy;
 ztest_func_t ztest_vdev_attach_detach;
 ztest_func_t ztest_vdev_LUN_growth;
 ztest_func_t ztest_vdev_add_remove;
 ztest_func_t ztest_vdev_aux_add_remove;
-ztest_func_t ztest_scrub;
-ztest_func_t ztest_dmu_snapshot_hold;
-
-typedef struct ztest_info {
-	ztest_func_t	*zi_func;	/* test function */
-	uint64_t	zi_iters;	/* iterations per execution */
-	uint64_t	*zi_interval;	/* execute every <interval> seconds */
-	uint64_t	zi_calls;	/* per-pass count */
-	uint64_t	zi_call_time;	/* per-pass time */
-	uint64_t	zi_call_total;	/* cumulative total */
-	uint64_t	zi_call_target;	/* target cumulative total */
-} ztest_info_t;
-
-uint64_t zopt_always = 0;		/* all the time */
-uint64_t zopt_often = 1;		/* every second */
-uint64_t zopt_sometimes = 10;		/* every 10 seconds */
-uint64_t zopt_rarely = 60;		/* every 60 seconds */
+
+uint64_t zopt_always = 0ULL * NANOSEC;		/* all the time */
+uint64_t zopt_incessant = 1ULL * NANOSEC / 10;	/* every 1/10 second */
+uint64_t zopt_often = 1ULL * NANOSEC;		/* every second */
+uint64_t zopt_sometimes = 10ULL * NANOSEC;	/* every 10 seconds */
+uint64_t zopt_rarely = 60ULL * NANOSEC;		/* every 60 seconds */
 
 ztest_info_t ztest_info[] = {
 	{ ztest_dmu_read_write,			1,	&zopt_always	},
-	{ ztest_dmu_write_parallel,		30,	&zopt_always	},
+	{ ztest_dmu_write_parallel,		10,	&zopt_always	},
 	{ ztest_dmu_object_alloc_free,		1,	&zopt_always	},
-	{ ztest_dmu_commit_callbacks,		10,	&zopt_always	},
+	{ ztest_dmu_commit_callbacks,		1,	&zopt_always	},
 	{ ztest_zap,				30,	&zopt_always	},
-	{ ztest_fzap,				1,	&zopt_always	},
 	{ ztest_zap_parallel,			100,	&zopt_always	},
-	{ ztest_dmu_read_write_zcopy,		1,	&zopt_sometimes	},
-	{ ztest_dsl_prop_get_set,		1,	&zopt_sometimes	},
-	{ ztest_dmu_objset_create_destroy,	1,	&zopt_sometimes },
-	{ ztest_dmu_snapshot_create_destroy,	1,	&zopt_sometimes },
-	{ ztest_spa_create_destroy,		1,	&zopt_sometimes },
+	{ ztest_zil_commit,			1,	&zopt_incessant	},
+	{ ztest_dmu_read_write_zcopy,		1,	&zopt_often	},
+	{ ztest_dmu_objset_create_destroy,	1,	&zopt_often	},
+	{ ztest_dsl_prop_get_set,		1,	&zopt_often	},
+	{ ztest_spa_prop_get_set,		1,	&zopt_sometimes	},
+#if 0
+	{ ztest_dmu_prealloc,			1,	&zopt_sometimes	},
+#endif
+	{ ztest_fzap,				1,	&zopt_sometimes	},
+	{ ztest_dmu_snapshot_create_destroy,	1,	&zopt_sometimes	},
+	{ ztest_spa_create_destroy,		1,	&zopt_sometimes	},
 	{ ztest_fault_inject,			1,	&zopt_sometimes	},
+	{ ztest_ddt_repair,			1,	&zopt_sometimes	},
 	{ ztest_dmu_snapshot_hold,		1,	&zopt_sometimes	},
 	{ ztest_spa_rename,			1,	&zopt_rarely	},
+	{ ztest_scrub,				1,	&zopt_rarely	},
+	{ ztest_dsl_dataset_promote_busy,	1,	&zopt_rarely	},
 	{ ztest_vdev_attach_detach,		1,	&zopt_rarely	},
 	{ ztest_vdev_LUN_growth,		1,	&zopt_rarely	},
-	{ ztest_dsl_dataset_promote_busy,	1,	&zopt_rarely	},
 	{ ztest_vdev_add_remove,		1,	&zopt_vdevtime	},
 	{ ztest_vdev_aux_add_remove,		1,	&zopt_vdevtime	},
-	{ ztest_scrub,				1,	&zopt_vdevtime	},
 };
 
 #define	ZTEST_FUNCS	(sizeof (ztest_info) / sizeof (ztest_info_t))
 
-#define	ZTEST_SYNC_LOCKS	16
-
 /*
  * The following struct is used to hold a list of uncalled commit callbacks.
- *
  * The callbacks are ordered by txg number.
  */
 typedef struct ztest_cb_list {
@@ -244,28 +303,34 @@
  * Stuff we need to share writably between parent and child.
  */
 typedef struct ztest_shared {
-	mutex_t		zs_vdev_lock;
-	rwlock_t	zs_name_lock;
+	char		*zs_pool;
+	spa_t		*zs_spa;
+	hrtime_t	zs_proc_start;
+	hrtime_t	zs_proc_stop;
+	hrtime_t	zs_thread_start;
+	hrtime_t	zs_thread_stop;
+	hrtime_t	zs_thread_kill;
+	uint64_t	zs_enospc_count;
 	uint64_t	zs_vdev_next_leaf;
 	uint64_t	zs_vdev_aux;
-	uint64_t	zs_enospc_count;
-	hrtime_t	zs_start_time;
-	hrtime_t	zs_stop_time;
 	uint64_t	zs_alloc;
 	uint64_t	zs_space;
+	mutex_t		zs_vdev_lock;
+	rwlock_t	zs_name_lock;
 	ztest_info_t	zs_info[ZTEST_FUNCS];
-	mutex_t		zs_sync_lock[ZTEST_SYNC_LOCKS];
-	uint64_t	zs_seq[ZTEST_SYNC_LOCKS];
+	ztest_ds_t	zs_zd[];
 } ztest_shared_t;
 
+#define	ID_PARALLEL	-1ULL
+
 static char ztest_dev_template[] = "%s/%s.%llua";
 static char ztest_aux_template[] = "%s/%s.%s.%llu";
-static ztest_shared_t *ztest_shared;
+ztest_shared_t *ztest_shared;
+uint64_t *ztest_seq;
 
 static int ztest_random_fd;
 static int ztest_dump_core = 1;
 
-static uint64_t metaslab_sz;
 static boolean_t ztest_exiting;
 
 /* Global commit callback list */
@@ -273,13 +338,13 @@
 
 extern uint64_t metaslab_gang_bang;
 extern uint64_t metaslab_df_alloc_threshold;
-
-#define	ZTEST_DIROBJ		1
-#define	ZTEST_MICROZAP_OBJ	2
-#define	ZTEST_FATZAP_OBJ	3
-
-#define	ZTEST_DIROBJ_BLOCKSIZE	(1 << 10)
-#define	ZTEST_DIRSIZE		256
+static uint64_t metaslab_sz;
+
+enum ztest_object {
+	ZTEST_META_DNODE = 0,
+	ZTEST_DIROBJ,
+	ZTEST_OBJECTS
+};
 
 static void usage(boolean_t) __NORETURN;
 
@@ -433,27 +498,6 @@
 	exit(requested ? 0 : 1);
 }
 
-static uint64_t
-ztest_random(uint64_t range)
-{
-	uint64_t r;
-
-	if (range == 0)
-		return (0);
-
-	if (read(ztest_random_fd, &r, sizeof (r)) != sizeof (r))
-		fatal(1, "short read from /dev/urandom");
-
-	return (r % range);
-}
-
-/* ARGSUSED */
-static void
-ztest_record_enospc(char *s)
-{
-	ztest_shared->zs_enospc_count++;
-}
-
 static void
 process_options(int argc, char **argv)
 {
@@ -546,10 +590,40 @@
 
 	zopt_raidz_parity = MIN(zopt_raidz_parity, zopt_raidz - 1);
 
-	zopt_vdevtime = (zopt_vdevs > 0 ? zopt_time / zopt_vdevs : UINT64_MAX);
+	zopt_vdevtime = (zopt_vdevs > 0 ? zopt_time * NANOSEC / zopt_vdevs :
+	    UINT64_MAX >> 2);
 	zopt_maxfaults = MAX(zopt_mirrors, 1) * (zopt_raidz_parity + 1) - 1;
 }
 
+static void
+ztest_kill(ztest_shared_t *zs)
+{
+	zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(zs->zs_spa));
+	zs->zs_space = metaslab_class_get_space(spa_normal_class(zs->zs_spa));
+	(void) kill(getpid(), SIGKILL);
+}
+
+static uint64_t
+ztest_random(uint64_t range)
+{
+	uint64_t r;
+
+	if (range == 0)
+		return (0);
+
+	if (read(ztest_random_fd, &r, sizeof (r)) != sizeof (r))
+		fatal(1, "short read from /dev/urandom");
+
+	return (r % range);
+}
+
+/* ARGSUSED */
+static void
+ztest_record_enospc(const char *s)
+{
+	ztest_shared->zs_enospc_count++;
+}
+
 static uint64_t
 ztest_get_ashift(void)
 {
@@ -687,100 +761,805 @@
 	return (root);
 }
 
-static void
-ztest_set_random_blocksize(objset_t *os, uint64_t object, dmu_tx_t *tx)
+static int
+ztest_random_blocksize(void)
 {
-	int bs = SPA_MINBLOCKSHIFT +
-	    ztest_random(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1);
-	int ibs = DN_MIN_INDBLKSHIFT +
-	    ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1);
-	int error;
-
-	error = dmu_object_set_blocksize(os, object, 1ULL << bs, ibs, tx);
-	if (error) {
-		char osname[300];
-		dmu_objset_name(os, osname);
-		fatal(0, "dmu_object_set_blocksize('%s', %llu, %d, %d) = %d",
-		    osname, object, 1 << bs, ibs, error);
-	}
-}
-
-static uint8_t
-ztest_random_checksum(void)
-{
-	uint8_t checksum;
-
-	do {
-		checksum = ztest_random(ZIO_CHECKSUM_FUNCTIONS);
-	} while (zio_checksum_table[checksum].ci_zbt);
-
-	if (checksum == ZIO_CHECKSUM_OFF)
-		checksum = ZIO_CHECKSUM_ON;
-
-	return (checksum);
-}
-
-static uint8_t
-ztest_random_compress(void)
-{
-	return ((uint8_t)ztest_random(ZIO_COMPRESS_FUNCTIONS));
+	return (1 << (SPA_MINBLOCKSHIFT +
+	    ztest_random(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)));
 }
 
 static int
-ztest_replay_create(objset_t *os, lr_create_t *lr, boolean_t byteswap)
+ztest_random_ibshift(void)
+{
+	return (DN_MIN_INDBLKSHIFT +
+	    ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1));
+}
+
+static uint64_t
+ztest_random_vdev_top(spa_t *spa, boolean_t log_ok)
 {
-	dmu_tx_t *tx;
+	uint64_t top;
+	vdev_t *rvd = spa->spa_root_vdev;
+	vdev_t *tvd;
+
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
+
+	do {
+		top = ztest_random(rvd->vdev_children);
+		tvd = rvd->vdev_child[top];
+	} while (tvd->vdev_ishole || (tvd->vdev_islog && !log_ok) ||
+	    tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL);
+
+	return (top);
+}
+
+static uint64_t
+ztest_random_dsl_prop(zfs_prop_t prop)
+{
+	uint64_t value;
+
+	do {
+		value = zfs_prop_random_value(prop, ztest_random(-1ULL));
+	} while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF);
+
+	return (value);
+}
+
+static int
+ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value,
+    boolean_t inherit)
+{
+	const char *propname = zfs_prop_to_name(prop);
+	const char *valname;
+	char setpoint[MAXPATHLEN];
+	uint64_t curval;
 	int error;
 
-	if (byteswap)
-		byteswap_uint64_array(lr, sizeof (*lr));
-
-	tx = dmu_tx_create(os);
-	dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error) {
-		dmu_tx_abort(tx);
+	error = dsl_prop_set(osname, propname, sizeof (value),
+	    inherit ? 0 : 1, &value);
+
+	if (error == ENOSPC) {
+		ztest_record_enospc(FTAG);
 		return (error);
 	}
-
-	error = dmu_object_claim(os, lr->lr_doid, lr->lr_mode, 0,
-	    DMU_OT_NONE, 0, tx);
 	ASSERT3U(error, ==, 0);
-	dmu_tx_commit(tx);
-
-	if (zopt_verbose >= 5) {
-		char osname[MAXNAMELEN];
-		dmu_objset_name(os, osname);
-		(void) printf("replay create of %s object %llu"
-		    " in txg %llu = %d\n",
-		    osname, (u_longlong_t)lr->lr_doid,
-		    (u_longlong_t)dmu_tx_get_txg(tx), error);
+
+	VERIFY3U(dsl_prop_get(osname, propname, sizeof (curval),
+	    1, &curval, setpoint), ==, 0);
+
+	if (zopt_verbose >= 6) {
+		VERIFY(zfs_prop_index_to_string(prop, curval, &valname) == 0);
+		(void) printf("%s %s = %s at '%s'\n",
+		    osname, propname, valname, setpoint);
 	}
 
 	return (error);
 }
 
+#if 0
 static int
-ztest_replay_remove(objset_t *os, lr_remove_t *lr, boolean_t byteswap)
+ztest_spa_prop_set_uint64(ztest_shared_t *zs, zpool_prop_t prop, uint64_t value)
+{
+	spa_t *spa = zs->zs_spa;
+	nvlist_t *props = NULL;
+	int error;
+
+	VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0);
+	VERIFY(nvlist_add_uint64(props, zpool_prop_to_name(prop), value) == 0);
+
+	error = spa_prop_set(spa, props);
+
+	nvlist_free(props);
+
+	if (error == ENOSPC) {
+		ztest_record_enospc(FTAG);
+		return (error);
+	}
+	ASSERT3U(error, ==, 0);
+
+	return (error);
+}
+#endif
+
+static void
+ztest_rll_init(rll_t *rll)
+{
+	rll->rll_writer = NULL;
+	rll->rll_readers = 0;
+	VERIFY(_mutex_init(&rll->rll_lock, USYNC_THREAD, NULL) == 0);
+	VERIFY(cond_init(&rll->rll_cv, USYNC_THREAD, NULL) == 0);
+}
+
+static void
+ztest_rll_destroy(rll_t *rll)
+{
+	ASSERT(rll->rll_writer == NULL);
+	ASSERT(rll->rll_readers == 0);
+	VERIFY(_mutex_destroy(&rll->rll_lock) == 0);
+	VERIFY(cond_destroy(&rll->rll_cv) == 0);
+}
+
+static void
+ztest_rll_lock(rll_t *rll, rl_type_t type)
+{
+	VERIFY(mutex_lock(&rll->rll_lock) == 0);
+
+	if (type == RL_READER) {
+		while (rll->rll_writer != NULL)
+			(void) cond_wait(&rll->rll_cv, &rll->rll_lock);
+		rll->rll_readers++;
+	} else {
+		while (rll->rll_writer != NULL || rll->rll_readers)
+			(void) cond_wait(&rll->rll_cv, &rll->rll_lock);
+		rll->rll_writer = curthread;
+	}
+
+	VERIFY(mutex_unlock(&rll->rll_lock) == 0);
+}
+
+static void
+ztest_rll_unlock(rll_t *rll)
+{
+	VERIFY(mutex_lock(&rll->rll_lock) == 0);
+
+	if (rll->rll_writer) {
+		ASSERT(rll->rll_readers == 0);
+		rll->rll_writer = NULL;
+	} else {
+		ASSERT(rll->rll_readers != 0);
+		ASSERT(rll->rll_writer == NULL);
+		rll->rll_readers--;
+	}
+
+	if (rll->rll_writer == NULL && rll->rll_readers == 0)
+		VERIFY(cond_broadcast(&rll->rll_cv) == 0);
+
+	VERIFY(mutex_unlock(&rll->rll_lock) == 0);
+}
+
+static void
+ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type)
+{
+	rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
+
+	ztest_rll_lock(rll, type);
+}
+
+static void
+ztest_object_unlock(ztest_ds_t *zd, uint64_t object)
+{
+	rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
+
+	ztest_rll_unlock(rll);
+}
+
+static rl_t *
+ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset,
+    uint64_t size, rl_type_t type)
+{
+	uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1));
+	rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)];
+	rl_t *rl;
+
+	rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL);
+	rl->rl_object = object;
+	rl->rl_offset = offset;
+	rl->rl_size = size;
+	rl->rl_lock = rll;
+
+	ztest_rll_lock(rll, type);
+
+	return (rl);
+}
+
+static void
+ztest_range_unlock(rl_t *rl)
+{
+	rll_t *rll = rl->rl_lock;
+
+	ztest_rll_unlock(rll);
+
+	umem_free(rl, sizeof (*rl));
+}
+
+static void
+ztest_zd_init(ztest_ds_t *zd, objset_t *os)
+{
+	zd->zd_os = os;
+	zd->zd_zilog = dmu_objset_zil(os);
+	zd->zd_seq = 0;
+	dmu_objset_name(os, zd->zd_name);
+
+	VERIFY(_mutex_init(&zd->zd_dirobj_lock, USYNC_THREAD, NULL) == 0);
+
+	for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++)
+		ztest_rll_init(&zd->zd_object_lock[l]);
+
+	for (int l = 0; l < ZTEST_RANGE_LOCKS; l++)
+		ztest_rll_init(&zd->zd_range_lock[l]);
+}
+
+static void
+ztest_zd_fini(ztest_ds_t *zd)
+{
+	VERIFY(_mutex_destroy(&zd->zd_dirobj_lock) == 0);
+
+	for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++)
+		ztest_rll_destroy(&zd->zd_object_lock[l]);
+
+	for (int l = 0; l < ZTEST_RANGE_LOCKS; l++)
+		ztest_rll_destroy(&zd->zd_range_lock[l]);
+}
+
+#define	TXG_MIGHTWAIT	(ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT)
+
+static uint64_t
+ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag)
 {
+	uint64_t txg;
+	int error;
+
+	/*
+	 * Attempt to assign tx to some transaction group.
+	 */
+	error = dmu_tx_assign(tx, txg_how);
+	if (error) {
+		if (error == ERESTART) {
+			ASSERT(txg_how == TXG_NOWAIT);
+			dmu_tx_wait(tx);
+		} else {
+			ASSERT3U(error, ==, ENOSPC);
+			ztest_record_enospc(tag);
+		}
+		dmu_tx_abort(tx);
+		return (0);
+	}
+	txg = dmu_tx_get_txg(tx);
+	ASSERT(txg != 0);
+	return (txg);
+}
+
+static void
+ztest_pattern_set(void *buf, uint64_t size, uint64_t value)
+{
+	uint64_t *ip = buf;
+	uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size);
+
+	while (ip < ip_end)
+		*ip++ = value;
+}
+
+static boolean_t
+ztest_pattern_match(void *buf, uint64_t size, uint64_t value)
+{
+	uint64_t *ip = buf;
+	uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size);
+	uint64_t diff = 0;
+
+	while (ip < ip_end)
+		diff |= (value - *ip++);
+
+	return (diff == 0);
+}
+
+static void
+ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
+    uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg)
+{
+	bt->bt_magic = BT_MAGIC;
+	bt->bt_objset = dmu_objset_id(os);
+	bt->bt_object = object;
+	bt->bt_offset = offset;
+	bt->bt_gen = gen;
+	bt->bt_txg = txg;
+	bt->bt_crtxg = crtxg;
+}
+
+static void
+ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
+    uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg)
+{
+	ASSERT(bt->bt_magic == BT_MAGIC);
+	ASSERT(bt->bt_objset == dmu_objset_id(os));
+	ASSERT(bt->bt_object == object);
+	ASSERT(bt->bt_offset == offset);
+	ASSERT(bt->bt_gen <= gen);
+	ASSERT(bt->bt_txg <= txg);
+	ASSERT(bt->bt_crtxg == crtxg);
+}
+
+static ztest_block_tag_t *
+ztest_bt_bonus(dmu_buf_t *db)
+{
+	dmu_object_info_t doi;
+	ztest_block_tag_t *bt;
+
+	dmu_object_info_from_db(db, &doi);
+	ASSERT3U(doi.doi_bonus_size, <=, db->db_size);
+	ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt));
+	bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt));
+
+	return (bt);
+}
+
+/*
+ * ZIL logging ops
+ */
+
+#define	lrz_type	lr_mode
+#define	lrz_blocksize	lr_uid
+#define	lrz_ibshift	lr_gid
+#define	lrz_bonustype	lr_rdev
+#define	lrz_bonuslen	lr_crtime[1]
+
+static uint64_t
+ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr)
+{
+	char *name = (void *)(lr + 1);		/* name follows lr */
+	size_t namesize = strlen(name) + 1;
+	itx_t *itx;
+
+	if (zil_replaying(zd->zd_zilog, tx))
+		return (0);
+
+	itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize);
+	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+	    sizeof (*lr) + namesize - sizeof (lr_t));
+
+	return (zil_itx_assign(zd->zd_zilog, itx, tx));
+}
+
+static uint64_t
+ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr)
+{
+	char *name = (void *)(lr + 1);		/* name follows lr */
+	size_t namesize = strlen(name) + 1;
+	itx_t *itx;
+
+	if (zil_replaying(zd->zd_zilog, tx))
+		return (0);
+
+	itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize);
+	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+	    sizeof (*lr) + namesize - sizeof (lr_t));
+
+	return (zil_itx_assign(zd->zd_zilog, itx, tx));
+}
+
+static uint64_t
+ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr)
+{
+	itx_t *itx;
+	itx_wr_state_t write_state = ztest_random(WR_NUM_STATES);
+
+	if (zil_replaying(zd->zd_zilog, tx))
+		return (0);
+
+	if (lr->lr_length > ZIL_MAX_LOG_DATA)
+		write_state = WR_INDIRECT;
+
+	itx = zil_itx_create(TX_WRITE,
+	    sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0));
+
+	if (write_state == WR_COPIED &&
+	    dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length,
+	    ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) {
+		zil_itx_destroy(itx);
+		itx = zil_itx_create(TX_WRITE, sizeof (*lr));
+		write_state = WR_NEED_COPY;
+	}
+	itx->itx_private = zd;
+	itx->itx_wr_state = write_state;
+	itx->itx_sync = (ztest_random(8) == 0);
+	itx->itx_sod += (write_state == WR_NEED_COPY ? lr->lr_length : 0);
+
+	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+	    sizeof (*lr) - sizeof (lr_t));
+
+	return (zil_itx_assign(zd->zd_zilog, itx, tx));
+}
+
+static uint64_t
+ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr)
+{
+	itx_t *itx;
+
+	if (zil_replaying(zd->zd_zilog, tx))
+		return (0);
+
+	itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
+	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+	    sizeof (*lr) - sizeof (lr_t));
+
+	return (zil_itx_assign(zd->zd_zilog, itx, tx));
+}
+
+static uint64_t
+ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr)
+{
+	itx_t *itx;
+
+	if (zil_replaying(zd->zd_zilog, tx))
+		return (0);
+
+	itx = zil_itx_create(TX_SETATTR, sizeof (*lr));
+	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+	    sizeof (*lr) - sizeof (lr_t));
+
+	return (zil_itx_assign(zd->zd_zilog, itx, tx));
+}
+
+/*
+ * ZIL replay ops
+ */
+static int
+ztest_replay_create(ztest_ds_t *zd, lr_create_t *lr, boolean_t byteswap)
+{
+	char *name = (void *)(lr + 1);		/* name follows lr */
+	objset_t *os = zd->zd_os;
+	ztest_block_tag_t *bbt;
+	dmu_buf_t *db;
 	dmu_tx_t *tx;
-	int error;
+	uint64_t txg;
+	int error = 0;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
+	ASSERT(lr->lr_doid == ZTEST_DIROBJ);
+	ASSERT(name[0] != '\0');
+
 	tx = dmu_tx_create(os);
-	dmu_tx_hold_free(tx, lr->lr_doid, 0, DMU_OBJECT_END);
-	error = dmu_tx_assign(tx, TXG_WAIT);
+
+	dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name);
+
+	if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+	} else {
+		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+	}
+
+	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+	if (txg == 0)
+		return (ENOSPC);
+
+	ASSERT(dmu_objset_zil(os)->zl_replay == !!lr->lr_foid);
+
+	if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
+		if (lr->lr_foid == 0) {
+			lr->lr_foid = zap_create(os,
+			    lr->lrz_type, lr->lrz_bonustype,
+			    lr->lrz_bonuslen, tx);
+		} else {
+			error = zap_create_claim(os, lr->lr_foid,
+			    lr->lrz_type, lr->lrz_bonustype,
+			    lr->lrz_bonuslen, tx);
+		}
+	} else {
+		if (lr->lr_foid == 0) {
+			lr->lr_foid = dmu_object_alloc(os,
+			    lr->lrz_type, 0, lr->lrz_bonustype,
+			    lr->lrz_bonuslen, tx);
+		} else {
+			error = dmu_object_claim(os, lr->lr_foid,
+			    lr->lrz_type, 0, lr->lrz_bonustype,
+			    lr->lrz_bonuslen, tx);
+		}
+	}
+
 	if (error) {
-		dmu_tx_abort(tx);
+		ASSERT3U(error, ==, EEXIST);
+		ASSERT(zd->zd_zilog->zl_replay);
+		dmu_tx_commit(tx);
 		return (error);
 	}
 
-	error = dmu_object_free(os, lr->lr_doid, tx);
+	ASSERT(lr->lr_foid != 0);
+
+	if (lr->lrz_type != DMU_OT_ZAP_OTHER)
+		VERIFY3U(0, ==, dmu_object_set_blocksize(os, lr->lr_foid,
+		    lr->lrz_blocksize, lr->lrz_ibshift, tx));
+
+	VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
+	bbt = ztest_bt_bonus(db);
+	dmu_buf_will_dirty(db, tx);
+	ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_gen, txg, txg);
+	dmu_buf_rele(db, FTAG);
+
+	VERIFY3U(0, ==, zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1,
+	    &lr->lr_foid, tx));
+
+	(void) ztest_log_create(zd, tx, lr);
+
+	dmu_tx_commit(tx);
+
+	return (0);
+}
+
+static int
+ztest_replay_remove(ztest_ds_t *zd, lr_remove_t *lr, boolean_t byteswap)
+{
+	char *name = (void *)(lr + 1);		/* name follows lr */
+	objset_t *os = zd->zd_os;
+	dmu_object_info_t doi;
+	dmu_tx_t *tx;
+	uint64_t object, txg;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	ASSERT(lr->lr_doid == ZTEST_DIROBJ);
+	ASSERT(name[0] != '\0');
+
+	VERIFY3U(0, ==,
+	    zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object));
+	ASSERT(object != 0);
+
+	ztest_object_lock(zd, object, RL_WRITER);
+
+	VERIFY3U(0, ==, dmu_object_info(os, object, &doi));
+
+	tx = dmu_tx_create(os);
+
+	dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name);
+	dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
+
+	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+	if (txg == 0) {
+		ztest_object_unlock(zd, object);
+		return (ENOSPC);
+	}
+
+	if (doi.doi_type == DMU_OT_ZAP_OTHER) {
+		VERIFY3U(0, ==, zap_destroy(os, object, tx));
+	} else {
+		VERIFY3U(0, ==, dmu_object_free(os, object, tx));
+	}
+
+	VERIFY3U(0, ==, zap_remove(os, lr->lr_doid, name, tx));
+
+	(void) ztest_log_remove(zd, tx, lr);
+
 	dmu_tx_commit(tx);
 
-	return (error);
+	ztest_object_unlock(zd, object);
+
+	return (0);
+}
+
+static int
+ztest_replay_write(ztest_ds_t *zd, lr_write_t *lr, boolean_t byteswap)
+{
+	objset_t *os = zd->zd_os;
+	void *data = lr + 1;			/* data follows lr */
+	uint64_t offset, length;
+	ztest_block_tag_t *bt = data;
+	ztest_block_tag_t *bbt;
+	uint64_t gen, txg, lrtxg, crtxg;
+	dmu_object_info_t doi;
+	dmu_tx_t *tx;
+	dmu_buf_t *db;
+	arc_buf_t *abuf = NULL;
+	rl_t *rl;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	offset = lr->lr_offset;
+	length = lr->lr_length;
+
+	/* If it's a dmu_sync() block, write the whole block */
+	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
+		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
+		if (length < blocksize) {
+			offset -= offset % blocksize;
+			length = blocksize;
+		}
+	}
+
+	if (bt->bt_magic == BSWAP_64(BT_MAGIC))
+		byteswap_uint64_array(bt, sizeof (*bt));
+
+	if (bt->bt_magic != BT_MAGIC)
+		bt = NULL;
+
+	ztest_object_lock(zd, lr->lr_foid, RL_READER);
+	rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER);
+
+	VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
+
+	dmu_object_info_from_db(db, &doi);
+
+	bbt = ztest_bt_bonus(db);
+	ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
+	gen = bbt->bt_gen;
+	crtxg = bbt->bt_crtxg;
+	lrtxg = lr->lr_common.lrc_txg;
+
+	tx = dmu_tx_create(os);
+
+	dmu_tx_hold_write(tx, lr->lr_foid, offset, length);
+
+	if (ztest_random(8) == 0 && length == doi.doi_data_block_size &&
+	    P2PHASE(offset, length) == 0)
+		abuf = dmu_request_arcbuf(db, length);
+
+	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+	if (txg == 0) {
+		if (abuf != NULL)
+			dmu_return_arcbuf(abuf);
+		dmu_buf_rele(db, FTAG);
+		ztest_range_unlock(rl);
+		ztest_object_unlock(zd, lr->lr_foid);
+		return (ENOSPC);
+	}
+
+	if (bt != NULL) {
+		/*
+		 * Usually, verify the old data before writing new data --
+		 * but not always, because we also want to verify correct
+		 * behavior when the data was not recently read into cache.
+		 */
+		ASSERT(offset % doi.doi_data_block_size == 0);
+		if (ztest_random(4) != 0) {
+			int prefetch = ztest_random(2) ?
+			    DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH;
+			ztest_block_tag_t rbt;
+
+			VERIFY(dmu_read(os, lr->lr_foid, offset,
+			    sizeof (rbt), &rbt, prefetch) == 0);
+			if (rbt.bt_magic == BT_MAGIC) {
+				ztest_bt_verify(&rbt, os, lr->lr_foid,
+				    offset, gen, txg, crtxg);
+			}
+		}
+
+		/*
+		 * Writes can appear to be newer than the bonus buffer because
+		 * the ztest_get_data() callback does a dmu_read() of the
+		 * open-context data, which may be different than the data
+		 * as it was when the write was generated.
+		 */
+		if (zd->zd_zilog->zl_replay) {
+			ztest_bt_verify(bt, os, lr->lr_foid, offset,
+			    MAX(gen, bt->bt_gen), MAX(txg, lrtxg),
+			    bt->bt_crtxg);
+		}
+
+		/*
+		 * Set the bt's gen/txg to the bonus buffer's gen/txg
+		 * so that all of the usual ASSERTs will work.
+		 */
+		ztest_bt_generate(bt, os, lr->lr_foid, offset, gen, txg, crtxg);
+	}
+
+	if (abuf == NULL) {
+		dmu_write(os, lr->lr_foid, offset, length, data, tx);
+	} else {
+		bcopy(data, abuf->b_data, length);
+		dmu_assign_arcbuf(db, offset, abuf, tx);
+	}
+
+	(void) ztest_log_write(zd, tx, lr);
+
+	dmu_buf_rele(db, FTAG);
+
+	dmu_tx_commit(tx);
+
+	ztest_range_unlock(rl);
+	ztest_object_unlock(zd, lr->lr_foid);
+
+	return (0);
+}
+
+static int
+ztest_replay_truncate(ztest_ds_t *zd, lr_truncate_t *lr, boolean_t byteswap)
+{
+	objset_t *os = zd->zd_os;
+	dmu_tx_t *tx;
+	uint64_t txg;
+	rl_t *rl;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	ztest_object_lock(zd, lr->lr_foid, RL_READER);
+	rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length,
+	    RL_WRITER);
+
+	tx = dmu_tx_create(os);
+
+	dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length);
+
+	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+	if (txg == 0) {
+		ztest_range_unlock(rl);
+		ztest_object_unlock(zd, lr->lr_foid);
+		return (ENOSPC);
+	}
+
+	VERIFY(dmu_free_range(os, lr->lr_foid, lr->lr_offset,
+	    lr->lr_length, tx) == 0);
+
+	(void) ztest_log_truncate(zd, tx, lr);
+
+	dmu_tx_commit(tx);
+
+	ztest_range_unlock(rl);
+	ztest_object_unlock(zd, lr->lr_foid);
+
+	return (0);
+}
+
+static int
+ztest_replay_setattr(ztest_ds_t *zd, lr_setattr_t *lr, boolean_t byteswap)
+{
+	objset_t *os = zd->zd_os;
+	dmu_tx_t *tx;
+	dmu_buf_t *db;
+	ztest_block_tag_t *bbt;
+	uint64_t txg, lrtxg, crtxg;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	ztest_object_lock(zd, lr->lr_foid, RL_WRITER);
+
+	VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
+
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_bonus(tx, lr->lr_foid);
+
+	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+	if (txg == 0) {
+		dmu_buf_rele(db, FTAG);
+		ztest_object_unlock(zd, lr->lr_foid);
+		return (ENOSPC);
+	}
+
+	bbt = ztest_bt_bonus(db);
+	ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
+	crtxg = bbt->bt_crtxg;
+	lrtxg = lr->lr_common.lrc_txg;
+
+	if (zd->zd_zilog->zl_replay) {
+		ASSERT(lr->lr_size != 0);
+		ASSERT(lr->lr_mode != 0);
+		ASSERT(lrtxg != 0);
+	} else {
+		/*
+		 * Randomly change the size and increment the generation.
+		 */
+		lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) *
+		    sizeof (*bbt);
+		lr->lr_mode = bbt->bt_gen + 1;
+		ASSERT(lrtxg == 0);
+	}
+
+	/*
+	 * Verify that the current bonus buffer is not newer than our txg.
+	 */
+	ztest_bt_verify(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode,
+	    MAX(txg, lrtxg), crtxg);
+
+	dmu_buf_will_dirty(db, tx);
+
+	ASSERT3U(lr->lr_size, >=, sizeof (*bbt));
+	ASSERT3U(lr->lr_size, <=, db->db_size);
+	VERIFY3U(dmu_set_bonus(db, lr->lr_size, tx), ==, 0);
+	bbt = ztest_bt_bonus(db);
+
+	ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode, txg, crtxg);
+
+	dmu_buf_rele(db, FTAG);
+
+	(void) ztest_log_setattr(zd, tx, lr);
+
+	dmu_tx_commit(tx);
+
+	ztest_object_unlock(zd, lr->lr_foid);
+
+	return (0);
 }
 
 zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
@@ -793,9 +1572,9 @@
 	NULL,			/* TX_RMDIR */
 	NULL,			/* TX_LINK */
 	NULL,			/* TX_RENAME */
-	NULL,			/* TX_WRITE */
-	NULL,			/* TX_TRUNCATE */
-	NULL,			/* TX_SETATTR */
+	ztest_replay_write,	/* TX_WRITE */
+	ztest_replay_truncate,	/* TX_TRUNCATE */
+	ztest_replay_setattr,	/* TX_SETATTR */
 	NULL,			/* TX_ACL */
 	NULL,			/* TX_CREATE_ACL */
 	NULL,			/* TX_CREATE_ATTR */
@@ -807,13 +1586,472 @@
 };
 
 /*
+ * ZIL get_data callbacks
+ */
+
+static void
+ztest_get_done(zgd_t *zgd, int error)
+{
+	ztest_ds_t *zd = zgd->zgd_private;
+	uint64_t object = zgd->zgd_rl->rl_object;
+
+	if (zgd->zgd_db)
+		dmu_buf_rele(zgd->zgd_db, zgd);
+
+	ztest_range_unlock(zgd->zgd_rl);
+	ztest_object_unlock(zd, object);
+
+	if (error == 0 && zgd->zgd_bp)
+		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
+
+	umem_free(zgd, sizeof (*zgd));
+}
+
+static int
+ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
+{
+	ztest_ds_t *zd = arg;
+	objset_t *os = zd->zd_os;
+	uint64_t object = lr->lr_foid;
+	uint64_t offset = lr->lr_offset;
+	uint64_t size = lr->lr_length;
+	blkptr_t *bp = &lr->lr_blkptr;
+	uint64_t txg = lr->lr_common.lrc_txg;
+	uint64_t crtxg;
+	dmu_object_info_t doi;
+	dmu_buf_t *db;
+	zgd_t *zgd;
+	int error;
+
+	ztest_object_lock(zd, object, RL_READER);
+	error = dmu_bonus_hold(os, object, FTAG, &db);
+	if (error) {
+		ztest_object_unlock(zd, object);
+		return (error);
+	}
+
+	crtxg = ztest_bt_bonus(db)->bt_crtxg;
+
+	if (crtxg == 0 || crtxg > txg) {
+		dmu_buf_rele(db, FTAG);
+		ztest_object_unlock(zd, object);
+		return (ENOENT);
+	}
+
+	dmu_object_info_from_db(db, &doi);
+	dmu_buf_rele(db, FTAG);
+	db = NULL;
+
+	zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL);
+	zgd->zgd_zilog = zd->zd_zilog;
+	zgd->zgd_private = zd;
+
+	if (buf != NULL) {	/* immediate write */
+		zgd->zgd_rl = ztest_range_lock(zd, object, offset, size,
+		    RL_READER);
+
+		error = dmu_read(os, object, offset, size, buf,
+		    DMU_READ_NO_PREFETCH);
+		ASSERT(error == 0);
+	} else {
+		size = doi.doi_data_block_size;
+		if (ISP2(size))
+			offset = P2ALIGN(offset, size);
+
+		zgd->zgd_rl = ztest_range_lock(zd, object, offset, size,
+		    RL_READER);
+
+		error = dmu_buf_hold(os, object, offset, zgd, &db);
+
+		if (error == 0) {
+			zgd->zgd_db = db;
+			zgd->zgd_bp = bp;
+
+			ASSERT(db->db_offset == offset);
+			ASSERT(db->db_size == size);
+
+			error = dmu_sync(zio, lr->lr_common.lrc_txg,
+			    ztest_get_done, zgd);
+
+			if (error == 0)
+				return (0);
+		}
+	}
+
+	ztest_get_done(zgd, error);
+
+	return (error);
+}
+
+static void *
+ztest_lr_alloc(size_t lrsize, char *name)
+{
+	char *lr;
+	size_t namesize = name ? strlen(name) + 1 : 0;
+
+	lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL);
+
+	if (name)
+		bcopy(name, lr + lrsize, namesize);
+
+	return (lr);
+}
+
+void
+ztest_lr_free(void *lr, size_t lrsize, char *name)
+{
+	size_t namesize = name ? strlen(name) + 1 : 0;
+
+	umem_free(lr, lrsize + namesize);
+}
+
+/*
+ * Lookup a bunch of objects.  Returns the number of objects not found.
+ */
+static int
+ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count)
+{
+	int missing = 0;
+	int error;
+
+	ASSERT(_mutex_held(&zd->zd_dirobj_lock));
+
+	for (int i = 0; i < count; i++, od++) {
+		od->od_object = 0;
+		error = zap_lookup(zd->zd_os, od->od_dir, od->od_name,
+		    sizeof (uint64_t), 1, &od->od_object);
+		if (error) {
+			ASSERT(error == ENOENT);
+			ASSERT(od->od_object == 0);
+			missing++;
+		} else {
+			dmu_buf_t *db;
+			ztest_block_tag_t *bbt;
+			dmu_object_info_t doi;
+
+			ASSERT(od->od_object != 0);
+			ASSERT(missing == 0);	/* there should be no gaps */
+
+			ztest_object_lock(zd, od->od_object, RL_READER);
+			VERIFY3U(0, ==, dmu_bonus_hold(zd->zd_os,
+			    od->od_object, FTAG, &db));
+			dmu_object_info_from_db(db, &doi);
+			bbt = ztest_bt_bonus(db);
+			ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
+			od->od_type = doi.doi_type;
+			od->od_blocksize = doi.doi_data_block_size;
+			od->od_gen = bbt->bt_gen;
+			dmu_buf_rele(db, FTAG);
+			ztest_object_unlock(zd, od->od_object);
+		}
+	}
+
+	return (missing);
+}
+
+static int
+ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count)
+{
+	int missing = 0;
+
+	ASSERT(_mutex_held(&zd->zd_dirobj_lock));
+
+	for (int i = 0; i < count; i++, od++) {
+		if (missing) {
+			od->od_object = 0;
+			missing++;
+			continue;
+		}
+
+		lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name);
+
+		lr->lr_doid = od->od_dir;
+		lr->lr_foid = 0;	/* 0 to allocate, > 0 to claim */
+		lr->lrz_type = od->od_crtype;
+		lr->lrz_blocksize = od->od_crblocksize;
+		lr->lrz_ibshift = ztest_random_ibshift();
+		lr->lrz_bonustype = DMU_OT_UINT64_OTHER;
+		lr->lrz_bonuslen = dmu_bonus_max();
+		lr->lr_gen = od->od_crgen;
+		lr->lr_crtime[0] = time(NULL);
+
+		if (ztest_replay_create(zd, lr, B_FALSE) != 0) {
+			ASSERT(missing == 0);
+			od->od_object = 0;
+			missing++;
+		} else {
+			od->od_object = lr->lr_foid;
+			od->od_type = od->od_crtype;
+			od->od_blocksize = od->od_crblocksize;
+			od->od_gen = od->od_crgen;
+			ASSERT(od->od_object != 0);
+		}
+
+		ztest_lr_free(lr, sizeof (*lr), od->od_name);
+	}
+
+	return (missing);
+}
+
+static int
+ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count)
+{
+	int missing = 0;
+	int error;
+
+	ASSERT(_mutex_held(&zd->zd_dirobj_lock));
+
+	od += count - 1;
+
+	for (int i = count - 1; i >= 0; i--, od--) {
+		if (missing) {
+			missing++;
+			continue;
+		}
+
+		if (od->od_object == 0)
+			continue;
+
+		lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name);
+
+		lr->lr_doid = od->od_dir;
+
+		if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) {
+			ASSERT3U(error, ==, ENOSPC);
+			missing++;
+		} else {
+			od->od_object = 0;
+		}
+		ztest_lr_free(lr, sizeof (*lr), od->od_name);
+	}
+
+	return (missing);
+}
+
+static int
+ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size,
+    void *data)
+{
+	lr_write_t *lr;
+	int error;
+
+	lr = ztest_lr_alloc(sizeof (*lr) + size, NULL);
+
+	lr->lr_foid = object;
+	lr->lr_offset = offset;
+	lr->lr_length = size;
+	lr->lr_blkoff = 0;
+	BP_ZERO(&lr->lr_blkptr);
+
+	bcopy(data, lr + 1, size);
+
+	error = ztest_replay_write(zd, lr, B_FALSE);
+
+	ztest_lr_free(lr, sizeof (*lr) + size, NULL);
+
+	return (error);
+}
+
+static int
+ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
+{
+	lr_truncate_t *lr;
+	int error;
+
+	lr = ztest_lr_alloc(sizeof (*lr), NULL);
+
+	lr->lr_foid = object;
+	lr->lr_offset = offset;
+	lr->lr_length = size;
+
+	error = ztest_replay_truncate(zd, lr, B_FALSE);
+
+	ztest_lr_free(lr, sizeof (*lr), NULL);
+
+	return (error);
+}
+
+static int
+ztest_setattr(ztest_ds_t *zd, uint64_t object)
+{
+	lr_setattr_t *lr;
+	int error;
+
+	lr = ztest_lr_alloc(sizeof (*lr), NULL);
+
+	lr->lr_foid = object;
+	lr->lr_size = 0;
+	lr->lr_mode = 0;
+
+	error = ztest_replay_setattr(zd, lr, B_FALSE);
+
+	ztest_lr_free(lr, sizeof (*lr), NULL);
+
+	return (error);
+}
+
+static void
+ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
+{
+	objset_t *os = zd->zd_os;
+	dmu_tx_t *tx;
+	uint64_t txg;
+	rl_t *rl;
+
+	txg_wait_synced(dmu_objset_pool(os), 0);
+
+	ztest_object_lock(zd, object, RL_READER);
+	rl = ztest_range_lock(zd, object, offset, size, RL_WRITER);
+
+	tx = dmu_tx_create(os);
+
+	dmu_tx_hold_write(tx, object, offset, size);
+
+	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+
+	if (txg != 0) {
+		dmu_prealloc(os, object, offset, size, tx);
+		dmu_tx_commit(tx);
+		txg_wait_synced(dmu_objset_pool(os), txg);
+	} else {
+		(void) dmu_free_long_range(os, object, offset, size);
+	}
+
+	ztest_range_unlock(rl);
+	ztest_object_unlock(zd, object);
+}
+
+static void
+ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
+{
+	ztest_block_tag_t wbt;
+	dmu_object_info_t doi;
+	enum ztest_io_type io_type;
+	uint64_t blocksize;
+	void *data;
+
+	VERIFY(dmu_object_info(zd->zd_os, object, &doi) == 0);
+	blocksize = doi.doi_data_block_size;
+	data = umem_alloc(blocksize, UMEM_NOFAIL);
+
+	/*
+	 * Pick an i/o type at random, biased toward writing block tags.
+	 */
+	io_type = ztest_random(ZTEST_IO_TYPES);
+	if (ztest_random(2) == 0)
+		io_type = ZTEST_IO_WRITE_TAG;
+
+	switch (io_type) {
+
+	case ZTEST_IO_WRITE_TAG:
+		ztest_bt_generate(&wbt, zd->zd_os, object, offset, 0, 0, 0);
+		(void) ztest_write(zd, object, offset, sizeof (wbt), &wbt);
+		break;
+
+	case ZTEST_IO_WRITE_PATTERN:
+		(void) memset(data, 'a' + (object + offset) % 5, blocksize);
+		if (ztest_random(2) == 0) {
+			/*
+			 * Induce fletcher2 collisions to ensure that
+			 * zio_ddt_collision() detects and resolves them
+			 * when using fletcher2-verify for deduplication.
+			 */
+			((uint64_t *)data)[0] ^= 1ULL << 63;
+			((uint64_t *)data)[4] ^= 1ULL << 63;
+		}
+		(void) ztest_write(zd, object, offset, blocksize, data);
+		break;
+
+	case ZTEST_IO_WRITE_ZEROES:
+		bzero(data, blocksize);
+		(void) ztest_write(zd, object, offset, blocksize, data);
+		break;
+
+	case ZTEST_IO_TRUNCATE:
+		(void) ztest_truncate(zd, object, offset, blocksize);
+		break;
+
+	case ZTEST_IO_SETATTR:
+		(void) ztest_setattr(zd, object);
+		break;
+	}
+
+	umem_free(data, blocksize);
+}
+
+/*
+ * Initialize an object description template.
+ */
+static void
+ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index,
+    dmu_object_type_t type, uint64_t blocksize, uint64_t gen)
+{
+	od->od_dir = ZTEST_DIROBJ;
+	od->od_object = 0;
+
+	od->od_crtype = type;
+	od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize();
+	od->od_crgen = gen;
+
+	od->od_type = DMU_OT_NONE;
+	od->od_blocksize = 0;
+	od->od_gen = 0;
+
+	(void) snprintf(od->od_name, sizeof (od->od_name), "%s(%lld)[%llu]",
+	    tag, (int64_t)id, index);
+}
+
+/*
+ * Lookup or create the objects for a test using the od template.
+ * If the objects do not all exist, or if 'remove' is specified,
+ * remove any existing objects and create new ones.  Otherwise,
+ * use the existing objects.
+ */
+static int
+ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove)
+{
+	int count = size / sizeof (*od);
+	int rv = 0;
+
+	VERIFY(mutex_lock(&zd->zd_dirobj_lock) == 0);
+	if ((ztest_lookup(zd, od, count) != 0 || remove) &&
+	    (ztest_remove(zd, od, count) != 0 ||
+	    ztest_create(zd, od, count) != 0))
+		rv = -1;
+	zd->zd_od = od;
+	VERIFY(mutex_unlock(&zd->zd_dirobj_lock) == 0);
+
+	return (rv);
+}
+
+/* ARGSUSED */
+void
+ztest_zil_commit(ztest_ds_t *zd, uint64_t id)
+{
+	zilog_t *zilog = zd->zd_zilog;
+
+	zil_commit(zilog, UINT64_MAX, ztest_random(ZTEST_OBJECTS));
+
+	/*
+	 * Remember the committed values in zd, which is in parent/child
+	 * shared memory.  If we die, the next iteration of ztest_run()
+	 * will verify that the log really does contain this record.
+	 */
+	mutex_enter(&zilog->zl_lock);
+	ASSERT(zd->zd_seq <= zilog->zl_commit_lr_seq);
+	zd->zd_seq = zilog->zl_commit_lr_seq;
+	mutex_exit(&zilog->zl_lock);
+}
+
+/*
  * Verify that we can't destroy an active pool, create an existing pool,
  * or create a pool with a bad vdev spec.
  */
+/* ARGSUSED */
 void
-ztest_spa_create_destroy(ztest_args_t *za)
+ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id)
 {
-	int error;
+	ztest_shared_t *zs = ztest_shared;
 	spa_t *spa;
 	nvlist_t *nvroot;
 
@@ -821,41 +2059,31 @@
 	 * Attempt to create using a bad file.
 	 */
 	nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1);
-	error = spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL);
+	VERIFY3U(ENOENT, ==,
+	    spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL));
 	nvlist_free(nvroot);
-	if (error != ENOENT)
-		fatal(0, "spa_create(bad_file) = %d", error);
 
 	/*
 	 * Attempt to create using a bad mirror.
 	 */
 	nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 2, 1);
-	error = spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL);
+	VERIFY3U(ENOENT, ==,
+	    spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL));
 	nvlist_free(nvroot);
-	if (error != ENOENT)
-		fatal(0, "spa_create(bad_mirror) = %d", error);
 
 	/*
 	 * Attempt to create an existing pool.  It shouldn't matter
 	 * what's in the nvroot; we should fail with EEXIST.
 	 */
-	(void) rw_rdlock(&ztest_shared->zs_name_lock);
+	(void) rw_rdlock(&zs->zs_name_lock);
 	nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1);
-	error = spa_create(za->za_pool, nvroot, NULL, NULL, NULL);
+	VERIFY3U(EEXIST, ==, spa_create(zs->zs_pool, nvroot, NULL, NULL, NULL));
 	nvlist_free(nvroot);
-	if (error != EEXIST)
-		fatal(0, "spa_create(whatever) = %d", error);
-
-	error = spa_open(za->za_pool, &spa, FTAG);
-	if (error)
-		fatal(0, "spa_open() = %d", error);
-
-	error = spa_destroy(za->za_pool);
-	if (error != EBUSY)
-		fatal(0, "spa_destroy() = %d", error);
-
+	VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG));
+	VERIFY3U(EBUSY, ==, spa_destroy(zs->zs_pool));
 	spa_close(spa, FTAG);
-	(void) rw_unlock(&ztest_shared->zs_name_lock);
+
+	(void) rw_unlock(&zs->zs_name_lock);
 }
 
 static vdev_t *
@@ -897,16 +2125,18 @@
 /*
  * Verify that vdev_add() works as expected.
  */
+/* ARGSUSED */
 void
-ztest_vdev_add_remove(ztest_args_t *za)
+ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
 {
-	spa_t *spa = za->za_spa;
+	ztest_shared_t *zs = ztest_shared;
+	spa_t *spa = zs->zs_spa;
 	uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
 	uint64_t guid;
 	nvlist_t *nvroot;
 	int error;
 
-	(void) mutex_lock(&ztest_shared->zs_vdev_lock);
+	VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 
@@ -919,7 +2149,7 @@
 		/*
 		 * Grab the guid from the head of the log class rotor.
 		 */
-		guid = spa->spa_log_class->mc_rotor->mg_vd->vdev_guid;
+		guid = spa_log_class(spa)->mc_rotor->mg_vd->vdev_guid;
 
 		spa_config_exit(spa, SCL_VDEV, FTAG);
 
@@ -931,9 +2161,9 @@
 		 * dmu_objset_destroy() to fail with EBUSY thus
 		 * leaving the dataset in an inconsistent state.
 		 */
-		(void) rw_wrlock(&ztest_shared->zs_name_lock);
+		VERIFY(rw_wrlock(&ztest_shared->zs_name_lock) == 0);
 		error = spa_vdev_remove(spa, guid, B_FALSE);
-		(void) rw_unlock(&ztest_shared->zs_name_lock);
+		VERIFY(rw_unlock(&ztest_shared->zs_name_lock) == 0);
 
 		if (error && error != EEXIST)
 			fatal(0, "spa_vdev_remove() = %d", error);
@@ -955,16 +2185,18 @@
 			fatal(0, "spa_vdev_add() = %d", error);
 	}
 
-	(void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+	VERIFY(mutex_unlock(&ztest_shared->zs_vdev_lock) == 0);
 }
 
 /*
  * Verify that adding/removing aux devices (l2arc, hot spare) works as expected.
  */
+/* ARGSUSED */
 void
-ztest_vdev_aux_add_remove(ztest_args_t *za)
+ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
 {
-	spa_t *spa = za->za_spa;
+	ztest_shared_t *zs = ztest_shared;
+	spa_t *spa = zs->zs_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	spa_aux_vdev_t *sav;
 	char *aux;
@@ -979,7 +2211,7 @@
 		aux = ZPOOL_CONFIG_L2CACHE;
 	}
 
-	(void) mutex_lock(&ztest_shared->zs_vdev_lock);
+	VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 
@@ -992,12 +2224,12 @@
 		/*
 		 * Find an unused device we can add.
 		 */
-		ztest_shared->zs_vdev_aux = 0;
+		zs->zs_vdev_aux = 0;
 		for (;;) {
 			char path[MAXPATHLEN];
 			int c;
 			(void) sprintf(path, ztest_aux_template, zopt_dir,
-			    zopt_pool, aux, ztest_shared->zs_vdev_aux);
+			    zopt_pool, aux, zs->zs_vdev_aux);
 			for (c = 0; c < sav->sav_count; c++)
 				if (strcmp(sav->sav_vdevs[c]->vdev_path,
 				    path) == 0)
@@ -1005,7 +2237,7 @@
 			if (c == sav->sav_count &&
 			    vdev_lookup_by_path(rvd, path) == NULL)
 				break;
-			ztest_shared->zs_vdev_aux++;
+			zs->zs_vdev_aux++;
 		}
 	}
 
@@ -1035,16 +2267,18 @@
 			fatal(0, "spa_vdev_remove(%llu) = %d", guid, error);
 	}
 
-	(void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+	VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
 }
 
 /*
  * Verify that we can attach and detach devices.
  */
+/* ARGSUSED */
 void
-ztest_vdev_attach_detach(ztest_args_t *za)
+ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
 {
-	spa_t *spa = za->za_spa;
+	ztest_shared_t *zs = ztest_shared;
+	spa_t *spa = zs->zs_spa;
 	spa_aux_vdev_t *sav = &spa->spa_spares;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *oldvd, *newvd, *pvd;
@@ -1061,7 +2295,7 @@
 	int oldvd_is_log;
 	int error, expected_error;
 
-	(void) mutex_lock(&ztest_shared->zs_vdev_lock);
+	VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 
@@ -1073,7 +2307,7 @@
 	/*
 	 * Pick a random top-level vdev.
 	 */
-	top = ztest_random(rvd->vdev_children);
+	top = ztest_random_vdev_top(spa, B_TRUE);
 
 	/*
 	 * Pick a random leaf within it.
@@ -1121,7 +2355,7 @@
 		if (error != 0 && error != ENODEV && error != EBUSY &&
 		    error != ENOTSUP)
 			fatal(0, "detach (%s) returned %d", oldpath, error);
-		(void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+		VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
 		return;
 	}
 
@@ -1214,7 +2448,7 @@
 		    (longlong_t)newsize, replacing, error, expected_error);
 	}
 
-	(void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+	VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
 }
 
 /*
@@ -1291,7 +2525,8 @@
 		if (zopt_verbose >= 5) {
 			(void) printf("vdev configuration has changed, "
 			    "guid %llu, state %llu, expected gen %llu, "
-			    "got gen %llu\n", (u_longlong_t)guid,
+			    "got gen %llu\n",
+			    (u_longlong_t)guid,
 			    (u_longlong_t)tvd->vdev_state,
 			    (u_longlong_t)generation,
 			    (u_longlong_t)spa->spa_config_generation);
@@ -1329,23 +2564,29 @@
 /*
  * Verify that dynamic LUN growth works as expected.
  */
+/* ARGSUSED */
 void
-ztest_vdev_LUN_growth(ztest_args_t *za)
+ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
 {
-	spa_t *spa = za->za_spa;
-	vdev_t *vd, *tvd = NULL;
+	ztest_shared_t *zs = ztest_shared;
+	spa_t *spa = zs->zs_spa;
+	vdev_t *vd, *tvd;
+	metaslab_class_t *mc;
+	metaslab_group_t *mg;
 	size_t psize, newsize;
-	uint64_t spa_newsize, spa_cursize, ms_count;
-
-	(void) mutex_lock(&ztest_shared->zs_vdev_lock);
+	uint64_t top;
+	uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count;
+
+	VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
 	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
 
-	while (tvd == NULL || tvd->vdev_islog) {
-		uint64_t vdev;
-
-		vdev = ztest_random(spa->spa_root_vdev->vdev_children);
-		tvd = spa->spa_root_vdev->vdev_child[vdev];
-	}
+	top = ztest_random_vdev_top(spa, B_TRUE);
+
+	tvd = spa->spa_root_vdev->vdev_child[top];
+	mg = tvd->vdev_mg;
+	mc = mg->mg_class;
+	old_ms_count = tvd->vdev_ms_count;
+	old_class_space = metaslab_class_get_space(mc);
 
 	/*
 	 * Determine the size of the first leaf vdev associated with
@@ -1364,21 +2605,18 @@
 	if (tvd->vdev_state != VDEV_STATE_HEALTHY ||
 	    psize == 0 || psize >= 4 * zopt_vdev_size) {
 		spa_config_exit(spa, SCL_STATE, spa);
-		(void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+		VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
 		return;
 	}
 	ASSERT(psize > 0);
 	newsize = psize + psize / 8;
 	ASSERT3U(newsize, >, psize);
 
-	if (zopt_verbose >= 5) {
-		(void) printf("Expanding vdev %s from %lu to %lu\n",
+	if (zopt_verbose >= 6) {
+		(void) printf("Expanding LUN %s from %lu to %lu\n",
 		    vd->vdev_path, (ulong_t)psize, (ulong_t)newsize);
 	}
 
-	spa_cursize = spa_get_space(spa);
-	ms_count = tvd->vdev_ms_count;
-
 	/*
 	 * Growing the vdev is a two step process:
 	 *	1). expand the physical size (i.e. relabel)
@@ -1391,154 +2629,176 @@
 			(void) printf("Could not expand LUN because "
 			    "the vdev configuration changed.\n");
 		}
-		(void) spa_config_exit(spa, SCL_STATE, spa);
-		(void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+		spa_config_exit(spa, SCL_STATE, spa);
+		VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
 		return;
 	}
 
-	(void) spa_config_exit(spa, SCL_STATE, spa);
+	spa_config_exit(spa, SCL_STATE, spa);
 
 	/*
 	 * Expanding the LUN will update the config asynchronously,
 	 * thus we must wait for the async thread to complete any
 	 * pending tasks before proceeding.
 	 */
-	mutex_enter(&spa->spa_async_lock);
-	while (spa->spa_async_thread != NULL || spa->spa_async_tasks)
-		cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
-	mutex_exit(&spa->spa_async_lock);
+	for (;;) {
+		boolean_t done;
+		mutex_enter(&spa->spa_async_lock);
+		done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks);
+		mutex_exit(&spa->spa_async_lock);
+		if (done)
+			break;
+		txg_wait_synced(spa_get_dsl(spa), 0);
+		(void) poll(NULL, 0, 100);
+	}
 
 	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
-	spa_newsize = spa_get_space(spa);
+
+	tvd = spa->spa_root_vdev->vdev_child[top];
+	new_ms_count = tvd->vdev_ms_count;
+	new_class_space = metaslab_class_get_space(mc);
+
+	if (tvd->vdev_mg != mg || mg->mg_class != mc) {
+		if (zopt_verbose >= 5) {
+			(void) printf("Could not verify LUN expansion due to "
+			    "intervening vdev offline or remove.\n");
+		}
+		spa_config_exit(spa, SCL_STATE, spa);
+		VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+		return;
+	}
+
+	/*
+	 * Make sure we were able to grow the vdev.
+	 */
+	if (new_ms_count <= old_ms_count)
+		fatal(0, "LUN expansion failed: ms_count %llu <= %llu\n",
+		    old_ms_count, new_ms_count);
 
 	/*
 	 * Make sure we were able to grow the pool.
 	 */
-	if (ms_count >= tvd->vdev_ms_count ||
-	    spa_cursize >= spa_newsize) {
-		(void) printf("Top-level vdev metaslab count: "
-		    "before %llu, after %llu\n",
-		    (u_longlong_t)ms_count,
-		    (u_longlong_t)tvd->vdev_ms_count);
-		fatal(0, "LUN expansion failed: before %llu, "
-		    "after %llu\n", spa_cursize, spa_newsize);
-	} else if (zopt_verbose >= 5) {
+	if (new_class_space <= old_class_space)
+		fatal(0, "LUN expansion failed: class_space %llu <= %llu\n",
+		    old_class_space, new_class_space);
+
+	if (zopt_verbose >= 5) {
 		char oldnumbuf[6], newnumbuf[6];
 
-		nicenum(spa_cursize, oldnumbuf);
-		nicenum(spa_newsize, newnumbuf);
+		nicenum(old_class_space, oldnumbuf);
+		nicenum(new_class_space, newnumbuf);
 		(void) printf("%s grew from %s to %s\n",
 		    spa->spa_name, oldnumbuf, newnumbuf);
 	}
+
 	spa_config_exit(spa, SCL_STATE, spa);
-	(void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+	VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+}
+
+/*
+ * Verify that dmu_objset_{create,destroy,open,close} work as expected.
+ */
+/* ARGSUSED */
+static void
+ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
+{
+	/*
+	 * Create the objects common to all ztest datasets.
+	 */
+	VERIFY(zap_create_claim(os, ZTEST_DIROBJ,
+	    DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
 }
 
 /* ARGSUSED */
-static void
-ztest_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
+static int
+ztest_objset_destroy_cb(char *name, void *arg)
 {
-	/*
-	 * Create the directory object.
-	 */
-	VERIFY(dmu_object_claim(os, ZTEST_DIROBJ,
-	    DMU_OT_UINT64_OTHER, ZTEST_DIROBJ_BLOCKSIZE,
-	    DMU_OT_UINT64_OTHER, 5 * sizeof (ztest_block_tag_t), tx) == 0);
-
-	VERIFY(zap_create_claim(os, ZTEST_MICROZAP_OBJ,
-	    DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
-
-	VERIFY(zap_create_claim(os, ZTEST_FATZAP_OBJ,
-	    DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
-}
-
-static int
-ztest_destroy_cb(char *name, void *arg)
-{
-	ztest_args_t *za = arg;
 	objset_t *os;
-	dmu_object_info_t *doi = &za->za_doi;
+	dmu_object_info_t doi;
 	int error;
 
 	/*
 	 * Verify that the dataset contains a directory object.
 	 */
-	error = dmu_objset_hold(name, FTAG, &os);
-	ASSERT3U(error, ==, 0);
-	error = dmu_object_info(os, ZTEST_DIROBJ, doi);
+	VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os));
+	error = dmu_object_info(os, ZTEST_DIROBJ, &doi);
 	if (error != ENOENT) {
 		/* We could have crashed in the middle of destroying it */
 		ASSERT3U(error, ==, 0);
-		ASSERT3U(doi->doi_type, ==, DMU_OT_UINT64_OTHER);
-		ASSERT3S(doi->doi_physical_blks, >=, 0);
+		ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER);
+		ASSERT3S(doi.doi_physical_blocks_512, >=, 0);
 	}
 	dmu_objset_rele(os, FTAG);
 
 	/*
 	 * Destroy the dataset.
 	 */
-	error = dmu_objset_destroy(name, B_FALSE);
-	if (error) {
-		(void) dmu_objset_hold(name, FTAG, &os);
-		fatal(0, "dmu_objset_destroy(os=%p) = %d\n", os, error);
-	}
+	VERIFY3U(0, ==, dmu_objset_destroy(name, B_FALSE));
 	return (0);
 }
 
-/*
- * Verify that dmu_objset_{create,destroy,open,close} work as expected.
- */
-static uint64_t
-ztest_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t object, int mode)
+static boolean_t
+ztest_snapshot_create(char *osname, uint64_t id)
 {
-	itx_t *itx;
-	lr_create_t *lr;
-	size_t namesize;
-	char name[24];
-
-	(void) sprintf(name, "ZOBJ_%llu", (u_longlong_t)object);
-	namesize = strlen(name) + 1;
-
-	itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize +
-	    ztest_random(ZIL_MAX_BLKSZ));
-	lr = (lr_create_t *)&itx->itx_lr;
-	bzero(lr + 1, lr->lr_common.lrc_reclen - sizeof (*lr));
-	lr->lr_doid = object;
-	lr->lr_foid = 0;
-	lr->lr_mode = mode;
-	lr->lr_uid = 0;
-	lr->lr_gid = 0;
-	lr->lr_gen = dmu_tx_get_txg(tx);
-	lr->lr_crtime[0] = time(NULL);
-	lr->lr_crtime[1] = 0;
-	lr->lr_rdev = 0;
-	bcopy(name, (char *)(lr + 1), namesize);
-
-	return (zil_itx_assign(zilog, itx, tx));
+	char snapname[MAXNAMELEN];
+	int error;
+
+	(void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname,
+	    (u_longlong_t)id);
+
+	error = dmu_objset_snapshot(osname, strchr(snapname, '@') + 1,
+	    NULL, B_FALSE);
+	if (error == ENOSPC) {
+		ztest_record_enospc(FTAG);
+		return (B_FALSE);
+	}
+	if (error != 0 && error != EEXIST)
+		fatal(0, "ztest_snapshot_create(%s) = %d", snapname, error);
+	return (B_TRUE);
 }
 
+static boolean_t
+ztest_snapshot_destroy(char *osname, uint64_t id)
+{
+	char snapname[MAXNAMELEN];
+	int error;
+
+	(void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname,
+	    (u_longlong_t)id);
+
+	error = dmu_objset_destroy(snapname, B_FALSE);
+	if (error != 0 && error != ENOENT)
+		fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error);
+	return (B_TRUE);
+}
+
+/* ARGSUSED */
 void
-ztest_dmu_objset_create_destroy(ztest_args_t *za)
+ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id)
 {
+	ztest_shared_t *zs = ztest_shared;
+	ztest_ds_t zdtmp;
+	int iters;
 	int error;
 	objset_t *os, *os2;
-	char name[100];
+	char name[MAXNAMELEN];
 	zilog_t *zilog;
-	uint64_t seq;
-	uint64_t objects;
-
-	(void) rw_rdlock(&ztest_shared->zs_name_lock);
-	(void) snprintf(name, 100, "%s/%s_temp_%llu", za->za_pool, za->za_pool,
-	    (u_longlong_t)za->za_instance);
+
+	(void) rw_rdlock(&zs->zs_name_lock);
+
+	(void) snprintf(name, MAXNAMELEN, "%s/temp_%llu",
+	    zs->zs_pool, (u_longlong_t)id);
 
 	/*
 	 * If this dataset exists from a previous run, process its replay log
 	 * half of the time.  If we don't replay it, then dmu_objset_destroy()
-	 * (invoked from ztest_destroy_cb() below) should just throw it away.
+	 * (invoked from ztest_objset_destroy_cb()) should just throw it away.
 	 */
 	if (ztest_random(2) == 0 &&
 	    dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os) == 0) {
-		zil_replay(os, os, ztest_replay_vector);
+		ztest_zd_init(&zdtmp, os);
+		zil_replay(os, &zdtmp, ztest_replay_vector);
+		ztest_zd_fini(&zdtmp);
 		dmu_objset_disown(os, FTAG);
 	}
 
@@ -1547,152 +2807,106 @@
 	 * create lying around from a previous run.  If so, destroy it
 	 * and all of its snapshots.
 	 */
-	(void) dmu_objset_find(name, ztest_destroy_cb, za,
+	(void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL,
 	    DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
 
 	/*
 	 * Verify that the destroyed dataset is no longer in the namespace.
 	 */
-	error = dmu_objset_hold(name, FTAG, &os);
-	if (error != ENOENT)
-		fatal(1, "dmu_objset_open(%s) found destroyed dataset %p",
-		    name, os);
+	VERIFY3U(ENOENT, ==, dmu_objset_hold(name, FTAG, &os));
 
 	/*
 	 * Verify that we can create a new dataset.
 	 */
 	error = dmu_objset_create(name, DMU_OST_OTHER, 0,
-	    ztest_create_cb, NULL);
+	    ztest_objset_create_cb, NULL);
 	if (error) {
 		if (error == ENOSPC) {
-			ztest_record_enospc("dmu_objset_create");
-			(void) rw_unlock(&ztest_shared->zs_name_lock);
+			ztest_record_enospc(FTAG);
+			(void) rw_unlock(&zs->zs_name_lock);
 			return;
 		}
 		fatal(0, "dmu_objset_create(%s) = %d", name, error);
 	}
 
-	error = dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os);
-	if (error) {
-		fatal(0, "dmu_objset_open(%s) = %d", name, error);
-	}
+	VERIFY3U(0, ==,
+	    dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os));
+
+	ztest_zd_init(&zdtmp, os);
 
 	/*
 	 * Open the intent log for it.
 	 */
-	zilog = zil_open(os, NULL);
+	zilog = zil_open(os, ztest_get_data);
 
 	/*
-	 * Put a random number of objects in there.
+	 * Put some objects in there, do a little I/O to them,
+	 * and randomly take a couple of snapshots along the way.
 	 */
-	objects = ztest_random(20);
-	seq = 0;
-	while (objects-- != 0) {
-		uint64_t object;
-		dmu_tx_t *tx = dmu_tx_create(os);
-		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, sizeof (name));
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		if (error) {
-			dmu_tx_abort(tx);
-		} else {
-			object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
-			    DMU_OT_NONE, 0, tx);
-			ztest_set_random_blocksize(os, object, tx);
-			seq = ztest_log_create(zilog, tx, object,
-			    DMU_OT_UINT64_OTHER);
-			dmu_write(os, object, 0, sizeof (name), name, tx);
-			dmu_tx_commit(tx);
-		}
-		if (ztest_random(5) == 0) {
-			zil_commit(zilog, seq, object);
-		}
-		if (ztest_random(100) == 0) {
-			error = zil_suspend(zilog);
-			if (error == 0) {
-				zil_resume(zilog);
-			}
-		}
+	iters = ztest_random(5);
+	for (int i = 0; i < iters; i++) {
+		ztest_dmu_object_alloc_free(&zdtmp, id);
+		if (ztest_random(iters) == 0)
+			(void) ztest_snapshot_create(name, i);
 	}
 
 	/*
 	 * Verify that we cannot create an existing dataset.
 	 */
-	error = dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL);
-	if (error != EEXIST)
-		fatal(0, "created existing dataset, error = %d", error);
+	VERIFY3U(EEXIST, ==,
+	    dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL));
 
 	/*
 	 * Verify that we can hold an objset that is also owned.
 	 */
-	error = dmu_objset_hold(name, FTAG, &os2);
-	if (error)
-		fatal(0, "dmu_objset_open('%s') = %d", name, error);
+	VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os2));
 	dmu_objset_rele(os2, FTAG);
 
 	/*
-	 * Verify that we can not own an objset that is already owned.
+	 * Verify that we cannot own an objset that is already owned.
 	 */
-	error = dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os2);
-	if (error != EBUSY)
-		fatal(0, "dmu_objset_open('%s') = %d, expected EBUSY",
-		    name, error);
+	VERIFY3U(EBUSY, ==,
+	    dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os2));
 
 	zil_close(zilog);
 	dmu_objset_disown(os, FTAG);
-
-	error = dmu_objset_destroy(name, B_FALSE);
-	if (error)
-		fatal(0, "dmu_objset_destroy(%s) = %d", name, error);
-
-	(void) rw_unlock(&ztest_shared->zs_name_lock);
+	ztest_zd_fini(&zdtmp);
+
+	(void) rw_unlock(&zs->zs_name_lock);
 }
 
 /*
  * Verify that dmu_snapshot_{create,destroy,open,close} work as expected.
  */
 void
-ztest_dmu_snapshot_create_destroy(ztest_args_t *za)
+ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id)
 {
-	int error;
-	objset_t *os = za->za_os;
-	char snapname[100];
-	char osname[MAXNAMELEN];
-
-	(void) rw_rdlock(&ztest_shared->zs_name_lock);
-	dmu_objset_name(os, osname);
-	(void) snprintf(snapname, 100, "%s@%llu", osname,
-	    (u_longlong_t)za->za_instance);
-
-	error = dmu_objset_destroy(snapname, B_FALSE);
-	if (error != 0 && error != ENOENT)
-		fatal(0, "dmu_objset_destroy() = %d", error);
-	error = dmu_objset_snapshot(osname, strchr(snapname, '@')+1,
-	    NULL, FALSE);
-	if (error == ENOSPC)
-		ztest_record_enospc("dmu_take_snapshot");
-	else if (error != 0 && error != EEXIST)
-		fatal(0, "dmu_take_snapshot() = %d", error);
-	(void) rw_unlock(&ztest_shared->zs_name_lock);
+	ztest_shared_t *zs = ztest_shared;
+
+	(void) rw_rdlock(&zs->zs_name_lock);
+	(void) ztest_snapshot_destroy(zd->zd_name, id);
+	(void) ztest_snapshot_create(zd->zd_name, id);
+	(void) rw_unlock(&zs->zs_name_lock);
 }
 
 /*
  * Cleanup non-standard snapshots and clones.
  */
 void
-ztest_dsl_dataset_cleanup(char *osname, uint64_t curval)
+ztest_dsl_dataset_cleanup(char *osname, uint64_t id)
 {
-	char snap1name[100];
-	char clone1name[100];
-	char snap2name[100];
-	char clone2name[100];
-	char snap3name[100];
+	char snap1name[MAXNAMELEN];
+	char clone1name[MAXNAMELEN];
+	char snap2name[MAXNAMELEN];
+	char clone2name[MAXNAMELEN];
+	char snap3name[MAXNAMELEN];
 	int error;
 
-	(void) snprintf(snap1name, 100, "%s@s1_%llu", osname, curval);
-	(void) snprintf(clone1name, 100, "%s/c1_%llu", osname, curval);
-	(void) snprintf(snap2name, 100, "%s@s2_%llu", clone1name, curval);
-	(void) snprintf(clone2name, 100, "%s/c2_%llu", osname, curval);
-	(void) snprintf(snap3name, 100, "%s@s3_%llu", clone1name, curval);
+	(void) snprintf(snap1name, MAXNAMELEN, "%s@s1_%llu", osname, id);
+	(void) snprintf(clone1name, MAXNAMELEN, "%s/c1_%llu", osname, id);
+	(void) snprintf(snap2name, MAXNAMELEN, "%s@s2_%llu", clone1name, id);
+	(void) snprintf(clone2name, MAXNAMELEN, "%s/c2_%llu", osname, id);
+	(void) snprintf(snap3name, MAXNAMELEN, "%s@s3_%llu", clone1name, id);
 
 	error = dmu_objset_destroy(clone2name, B_FALSE);
 	if (error && error != ENOENT)
@@ -1715,36 +2929,34 @@
  * Verify dsl_dataset_promote handles EBUSY
  */
 void
-ztest_dsl_dataset_promote_busy(ztest_args_t *za)
+ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id)
 {
-	int error;
-	objset_t *os = za->za_os;
+	ztest_shared_t *zs = ztest_shared;
 	objset_t *clone;
 	dsl_dataset_t *ds;
-	char snap1name[100];
-	char clone1name[100];
-	char snap2name[100];
-	char clone2name[100];
-	char snap3name[100];
-	char osname[MAXNAMELEN];
-	uint64_t curval = za->za_instance;
-
-	(void) rw_rdlock(&ztest_shared->zs_name_lock);
-
-	dmu_objset_name(os, osname);
-	ztest_dsl_dataset_cleanup(osname, curval);
-
-	(void) snprintf(snap1name, 100, "%s@s1_%llu", osname, curval);
-	(void) snprintf(clone1name, 100, "%s/c1_%llu", osname, curval);
-	(void) snprintf(snap2name, 100, "%s@s2_%llu", clone1name, curval);
-	(void) snprintf(clone2name, 100, "%s/c2_%llu", osname, curval);
-	(void) snprintf(snap3name, 100, "%s@s3_%llu", clone1name, curval);
+	char snap1name[MAXNAMELEN];
+	char clone1name[MAXNAMELEN];
+	char snap2name[MAXNAMELEN];
+	char clone2name[MAXNAMELEN];
+	char snap3name[MAXNAMELEN];
+	char *osname = zd->zd_name;
+	int error;
+
+	(void) rw_rdlock(&zs->zs_name_lock);
+
+	ztest_dsl_dataset_cleanup(osname, id);
+
+	(void) snprintf(snap1name, MAXNAMELEN, "%s@s1_%llu", osname, id);
+	(void) snprintf(clone1name, MAXNAMELEN, "%s/c1_%llu", osname, id);
+	(void) snprintf(snap2name, MAXNAMELEN, "%s@s2_%llu", clone1name, id);
+	(void) snprintf(clone2name, MAXNAMELEN, "%s/c2_%llu", osname, id);
+	(void) snprintf(snap3name, MAXNAMELEN, "%s@s3_%llu", clone1name, id);
 
 	error = dmu_objset_snapshot(osname, strchr(snap1name, '@')+1,
-	    NULL, FALSE);
+	    NULL, B_FALSE);
 	if (error && error != EEXIST) {
 		if (error == ENOSPC) {
-			ztest_record_enospc("dmu_take_snapshot");
+			ztest_record_enospc(FTAG);
 			goto out;
 		}
 		fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error);
@@ -1758,27 +2970,27 @@
 	dmu_objset_rele(clone, FTAG);
 	if (error) {
 		if (error == ENOSPC) {
-			ztest_record_enospc("dmu_objset_create");
+			ztest_record_enospc(FTAG);
 			goto out;
 		}
 		fatal(0, "dmu_objset_create(%s) = %d", clone1name, error);
 	}
 
 	error = dmu_objset_snapshot(clone1name, strchr(snap2name, '@')+1,
-	    NULL, FALSE);
+	    NULL, B_FALSE);
 	if (error && error != EEXIST) {
 		if (error == ENOSPC) {
-			ztest_record_enospc("dmu_take_snapshot");
+			ztest_record_enospc(FTAG);
 			goto out;
 		}
 		fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error);
 	}
 
 	error = dmu_objset_snapshot(clone1name, strchr(snap3name, '@')+1,
-	    NULL, FALSE);
+	    NULL, B_FALSE);
 	if (error && error != EEXIST) {
 		if (error == ENOSPC) {
-			ztest_record_enospc("dmu_take_snapshot");
+			ztest_record_enospc(FTAG);
 			goto out;
 		}
 		fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error);
@@ -1792,7 +3004,7 @@
 	dmu_objset_rele(clone, FTAG);
 	if (error) {
 		if (error == ENOSPC) {
-			ztest_record_enospc("dmu_objset_create");
+			ztest_record_enospc(FTAG);
 			goto out;
 		}
 		fatal(0, "dmu_objset_create(%s) = %d", clone2name, error);
@@ -1808,246 +3020,49 @@
 	dsl_dataset_disown(ds, FTAG);
 
 out:
-	ztest_dsl_dataset_cleanup(osname, curval);
-
-	(void) rw_unlock(&ztest_shared->zs_name_lock);
+	ztest_dsl_dataset_cleanup(osname, id);
+
+	(void) rw_unlock(&zs->zs_name_lock);
 }
 
 /*
  * Verify that dmu_object_{alloc,free} work as expected.
  */
 void
-ztest_dmu_object_alloc_free(ztest_args_t *za)
+ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id)
 {
-	objset_t *os = za->za_os;
-	dmu_buf_t *db;
-	dmu_tx_t *tx;
-	uint64_t batchobj, object, batchsize, endoff, temp;
-	int b, c, error, bonuslen;
-	dmu_object_info_t *doi = &za->za_doi;
-	char osname[MAXNAMELEN];
-
-	dmu_objset_name(os, osname);
-
-	endoff = -8ULL;
-	batchsize = 2;
-
-	/*
-	 * Create a batch object if necessary, and record it in the directory.
-	 */
-	VERIFY3U(0, ==, dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
-	    sizeof (uint64_t), &batchobj, DMU_READ_PREFETCH));
-	if (batchobj == 0) {
-		tx = dmu_tx_create(os);
-		dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff,
-		    sizeof (uint64_t));
-		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		if (error) {
-			ztest_record_enospc("create a batch object");
-			dmu_tx_abort(tx);
-			return;
-		}
-		batchobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
-		    DMU_OT_NONE, 0, tx);
-		ztest_set_random_blocksize(os, batchobj, tx);
-		dmu_write(os, ZTEST_DIROBJ, za->za_diroff,
-		    sizeof (uint64_t), &batchobj, tx);
-		dmu_tx_commit(tx);
-	}
+	ztest_od_t od[4];
+	int batchsize = sizeof (od) / sizeof (od[0]);
+
+	for (int b = 0; b < batchsize; b++)
+		ztest_od_init(&od[b], id, FTAG, b, DMU_OT_UINT64_OTHER, 0, 0);
 
 	/*
-	 * Destroy the previous batch of objects.
-	 */
-	for (b = 0; b < batchsize; b++) {
-		VERIFY3U(0, ==, dmu_read(os, batchobj, b * sizeof (uint64_t),
-		    sizeof (uint64_t), &object, DMU_READ_PREFETCH));
-		if (object == 0)
-			continue;
-		/*
-		 * Read and validate contents.
-		 * We expect the nth byte of the bonus buffer to be n.
-		 */
-		VERIFY(0 == dmu_bonus_hold(os, object, FTAG, &db));
-		za->za_dbuf = db;
-
-		dmu_object_info_from_db(db, doi);
-		ASSERT(doi->doi_type == DMU_OT_UINT64_OTHER);
-		ASSERT(doi->doi_bonus_type == DMU_OT_PLAIN_OTHER);
-		ASSERT3S(doi->doi_physical_blks, >=, 0);
-
-		bonuslen = doi->doi_bonus_size;
-
-		for (c = 0; c < bonuslen; c++) {
-			if (((uint8_t *)db->db_data)[c] !=
-			    (uint8_t)(c + bonuslen)) {
-				fatal(0,
-				    "bad bonus: %s, obj %llu, off %d: %u != %u",
-				    osname, object, c,
-				    ((uint8_t *)db->db_data)[c],
-				    (uint8_t)(c + bonuslen));
-			}
-		}
-
-		dmu_buf_rele(db, FTAG);
-		za->za_dbuf = NULL;
-
-		/*
-		 * We expect the word at endoff to be our object number.
-		 */
-		VERIFY(0 == dmu_read(os, object, endoff,
-		    sizeof (uint64_t), &temp, DMU_READ_PREFETCH));
-
-		if (temp != object) {
-			fatal(0, "bad data in %s, got %llu, expected %llu",
-			    osname, temp, object);
-		}
-
-		/*
-		 * Destroy old object and clear batch entry.
-		 */
-		tx = dmu_tx_create(os);
-		dmu_tx_hold_write(tx, batchobj,
-		    b * sizeof (uint64_t), sizeof (uint64_t));
-		dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		if (error) {
-			ztest_record_enospc("free object");
-			dmu_tx_abort(tx);
-			return;
-		}
-		error = dmu_object_free(os, object, tx);
-		if (error) {
-			fatal(0, "dmu_object_free('%s', %llu) = %d",
-			    osname, object, error);
-		}
-		object = 0;
-
-		dmu_object_set_checksum(os, batchobj,
-		    ztest_random_checksum(), tx);
-		dmu_object_set_compress(os, batchobj,
-		    ztest_random_compress(), tx);
-
-		dmu_write(os, batchobj, b * sizeof (uint64_t),
-		    sizeof (uint64_t), &object, tx);
-
-		dmu_tx_commit(tx);
-	}
-
-	/*
-	 * Before creating the new batch of objects, generate a bunch of churn.
+	 * Destroy the previous batch of objects, create a new batch,
+	 * and do some I/O on the new objects.
 	 */
-	for (b = ztest_random(100); b > 0; b--) {
-		tx = dmu_tx_create(os);
-		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		if (error) {
-			ztest_record_enospc("churn objects");
-			dmu_tx_abort(tx);
-			return;
-		}
-		object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
-		    DMU_OT_NONE, 0, tx);
-		ztest_set_random_blocksize(os, object, tx);
-		error = dmu_object_free(os, object, tx);
-		if (error) {
-			fatal(0, "dmu_object_free('%s', %llu) = %d",
-			    osname, object, error);
-		}
-		dmu_tx_commit(tx);
-	}
-
-	/*
-	 * Create a new batch of objects with randomly chosen
-	 * blocksizes and record them in the batch directory.
-	 */
-	for (b = 0; b < batchsize; b++) {
-		uint32_t va_blksize;
-		u_longlong_t va_nblocks;
-
-		tx = dmu_tx_create(os);
-		dmu_tx_hold_write(tx, batchobj, b * sizeof (uint64_t),
-		    sizeof (uint64_t));
-		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, endoff,
-		    sizeof (uint64_t));
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		if (error) {
-			ztest_record_enospc("create batchobj");
-			dmu_tx_abort(tx);
-			return;
-		}
-		bonuslen = (int)ztest_random(dmu_bonus_max()) + 1;
-
-		object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
-		    DMU_OT_PLAIN_OTHER, bonuslen, tx);
-
-		ztest_set_random_blocksize(os, object, tx);
-
-		dmu_object_set_checksum(os, object,
-		    ztest_random_checksum(), tx);
-		dmu_object_set_compress(os, object,
-		    ztest_random_compress(), tx);
-
-		dmu_write(os, batchobj, b * sizeof (uint64_t),
-		    sizeof (uint64_t), &object, tx);
-
-		/*
-		 * Write to both the bonus buffer and the regular data.
-		 */
-		VERIFY(dmu_bonus_hold(os, object, FTAG, &db) == 0);
-		za->za_dbuf = db;
-		ASSERT3U(bonuslen, <=, db->db_size);
-
-		dmu_object_size_from_db(db, &va_blksize, &va_nblocks);
-		ASSERT3S(va_nblocks, >=, 0);
-
-		dmu_buf_will_dirty(db, tx);
-
-		/*
-		 * See comments above regarding the contents of
-		 * the bonus buffer and the word at endoff.
-		 */
-		for (c = 0; c < bonuslen; c++)
-			((uint8_t *)db->db_data)[c] = (uint8_t)(c + bonuslen);
-
-		dmu_buf_rele(db, FTAG);
-		za->za_dbuf = NULL;
-
-		/*
-		 * Write to a large offset to increase indirection.
-		 */
-		dmu_write(os, object, endoff, sizeof (uint64_t), &object, tx);
-
-		dmu_tx_commit(tx);
-	}
+	if (ztest_object_init(zd, od, sizeof (od), B_TRUE) != 0)
+		return;
+
+	while (ztest_random(4 * batchsize) != 0)
+		ztest_io(zd, od[ztest_random(batchsize)].od_object,
+		    ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
 }
 
 /*
  * Verify that dmu_{read,write} work as expected.
  */
-typedef struct bufwad {
-	uint64_t	bw_index;
-	uint64_t	bw_txg;
-	uint64_t	bw_data;
-} bufwad_t;
-
-typedef struct dmu_read_write_dir {
-	uint64_t	dd_packobj;
-	uint64_t	dd_bigobj;
-	uint64_t	dd_chunk;
-} dmu_read_write_dir_t;
-
 void
-ztest_dmu_read_write(ztest_args_t *za)
+ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id)
 {
-	objset_t *os = za->za_os;
-	dmu_read_write_dir_t dd;
+	objset_t *os = zd->zd_os;
+	ztest_od_t od[2];
 	dmu_tx_t *tx;
 	int i, freeit, error;
 	uint64_t n, s, txg;
 	bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT;
-	uint64_t packoff, packsize, bigoff, bigsize;
+	uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize;
+	uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t);
 	uint64_t regions = 997;
 	uint64_t stride = 123456789ULL;
 	uint64_t width = 40;
@@ -2080,34 +3095,16 @@
 	/*
 	 * Read the directory info.  If it's the first time, set things up.
 	 */
-	VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
-	    sizeof (dd), &dd, DMU_READ_PREFETCH));
-	if (dd.dd_chunk == 0) {
-		ASSERT(dd.dd_packobj == 0);
-		ASSERT(dd.dd_bigobj == 0);
-		tx = dmu_tx_create(os);
-		dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (dd));
-		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		if (error) {
-			ztest_record_enospc("create r/w directory");
-			dmu_tx_abort(tx);
-			return;
-		}
-
-		dd.dd_packobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
-		    DMU_OT_NONE, 0, tx);
-		dd.dd_bigobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
-		    DMU_OT_NONE, 0, tx);
-		dd.dd_chunk = (1000 + ztest_random(1000)) * sizeof (uint64_t);
-
-		ztest_set_random_blocksize(os, dd.dd_packobj, tx);
-		ztest_set_random_blocksize(os, dd.dd_bigobj, tx);
-
-		dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (dd), &dd,
-		    tx);
-		dmu_tx_commit(tx);
-	}
+	ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, chunksize);
+	ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, chunksize);
+
+	if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
+		return;
+
+	bigobj = od[0].od_object;
+	packobj = od[1].od_object;
+	chunksize = od[0].od_gen;
+	ASSERT(chunksize == od[1].od_gen);
 
 	/*
 	 * Prefetch a random chunk of the big object.
@@ -2117,7 +3114,7 @@
 	 */
 	n = ztest_random(regions) * stride + ztest_random(width);
 	s = 1 + ztest_random(2 * width - 1);
-	dmu_prefetch(os, dd.dd_bigobj, n * dd.dd_chunk, s * dd.dd_chunk);
+	dmu_prefetch(os, bigobj, n * chunksize, s * chunksize);
 
 	/*
 	 * Pick a random index and compute the offsets into packobj and bigobj.
@@ -2128,8 +3125,8 @@
 	packoff = n * sizeof (bufwad_t);
 	packsize = s * sizeof (bufwad_t);
 
-	bigoff = n * dd.dd_chunk;
-	bigsize = s * dd.dd_chunk;
+	bigoff = n * chunksize;
+	bigsize = s * chunksize;
 
 	packbuf = umem_alloc(packsize, UMEM_NOFAIL);
 	bigbuf = umem_alloc(bigsize, UMEM_NOFAIL);
@@ -2143,10 +3140,10 @@
 	/*
 	 * Read the current contents of our objects.
 	 */
-	error = dmu_read(os, dd.dd_packobj, packoff, packsize, packbuf,
+	error = dmu_read(os, packobj, packoff, packsize, packbuf,
 	    DMU_READ_PREFETCH);
 	ASSERT3U(error, ==, 0);
-	error = dmu_read(os, dd.dd_bigobj, bigoff, bigsize, bigbuf,
+	error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf,
 	    DMU_READ_PREFETCH);
 	ASSERT3U(error, ==, 0);
 
@@ -2155,24 +3152,25 @@
 	 */
 	tx = dmu_tx_create(os);
 
-	dmu_tx_hold_write(tx, dd.dd_packobj, packoff, packsize);
+	dmu_tx_hold_write(tx, packobj, packoff, packsize);
 
 	if (freeit)
-		dmu_tx_hold_free(tx, dd.dd_bigobj, bigoff, bigsize);
+		dmu_tx_hold_free(tx, bigobj, bigoff, bigsize);
 	else
-		dmu_tx_hold_write(tx, dd.dd_bigobj, bigoff, bigsize);
-
-	error = dmu_tx_assign(tx, TXG_WAIT);
-
-	if (error) {
-		ztest_record_enospc("dmu r/w range");
-		dmu_tx_abort(tx);
+		dmu_tx_hold_write(tx, bigobj, bigoff, bigsize);
+
+	txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+	if (txg == 0) {
 		umem_free(packbuf, packsize);
 		umem_free(bigbuf, bigsize);
 		return;
 	}
 
-	txg = dmu_tx_get_txg(tx);
+	dmu_object_set_checksum(os, bigobj,
+	    (enum zio_checksum)ztest_random_dsl_prop(ZFS_PROP_CHECKSUM), tx);
+
+	dmu_object_set_compress(os, bigobj,
+	    (enum zio_compress)ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), tx);
 
 	/*
 	 * For each index from n to n + s, verify that the existing bufwad
@@ -2184,9 +3182,9 @@
 		/* LINTED */
 		pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t));
 		/* LINTED */
-		bigH = (bufwad_t *)((char *)bigbuf + i * dd.dd_chunk);
+		bigH = (bufwad_t *)((char *)bigbuf + i * chunksize);
 		/* LINTED */
-		bigT = (bufwad_t *)((char *)bigH + dd.dd_chunk) - 1;
+		bigT = (bufwad_t *)((char *)bigH + chunksize) - 1;
 
 		ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
 		ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);
@@ -2220,27 +3218,26 @@
 	 * We've verified all the old bufwads, and made new ones.
 	 * Now write them out.
 	 */
-	dmu_write(os, dd.dd_packobj, packoff, packsize, packbuf, tx);
+	dmu_write(os, packobj, packoff, packsize, packbuf, tx);
 
 	if (freeit) {
-		if (zopt_verbose >= 6) {
+		if (zopt_verbose >= 7) {
 			(void) printf("freeing offset %llx size %llx"
 			    " txg %llx\n",
 			    (u_longlong_t)bigoff,
 			    (u_longlong_t)bigsize,
 			    (u_longlong_t)txg);
 		}
-		VERIFY(0 == dmu_free_range(os, dd.dd_bigobj, bigoff,
-		    bigsize, tx));
+		VERIFY(0 == dmu_free_range(os, bigobj, bigoff, bigsize, tx));
 	} else {
-		if (zopt_verbose >= 6) {
+		if (zopt_verbose >= 7) {
 			(void) printf("writing offset %llx size %llx"
 			    " txg %llx\n",
 			    (u_longlong_t)bigoff,
 			    (u_longlong_t)bigsize,
 			    (u_longlong_t)txg);
 		}
-		dmu_write(os, dd.dd_bigobj, bigoff, bigsize, bigbuf, tx);
+		dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx);
 	}
 
 	dmu_tx_commit(tx);
@@ -2252,9 +3249,9 @@
 		void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
 		void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
 
-		VERIFY(0 == dmu_read(os, dd.dd_packobj, packoff,
+		VERIFY(0 == dmu_read(os, packobj, packoff,
 		    packsize, packcheck, DMU_READ_PREFETCH));
-		VERIFY(0 == dmu_read(os, dd.dd_bigobj, bigoff,
+		VERIFY(0 == dmu_read(os, bigobj, bigoff,
 		    bigsize, bigcheck, DMU_READ_PREFETCH));
 
 		ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
@@ -2270,7 +3267,7 @@
 
 void
 compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf,
-    uint64_t bigsize, uint64_t n, dmu_read_write_dir_t dd, uint64_t txg)
+    uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg)
 {
 	uint64_t i;
 	bufwad_t *pack;
@@ -2287,9 +3284,9 @@
 		/* LINTED */
 		pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t));
 		/* LINTED */
-		bigH = (bufwad_t *)((char *)bigbuf + i * dd.dd_chunk);
+		bigH = (bufwad_t *)((char *)bigbuf + i * chunksize);
 		/* LINTED */
-		bigT = (bufwad_t *)((char *)bigH + dd.dd_chunk) - 1;
+		bigT = (bufwad_t *)((char *)bigH + chunksize) - 1;
 
 		ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
 		ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);
@@ -2318,22 +3315,24 @@
 }
 
 void
-ztest_dmu_read_write_zcopy(ztest_args_t *za)
+ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
 {
-	objset_t *os = za->za_os;
-	dmu_read_write_dir_t dd;
+	objset_t *os = zd->zd_os;
+	ztest_od_t od[2];
 	dmu_tx_t *tx;
 	uint64_t i;
 	int error;
 	uint64_t n, s, txg;
 	bufwad_t *packbuf, *bigbuf;
-	uint64_t packoff, packsize, bigoff, bigsize;
+	uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize;
+	uint64_t blocksize = ztest_random_blocksize();
+	uint64_t chunksize = blocksize;
 	uint64_t regions = 997;
 	uint64_t stride = 123456789ULL;
 	uint64_t width = 9;
 	dmu_buf_t *bonus_db;
 	arc_buf_t **bigbuf_arcbufs;
-	dmu_object_info_t *doi = &za->za_doi;
+	dmu_object_info_t doi;
 
 	/*
 	 * This test uses two objects, packobj and bigobj, that are always
@@ -2354,42 +3353,22 @@
 	/*
 	 * Read the directory info.  If it's the first time, set things up.
 	 */
-	VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
-	    sizeof (dd), &dd, DMU_READ_PREFETCH));
-	if (dd.dd_chunk == 0) {
-		ASSERT(dd.dd_packobj == 0);
-		ASSERT(dd.dd_bigobj == 0);
-		tx = dmu_tx_create(os);
-		dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (dd));
-		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		if (error) {
-			ztest_record_enospc("create r/w directory");
-			dmu_tx_abort(tx);
-			return;
-		}
-
-		dd.dd_packobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
-		    DMU_OT_NONE, 0, tx);
-		dd.dd_bigobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
-		    DMU_OT_NONE, 0, tx);
-		ztest_set_random_blocksize(os, dd.dd_packobj, tx);
-		ztest_set_random_blocksize(os, dd.dd_bigobj, tx);
-
-		VERIFY(dmu_object_info(os, dd.dd_bigobj, doi) == 0);
-		ASSERT(doi->doi_data_block_size >= 2 * sizeof (bufwad_t));
-		ASSERT(ISP2(doi->doi_data_block_size));
-		dd.dd_chunk = doi->doi_data_block_size;
-
-		dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (dd), &dd,
-		    tx);
-		dmu_tx_commit(tx);
-	} else {
-		VERIFY(dmu_object_info(os, dd.dd_bigobj, doi) == 0);
-		VERIFY(ISP2(doi->doi_data_block_size));
-		VERIFY(dd.dd_chunk == doi->doi_data_block_size);
-		VERIFY(dd.dd_chunk >= 2 * sizeof (bufwad_t));
-	}
+	ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
+	ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, chunksize);
+
+	if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
+		return;
+
+	bigobj = od[0].od_object;
+	packobj = od[1].od_object;
+	blocksize = od[0].od_blocksize;
+	chunksize = blocksize;
+	ASSERT(chunksize == od[1].od_gen);
+
+	VERIFY(dmu_object_info(os, bigobj, &doi) == 0);
+	VERIFY(ISP2(doi.doi_data_block_size));
+	VERIFY(chunksize == doi.doi_data_block_size);
+	VERIFY(chunksize >= 2 * sizeof (bufwad_t));
 
 	/*
 	 * Pick a random index and compute the offsets into packobj and bigobj.
@@ -2400,13 +3379,13 @@
 	packoff = n * sizeof (bufwad_t);
 	packsize = s * sizeof (bufwad_t);
 
-	bigoff = n * dd.dd_chunk;
-	bigsize = s * dd.dd_chunk;
+	bigoff = n * chunksize;
+	bigsize = s * chunksize;
 
 	packbuf = umem_zalloc(packsize, UMEM_NOFAIL);
 	bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL);
 
-	VERIFY(dmu_bonus_hold(os, dd.dd_bigobj, FTAG, &bonus_db) == 0);
+	VERIFY3U(0, ==, dmu_bonus_hold(os, bigobj, FTAG, &bonus_db));
 
 	bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL);
 
@@ -2432,15 +3411,12 @@
 		for (j = 0; j < s; j++) {
 			if (i != 5) {
 				bigbuf_arcbufs[j] =
-				    dmu_request_arcbuf(bonus_db,
-				    dd.dd_chunk);
+				    dmu_request_arcbuf(bonus_db, chunksize);
 			} else {
 				bigbuf_arcbufs[2 * j] =
-				    dmu_request_arcbuf(bonus_db,
-				    dd.dd_chunk / 2);
+				    dmu_request_arcbuf(bonus_db, chunksize / 2);
 				bigbuf_arcbufs[2 * j + 1] =
-				    dmu_request_arcbuf(bonus_db,
-				    dd.dd_chunk / 2);
+				    dmu_request_arcbuf(bonus_db, chunksize / 2);
 			}
 		}
 
@@ -2449,20 +3425,11 @@
 		 */
 		tx = dmu_tx_create(os);
 
-		dmu_tx_hold_write(tx, dd.dd_packobj, packoff, packsize);
-		dmu_tx_hold_write(tx, dd.dd_bigobj, bigoff, bigsize);
-
-		if (ztest_random(100) == 0) {
-			error = -1;
-		} else {
-			error = dmu_tx_assign(tx, TXG_WAIT);
-		}
-
-		if (error) {
-			if (error != -1) {
-				ztest_record_enospc("dmu r/w range");
-			}
-			dmu_tx_abort(tx);
+		dmu_tx_hold_write(tx, packobj, packoff, packsize);
+		dmu_tx_hold_write(tx, bigobj, bigoff, bigsize);
+
+		txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+		if (txg == 0) {
 			umem_free(packbuf, packsize);
 			umem_free(bigbuf, bigsize);
 			for (j = 0; j < s; j++) {
@@ -2480,53 +3447,51 @@
 			return;
 		}
 
-		txg = dmu_tx_get_txg(tx);
-
 		/*
 		 * 50% of the time don't read objects in the 1st iteration to
 		 * test dmu_assign_arcbuf() for the case when there're no
 		 * existing dbufs for the specified offsets.
 		 */
 		if (i != 0 || ztest_random(2) != 0) {
-			error = dmu_read(os, dd.dd_packobj, packoff,
+			error = dmu_read(os, packobj, packoff,
 			    packsize, packbuf, DMU_READ_PREFETCH);
 			ASSERT3U(error, ==, 0);
-			error = dmu_read(os, dd.dd_bigobj, bigoff, bigsize,
+			error = dmu_read(os, bigobj, bigoff, bigsize,
 			    bigbuf, DMU_READ_PREFETCH);
 			ASSERT3U(error, ==, 0);
 		}
 		compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize,
-		    n, dd, txg);
+		    n, chunksize, txg);
 
 		/*
 		 * We've verified all the old bufwads, and made new ones.
 		 * Now write them out.
 		 */
-		dmu_write(os, dd.dd_packobj, packoff, packsize, packbuf, tx);
-		if (zopt_verbose >= 6) {
+		dmu_write(os, packobj, packoff, packsize, packbuf, tx);
+		if (zopt_verbose >= 7) {
 			(void) printf("writing offset %llx size %llx"
 			    " txg %llx\n",
 			    (u_longlong_t)bigoff,
 			    (u_longlong_t)bigsize,
 			    (u_longlong_t)txg);
 		}
-		for (off = bigoff, j = 0; j < s; j++, off += dd.dd_chunk) {
+		for (off = bigoff, j = 0; j < s; j++, off += chunksize) {
 			dmu_buf_t *dbt;
 			if (i != 5) {
 				bcopy((caddr_t)bigbuf + (off - bigoff),
-				    bigbuf_arcbufs[j]->b_data, dd.dd_chunk);
+				    bigbuf_arcbufs[j]->b_data, chunksize);
 			} else {
 				bcopy((caddr_t)bigbuf + (off - bigoff),
 				    bigbuf_arcbufs[2 * j]->b_data,
-				    dd.dd_chunk / 2);
+				    chunksize / 2);
 				bcopy((caddr_t)bigbuf + (off - bigoff) +
-				    dd.dd_chunk / 2,
+				    chunksize / 2,
 				    bigbuf_arcbufs[2 * j + 1]->b_data,
-				    dd.dd_chunk / 2);
+				    chunksize / 2);
 			}
 
 			if (i == 1) {
-				VERIFY(dmu_buf_hold(os, dd.dd_bigobj, off,
+				VERIFY(dmu_buf_hold(os, bigobj, off,
 				    FTAG, &dbt) == 0);
 			}
 			if (i != 5) {
@@ -2536,7 +3501,7 @@
 				dmu_assign_arcbuf(bonus_db, off,
 				    bigbuf_arcbufs[2 * j], tx);
 				dmu_assign_arcbuf(bonus_db,
-				    off + dd.dd_chunk / 2,
+				    off + chunksize / 2,
 				    bigbuf_arcbufs[2 * j + 1], tx);
 			}
 			if (i == 1) {
@@ -2552,9 +3517,9 @@
 			void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
 			void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
 
-			VERIFY(0 == dmu_read(os, dd.dd_packobj, packoff,
+			VERIFY(0 == dmu_read(os, packobj, packoff,
 			    packsize, packcheck, DMU_READ_PREFETCH));
-			VERIFY(0 == dmu_read(os, dd.dd_bigobj, bigoff,
+			VERIFY(0 == dmu_read(os, bigobj, bigoff,
 			    bigsize, bigcheck, DMU_READ_PREFETCH));
 
 			ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
@@ -2576,256 +3541,60 @@
 	umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *));
 }
 
+/* ARGSUSED */
 void
-ztest_dmu_check_future_leak(ztest_args_t *za)
+ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id)
 {
-	objset_t *os = za->za_os;
-	dmu_buf_t *db;
-	ztest_block_tag_t *bt;
-	dmu_object_info_t *doi = &za->za_doi;
+	ztest_od_t od[1];
+	uint64_t offset = (1ULL << (ztest_random(20) + 43)) +
+	    (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
 
 	/*
-	 * Make sure that, if there is a write record in the bonus buffer
-	 * of the ZTEST_DIROBJ, that the txg for this record is <= the
-	 * last synced txg of the pool.
+	 * Have multiple threads write to large offsets in an object
+	 * to verify that parallel writes to an object -- even to the
+	 * same blocks within the object -- doesn't cause any trouble.
 	 */
-	VERIFY(dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &db) == 0);
-	za->za_dbuf = db;
-	VERIFY(dmu_object_info(os, ZTEST_DIROBJ, doi) == 0);
-	ASSERT3U(doi->doi_bonus_size, >=, sizeof (*bt));
-	ASSERT3U(doi->doi_bonus_size, <=, db->db_size);
-	ASSERT3U(doi->doi_bonus_size % sizeof (*bt), ==, 0);
-	bt = (void *)((char *)db->db_data + doi->doi_bonus_size - sizeof (*bt));
-	if (bt->bt_objset != 0) {
-		ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os));
-		ASSERT3U(bt->bt_object, ==, ZTEST_DIROBJ);
-		ASSERT3U(bt->bt_offset, ==, -1ULL);
-		ASSERT3U(bt->bt_txg, <, spa_first_txg(za->za_spa));
-	}
-	dmu_buf_rele(db, FTAG);
-	za->za_dbuf = NULL;
+	ztest_od_init(&od[0], ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);
+
+	if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
+		return;
+
+	while (ztest_random(10) != 0)
+		ztest_io(zd, od[0].od_object, offset);
 }
 
 void
-ztest_dmu_write_parallel(ztest_args_t *za)
+ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id)
 {
-	objset_t *os = za->za_os;
-	ztest_block_tag_t *rbt = &za->za_rbt;
-	ztest_block_tag_t *wbt = &za->za_wbt;
-	const size_t btsize = sizeof (ztest_block_tag_t);
-	dmu_buf_t *db;
-	int b, error;
-	int bs = ZTEST_DIROBJ_BLOCKSIZE;
-	int do_free = 0;
-	uint64_t off, txg, txg_how;
-	mutex_t *lp;
-	char osname[MAXNAMELEN];
-	char iobuf[SPA_MAXBLOCKSIZE];
-	blkptr_t blk = { 0 };
-	uint64_t blkoff;
-	zbookmark_t zb;
-	dmu_tx_t *tx = dmu_tx_create(os);
-	dmu_buf_t *bonus_db;
-	arc_buf_t *abuf = NULL;
-
-	dmu_objset_name(os, osname);
-
-	/*
-	 * Have multiple threads write to large offsets in ZTEST_DIROBJ
-	 * to verify that having multiple threads writing to the same object
-	 * in parallel doesn't cause any trouble.
-	 */
-	if (ztest_random(4) == 0) {
-		/*
-		 * Do the bonus buffer instead of a regular block.
-		 * We need a lock to serialize resize vs. others,
-		 * so we hash on the objset ID.
-		 */
-		b = dmu_objset_id(os) % ZTEST_SYNC_LOCKS;
-		off = -1ULL;
-		dmu_tx_hold_bonus(tx, ZTEST_DIROBJ);
-	} else {
-		b = ztest_random(ZTEST_SYNC_LOCKS);
-		off = za->za_diroff_shared + (b << SPA_MAXBLOCKSHIFT);
-		if (ztest_random(4) == 0) {
-			do_free = 1;
-			dmu_tx_hold_free(tx, ZTEST_DIROBJ, off, bs);
-		} else {
-			dmu_tx_hold_write(tx, ZTEST_DIROBJ, off, bs);
-		}
-	}
-
-	if (off != -1ULL && P2PHASE(off, bs) == 0 && !do_free &&
-	    ztest_random(8) == 0) {
-		VERIFY(dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &bonus_db) == 0);
-		abuf = dmu_request_arcbuf(bonus_db, bs);
-	}
-
-	txg_how = ztest_random(2) == 0 ? TXG_WAIT : TXG_NOWAIT;
-	error = dmu_tx_assign(tx, txg_how);
-	if (error) {
-		if (error == ERESTART) {
-			ASSERT(txg_how == TXG_NOWAIT);
-			dmu_tx_wait(tx);
-		} else {
-			ztest_record_enospc("dmu write parallel");
-		}
-		dmu_tx_abort(tx);
-		if (abuf != NULL) {
-			dmu_return_arcbuf(abuf);
-			dmu_buf_rele(bonus_db, FTAG);
-		}
+	ztest_od_t od[1];
+	uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) +
+	    (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
+	uint64_t count = ztest_random(20) + 1;
+	uint64_t blocksize = ztest_random_blocksize();
+	void *data;
+
+	ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
+
+	if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0)
 		return;
+
+	if (ztest_truncate(zd, od[0].od_object, offset, count * blocksize) != 0)
+		return;
+
+	ztest_prealloc(zd, od[0].od_object, offset, count * blocksize);
+
+	data = umem_zalloc(blocksize, UMEM_NOFAIL);
+
+	while (ztest_random(count) != 0) {
+		uint64_t randoff = offset + (ztest_random(count) * blocksize);
+		if (ztest_write(zd, od[0].od_object, randoff, blocksize,
+		    data) != 0)
+			break;
+		while (ztest_random(4) != 0)
+			ztest_io(zd, od[0].od_object, randoff);
 	}
-	txg = dmu_tx_get_txg(tx);
-
-	lp = &ztest_shared->zs_sync_lock[b];
-	(void) mutex_lock(lp);
-
-	wbt->bt_objset = dmu_objset_id(os);
-	wbt->bt_object = ZTEST_DIROBJ;
-	wbt->bt_offset = off;
-	wbt->bt_txg = txg;
-	wbt->bt_thread = za->za_instance;
-	wbt->bt_seq = ztest_shared->zs_seq[b]++;	/* protected by lp */
-
-	/*
-	 * Occasionally, write an all-zero block to test the behavior
-	 * of blocks that compress into holes.
-	 */
-	if (off != -1ULL && ztest_random(8) == 0)
-		bzero(wbt, btsize);
-
-	if (off == -1ULL) {
-		dmu_object_info_t *doi = &za->za_doi;
-		char *dboff;
-
-		VERIFY(dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &db) == 0);
-		za->za_dbuf = db;
-		dmu_object_info_from_db(db, doi);
-		ASSERT3U(doi->doi_bonus_size, <=, db->db_size);
-		ASSERT3U(doi->doi_bonus_size, >=, btsize);
-		ASSERT3U(doi->doi_bonus_size % btsize, ==, 0);
-		dboff = (char *)db->db_data + doi->doi_bonus_size - btsize;
-		bcopy(dboff, rbt, btsize);
-		if (rbt->bt_objset != 0) {
-			ASSERT3U(rbt->bt_objset, ==, wbt->bt_objset);
-			ASSERT3U(rbt->bt_object, ==, wbt->bt_object);
-			ASSERT3U(rbt->bt_offset, ==, wbt->bt_offset);
-			ASSERT3U(rbt->bt_txg, <=, wbt->bt_txg);
-		}
-		if (ztest_random(10) == 0) {
-			int newsize = (ztest_random(db->db_size /
-			    btsize) + 1) * btsize;
-
-			ASSERT3U(newsize, >=, btsize);
-			ASSERT3U(newsize, <=, db->db_size);
-			VERIFY3U(dmu_set_bonus(db, newsize, tx), ==, 0);
-			dboff = (char *)db->db_data + newsize - btsize;
-		}
-		dmu_buf_will_dirty(db, tx);
-		bcopy(wbt, dboff, btsize);
-		dmu_buf_rele(db, FTAG);
-		za->za_dbuf = NULL;
-	} else if (do_free) {
-		VERIFY(dmu_free_range(os, ZTEST_DIROBJ, off, bs, tx) == 0);
-	} else if (abuf == NULL) {
-		dmu_write(os, ZTEST_DIROBJ, off, btsize, wbt, tx);
-	} else {
-		bcopy(wbt, abuf->b_data, btsize);
-		dmu_assign_arcbuf(bonus_db, off, abuf, tx);
-		dmu_buf_rele(bonus_db, FTAG);
-	}
-
-	(void) mutex_unlock(lp);
-
-	if (ztest_random(1000) == 0)
-		(void) poll(NULL, 0, 1); /* open dn_notxholds window */
-
-	dmu_tx_commit(tx);
-
-	if (ztest_random(10000) == 0)
-		txg_wait_synced(dmu_objset_pool(os), txg);
-
-	if (off == -1ULL || do_free)
-		return;
-
-	if (ztest_random(2) != 0)
-		return;
-
-	/*
-	 * dmu_sync() the block we just wrote.
-	 */
-	(void) mutex_lock(lp);
-
-	blkoff = P2ALIGN_TYPED(off, bs, uint64_t);
-	error = dmu_buf_hold(os, ZTEST_DIROBJ, blkoff, FTAG, &db);
-	za->za_dbuf = db;
-	if (error) {
-		(void) mutex_unlock(lp);
-		return;
-	}
-	blkoff = off - blkoff;
-	error = dmu_sync(NULL, db, &blk, txg, NULL, NULL);
-	dmu_buf_rele(db, FTAG);
-	za->za_dbuf = NULL;
-
-	if (error) {
-		(void) mutex_unlock(lp);
-		return;
-	}
-
-	if (blk.blk_birth == 0)	{	/* concurrent free */
-		(void) mutex_unlock(lp);
-		return;
-	}
-
-	txg_suspend(dmu_objset_pool(os));
-
-	(void) mutex_unlock(lp);
-
-	ASSERT(blk.blk_fill == 1);
-	ASSERT3U(BP_GET_TYPE(&blk), ==, DMU_OT_UINT64_OTHER);
-	ASSERT3U(BP_GET_LEVEL(&blk), ==, 0);
-	ASSERT3U(BP_GET_LSIZE(&blk), ==, bs);
-
-	/*
-	 * Read the block that dmu_sync() returned to make sure its contents
-	 * match what we wrote.  We do this while still txg_suspend()ed
-	 * to ensure that the block can't be reused before we read it.
-	 */
-	zb.zb_objset = dmu_objset_id(os);
-	zb.zb_object = ZTEST_DIROBJ;
-	zb.zb_level = 0;
-	zb.zb_blkid = off / bs;
-	error = zio_wait(zio_read(NULL, za->za_spa, &blk, iobuf, bs,
-	    NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_MUSTSUCCEED, &zb));
-	ASSERT3U(error, ==, 0);
-
-	txg_resume(dmu_objset_pool(os));
-
-	bcopy(&iobuf[blkoff], rbt, btsize);
-
-	if (rbt->bt_objset == 0)		/* concurrent free */
-		return;
-
-	if (wbt->bt_objset == 0)		/* all-zero overwrite */
-		return;
-
-	ASSERT3U(rbt->bt_objset, ==, wbt->bt_objset);
-	ASSERT3U(rbt->bt_object, ==, wbt->bt_object);
-	ASSERT3U(rbt->bt_offset, ==, wbt->bt_offset);
-
-	/*
-	 * The semantic of dmu_sync() is that we always push the most recent
-	 * version of the data, so in the face of concurrent updates we may
-	 * see a newer version of the block.  That's OK.
-	 */
-	ASSERT3U(rbt->bt_txg, >=, wbt->bt_txg);
-	if (rbt->bt_thread == wbt->bt_thread)
-		ASSERT3U(rbt->bt_seq, ==, wbt->bt_seq);
-	else
-		ASSERT3U(rbt->bt_seq, >, wbt->bt_seq);
+
+	umem_free(data, blocksize);
 }
 
 /*
@@ -2836,9 +3605,10 @@
 #define	ZTEST_ZAP_MAX_PROPS	1000
 
 void
-ztest_zap(ztest_args_t *za)
+ztest_zap(ztest_ds_t *zd, uint64_t id)
 {
-	objset_t *os = za->za_os;
+	objset_t *os = zd->zd_os;
+	ztest_od_t od[1];
 	uint64_t object;
 	uint64_t txg, last_txg;
 	uint64_t value[ZTEST_ZAP_MAX_INTS];
@@ -2847,64 +3617,45 @@
 	dmu_tx_t *tx;
 	char propname[100], txgname[100];
 	int error;
-	char osname[MAXNAMELEN];
 	char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" };
 
-	dmu_objset_name(os, osname);
+	ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0);
+
+	if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0)
+		return;
+
+	object = od[0].od_object;
 
 	/*
-	 * Create a new object if necessary, and record it in the directory.
+	 * Generate a known hash collision, and verify that
+	 * we can lookup and remove both entries.
 	 */
-	VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
-	    sizeof (uint64_t), &object, DMU_READ_PREFETCH));
-
-	if (object == 0) {
-		tx = dmu_tx_create(os);
-		dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff,
-		    sizeof (uint64_t));
-		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL);
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		if (error) {
-			ztest_record_enospc("create zap test obj");
-			dmu_tx_abort(tx);
-			return;
-		}
-		object = zap_create(os, DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx);
-		if (error) {
-			fatal(0, "zap_create('%s', %llu) = %d",
-			    osname, object, error);
-		}
-		ASSERT(object != 0);
-		dmu_write(os, ZTEST_DIROBJ, za->za_diroff,
-		    sizeof (uint64_t), &object, tx);
-		/*
-		 * Generate a known hash collision, and verify that
-		 * we can lookup and remove both entries.
-		 */
-		for (i = 0; i < 2; i++) {
-			value[i] = i;
-			error = zap_add(os, object, hc[i], sizeof (uint64_t),
-			    1, &value[i], tx);
-			ASSERT3U(error, ==, 0);
-		}
-		for (i = 0; i < 2; i++) {
-			error = zap_add(os, object, hc[i], sizeof (uint64_t),
-			    1, &value[i], tx);
-			ASSERT3U(error, ==, EEXIST);
-			error = zap_length(os, object, hc[i],
-			    &zl_intsize, &zl_ints);
-			ASSERT3U(error, ==, 0);
-			ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
-			ASSERT3U(zl_ints, ==, 1);
-		}
-		for (i = 0; i < 2; i++) {
-			error = zap_remove(os, object, hc[i], tx);
-			ASSERT3U(error, ==, 0);
-		}
-
-		dmu_tx_commit(tx);
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+	txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+	if (txg == 0)
+		return;
+	for (i = 0; i < 2; i++) {
+		value[i] = i;
+		VERIFY3U(0, ==, zap_add(os, object, hc[i], sizeof (uint64_t),
+		    1, &value[i], tx));
 	}
-
+	for (i = 0; i < 2; i++) {
+		VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i],
+		    sizeof (uint64_t), 1, &value[i], tx));
+		VERIFY3U(0, ==,
+		    zap_length(os, object, hc[i], &zl_intsize, &zl_ints));
+		ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
+		ASSERT3U(zl_ints, ==, 1);
+	}
+	for (i = 0; i < 2; i++) {
+		VERIFY3U(0, ==, zap_remove(os, object, hc[i], tx));
+	}
+	dmu_tx_commit(tx);
+
+	/*
+	 * Generate a buch of random entries.
+	 */
 	ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS);
 
 	prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
@@ -2948,14 +3699,10 @@
 	 * should be txg + object + n.
 	 */
 	tx = dmu_tx_create(os);
-	dmu_tx_hold_zap(tx, object, TRUE, NULL);
-	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error) {
-		ztest_record_enospc("create zap entry");
-		dmu_tx_abort(tx);
+	dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+	txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+	if (txg == 0)
 		return;
-	}
-	txg = dmu_tx_get_txg(tx);
 
 	if (last_txg > txg)
 		fatal(0, "zap future leak: old %llu new %llu", last_txg, txg);
@@ -2963,16 +3710,10 @@
 	for (i = 0; i < ints; i++)
 		value[i] = txg + object + i;
 
-	error = zap_update(os, object, txgname, sizeof (uint64_t), 1, &txg, tx);
-	if (error)
-		fatal(0, "zap_update('%s', %llu, '%s') = %d",
-		    osname, object, txgname, error);
-
-	error = zap_update(os, object, propname, sizeof (uint64_t),
-	    ints, value, tx);
-	if (error)
-		fatal(0, "zap_update('%s', %llu, '%s') = %d",
-		    osname, object, propname, error);
+	VERIFY3U(0, ==, zap_update(os, object, txgname, sizeof (uint64_t),
+	    1, &txg, tx));
+	VERIFY3U(0, ==, zap_update(os, object, propname, sizeof (uint64_t),
+	    ints, value, tx));
 
 	dmu_tx_commit(tx);
 
@@ -2991,47 +3732,12 @@
 	ASSERT3U(error, ==, 0);
 
 	tx = dmu_tx_create(os);
-	dmu_tx_hold_zap(tx, object, TRUE, NULL);
-	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error) {
-		ztest_record_enospc("remove zap entry");
-		dmu_tx_abort(tx);
+	dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+	txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+	if (txg == 0)
 		return;
-	}
-	error = zap_remove(os, object, txgname, tx);
-	if (error)
-		fatal(0, "zap_remove('%s', %llu, '%s') = %d",
-		    osname, object, txgname, error);
-
-	error = zap_remove(os, object, propname, tx);
-	if (error)
-		fatal(0, "zap_remove('%s', %llu, '%s') = %d",
-		    osname, object, propname, error);
-
-	dmu_tx_commit(tx);
-
-	/*
-	 * Once in a while, destroy the object.
-	 */
-	if (ztest_random(1000) != 0)
-		return;
-
-	tx = dmu_tx_create(os);
-	dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t));
-	dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
-	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error) {
-		ztest_record_enospc("destroy zap object");
-		dmu_tx_abort(tx);
-		return;
-	}
-	error = zap_destroy(os, object, tx);
-	if (error)
-		fatal(0, "zap_destroy('%s', %llu) = %d",
-		    osname, object, error);
-	object = 0;
-	dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t),
-	    &object, tx);
+	VERIFY3U(0, ==, zap_remove(os, object, txgname, tx));
+	VERIFY3U(0, ==, zap_remove(os, object, propname, tx));
 	dmu_tx_commit(tx);
 }
 
@@ -3039,108 +3745,65 @@
  * Testcase to test the upgrading of a microzap to fatzap.
  */
 void
-ztest_fzap(ztest_args_t *za)
+ztest_fzap(ztest_ds_t *zd, uint64_t id)
 {
-	objset_t *os = za->za_os;
-	uint64_t object;
-	uint64_t value;
-	dmu_tx_t *tx;
-	int i, error;
-	char osname[MAXNAMELEN];
-	char *name = "aaa";
-	char entname[MAXNAMELEN];
-
-	dmu_objset_name(os, osname);
+	objset_t *os = zd->zd_os;
+	ztest_od_t od[1];
+	uint64_t object, txg;
+
+	ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0);
+
+	if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0)
+		return;
+
+	object = od[0].od_object;
 
 	/*
-	 * Create a new object if necessary, and record it in the directory.
+	 * Add entries to this ZAP and make sure it spills over
+	 * and gets upgraded to a fatzap. Also, since we are adding
+	 * 2050 entries we should see ptrtbl growth and leaf-block split.
 	 */
-	VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
-	    sizeof (uint64_t), &object, DMU_READ_PREFETCH));
-
-	if (object == 0) {
-		tx = dmu_tx_create(os);
-		dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff,
-		    sizeof (uint64_t));
-		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL);
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		if (error) {
-			ztest_record_enospc("create zap test obj");
-			dmu_tx_abort(tx);
-			return;
-		}
-		object = zap_create(os, DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx);
-		if (error) {
-			fatal(0, "zap_create('%s', %llu) = %d",
-			    osname, object, error);
-		}
-		ASSERT(object != 0);
-		dmu_write(os, ZTEST_DIROBJ, za->za_diroff,
-		    sizeof (uint64_t), &object, tx);
-		dmu_tx_commit(tx);
-	}
-
-	/*
-	 * Add entries to this ZAP amd make sure it spills over
-	 * and gets upgraded to a fatzap. Also, since we are adding
-	 * 2050 entries we should see ptrtbl growth and leaf-block
-	 * split.
-	 */
-	for (i = 0; i < 2050; i++) {
-		(void) snprintf(entname, sizeof (entname), "%s-%d", name, i);
-		value = i;
+	for (int i = 0; i < 2050; i++) {
+		char name[MAXNAMELEN];
+		uint64_t value = i;
+		dmu_tx_t *tx;
+		int error;
+
+		(void) snprintf(name, sizeof (name), "fzap-%llu-%llu",
+		    id, value);
 
 		tx = dmu_tx_create(os);
-		dmu_tx_hold_zap(tx, object, TRUE, entname);
-		error = dmu_tx_assign(tx, TXG_WAIT);
-
-		if (error) {
-			ztest_record_enospc("create zap entry");
-			dmu_tx_abort(tx);
+		dmu_tx_hold_zap(tx, object, B_TRUE, name);
+		txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+		if (txg == 0)
 			return;
-		}
-		error = zap_add(os, object, entname, sizeof (uint64_t),
-		    1, &value, tx);
-
+		error = zap_add(os, object, name, sizeof (uint64_t), 1,
+		    &value, tx);
 		ASSERT(error == 0 || error == EEXIST);
 		dmu_tx_commit(tx);
 	}
-
-	/*
-	 * Once in a while, destroy the object.
-	 */
-	if (ztest_random(1000) != 0)
-		return;
-
-	tx = dmu_tx_create(os);
-	dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t));
-	dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
-	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error) {
-		ztest_record_enospc("destroy zap object");
-		dmu_tx_abort(tx);
-		return;
-	}
-	error = zap_destroy(os, object, tx);
-	if (error)
-		fatal(0, "zap_destroy('%s', %llu) = %d",
-		    osname, object, error);
-	object = 0;
-	dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t),
-	    &object, tx);
-	dmu_tx_commit(tx);
 }
 
+/* ARGSUSED */
 void
-ztest_zap_parallel(ztest_args_t *za)
+ztest_zap_parallel(ztest_ds_t *zd, uint64_t id)
 {
-	objset_t *os = za->za_os;
+	objset_t *os = zd->zd_os;
+	ztest_od_t od[1];
 	uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc;
 	dmu_tx_t *tx;
 	int i, namelen, error;
+	int micro = ztest_random(2);
 	char name[20], string_value[20];
 	void *data;
 
+	ztest_od_init(&od[0], ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0);
+
+	if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
+		return;
+
+	object = od[0].od_object;
+
 	/*
 	 * Generate a random name of the form 'xxx.....' where each
 	 * x is a random printable character and the dots are dots.
@@ -3155,12 +3818,7 @@
 		name[i] = '.';
 	name[i] = '\0';
 
-	if (ztest_random(2) == 0)
-		object = ZTEST_MICROZAP_OBJ;
-	else
-		object = ZTEST_FATZAP_OBJ;
-
-	if ((namelen & 1) || object == ZTEST_MICROZAP_OBJ) {
+	if ((namelen & 1) || micro) {
 		wsize = sizeof (txg);
 		wc = 1;
 		data = &txg;
@@ -3181,14 +3839,10 @@
 
 	if (i >= 2) {
 		tx = dmu_tx_create(os);
-		dmu_tx_hold_zap(tx, object, TRUE, NULL);
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		if (error) {
-			ztest_record_enospc("zap parallel");
-			dmu_tx_abort(tx);
+		dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+		txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+		if (txg == 0)
 			return;
-		}
-		txg = dmu_tx_get_txg(tx);
 		bcopy(name, string_value, namelen);
 	} else {
 		tx = NULL;
@@ -3322,20 +3976,26 @@
  * Commit callback test.
  */
 void
-ztest_dmu_commit_callbacks(ztest_args_t *za)
+ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id)
 {
-	objset_t *os = za->za_os;
+	objset_t *os = zd->zd_os;
+	ztest_od_t od[1];
 	dmu_tx_t *tx;
 	ztest_cb_data_t *cb_data[3], *tmp_cb;
 	uint64_t old_txg, txg;
 	int i, error;
 
+	ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);
+
+	if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
+		return;
+
 	tx = dmu_tx_create(os);
 
 	cb_data[0] = ztest_create_cb_data(os, 0);
 	dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]);
 
-	dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t));
+	dmu_tx_hold_write(tx, od[0].od_object, 0, sizeof (uint64_t));
 
 	/* Every once in a while, abort the transaction on purpose */
 	if (ztest_random(100) == 0)
@@ -3378,14 +4038,14 @@
 	/*
 	 * Read existing data to make sure there isn't a future leak.
 	 */
-	VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t),
+	VERIFY(0 == dmu_read(os, od[0].od_object, 0, sizeof (uint64_t),
 	    &old_txg, DMU_READ_PREFETCH));
 
 	if (old_txg > txg)
 		fatal(0, "future leak: got %" PRIu64 ", open txg is %" PRIu64,
 		    old_txg, txg);
 
-	dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t), &txg, tx);
+	dmu_write(os, od[0].od_object, 0, sizeof (uint64_t), &txg, tx);
 
 	(void) mutex_lock(&zcl.zcl_callbacks_lock);
 
@@ -3439,69 +4099,60 @@
 	dmu_tx_commit(tx);
 }
 
+/* ARGSUSED */
 void
-ztest_dsl_prop_get_set(ztest_args_t *za)
+ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id)
 {
-	objset_t *os = za->za_os;
-	int i, inherit;
-	uint64_t value;
-	const char *prop, *valname;
-	char setpoint[MAXPATHLEN];
-	char osname[MAXNAMELEN];
-	int error;
-
-	(void) rw_rdlock(&ztest_shared->zs_name_lock);
-
-	dmu_objset_name(os, osname);
-
-	for (i = 0; i < 2; i++) {
-		if (i == 0) {
-			prop = "checksum";
-			value = ztest_random_checksum();
-			inherit = (value == ZIO_CHECKSUM_INHERIT);
-		} else {
-			prop = "compression";
-			value = ztest_random_compress();
-			inherit = (value == ZIO_COMPRESS_INHERIT);
-		}
-
-		error = dsl_prop_set(osname, prop, sizeof (value),
-		    !inherit, &value);
-
-		if (error == ENOSPC) {
-			ztest_record_enospc("dsl_prop_set");
-			break;
-		}
-
-		ASSERT3U(error, ==, 0);
-
-		VERIFY3U(dsl_prop_get(osname, prop, sizeof (value),
-		    1, &value, setpoint), ==, 0);
-
-		if (i == 0)
-			valname = zio_checksum_table[value].ci_name;
-		else
-			valname = zio_compress_table[value].ci_name;
-
-		if (zopt_verbose >= 6) {
-			(void) printf("%s %s = %s for '%s'\n",
-			    osname, prop, valname, setpoint);
-		}
-	}
-
-	(void) rw_unlock(&ztest_shared->zs_name_lock);
+	zfs_prop_t proplist[] = {
+		ZFS_PROP_CHECKSUM,
+		ZFS_PROP_COMPRESSION,
+		ZFS_PROP_COPIES,
+		ZFS_PROP_DEDUP
+	};
+	ztest_shared_t *zs = ztest_shared;
+
+	(void) rw_rdlock(&zs->zs_name_lock);
+
+	for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++)
+		(void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p],
+		    ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2));
+
+	(void) rw_unlock(&zs->zs_name_lock);
+}
+
+/* ARGSUSED */
+void
+ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id)
+{
+	ztest_shared_t *zs = ztest_shared;
+	nvlist_t *props = NULL;
+
+	(void) rw_rdlock(&zs->zs_name_lock);
+
+#if 0
+	(void) ztest_spa_prop_set_uint64(zs, ZPOOL_PROP_DEDUPDITTO,
+	    ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN));
+#endif
+
+	VERIFY3U(spa_prop_get(zs->zs_spa, &props), ==, 0);
+
+	if (zopt_verbose >= 6)
+		dump_nvlist(props, 4);
+
+	nvlist_free(props);
+
+	(void) rw_unlock(&zs->zs_name_lock);
 }
 
 /*
  * Test snapshot hold/release and deferred destroy.
  */
 void
-ztest_dmu_snapshot_hold(ztest_args_t *za)
+ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id)
 {
 	int error;
-	objset_t *os = za->za_os;
+	objset_t *os = zd->zd_os;
 	objset_t *origin;
-	uint64_t curval = za->za_instance;
 	char snapname[100];
 	char fullname[100];
 	char clonename[100];
@@ -3512,10 +4163,10 @@
 
 	dmu_objset_name(os, osname);
 
-	(void) snprintf(snapname, 100, "sh1_%llu", curval);
+	(void) snprintf(snapname, 100, "sh1_%llu", id);
 	(void) snprintf(fullname, 100, "%s@%s", osname, snapname);
-	(void) snprintf(clonename, 100, "%s/ch1_%llu", osname, curval);
-	(void) snprintf(tag, 100, "%tag_%llu", curval);
+	(void) snprintf(clonename, 100, "%s/ch1_%llu", osname, id);
+	(void) snprintf(tag, 100, "%tag_%llu", id);
 
 	/*
 	 * Clean up from any previous run.
@@ -3608,9 +4259,12 @@
 /*
  * Inject random faults into the on-disk data.
  */
+/* ARGSUSED */
 void
-ztest_fault_inject(ztest_args_t *za)
+ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
 {
+	ztest_shared_t *zs = ztest_shared;
+	spa_t *spa = zs->zs_spa;
 	int fd;
 	uint64_t offset;
 	uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
@@ -3619,7 +4273,6 @@
 	char path0[MAXPATHLEN];
 	char pathrand[MAXPATHLEN];
 	size_t fsize;
-	spa_t *spa = za->za_spa;
 	int bshift = SPA_MAXBLOCKSHIFT + 2;	/* don't scrog all labels */
 	int iters = 1000;
 	int maxfaults = zopt_maxfaults;
@@ -3636,9 +4289,9 @@
 
 	if (ztest_random(2) == 0) {
 		/*
-		 * Inject errors on a normal data device.
+		 * Inject errors on a normal data device or slog device.
 		 */
-		top = ztest_random(spa->spa_root_vdev->vdev_children);
+		top = ztest_random_vdev_top(spa, B_TRUE);
 		leaf = ztest_random(leaves);
 
 		/*
@@ -3751,7 +4404,7 @@
 		if (offset >= fsize)
 			continue;
 
-		if (zopt_verbose >= 6)
+		if (zopt_verbose >= 7)
 			(void) printf("injecting bad word into %s,"
 			    " offset 0x%llx\n", pathrand, (u_longlong_t)offset);
 
@@ -3764,31 +4417,129 @@
 }
 
 /*
- * Scrub the pool.
+ * Verify that DDT repair works as expected.
  */
 void
-ztest_scrub(ztest_args_t *za)
+ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
 {
-	spa_t *spa = za->za_spa;
+	ztest_shared_t *zs = ztest_shared;
+	spa_t *spa = zs->zs_spa;
+	objset_t *os = zd->zd_os;
+	ztest_od_t od[1];
+	uint64_t object, blocksize, txg, pattern, psize;
+	enum zio_checksum checksum = spa_dedup_checksum(spa);
+	dmu_buf_t *db;
+	dmu_tx_t *tx;
+	void *buf;
+	blkptr_t blk;
+	int copies = 2 * ZIO_DEDUPDITTO_MIN;
+
+	blocksize = ztest_random_blocksize();
+	blocksize = MIN(blocksize, 2048);	/* because we write so many */
+
+	ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
+
+	if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
+		return;
+
+	/*
+	 * Take the name lock as writer to prevent anyone else from changing
+	 * the pool and dataset properies we need to maintain during this test.
+	 */
+	(void) rw_wrlock(&zs->zs_name_lock);
+
+	if (ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_DEDUP, checksum,
+	    B_FALSE) != 0 ||
+	    ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COPIES, 1,
+	    B_FALSE) != 0) {
+		(void) rw_unlock(&zs->zs_name_lock);
+		return;
+	}
+
+	object = od[0].od_object;
+	blocksize = od[0].od_blocksize;
+	pattern = spa_guid(spa) ^ dmu_objset_fsid_guid(os);
+
+	ASSERT(object != 0);
+
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_write(tx, object, 0, copies * blocksize);
+	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+	if (txg == 0) {
+		(void) rw_unlock(&zs->zs_name_lock);
+		return;
+	}
+
+	/*
+	 * Write all the copies of our block.
+	 */
+	for (int i = 0; i < copies; i++) {
+		uint64_t offset = i * blocksize;
+		VERIFY(dmu_buf_hold(os, object, offset, FTAG, &db) == 0);
+		ASSERT(db->db_offset == offset);
+		ASSERT(db->db_size == blocksize);
+		ASSERT(ztest_pattern_match(db->db_data, db->db_size, pattern) ||
+		    ztest_pattern_match(db->db_data, db->db_size, 0ULL));
+		dmu_buf_will_fill(db, tx);
+		ztest_pattern_set(db->db_data, db->db_size, pattern);
+		dmu_buf_rele(db, FTAG);
+	}
+
+	dmu_tx_commit(tx);
+	txg_wait_synced(spa_get_dsl(spa), txg);
+
+	/*
+	 * Find out what block we got.
+	 */
+	VERIFY(dmu_buf_hold(os, object, 0, FTAG, &db) == 0);
+	blk = *((dmu_buf_impl_t *)db)->db_blkptr;
+	dmu_buf_rele(db, FTAG);
+
+	/*
+	 * Damage the block.  Dedup-ditto will save us when we read it later.
+	 */
+	psize = BP_GET_PSIZE(&blk);
+	buf = zio_buf_alloc(psize);
+	ztest_pattern_set(buf, psize, ~pattern);
+
+	(void) zio_wait(zio_rewrite(NULL, spa, 0, &blk,
+	    buf, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL));
+
+	zio_buf_free(buf, psize);
+
+	(void) rw_unlock(&zs->zs_name_lock);
+}
+
+/*
+ * Scrub the pool.
+ */
+/* ARGSUSED */
+void
+ztest_scrub(ztest_ds_t *zd, uint64_t id)
+{
+	ztest_shared_t *zs = ztest_shared;
+	spa_t *spa = zs->zs_spa;
 
 	(void) spa_scrub(spa, POOL_SCRUB_EVERYTHING);
-	(void) poll(NULL, 0, 1000); /* wait a second, then force a restart */
+	(void) poll(NULL, 0, 100); /* wait a moment, then force a restart */
 	(void) spa_scrub(spa, POOL_SCRUB_EVERYTHING);
 }
 
 /*
  * Rename the pool to a different name and then rename it back.
  */
+/* ARGSUSED */
 void
-ztest_spa_rename(ztest_args_t *za)
+ztest_spa_rename(ztest_ds_t *zd, uint64_t id)
 {
+	ztest_shared_t *zs = ztest_shared;
 	char *oldname, *newname;
-	int error;
 	spa_t *spa;
 
-	(void) rw_wrlock(&ztest_shared->zs_name_lock);
-
-	oldname = za->za_pool;
+	(void) rw_wrlock(&zs->zs_name_lock);
+
+	oldname = zs->zs_pool;
 	newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL);
 	(void) strcpy(newname, oldname);
 	(void) strcat(newname, "_tmp");
@@ -3796,128 +4547,44 @@
 	/*
 	 * Do the rename
 	 */
-	error = spa_rename(oldname, newname);
-	if (error)
-		fatal(0, "spa_rename('%s', '%s') = %d", oldname,
-		    newname, error);
+	VERIFY3U(0, ==, spa_rename(oldname, newname));
 
 	/*
 	 * Try to open it under the old name, which shouldn't exist
 	 */
-	error = spa_open(oldname, &spa, FTAG);
-	if (error != ENOENT)
-		fatal(0, "spa_open('%s') = %d", oldname, error);
+	VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG));
 
 	/*
 	 * Open it under the new name and make sure it's still the same spa_t.
 	 */
-	error = spa_open(newname, &spa, FTAG);
-	if (error != 0)
-		fatal(0, "spa_open('%s') = %d", newname, error);
-
-	ASSERT(spa == za->za_spa);
+	VERIFY3U(0, ==, spa_open(newname, &spa, FTAG));
+
+	ASSERT(spa == zs->zs_spa);
 	spa_close(spa, FTAG);
 
 	/*
 	 * Rename it back to the original
 	 */
-	error = spa_rename(newname, oldname);
-	if (error)
-		fatal(0, "spa_rename('%s', '%s') = %d", newname,
-		    oldname, error);
+	VERIFY3U(0, ==, spa_rename(newname, oldname));
 
 	/*
 	 * Make sure it can still be opened
 	 */
-	error = spa_open(oldname, &spa, FTAG);
-	if (error != 0)
-		fatal(0, "spa_open('%s') = %d", oldname, error);
-
-	ASSERT(spa == za->za_spa);
+	VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG));
+
+	ASSERT(spa == zs->zs_spa);
 	spa_close(spa, FTAG);
 
 	umem_free(newname, strlen(newname) + 1);
 
-	(void) rw_unlock(&ztest_shared->zs_name_lock);
+	(void) rw_unlock(&zs->zs_name_lock);
 }
 
-
 /*
- * Completely obliterate one disk.
+ * Verify pool integrity by running zdb.
  */
 static void
-ztest_obliterate_one_disk(uint64_t vdev)
-{
-	int fd;
-	char dev_name[MAXPATHLEN], copy_name[MAXPATHLEN];
-	size_t fsize;
-
-	if (zopt_maxfaults < 2)
-		return;
-
-	(void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
-	(void) snprintf(copy_name, MAXPATHLEN, "%s.old", dev_name);
-
-	fd = open(dev_name, O_RDWR);
-
-	if (fd == -1)
-		fatal(1, "can't open %s", dev_name);
-
-	/*
-	 * Determine the size.
-	 */
-	fsize = lseek(fd, 0, SEEK_END);
-
-	(void) close(fd);
-
-	/*
-	 * Rename the old device to dev_name.old (useful for debugging).
-	 */
-	VERIFY(rename(dev_name, copy_name) == 0);
-
-	/*
-	 * Create a new one.
-	 */
-	VERIFY((fd = open(dev_name, O_RDWR | O_CREAT | O_TRUNC, 0666)) >= 0);
-	VERIFY(ftruncate(fd, fsize) == 0);
-	(void) close(fd);
-}
-
-static void
-ztest_replace_one_disk(spa_t *spa, uint64_t vdev)
-{
-	char dev_name[MAXPATHLEN];
-	nvlist_t *root;
-	int error;
-	uint64_t guid;
-	vdev_t *vd;
-
-	(void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
-
-	/*
-	 * Build the nvlist describing dev_name.
-	 */
-	root = make_vdev_root(dev_name, NULL, 0, 0, 0, 0, 0, 1);
-
-	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
-	if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, dev_name)) == NULL)
-		guid = 0;
-	else
-		guid = vd->vdev_guid;
-	spa_config_exit(spa, SCL_VDEV, FTAG);
-	error = spa_vdev_attach(spa, guid, root, B_TRUE);
-	if (error != 0 &&
-	    error != EBUSY &&
-	    error != ENOTSUP &&
-	    error != ENODEV &&
-	    error != EDOM)
-		fatal(0, "spa_vdev_attach(in-place) = %d", error);
-
-	nvlist_free(root);
-}
-
-static void
-ztest_verify_blocks(char *pool)
+ztest_run_zdb(char *pool)
 {
 	int status;
 	char zdb[MAXPATHLEN + MAXNAMELEN + 20];
@@ -3988,7 +4655,6 @@
 	nvlist_t *config, *newconfig;
 	uint64_t pool_guid;
 	spa_t *spa;
-	int error;
 
 	if (zopt_verbose >= 4) {
 		(void) printf("import/export: old = %s, new = %s\n",
@@ -4003,9 +4669,7 @@
 	/*
 	 * Get the pool's configuration and guid.
 	 */
-	error = spa_open(oldname, &spa, FTAG);
-	if (error)
-		fatal(0, "spa_open('%s') = %d", oldname, error);
+	VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG));
 
 	/*
 	 * Kick off a scrub to tickle scrub/export races.
@@ -4021,9 +4685,7 @@
 	/*
 	 * Export it.
 	 */
-	error = spa_export(oldname, &config, B_FALSE, B_FALSE);
-	if (error)
-		fatal(0, "spa_export('%s') = %d", oldname, error);
+	VERIFY3U(0, ==, spa_export(oldname, &config, B_FALSE, B_FALSE));
 
 	ztest_walk_pool_directory("pools after export");
 
@@ -4037,39 +4699,29 @@
 	/*
 	 * Import it under the new name.
 	 */
-	error = spa_import(newname, config, NULL);
-	if (error)
-		fatal(0, "spa_import('%s') = %d", newname, error);
+	VERIFY3U(0, ==, spa_import(newname, config, NULL));
 
 	ztest_walk_pool_directory("pools after import");
 
 	/*
 	 * Try to import it again -- should fail with EEXIST.
 	 */
-	error = spa_import(newname, config, NULL);
-	if (error != EEXIST)
-		fatal(0, "spa_import('%s') twice", newname);
+	VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL));
 
 	/*
 	 * Try to import it under a different name -- should fail with EEXIST.
 	 */
-	error = spa_import(oldname, config, NULL);
-	if (error != EEXIST)
-		fatal(0, "spa_import('%s') under multiple names", newname);
+	VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL));
 
 	/*
 	 * Verify that the pool is no longer visible under the old name.
 	 */
-	error = spa_open(oldname, &spa, FTAG);
-	if (error != ENOENT)
-		fatal(0, "spa_open('%s') = %d", newname, error);
+	VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG));
 
 	/*
 	 * Verify that we can open and close the pool using the new name.
 	 */
-	error = spa_open(newname, &spa, FTAG);
-	if (error)
-		fatal(0, "spa_open('%s') = %d", newname, error);
+	VERIFY3U(0, ==, spa_open(newname, &spa, FTAG));
 	ASSERT(pool_guid == spa_guid(spa));
 	spa_close(spa, FTAG);
 
@@ -4079,12 +4731,12 @@
 static void
 ztest_resume(spa_t *spa)
 {
-	if (spa_suspended(spa)) {
-		spa_vdev_state_enter(spa, SCL_NONE);
-		vdev_clear(spa, NULL);
-		(void) spa_vdev_state_exit(spa, NULL, 0);
-		(void) zio_resume(spa);
-	}
+	if (spa_suspended(spa) && zopt_verbose >= 6)
+		(void) printf("resuming from suspended state\n");
+	spa_vdev_state_enter(spa, SCL_NONE);
+	vdev_clear(spa, NULL);
+	(void) spa_vdev_state_exit(spa, NULL, 0);
+	(void) zio_resume(spa);
 }
 
 static void *
@@ -4093,154 +4745,246 @@
 	spa_t *spa = arg;
 
 	while (!ztest_exiting) {
-		(void) poll(NULL, 0, 1000);
-		ztest_resume(spa);
+		if (spa_suspended(spa))
+			ztest_resume(spa);
+		(void) poll(NULL, 0, 100);
 	}
 	return (NULL);
 }
 
 static void *
+ztest_deadman_thread(void *arg)
+{
+	ztest_shared_t *zs = arg;
+	int grace = 300;
+	hrtime_t delta;
+
+	delta = (zs->zs_thread_stop - zs->zs_thread_start) / NANOSEC + grace;
+
+	(void) poll(NULL, 0, (int)(1000 * delta));
+
+	fatal(0, "failed to complete within %d seconds of deadline", grace);
+
+	return (NULL);
+}
+
+static void
+ztest_execute(ztest_info_t *zi, uint64_t id)
+{
+	ztest_shared_t *zs = ztest_shared;
+	ztest_ds_t *zd = &zs->zs_zd[id % zopt_datasets];
+	hrtime_t functime = gethrtime();
+
+	for (int i = 0; i < zi->zi_iters; i++)
+		zi->zi_func(zd, id);
+
+	functime = gethrtime() - functime;
+
+	atomic_add_64(&zi->zi_call_count, 1);
+	atomic_add_64(&zi->zi_call_time, functime);
+
+	if (zopt_verbose >= 4) {
+		Dl_info dli;
+		(void) dladdr((void *)zi->zi_func, &dli);
+		(void) printf("%6.2f sec in %s\n",
+		    (double)functime / NANOSEC, dli.dli_sname);
+	}
+}
+
+static void *
 ztest_thread(void *arg)
 {
-	ztest_args_t *za = arg;
+	uint64_t id = (uintptr_t)arg;
 	ztest_shared_t *zs = ztest_shared;
-	hrtime_t now, functime;
+	uint64_t call_next;
+	hrtime_t now;
 	ztest_info_t *zi;
-	int f, i;
-
-	while ((now = gethrtime()) < za->za_stop) {
+
+	while ((now = gethrtime()) < zs->zs_thread_stop) {
 		/*
 		 * See if it's time to force a crash.
 		 */
-		if (now > za->za_kill) {
-			zs->zs_alloc = spa_get_alloc(za->za_spa);
-			zs->zs_space = spa_get_space(za->za_spa);
-			(void) kill(getpid(), SIGKILL);
-		}
-
-		/*
-		 * Pick a random function.
-		 */
-		f = ztest_random(ZTEST_FUNCS);
-		zi = &zs->zs_info[f];
-
-		/*
-		 * Decide whether to call it, based on the requested frequency.
-		 */
-		if (zi->zi_call_target == 0 ||
-		    (double)zi->zi_call_total / zi->zi_call_target >
-		    (double)(now - zs->zs_start_time) / (zopt_time * NANOSEC))
-			continue;
-
-		atomic_add_64(&zi->zi_calls, 1);
-		atomic_add_64(&zi->zi_call_total, 1);
-
-		za->za_diroff = (za->za_instance * ZTEST_FUNCS + f) *
-		    ZTEST_DIRSIZE;
-		za->za_diroff_shared = (1ULL << 63);
-
-		for (i = 0; i < zi->zi_iters; i++)
-			zi->zi_func(za);
-
-		functime = gethrtime() - now;
-
-		atomic_add_64(&zi->zi_call_time, functime);
-
-		if (zopt_verbose >= 4) {
-			Dl_info dli;
-			(void) dladdr((void *)zi->zi_func, &dli);
-			(void) printf("%6.2f sec in %s\n",
-			    (double)functime / NANOSEC, dli.dli_sname);
-		}
+		if (now > zs->zs_thread_kill)
+			ztest_kill(zs);
 
 		/*
 		 * If we're getting ENOSPC with some regularity, stop.
 		 */
 		if (zs->zs_enospc_count > 10)
 			break;
+
+		/*
+		 * Pick a random function to execute.
+		 */
+		zi = &zs->zs_info[ztest_random(ZTEST_FUNCS)];
+		call_next = zi->zi_call_next;
+
+		if (now >= call_next &&
+		    atomic_cas_64(&zi->zi_call_next, call_next, call_next +
+		    ztest_random(2 * zi->zi_interval[0] + 1)) == call_next)
+			ztest_execute(zi, id);
 	}
 
 	return (NULL);
 }
 
+static void
+ztest_dataset_name(char *dsname, char *pool, int d)
+{
+	(void) snprintf(dsname, MAXNAMELEN, "%s/ds_%d", pool, d);
+}
+
+static void
+ztest_dataset_destroy(ztest_shared_t *zs, int d)
+{
+	char name[MAXNAMELEN];
+
+	ztest_dataset_name(name, zs->zs_pool, d);
+
+	if (zopt_verbose >= 3)
+		(void) printf("Destroying %s to free up space\n", name);
+
+	/*
+	 * Cleanup any non-standard clones and snapshots.  In general,
+	 * ztest thread t operates on dataset (t % zopt_datasets),
+	 * so there may be more than one thing to clean up.
+	 */
+	for (int t = d; t < zopt_threads; t += zopt_datasets)
+		ztest_dsl_dataset_cleanup(name, t);
+
+	(void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL,
+	    DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
+}
+
+static void
+ztest_dataset_dirobj_verify(ztest_ds_t *zd)
+{
+	uint64_t usedobjs, dirobjs, scratch;
+
+	/*
+	 * ZTEST_DIROBJ is the object directory for the entire dataset.
+	 * Therefore, the number of objects in use should equal the
+	 * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself.
+	 * If not, we have an object leak.
+	 *
+	 * Note that we can only check this in ztest_dataset_open(),
+	 * when the open-context and syncing-context values agree.
+	 * That's because zap_count() returns the open-context value,
+	 * while dmu_objset_space() returns the rootbp fill count.
+	 */
+	VERIFY3U(0, ==, zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs));
+	dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch);
+	ASSERT3U(dirobjs + 1, ==, usedobjs);
+}
+
+static int
+ztest_dataset_open(ztest_shared_t *zs, int d)
+{
+	ztest_ds_t *zd = &zs->zs_zd[d];
+	uint64_t committed_seq = zd->zd_seq;
+	objset_t *os;
+	zilog_t *zilog;
+	char name[MAXNAMELEN];
+	int error;
+
+	ztest_dataset_name(name, zs->zs_pool, d);
+
+	(void) rw_rdlock(&zs->zs_name_lock);
+
+	error = dmu_objset_create(name, DMU_OST_OTHER, 0,
+	    ztest_objset_create_cb, NULL);
+	if (error == ENOSPC) {
+		(void) rw_unlock(&zs->zs_name_lock);
+		ztest_record_enospc(FTAG);
+		return (error);
+	}
+	ASSERT(error == 0 || error == EEXIST);
+
+	VERIFY3U(dmu_objset_hold(name, zd, &os), ==, 0);
+	(void) rw_unlock(&zs->zs_name_lock);
+
+	ztest_zd_init(zd, os);
+
+	zilog = zd->zd_zilog;
+
+	if (zilog->zl_header->zh_claim_lr_seq != 0 &&
+	    zilog->zl_header->zh_claim_lr_seq < committed_seq)
+		fatal(0, "missing log records: claimed %llu < committed %llu",
+		    zilog->zl_header->zh_claim_lr_seq, committed_seq);
+
+	ztest_dataset_dirobj_verify(zd);
+
+	zil_replay(os, zd, ztest_replay_vector);
+
+	ztest_dataset_dirobj_verify(zd);
+
+	if (zopt_verbose >= 6)
+		(void) printf("%s replay %llu blocks, %llu records, seq %llu\n",
+		    zd->zd_name,
+		    (u_longlong_t)zilog->zl_parse_blk_count,
+		    (u_longlong_t)zilog->zl_parse_lr_count,
+		    (u_longlong_t)zilog->zl_replaying_seq);
+
+	zilog = zil_open(os, ztest_get_data);
+
+	if (zilog->zl_replaying_seq != 0 &&
+	    zilog->zl_replaying_seq < committed_seq)
+		fatal(0, "missing log records: replayed %llu < committed %llu",
+		    zilog->zl_replaying_seq, committed_seq);
+
+	return (0);
+}
+
+static void
+ztest_dataset_close(ztest_shared_t *zs, int d)
+{
+	ztest_ds_t *zd = &zs->zs_zd[d];
+
+	zil_close(zd->zd_zilog);
+	dmu_objset_rele(zd->zd_os, zd);
+
+	ztest_zd_fini(zd);
+}
+
 /*
  * Kick off threads to run tests on all datasets in parallel.
  */
 static void
-ztest_run(char *pool)
+ztest_run(ztest_shared_t *zs)
 {
-	int t, d, error;
-	ztest_shared_t *zs = ztest_shared;
-	ztest_args_t *za;
+	thread_t *tid;
 	spa_t *spa;
-	char name[100];
 	thread_t resume_tid;
+	int error;
 
 	ztest_exiting = B_FALSE;
 
-	(void) _mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL);
-	(void) rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL);
-
-	(void) _mutex_init(&zcl.zcl_callbacks_lock, USYNC_THREAD,
-	    NULL);
+	/*
+	 * Initialize parent/child shared state.
+	 */
+	VERIFY(_mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL) == 0);
+	VERIFY(rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL) == 0);
+
+	zs->zs_thread_start = gethrtime();
+	zs->zs_thread_stop = zs->zs_thread_start + zopt_passtime * NANOSEC;
+	zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop);
+	zs->zs_thread_kill = zs->zs_thread_stop;
+	if (ztest_random(100) < zopt_killrate)
+		zs->zs_thread_kill -= ztest_random(zopt_passtime * NANOSEC);
+
+	(void) _mutex_init(&zcl.zcl_callbacks_lock, USYNC_THREAD, NULL);
 
 	list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t),
 	    offsetof(ztest_cb_data_t, zcd_node));
 
-	for (t = 0; t < ZTEST_SYNC_LOCKS; t++)
-		(void) _mutex_init(&zs->zs_sync_lock[t], USYNC_THREAD, NULL);
-
-	/*
-	 * Destroy one disk before we even start.
-	 * It's mirrored, so everything should work just fine.
-	 * This makes us exercise fault handling very early in spa_load().
-	 */
-	ztest_obliterate_one_disk(0);
-
-	/*
-	 * Verify that the sum of the sizes of all blocks in the pool
-	 * equals the SPA's allocated space total.
-	 */
-	ztest_verify_blocks(pool);
-
-	/*
-	 * Kick off a replacement of the disk we just obliterated.
-	 */
-	kernel_init(FREAD | FWRITE);
-	VERIFY(spa_open(pool, &spa, FTAG) == 0);
-	ztest_replace_one_disk(spa, 0);
-	if (zopt_verbose >= 5)
-		show_pool_stats(spa);
-	spa_close(spa, FTAG);
-	kernel_fini();
-
-	kernel_init(FREAD | FWRITE);
-
-	/*
-	 * Verify that we can export the pool and reimport it under a
-	 * different name.
-	 */
-	if (ztest_random(2) == 0) {
-		(void) snprintf(name, 100, "%s_import", pool);
-		ztest_spa_import_export(pool, name);
-		ztest_spa_import_export(name, pool);
-	}
-
-	/*
-	 * Verify that we can loop over all pools.
-	 */
-	mutex_enter(&spa_namespace_lock);
-	for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) {
-		if (zopt_verbose > 3) {
-			(void) printf("spa_next: found %s\n", spa_name(spa));
-		}
-	}
-	mutex_exit(&spa_namespace_lock);
-
 	/*
 	 * Open our pool.
 	 */
-	VERIFY(spa_open(pool, &spa, FTAG) == 0);
+	kernel_init(FREAD | FWRITE);
+	VERIFY(spa_open(zs->zs_pool, &spa, FTAG) == 0);
+	zs->zs_spa = spa;
+
+	spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN;
 
 	/*
 	 * We don't expect the pool to suspend unless maxfaults == 0,
@@ -4259,13 +5003,19 @@
 	    &resume_tid) == 0);
 
 	/*
+	 * Create a deadman thread to abort() if we hang.
+	 */
+	VERIFY(thr_create(0, 0, ztest_deadman_thread, zs, THR_BOUND,
+	    NULL) == 0);
+
+	/*
 	 * Verify that we can safely inquire about about any object,
 	 * whether it's allocated or not.  To make it interesting,
 	 * we probe a 5-wide window around each power of two.
 	 * This hits all edge cases, including zero and the max.
 	 */
-	for (t = 0; t < 64; t++) {
-		for (d = -5; d <= 5; d++) {
+	for (int t = 0; t < 64; t++) {
+		for (int d = -5; d <= 5; d++) {
 			error = dmu_object_info(spa->spa_meta_objset,
 			    (1ULL << t) + d, NULL);
 			ASSERT(error == 0 || error == ENOENT ||
@@ -4274,104 +5024,45 @@
 	}
 
 	/*
-	 * Now kick off all the tests that run in parallel.
+	 * If we got any ENOSPC errors on the previous run, destroy something.
 	 */
+	if (zs->zs_enospc_count != 0) {
+		int d = ztest_random(zopt_datasets);
+		ztest_dataset_destroy(zs, d);
+	}
 	zs->zs_enospc_count = 0;
 
-	za = umem_zalloc(zopt_threads * sizeof (ztest_args_t), UMEM_NOFAIL);
+	tid = umem_zalloc(zopt_threads * sizeof (thread_t), UMEM_NOFAIL);
 
 	if (zopt_verbose >= 4)
 		(void) printf("starting main threads...\n");
 
-	za[0].za_start = gethrtime();
-	za[0].za_stop = za[0].za_start + zopt_passtime * NANOSEC;
-	za[0].za_stop = MIN(za[0].za_stop, zs->zs_stop_time);
-	za[0].za_kill = za[0].za_stop;
-	if (ztest_random(100) < zopt_killrate)
-		za[0].za_kill -= ztest_random(zopt_passtime * NANOSEC);
-
-	for (t = 0; t < zopt_threads; t++) {
-		d = t % zopt_datasets;
-
-		(void) strcpy(za[t].za_pool, pool);
-		za[t].za_os = za[d].za_os;
-		za[t].za_spa = spa;
-		za[t].za_zilog = za[d].za_zilog;
-		za[t].za_instance = t;
-		za[t].za_random = ztest_random(-1ULL);
-		za[t].za_start = za[0].za_start;
-		za[t].za_stop = za[0].za_stop;
-		za[t].za_kill = za[0].za_kill;
-
-		if (t < zopt_datasets) {
-			int test_future = FALSE;
-			(void) rw_rdlock(&ztest_shared->zs_name_lock);
-			(void) snprintf(name, 100, "%s/%s_%d", pool, pool, d);
-			error = dmu_objset_create(name, DMU_OST_OTHER, 0,
-			    ztest_create_cb, NULL);
-			if (error == EEXIST) {
-				test_future = TRUE;
-			} else if (error == ENOSPC) {
-				zs->zs_enospc_count++;
-				(void) rw_unlock(&ztest_shared->zs_name_lock);
-				break;
-			} else if (error != 0) {
-				fatal(0, "dmu_objset_create(%s) = %d",
-				    name, error);
-			}
-			error = dmu_objset_hold(name, FTAG, &za[d].za_os);
-			if (error)
-				fatal(0, "dmu_objset_open('%s') = %d",
-				    name, error);
-			(void) rw_unlock(&ztest_shared->zs_name_lock);
-			if (test_future)
-				ztest_dmu_check_future_leak(&za[t]);
-			zil_replay(za[d].za_os, za[d].za_os,
-			    ztest_replay_vector);
-			za[d].za_zilog = zil_open(za[d].za_os, NULL);
-		}
-
-		VERIFY(thr_create(0, 0, ztest_thread, &za[t], THR_BOUND,
-		    &za[t].za_thread) == 0);
+	/*
+	 * Kick off all the tests that run in parallel.
+	 */
+	for (int t = 0; t < zopt_threads; t++) {
+		if (t < zopt_datasets && ztest_dataset_open(zs, t) != 0)
+			return;
+		VERIFY(thr_create(0, 0, ztest_thread, (void *)(uintptr_t)t,
+		    THR_BOUND, &tid[t]) == 0);
 	}
 
-	while (--t >= 0) {
-		VERIFY(thr_join(za[t].za_thread, NULL, NULL) == 0);
-		if (t < zopt_datasets) {
-			zil_close(za[t].za_zilog);
-			dmu_objset_rele(za[t].za_os, FTAG);
-		}
-	}
-
-	if (zopt_verbose >= 3)
-		show_pool_stats(spa);
-
-	txg_wait_synced(spa_get_dsl(spa), 0);
-
-	zs->zs_alloc = spa_get_alloc(spa);
-	zs->zs_space = spa_get_space(spa);
-
 	/*
-	 * If we had out-of-space errors, destroy a random objset.
+	 * Wait for all of the tests to complete.  We go in reverse order
+	 * so we don't close datasets while threads are still using them.
 	 */
-	if (zs->zs_enospc_count != 0) {
-		(void) rw_rdlock(&ztest_shared->zs_name_lock);
-		d = (int)ztest_random(zopt_datasets);
-		(void) snprintf(name, 100, "%s/%s_%d", pool, pool, d);
-		if (zopt_verbose >= 3)
-			(void) printf("Destroying %s to free up space\n", name);
-
-		/* Cleanup any non-standard clones and snapshots */
-		ztest_dsl_dataset_cleanup(name, za[d].za_instance);
-
-		(void) dmu_objset_find(name, ztest_destroy_cb, &za[d],
-		    DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
-		(void) rw_unlock(&ztest_shared->zs_name_lock);
+	for (int t = zopt_threads - 1; t >= 0; t--) {
+		VERIFY(thr_join(tid[t], NULL, NULL) == 0);
+		if (t < zopt_datasets)
+			ztest_dataset_close(zs, t);
 	}
 
 	txg_wait_synced(spa_get_dsl(spa), 0);
 
-	umem_free(za, zopt_threads * sizeof (ztest_args_t));
+	zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
+	zs->zs_space = metaslab_class_get_space(spa_normal_class(spa));
+
+	umem_free(tid, zopt_threads * sizeof (thread_t));
 
 	/* Kill the resume thread */
 	ztest_exiting = B_TRUE;
@@ -4382,11 +5073,99 @@
 	 * Right before closing the pool, kick off a bunch of async I/O;
 	 * spa_close() should wait for it to complete.
 	 */
-	for (t = 1; t < 50; t++)
-		dmu_prefetch(spa->spa_meta_objset, t, 0, 1 << 15);
+	for (uint64_t object = 1; object < 50; object++)
+		dmu_prefetch(spa->spa_meta_objset, object, 0, 1ULL << 20);
 
 	spa_close(spa, FTAG);
 
+	/*
+	 * Verify that we can loop over all pools.
+	 */
+	mutex_enter(&spa_namespace_lock);
+	for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa))
+		if (zopt_verbose > 3)
+			(void) printf("spa_next: found %s\n", spa_name(spa));
+	mutex_exit(&spa_namespace_lock);
+
+	/*
+	 * Verify that we can export the pool and reimport it under a
+	 * different name.
+	 */
+	if (ztest_random(2) == 0) {
+		char name[MAXNAMELEN];
+		(void) snprintf(name, MAXNAMELEN, "%s_import", zs->zs_pool);
+		ztest_spa_import_export(zs->zs_pool, name);
+		ztest_spa_import_export(name, zs->zs_pool);
+	}
+
+	kernel_fini();
+}
+
+static void
+ztest_freeze(ztest_shared_t *zs)
+{
+	ztest_ds_t *zd = &zs->zs_zd[0];
+	spa_t *spa;
+
+	if (zopt_verbose >= 3)
+		(void) printf("testing spa_freeze()...\n");
+
+	kernel_init(FREAD | FWRITE);
+	VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG));
+	VERIFY3U(0, ==, ztest_dataset_open(zs, 0));
+
+	/*
+	 * Force the first log block to be transactionally allocated.
+	 * We have to do this before we freeze the pool -- otherwise
+	 * the log chain won't be anchored.
+	 */
+	while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) {
+		ztest_dmu_object_alloc_free(zd, 0);
+		zil_commit(zd->zd_zilog, UINT64_MAX, 0);
+	}
+
+	txg_wait_synced(spa_get_dsl(spa), 0);
+
+	/*
+	 * Freeze the pool.  This stops spa_sync() from doing anything,
+	 * so that the only way to record changes from now on is the ZIL.
+	 */
+	spa_freeze(spa);
+
+	/*
+	 * Run tests that generate log records but don't alter the pool config
+	 * or depend on DSL sync tasks (snapshots, objset create/destroy, etc).
+	 * We do a txg_wait_synced() after each iteration to force the txg
+	 * to increase well beyond the last synced value in the uberblock.
+	 * The ZIL should be OK with that.
+	 */
+	while (ztest_random(20) != 0) {
+		ztest_dmu_write_parallel(zd, 0);
+		ztest_dmu_object_alloc_free(zd, 0);
+		txg_wait_synced(spa_get_dsl(spa), 0);
+	}
+
+	/*
+	 * Commit all of the changes we just generated.
+	 */
+	zil_commit(zd->zd_zilog, UINT64_MAX, 0);
+	txg_wait_synced(spa_get_dsl(spa), 0);
+
+	/*
+	 * Close our dataset and close the pool.
+	 */
+	ztest_dataset_close(zs, 0);
+	spa_close(spa, FTAG);
+	kernel_fini();
+
+	/*
+	 * Open and close the pool and dataset to induce log replay.
+	 */
+	kernel_init(FREAD | FWRITE);
+	VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG));
+	VERIFY3U(0, ==, ztest_dataset_open(zs, 0));
+	ztest_dataset_close(zs, 0);
+	spa_close(spa, FTAG);
 	kernel_fini();
 
 	list_destroy(&zcl.zcl_callbacks);
@@ -4424,41 +5203,40 @@
 
 /*
  * Create a storage pool with the given name and initial vdev size.
- * Then create the specified number of datasets in the pool.
+ * Then test spa_freeze() functionality.
  */
 static void
-ztest_init(char *pool)
+ztest_init(ztest_shared_t *zs)
 {
 	spa_t *spa;
-	int error;
 	nvlist_t *nvroot;
 
+	VERIFY(_mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL) == 0);
+	VERIFY(rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL) == 0);
+
 	kernel_init(FREAD | FWRITE);
 
 	/*
 	 * Create the storage pool.
 	 */
-	(void) spa_destroy(pool);
+	(void) spa_destroy(zs->zs_pool);
 	ztest_shared->zs_vdev_next_leaf = 0;
 	nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0,
 	    0, zopt_raidz, zopt_mirrors, 1);
-	error = spa_create(pool, nvroot, NULL, NULL, NULL);
+	VERIFY3U(0, ==, spa_create(zs->zs_pool, nvroot, NULL, NULL, NULL));
 	nvlist_free(nvroot);
 
-	if (error)
-		fatal(0, "spa_create() = %d", error);
-	error = spa_open(pool, &spa, FTAG);
-	if (error)
-		fatal(0, "spa_open() = %d", error);
-
+	VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG));
 	metaslab_sz = 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift;
-
-	if (zopt_verbose >= 3)
-		show_pool_stats(spa);
-
 	spa_close(spa, FTAG);
 
 	kernel_fini();
+
+	ztest_run_zdb(zs->zs_pool);
+
+	ztest_freeze(zs);
+
+	ztest_run_zdb(zs->zs_pool);
 }
 
 int
@@ -4466,11 +5244,12 @@
 {
 	int kills = 0;
 	int iters = 0;
-	int i, f;
 	ztest_shared_t *zs;
+	size_t shared_size;
 	ztest_info_t *zi;
 	char timebuf[100];
 	char numbuf[6];
+	spa_t *spa;
 
 	(void) setvbuf(stdout, NULL, _IOLBF, 0);
 
@@ -4487,8 +5266,10 @@
 	if (zopt_init != 0)
 		(void) remove("/tmp/zpool.cache");
 
+	shared_size = sizeof (*zs) + zopt_datasets * sizeof (ztest_ds_t);
+
 	zs = ztest_shared = (void *)mmap(0,
-	    P2ROUNDUP(sizeof (ztest_shared_t), getpagesize()),
+	    P2ROUNDUP(shared_size, getpagesize()),
 	    PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
 
 	if (zopt_verbose >= 1) {
@@ -4501,46 +5282,43 @@
 	/*
 	 * Create and initialize our storage pool.
 	 */
-	for (i = 1; i <= zopt_init; i++) {
+	for (int i = 1; i <= zopt_init; i++) {
 		bzero(zs, sizeof (ztest_shared_t));
 		if (zopt_verbose >= 3 && zopt_init != 1)
 			(void) printf("ztest_init(), pass %d\n", i);
-		ztest_init(zopt_pool);
+		zs->zs_pool = zopt_pool;
+		ztest_init(zs);
 	}
 
-	/*
-	 * Initialize the call targets for each function.
-	 */
-	for (f = 0; f < ZTEST_FUNCS; f++) {
+	zs->zs_pool = zopt_pool;
+	zs->zs_proc_start = gethrtime();
+	zs->zs_proc_stop = zs->zs_proc_start + zopt_time * NANOSEC;
+
+	for (int f = 0; f < ZTEST_FUNCS; f++) {
 		zi = &zs->zs_info[f];
-
 		*zi = ztest_info[f];
-
-		if (*zi->zi_interval == 0)
-			zi->zi_call_target = UINT64_MAX;
+		if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop)
+			zi->zi_call_next = UINT64_MAX;
 		else
-			zi->zi_call_target = zopt_time / *zi->zi_interval;
+			zi->zi_call_next = zs->zs_proc_start +
+			    ztest_random(2 * zi->zi_interval[0] + 1);
 	}
 
-	zs->zs_start_time = gethrtime();
-	zs->zs_stop_time = zs->zs_start_time + zopt_time * NANOSEC;
-
 	/*
 	 * Run the tests in a loop.  These tests include fault injection
 	 * to verify that self-healing data works, and forced crashes
 	 * to verify that we never lose on-disk consistency.
 	 */
-	while (gethrtime() < zs->zs_stop_time) {
+	while (gethrtime() < zs->zs_proc_stop) {
 		int status;
 		pid_t pid;
-		char *tmp;
 
 		/*
 		 * Initialize the workload counters for each function.
 		 */
-		for (f = 0; f < ZTEST_FUNCS; f++) {
+		for (int f = 0; f < ZTEST_FUNCS; f++) {
 			zi = &zs->zs_info[f];
-			zi->zi_calls = 0;
+			zi->zi_call_count = 0;
 			zi->zi_call_time = 0;
 		}
 
@@ -4556,7 +5334,7 @@
 			struct rlimit rl = { 1024, 1024 };
 			(void) setrlimit(RLIMIT_NOFILE, &rl);
 			(void) enable_extended_FILE_stdio(-1, -1);
-			ztest_run(zopt_pool);
+			ztest_run(zs);
 			exit(0);
 		}
 
@@ -4589,8 +5367,8 @@
 		if (zopt_verbose >= 1) {
 			hrtime_t now = gethrtime();
 
-			now = MIN(now, zs->zs_stop_time);
-			print_time(zs->zs_stop_time - now, timebuf);
+			now = MIN(now, zs->zs_proc_stop);
+			print_time(zs->zs_proc_stop - now, timebuf);
 			nicenum(zs->zs_space, numbuf);
 
 			(void) printf("Pass %3d, %8s, %3llu ENOSPC, "
@@ -4600,7 +5378,7 @@
 			    (u_longlong_t)zs->zs_enospc_count,
 			    100.0 * zs->zs_alloc / zs->zs_space,
 			    numbuf,
-			    100.0 * (now - zs->zs_start_time) /
+			    100.0 * (now - zs->zs_proc_start) /
 			    (zopt_time * NANOSEC), timebuf);
 		}
 
@@ -4610,35 +5388,40 @@
 			    "Calls", "Time", "Function");
 			(void) printf("%7s %9s   %s\n",
 			    "-----", "----", "--------");
-			for (f = 0; f < ZTEST_FUNCS; f++) {
+			for (int f = 0; f < ZTEST_FUNCS; f++) {
 				Dl_info dli;
 
 				zi = &zs->zs_info[f];
 				print_time(zi->zi_call_time, timebuf);
 				(void) dladdr((void *)zi->zi_func, &dli);
 				(void) printf("%7llu %9s   %s\n",
-				    (u_longlong_t)zi->zi_calls, timebuf,
+				    (u_longlong_t)zi->zi_call_count, timebuf,
 				    dli.dli_sname);
 			}
 			(void) printf("\n");
 		}
 
 		/*
-		 * It's possible that we killed a child during a rename test, in
-		 * which case we'll have a 'ztest_tmp' pool lying around instead
-		 * of 'ztest'.  Do a blind rename in case this happened.
+		 * It's possible that we killed a child during a rename test,
+		 * in which case we'll have a 'ztest_tmp' pool lying around
+		 * instead of 'ztest'.  Do a blind rename in case this happened.
 		 */
-		tmp = umem_alloc(strlen(zopt_pool) + 5, UMEM_NOFAIL);
-		(void) strcpy(tmp, zopt_pool);
-		(void) strcat(tmp, "_tmp");
-		kernel_init(FREAD | FWRITE);
-		(void) spa_rename(tmp, zopt_pool);
+		kernel_init(FREAD);
+		if (spa_open(zopt_pool, &spa, FTAG) == 0) {
+			spa_close(spa, FTAG);
+		} else {
+			char tmpname[MAXNAMELEN];
+			kernel_fini();
+			kernel_init(FREAD | FWRITE);
+			(void) snprintf(tmpname, sizeof (tmpname), "%s_tmp",
+			    zopt_pool);
+			(void) spa_rename(tmpname, zopt_pool);
+		}
 		kernel_fini();
-		umem_free(tmp, strlen(tmp) + 1);
+
+		ztest_run_zdb(zopt_pool);
 	}
 
-	ztest_verify_blocks(zopt_pool);
-
 	if (zopt_verbose >= 1) {
 		(void) printf("%d killed, %d completed, %.0f%% kill rate\n",
 		    kills, iters - kills, (100.0 * kills) / MAX(1, iters));
diff -r 8aac17999e4d -r e2081f502306 usr/src/common/avl/avl.c
--- a/usr/src/common/avl/avl.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/common/avl/avl.c	Sun Nov 01 14:14:46 2009 -0800
@@ -19,13 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-
 /*
  * AVL - generic AVL tree implementation for kernel use
  *
@@ -243,7 +240,7 @@
  *	"void *"  of the found tree node
  */
 void *
-avl_find(avl_tree_t *tree, void *value, avl_index_t *where)
+avl_find(avl_tree_t *tree, const void *value, avl_index_t *where)
 {
 	avl_node_t *node;
 	avl_node_t *prev = NULL;
diff -r 8aac17999e4d -r e2081f502306 usr/src/common/zfs/zfs_fletcher.c
--- a/usr/src/common/zfs/zfs_fletcher.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/common/zfs/zfs_fletcher.c	Sun Nov 01 14:14:46 2009 -0800
@@ -128,6 +128,7 @@
 #include <sys/types.h>
 #include <sys/sysmacros.h>
 #include <sys/byteorder.h>
+#include <sys/zio.h>
 #include <sys/spa.h>
 
 void
diff -r 8aac17999e4d -r e2081f502306 usr/src/common/zfs/zfs_prop.c
--- a/usr/src/common/zfs/zfs_prop.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/common/zfs/zfs_prop.c	Sun Nov 01 14:14:46 2009 -0800
@@ -69,6 +69,18 @@
 		{ NULL }
 	};
 
+	static zprop_index_t dedup_table[] = {
+		{ "on",		ZIO_CHECKSUM_ON },
+		{ "off",	ZIO_CHECKSUM_OFF },
+		{ "verify",	ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY },
+		{ "sha256",	ZIO_CHECKSUM_SHA256 },
+		{ "sha256,verify",
+				ZIO_CHECKSUM_SHA256 | ZIO_CHECKSUM_VERIFY },
+		{ "fletcher4,verify",
+				ZIO_CHECKSUM_FLETCHER_4 | ZIO_CHECKSUM_VERIFY },
+		{ NULL }
+	};
+
 	static zprop_index_t compress_table[] = {
 		{ "on",		ZIO_COMPRESS_ON },
 		{ "off",	ZIO_COMPRESS_OFF },
@@ -83,6 +95,7 @@
 		{ "gzip-7",	ZIO_COMPRESS_GZIP_7 },
 		{ "gzip-8",	ZIO_COMPRESS_GZIP_8 },
 		{ "gzip-9",	ZIO_COMPRESS_GZIP_9 },
+		{ "zle",	ZIO_COMPRESS_ZLE },
 		{ NULL }
 	};
 
@@ -177,10 +190,15 @@
 	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
 	    "on | off | fletcher2 | fletcher4 | sha256", "CHECKSUM",
 	    checksum_table);
+	register_index(ZFS_PROP_DEDUP, "dedup", ZIO_CHECKSUM_OFF,
+	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+	    "on | off | verify | sha256[,verify] | fletcher4,verify", "DEDUP",
+	    dedup_table);
 	register_index(ZFS_PROP_COMPRESSION, "compression",
 	    ZIO_COMPRESS_DEFAULT, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
-	    "on | off | lzjb | gzip | gzip-[1-9]", "COMPRESS", compress_table);
+	    "on | off | lzjb | gzip | gzip-[1-9] | zle", "COMPRESS",
+	    compress_table);
 	register_index(ZFS_PROP_SNAPDIR, "snapdir", ZFS_SNAPDIR_HIDDEN,
 	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
 	    "hidden | visible", "SNAPDIR", snapdir_table);
@@ -321,9 +339,9 @@
 
 	/* hidden properties */
 	register_hidden(ZFS_PROP_CREATETXG, "createtxg", PROP_TYPE_NUMBER,
-	    PROP_READONLY, ZFS_TYPE_DATASET, NULL);
+	    PROP_READONLY, ZFS_TYPE_DATASET, "CREATETXG");
 	register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER,
-	    PROP_READONLY, ZFS_TYPE_SNAPSHOT, NULL);
+	    PROP_READONLY, ZFS_TYPE_SNAPSHOT, "NUMCLONES");
 	register_hidden(ZFS_PROP_NAME, "name", PROP_TYPE_STRING,
 	    PROP_READONLY, ZFS_TYPE_DATASET, "NAME");
 	register_hidden(ZFS_PROP_ISCSIOPTIONS, "iscsioptions", PROP_TYPE_STRING,
@@ -334,9 +352,10 @@
 	register_hidden(ZFS_PROP_GUID, "guid", PROP_TYPE_NUMBER, PROP_READONLY,
 	    ZFS_TYPE_DATASET, "GUID");
 	register_hidden(ZFS_PROP_USERACCOUNTING, "useraccounting",
-	    PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, NULL);
+	    PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET,
+	    "USERACCOUNTING");
 	register_hidden(ZFS_PROP_UNIQUE, "unique", PROP_TYPE_NUMBER,
-	    PROP_READONLY, ZFS_TYPE_DATASET, NULL);
+	    PROP_READONLY, ZFS_TYPE_DATASET, "UNIQUE");
 	register_hidden(ZFS_PROP_OBJSETID, "objsetid", PROP_TYPE_NUMBER,
 	    PROP_READONLY, ZFS_TYPE_DATASET, "OBJSETID");
 
@@ -434,6 +453,12 @@
 	return (zprop_index_to_string(prop, index, string, ZFS_TYPE_DATASET));
 }
 
+uint64_t
+zfs_prop_random_value(zfs_prop_t prop, uint64_t seed)
+{
+	return (zprop_random_value(prop, seed, ZFS_TYPE_DATASET));
+}
+
 /*
  * Returns TRUE if the property applies to any of the given dataset types.
  */
diff -r 8aac17999e4d -r e2081f502306 usr/src/common/zfs/zfs_prop.h
--- a/usr/src/common/zfs/zfs_prop.h	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/common/zfs/zfs_prop.h	Sun Nov 01 14:14:46 2009 -0800
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_ZFS_PROP_H
 #define	_ZFS_PROP_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/fs/zfs.h>
 #include <sys/types.h>
 
@@ -79,6 +77,7 @@
 					/* "zfs get" help message */
 	const zprop_index_t *pd_table;	/* for index properties, a table */
 					/* defining the possible values */
+	size_t pd_table_size;		/* number of entries in pd_table[] */
 } zprop_desc_t;
 
 /*
@@ -118,6 +117,7 @@
 int zprop_name_to_prop(const char *, zfs_type_t);
 int zprop_string_to_index(int, const char *, uint64_t *, zfs_type_t);
 int zprop_index_to_string(int, uint64_t, const char **, zfs_type_t);
+uint64_t zprop_random_value(int, uint64_t, zfs_type_t);
 const char *zprop_values(int, zfs_type_t);
 size_t zprop_width(int, boolean_t *, zfs_type_t);
 boolean_t zprop_valid_for_type(int, zfs_type_t);
diff -r 8aac17999e4d -r e2081f502306 usr/src/common/zfs/zpool_prop.c
--- a/usr/src/common/zfs/zpool_prop.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/common/zfs/zpool_prop.c	Sun Nov 01 14:14:46 2009 -0800
@@ -84,6 +84,8 @@
 	    ZFS_TYPE_POOL, "<guid>", "GUID");
 	register_number(ZPOOL_PROP_HEALTH, "health", 0, PROP_READONLY,
 	    ZFS_TYPE_POOL, "<state>", "HEALTH");
+	register_number(ZPOOL_PROP_DEDUPRATIO, "dedupratio", 0, PROP_READONLY,
+	    ZFS_TYPE_POOL, "<1.00x or higher if deduped>", "DEDUP");
 
 	/* default number properties */
 	register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION,
@@ -107,6 +109,8 @@
 	/* hidden properties */
 	register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING,
 	    PROP_READONLY, ZFS_TYPE_POOL, "NAME");
+	register_hidden(ZPOOL_PROP_DEDUPDITTO, "dedupditto", PROP_TYPE_NUMBER,
+	    PROP_READONLY, ZFS_TYPE_POOL, "DEDUPDITTO");
 }
 
 /*
@@ -166,6 +170,12 @@
 	return (zprop_index_to_string(prop, index, string, ZFS_TYPE_POOL));
 }
 
+uint64_t
+zpool_prop_random_value(zpool_prop_t prop, uint64_t seed)
+{
+	return (zprop_random_value(prop, seed, ZFS_TYPE_POOL));
+}
+
 #ifndef _KERNEL
 
 const char *
diff -r 8aac17999e4d -r e2081f502306 usr/src/common/zfs/zprop_common.c
--- a/usr/src/common/zfs/zprop_common.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/common/zfs/zprop_common.c	Sun Nov 01 14:14:46 2009 -0800
@@ -76,6 +76,8 @@
 	pd = &prop_tbl[prop];
 
 	ASSERT(pd->pd_name == NULL || pd->pd_name == name);
+	ASSERT(name != NULL);
+	ASSERT(colname != NULL);
 
 	pd->pd_name = name;
 	pd->pd_propnum = prop;
@@ -89,6 +91,9 @@
 	pd->pd_rightalign = rightalign;
 	pd->pd_visible = visible;
 	pd->pd_table = idx_tbl;
+	pd->pd_table_size = 0;
+	while (idx_tbl && (idx_tbl++)->pi_name != NULL)
+		pd->pd_table_size++;
 }
 
 void
@@ -307,6 +312,25 @@
 	return (-1);
 }
 
+/*
+ * Return a random valid property value.  Used by ztest.
+ */
+uint64_t
+zprop_random_value(int prop, uint64_t seed, zfs_type_t type)
+{
+	zprop_desc_t *prop_tbl;
+	const zprop_index_t *idx_tbl;
+
+	ASSERT((uint_t)prop < zprop_get_numprops(type));
+	prop_tbl = zprop_get_proptable(type);
+	idx_tbl = prop_tbl[prop].pd_table;
+
+	if (idx_tbl == NULL)
+		return (seed);
+
+	return (idx_tbl[seed % prop_tbl[prop].pd_table_size].pi_value);
+}
+
 const char *
 zprop_values(int prop, zfs_type_t type)
 {
diff -r 8aac17999e4d -r e2081f502306 usr/src/grub/capability
--- a/usr/src/grub/capability	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/grub/capability	Sun Nov 01 14:14:46 2009 -0800
@@ -40,7 +40,7 @@
 # This file and the associated version are Solaris specific and are
 # not a part of the open source distribution of GRUB.
 #
-VERSION=12
+VERSION=13
 dboot
 xVM
 zfs
diff -r 8aac17999e4d -r e2081f502306 usr/src/grub/grub-0.97/stage2/fsys_zfs.c
--- a/usr/src/grub/grub-0.97/stage2/fsys_zfs.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/grub/grub-0.97/stage2/fsys_zfs.c	Sun Nov 01 14:14:46 2009 -0800
@@ -513,7 +513,7 @@
 	 * those are the onces that we first pay attention to when
 	 * chosing the bucket.
 	 */
-	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
+	crc &= ~((1ULL << (64 - 28)) - 1);
 
 	return (crc);
 }
@@ -623,7 +623,8 @@
 	int blksft = zfs_log2(zap_dnode->dn_datablkszsec << DNODE_SHIFT);
 
 	/* Verify if this is a fat zap header block */
-	if (zap->zap_magic != (uint64_t)ZAP_MAGIC)
+	if (zap->zap_magic != (uint64_t)ZAP_MAGIC ||
+	    zap->zap_flags != 0)
 		return (ERR_FSYS_CORRUPT);
 
 	hash = zap_hash(zap->zap_salt, name);
diff -r 8aac17999e4d -r e2081f502306 usr/src/grub/grub-0.97/stage2/zfs-include/spa.h
--- a/usr/src/grub/grub-0.97/stage2/zfs-include/spa.h	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/grub/grub-0.97/stage2/zfs-include/spa.h	Sun Nov 01 14:14:46 2009 -0800
@@ -17,15 +17,13 @@
  *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_SPA_H
 #define	_SYS_SPA_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * General-purpose 32-bit and 64-bit bitfield encodings.
  */
@@ -65,6 +63,11 @@
 #define	SPA_BLOCKSIZES		(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)
 
 /*
+ * Size of block to hold the configuration data (a packed nvlist)
+ */
+#define	SPA_CONFIG_BLOCKSIZE	(1 << 14)
+
+/*
  * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB.
  * The ASIZE encoding should be at least 64 times larger (6 more bits)
  * to support up to 4-way RAID-Z mirror mode with worst-case gang block
@@ -108,15 +111,15 @@
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 5	|G|			 offset3				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 6	|E| lvl | type	| cksum | comp	|     PSIZE	|     LSIZE	|
+ * 6	|BDX|lvl| type	| cksum | comp	|     PSIZE	|     LSIZE	|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 7	|			padding					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 8	|			padding					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 9	|			padding					|
+ * 9	|			physical birth txg			|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * a	|			birth txg				|
+ * a	|			logical birth txg			|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * b	|			fill count				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
@@ -140,25 +143,29 @@
  * cksum	checksum function
  * comp		compression function
  * G		gang block indicator
- * E		endianness
+ * B		byteorder (endianness)
+ * D		dedup
+ * X		unused
+ * lvl		level of indirection
  * type		DMU object type
- * lvl		level of indirection
- * birth txg	transaction group in which the block was born
+ * phys birth	txg of block allocation; zero if same as logical birth txg
+ * log. birth	transaction group in which the block was logically born
  * fill count	number of non-zero blocks under this bp
  * checksum[4]	256-bit checksum of the data this bp describes
  */
-typedef struct blkptr {
-	dva_t		blk_dva[3];	/* 128-bit Data Virtual Address	*/
-	uint64_t	blk_prop;	/* size, compression, type, etc	*/
-	uint64_t	blk_pad[3];	/* Extra space for the future	*/
-	uint64_t	blk_birth;	/* transaction group at birth	*/
-	uint64_t	blk_fill;	/* fill count			*/
-	zio_cksum_t	blk_cksum;	/* 256-bit checksum		*/
-} blkptr_t;
-
 #define	SPA_BLKPTRSHIFT	7		/* blkptr_t is 128 bytes	*/
 #define	SPA_DVAS_PER_BP	3		/* Number of DVAs in a bp	*/
 
+typedef struct blkptr {
+	dva_t		blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */
+	uint64_t	blk_prop;	/* size, compression, type, etc	    */
+	uint64_t	blk_pad[2];	/* Extra space for the future	    */
+	uint64_t	blk_phys_birth;	/* txg when block was allocated	    */
+	uint64_t	blk_birth;	/* transaction group at birth	    */
+	uint64_t	blk_fill;	/* fill count			    */
+	zio_cksum_t	blk_cksum;	/* 256-bit checksum		    */
+} blkptr_t;
+
 /*
  * Macros to get and set fields in a bp or DVA.
  */
@@ -182,8 +189,7 @@
 #define	DVA_SET_GANG(dva, x)	BF64_SET((dva)->dva_word[1], 63, 1, x)
 
 #define	BP_GET_LSIZE(bp)	\
-	(BP_IS_HOLE(bp) ? 0 : \
-	BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1))
+	BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)
 #define	BP_SET_LSIZE(bp, x)	\
 	BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
 
@@ -192,20 +198,35 @@
 #define	BP_SET_PSIZE(bp, x)	\
 	BF64_SET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
 
-#define	BP_GET_COMPRESS(bp)	BF64_GET((bp)->blk_prop, 32, 8)
-#define	BP_SET_COMPRESS(bp, x)	BF64_SET((bp)->blk_prop, 32, 8, x)
+#define	BP_GET_COMPRESS(bp)		BF64_GET((bp)->blk_prop, 32, 8)
+#define	BP_SET_COMPRESS(bp, x)		BF64_SET((bp)->blk_prop, 32, 8, x)
+
+#define	BP_GET_CHECKSUM(bp)		BF64_GET((bp)->blk_prop, 40, 8)
+#define	BP_SET_CHECKSUM(bp, x)		BF64_SET((bp)->blk_prop, 40, 8, x)
 
-#define	BP_GET_CHECKSUM(bp)	BF64_GET((bp)->blk_prop, 40, 8)
-#define	BP_SET_CHECKSUM(bp, x)	BF64_SET((bp)->blk_prop, 40, 8, x)
+#define	BP_GET_TYPE(bp)			BF64_GET((bp)->blk_prop, 48, 8)
+#define	BP_SET_TYPE(bp, x)		BF64_SET((bp)->blk_prop, 48, 8, x)
+
+#define	BP_GET_LEVEL(bp)		BF64_GET((bp)->blk_prop, 56, 5)
+#define	BP_SET_LEVEL(bp, x)		BF64_SET((bp)->blk_prop, 56, 5, x)
+
+#define	BP_GET_PROP_BIT_61(bp)		BF64_GET((bp)->blk_prop, 61, 1)
+#define	BP_SET_PROP_BIT_61(bp, x)	BF64_SET((bp)->blk_prop, 61, 1, x)
 
-#define	BP_GET_TYPE(bp)		BF64_GET((bp)->blk_prop, 48, 8)
-#define	BP_SET_TYPE(bp, x)	BF64_SET((bp)->blk_prop, 48, 8, x)
+#define	BP_GET_DEDUP(bp)		BF64_GET((bp)->blk_prop, 62, 1)
+#define	BP_SET_DEDUP(bp, x)		BF64_SET((bp)->blk_prop, 62, 1, x)
+
+#define	BP_GET_BYTEORDER(bp)		(0 - BF64_GET((bp)->blk_prop, 63, 1))
+#define	BP_SET_BYTEORDER(bp, x)		BF64_SET((bp)->blk_prop, 63, 1, x)
 
-#define	BP_GET_LEVEL(bp)	BF64_GET((bp)->blk_prop, 56, 5)
-#define	BP_SET_LEVEL(bp, x)	BF64_SET((bp)->blk_prop, 56, 5, x)
+#define	BP_PHYSICAL_BIRTH(bp)		\
+	((bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
 
-#define	BP_GET_BYTEORDER(bp)	(0 - BF64_GET((bp)->blk_prop, 63, 1))
-#define	BP_SET_BYTEORDER(bp, x)	BF64_SET((bp)->blk_prop, 63, 1, x)
+#define	BP_SET_BIRTH(bp, logical, physical)	\
+{						\
+	(bp)->blk_birth = (logical);		\
+	(bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \
+}
 
 #define	BP_GET_ASIZE(bp)	\
 	(DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
@@ -229,13 +250,18 @@
 	((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
 	(dva1)->dva_word[0] == (dva2)->dva_word[0])
 
+#define	BP_EQUAL(bp1, bp2)	\
+	(BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) &&	\
+	DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) &&	\
+	DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) &&	\
+	DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2]))
+
 #define	ZIO_CHECKSUM_EQUAL(zc1, zc2) \
 	(0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \
 	((zc1).zc_word[1] - (zc2).zc_word[1]) | \
 	((zc1).zc_word[2] - (zc2).zc_word[2]) | \
 	((zc1).zc_word[3] - (zc2).zc_word[3])))
 
-
 #define	DVA_IS_VALID(dva)	(DVA_GET_ASIZE(dva) != 0)
 
 #define	ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3)	\
@@ -249,7 +275,6 @@
 #define	BP_IDENTITY(bp)		(&(bp)->blk_dva[0])
 #define	BP_IS_GANG(bp)		DVA_GET_GANG(BP_IDENTITY(bp))
 #define	BP_IS_HOLE(bp)		((bp)->blk_birth == 0)
-#define	BP_IS_OLDER(bp, txg)	(!BP_IS_HOLE(bp) && (bp)->blk_birth < (txg))
 
 #define	BP_ZERO(bp)				\
 {						\
@@ -262,7 +287,7 @@
 	(bp)->blk_prop = 0;			\
 	(bp)->blk_pad[0] = 0;			\
 	(bp)->blk_pad[1] = 0;			\
-	(bp)->blk_pad[2] = 0;			\
+	(bp)->blk_phys_birth = 0;		\
 	(bp)->blk_birth = 0;			\
 	(bp)->blk_fill = 0;			\
 	ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0);	\
diff -r 8aac17999e4d -r e2081f502306 usr/src/grub/grub-0.97/stage2/zfs-include/zap_impl.h
--- a/usr/src/grub/grub-0.97/stage2/zfs-include/zap_impl.h	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/grub/grub-0.97/stage2/zfs-include/zap_impl.h	Sun Nov 01 14:14:46 2009 -0800
@@ -17,18 +17,15 @@
  *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_ZAP_IMPL_H
 #define	_SYS_ZAP_IMPL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #define	ZAP_MAGIC 0x2F52AB2ABULL
 
-#define	ZAP_HASHBITS		28
 #define	MZAP_ENT_LEN		64
 #define	MZAP_NAME_LEN		(MZAP_ENT_LEN - 8 - 4 - 2)
 #define	MZAP_MAX_BLKSHIFT	SPA_MAXBLOCKSHIFT
@@ -101,6 +98,8 @@
 	uint64_t zap_num_leafs;		/* number of leafs */
 	uint64_t zap_num_entries;	/* number of entries */
 	uint64_t zap_salt;		/* salt to stir into hash function */
+	uint64_t zap_normflags;		/* flags for u8_textprep_str() */
+	uint64_t zap_flags;		/* zap_flag_t */
 	/*
 	 * This structure is followed by padding, and then the embedded
 	 * pointer table.  The embedded pointer table takes up second
diff -r 8aac17999e4d -r e2081f502306 usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h
--- a/usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h	Sun Nov 01 14:14:46 2009 -0800
@@ -27,7 +27,7 @@
 /*
  * On-disk version number.
  */
-#define	SPA_VERSION			19ULL
+#define	SPA_VERSION			21ULL
 
 /*
  * The following are configuration names used in the nvlist describing a pool's
diff -r 8aac17999e4d -r e2081f502306 usr/src/lib/libzfs/common/libzfs_dataset.c
--- a/usr/src/lib/libzfs/common/libzfs_dataset.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/lib/libzfs/common/libzfs_dataset.c	Sun Nov 01 14:14:46 2009 -0800
@@ -1814,8 +1814,9 @@
 	case ZFS_PROP_COMPRESSRATIO:
 		if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
 			return (-1);
-		(void) snprintf(propbuf, proplen, "%lld.%02lldx", (longlong_t)
-		    val / 100, (longlong_t)val % 100);
+		(void) snprintf(propbuf, proplen, "%llu.%02llux",
+		    (u_longlong_t)(val / 100),
+		    (u_longlong_t)(val % 100));
 		break;
 
 	case ZFS_PROP_TYPE:
diff -r 8aac17999e4d -r e2081f502306 usr/src/lib/libzfs/common/libzfs_pool.c
--- a/usr/src/lib/libzfs/common/libzfs_pool.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/lib/libzfs/common/libzfs_pool.c	Sun Nov 01 14:14:46 2009 -0800
@@ -321,6 +321,12 @@
 			    (u_longlong_t)intval);
 			break;
 
+		case ZPOOL_PROP_DEDUPRATIO:
+			(void) snprintf(buf, len, "%llu.%02llux",
+			    (u_longlong_t)(intval / 100),
+			    (u_longlong_t)(intval % 100));
+			break;
+
 		case ZPOOL_PROP_HEALTH:
 			verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
 			    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
diff -r 8aac17999e4d -r e2081f502306 usr/src/lib/libzpool/common/llib-lzpool
--- a/usr/src/lib/libzpool/common/llib-lzpool	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/lib/libzpool/common/llib-lzpool	Sun Nov 01 14:14:46 2009 -0800
@@ -47,6 +47,9 @@
 #include <sys/bplist.h>
 #include <sys/zfs_znode.h>
 #include <sys/arc.h>
+#include <sys/dbuf.h>
+#include <sys/zio_checksum.h>
+#include <sys/ddt.h>
 
 extern uint64_t metaslab_gang_bang;
 extern uint64_t metaslab_df_alloc_threshold;
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/Makefile.files
--- a/usr/src/uts/common/Makefile.files	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/Makefile.files	Sun Nov 01 14:14:46 2009 -0800
@@ -1296,6 +1296,8 @@
 	arc.o			\
 	bplist.o		\
 	dbuf.o			\
+	ddt.o			\
+	ddt_zap.o		\
 	dmu.o			\
 	dmu_send.o		\
 	dmu_object.o		\
@@ -1346,7 +1348,8 @@
 	zio.o			\
 	zio_checksum.o		\
 	zio_compress.o		\
-	zio_inject.o
+	zio_inject.o		\
+	zle.o
 
 ZFS_SHARED_OBJS +=		\
 	zfs_namecheck.o		\
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/vfs.c
--- a/usr/src/uts/common/fs/vfs.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/vfs.c	Sun Nov 01 14:14:46 2009 -0800
@@ -81,6 +81,7 @@
 #include <sys/console.h>
 #include <sys/reboot.h>
 #include <sys/attr.h>
+#include <sys/zio.h>
 #include <sys/spa.h>
 #include <sys/lofi.h>
 #include <sys/bootprops.h>
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/arc.c
--- a/usr/src/uts/common/fs/zfs/arc.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/arc.c	Sun Nov 01 14:14:46 2009 -0800
@@ -119,7 +119,6 @@
 
 #include <sys/spa.h>
 #include <sys/zio.h>
-#include <sys/zio_checksum.h>
 #include <sys/zfs_context.h>
 #include <sys/arc.h>
 #include <sys/refcount.h>
@@ -133,6 +132,7 @@
 #endif
 #include <sys/callb.h>
 #include <sys/kstat.h>
+#include <zfs_fletcher.h>
 
 static kmutex_t		arc_reclaim_thr_lock;
 static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
@@ -178,7 +178,6 @@
 uint64_t zfs_arc_max;
 uint64_t zfs_arc_min;
 uint64_t zfs_arc_meta_limit = 0;
-int zfs_mdcomp_disable = 0;
 int zfs_arc_grow_retry = 0;
 int zfs_arc_shrink_shift = 0;
 int zfs_arc_p_min_shift = 0;
@@ -347,7 +346,7 @@
 #define	ARCSTAT_INCR(stat, val) \
 	atomic_add_64(&arc_stats.stat.value.ui64, (val));
 
-#define	ARCSTAT_BUMP(stat) 	ARCSTAT_INCR(stat, 1)
+#define	ARCSTAT_BUMP(stat)	ARCSTAT_INCR(stat, 1)
 #define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
 
 #define	ARCSTAT_MAX(stat, val) {					\
@@ -381,7 +380,7 @@
 	}
 
 kstat_t			*arc_ksp;
-static arc_state_t 	*arc_anon;
+static arc_state_t	*arc_anon;
 static arc_state_t	*arc_mru;
 static arc_state_t	*arc_mru_ghost;
 static arc_state_t	*arc_mfu;
@@ -498,7 +497,6 @@
 #define	ARC_L2_WRITING		(1 << 16)	/* L2ARC write in progress */
 #define	ARC_L2_EVICTED		(1 << 17)	/* evicted during I/O */
 #define	ARC_L2_WRITE_HEAD	(1 << 18)	/* head of write list */
-#define	ARC_STORED		(1 << 19)	/* has been store()d to */
 
 #define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_IN_HASH_TABLE)
 #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
@@ -815,6 +813,7 @@
 {
 	arc_buf_hdr_t *buf = vbuf;
 
+	ASSERT(BUF_EMPTY(buf));
 	refcount_destroy(&buf->b_refcnt);
 	cv_destroy(&buf->b_cv);
 	mutex_destroy(&buf->b_freeze_lock);
@@ -1038,6 +1037,8 @@
 	ASSERT(new_state != old_state);
 	ASSERT(refcnt == 0 || ab->b_datacnt > 0);
 	ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
+	ASSERT(ab->b_datacnt <= 1 || new_state != arc_anon);
+	ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
 
 	from_delta = to_delta = ab->b_datacnt * ab->b_size;
 
@@ -1255,6 +1256,8 @@
 	arc_buf_hdr_t *hdr = from->b_hdr;
 	uint64_t size = hdr->b_size;
 
+	ASSERT(hdr->b_state != arc_anon);
+
 	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
 	buf->b_hdr = hdr;
 	buf->b_data = NULL;
@@ -1336,6 +1339,7 @@
 		arc_buf_contents_t type = buf->b_hdr->b_type;
 
 		arc_cksum_verify(buf);
+
 		if (!recycle) {
 			if (type == ARC_BUFC_METADATA) {
 				arc_buf_data_free(buf->b_hdr, zio_buf_free,
@@ -1387,34 +1391,36 @@
 	ASSERT(refcount_is_zero(&hdr->b_refcnt));
 	ASSERT3P(hdr->b_state, ==, arc_anon);
 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-	ASSERT(!(hdr->b_flags & ARC_STORED));
-
-	if (hdr->b_l2hdr != NULL) {
-		if (!MUTEX_HELD(&l2arc_buflist_mtx)) {
-			/*
-			 * To prevent arc_free() and l2arc_evict() from
-			 * attempting to free the same buffer at the same time,
-			 * a FREE_IN_PROGRESS flag is given to arc_free() to
-			 * give it priority.  l2arc_evict() can't destroy this
-			 * header while we are waiting on l2arc_buflist_mtx.
-			 *
-			 * The hdr may be removed from l2ad_buflist before we
-			 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
-			 */
+	l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
+
+	if (l2hdr != NULL) {
+		boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
+		/*
+		 * To prevent arc_free() and l2arc_evict() from
+		 * attempting to free the same buffer at the same time,
+		 * a FREE_IN_PROGRESS flag is given to arc_free() to
+		 * give it priority.  l2arc_evict() can't destroy this
+		 * header while we are waiting on l2arc_buflist_mtx.
+		 *
+		 * The hdr may be removed from l2ad_buflist before we
+		 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
+		 */
+		if (!buflist_held) {
 			mutex_enter(&l2arc_buflist_mtx);
-			if (hdr->b_l2hdr != NULL) {
-				list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist,
-				    hdr);
-			}
+			l2hdr = hdr->b_l2hdr;
+		}
+
+		if (l2hdr != NULL) {
+			list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
+			ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
+			kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
+			if (hdr->b_state == arc_l2c_only)
+				l2arc_hdr_stat_remove();
+			hdr->b_l2hdr = NULL;
+		}
+
+		if (!buflist_held)
 			mutex_exit(&l2arc_buflist_mtx);
-		} else {
-			list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr);
-		}
-		ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
-		kmem_free(hdr->b_l2hdr, sizeof (l2arc_buf_hdr_t));
-		if (hdr->b_state == arc_l2c_only)
-			l2arc_hdr_stat_remove();
-		hdr->b_l2hdr = NULL;
 	}
 
 	if (!BUF_EMPTY(hdr)) {
@@ -1466,10 +1472,13 @@
 
 		mutex_enter(hash_lock);
 		(void) remove_reference(hdr, hash_lock, tag);
-		if (hdr->b_datacnt > 1)
+		if (hdr->b_datacnt > 1) {
 			arc_buf_destroy(buf, FALSE, TRUE);
-		else
+		} else {
+			ASSERT(buf == hdr->b_buf);
+			ASSERT(buf->b_efunc == NULL);
 			hdr->b_flags |= ARC_BUF_AVAILABLE;
+		}
 		mutex_exit(hash_lock);
 	} else if (HDR_IO_IN_PROGRESS(hdr)) {
 		int destroy_hdr;
@@ -1503,6 +1512,7 @@
 	int no_callback = (buf->b_efunc == NULL);
 
 	if (hdr->b_state == arc_anon) {
+		ASSERT(hdr->b_datacnt == 1);
 		arc_buf_free(buf, tag);
 		return (no_callback);
 	}
@@ -1517,6 +1527,7 @@
 			arc_buf_destroy(buf, FALSE, TRUE);
 	} else if (no_callback) {
 		ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
+		ASSERT(buf->b_efunc == NULL);
 		hdr->b_flags |= ARC_BUF_AVAILABLE;
 	}
 	ASSERT(no_callback || hdr->b_datacnt > 1 ||
@@ -2463,6 +2474,16 @@
 
 	arc_cksum_compute(buf, B_FALSE);
 
+	if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
+		/*
+		 * Only call arc_access on anonymous buffers.  This is because
+		 * if we've issued an I/O for an evicted buffer, we've already
+		 * called arc_access (to prevent any simultaneous readers from
+		 * getting confused).
+		 */
+		arc_access(hdr, hash_lock);
+	}
+
 	/* create copies of the data buffer for the callers */
 	abuf = buf;
 	for (acb = callback_list; acb; acb = acb->acb_next) {
@@ -2476,8 +2497,11 @@
 	hdr->b_acb = NULL;
 	hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
 	ASSERT(!HDR_BUF_AVAILABLE(hdr));
-	if (abuf == buf)
+	if (abuf == buf) {
+		ASSERT(buf->b_efunc == NULL);
+		ASSERT(hdr->b_datacnt == 1);
 		hdr->b_flags |= ARC_BUF_AVAILABLE;
+	}
 
 	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
 
@@ -2498,14 +2522,6 @@
 	cv_broadcast(&hdr->b_cv);
 
 	if (hash_lock) {
-		/*
-		 * Only call arc_access on anonymous buffers.  This is because
-		 * if we've issued an I/O for an evicted buffer, we've already
-		 * called arc_access (to prevent any simultaneous readers from
-		 * getting confused).
-		 */
-		if (zio->io_error == 0 && hdr->b_state == arc_anon)
-			arc_access(hdr, hash_lock);
 		mutex_exit(hash_lock);
 	} else {
 		/*
@@ -2559,7 +2575,7 @@
  * arc_read_bp.
  */
 int
-arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
+arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf,
     arc_done_func_t *done, void *private, int priority, int zio_flags,
     uint32_t *arc_flags, const zbookmark_t *zb)
 {
@@ -2577,7 +2593,7 @@
 }
 
 int
-arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
+arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     arc_done_func_t *done, void *private, int priority, int zio_flags,
     uint32_t *arc_flags, const zbookmark_t *zb)
 {
@@ -2588,7 +2604,8 @@
 	uint64_t guid = spa_guid(spa);
 
 top:
-	hdr = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
+	hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
+	    &hash_lock);
 	if (hdr && hdr->b_datacnt > 0) {
 
 		*arc_flags |= ARC_CACHED;
@@ -2642,6 +2659,7 @@
 			} else {
 				buf = arc_buf_clone(buf);
 			}
+
 		} else if (*arc_flags & ARC_PREFETCH &&
 		    refcount_count(&hdr->b_refcnt) == 0) {
 			hdr->b_flags |= ARC_PREFETCH;
@@ -2672,7 +2690,7 @@
 			buf = arc_buf_alloc(spa, size, private, type);
 			hdr = buf->b_hdr;
 			hdr->b_dva = *BP_IDENTITY(bp);
-			hdr->b_birth = bp->blk_birth;
+			hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
 			hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
 			exists = buf_hash_insert(hdr, &hash_lock);
 			if (exists) {
@@ -2718,7 +2736,6 @@
 			arc_get_data_buf(buf);
 			ASSERT(hdr->b_datacnt == 0);
 			hdr->b_datacnt = 1;
-
 		}
 
 		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
@@ -2848,6 +2865,9 @@
 	ASSERT(buf->b_hdr != NULL);
 	ASSERT(buf->b_hdr->b_state != arc_anon);
 	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
+	ASSERT(buf->b_efunc == NULL);
+	ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
+
 	buf->b_efunc = func;
 	buf->b_private = private;
 }
@@ -2953,7 +2973,6 @@
 
 	/* this buffer is not on any list */
 	ASSERT(refcount_count(&hdr->b_refcnt) > 0);
-	ASSERT(!(hdr->b_flags & ARC_STORED));
 
 	if (hdr->b_state == arc_anon) {
 		/* this buffer is already released */
@@ -3124,11 +3143,16 @@
 	arc_buf_t *buf = callback->awcb_buf;
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
-	hdr->b_acb = NULL;
-
-	hdr->b_dva = *BP_IDENTITY(zio->io_bp);
-	hdr->b_birth = zio->io_bp->blk_birth;
-	hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
+	ASSERT(hdr->b_acb == NULL);
+
+	if (zio->io_error == 0) {
+		hdr->b_dva = *BP_IDENTITY(zio->io_bp);
+		hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
+		hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
+	} else {
+		ASSERT(BUF_EMPTY(hdr));
+	}
+
 	/*
 	 * If the block to be written was all-zero, we may have
 	 * compressed it away.  In this case no write was performed
@@ -3139,6 +3163,8 @@
 		arc_buf_hdr_t *exists;
 		kmutex_t *hash_lock;
 
+		ASSERT(zio->io_error == 0);
+
 		arc_cksum_verify(buf);
 
 		exists = buf_hash_insert(hdr, &hash_lock);
@@ -3148,109 +3174,54 @@
 			 * sync-to-convergence, because we remove
 			 * buffers from the hash table when we arc_free().
 			 */
-			if (!(zio->io_flags & ZIO_FLAG_IO_REWRITE) ||
-			    !DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
-			    BP_IDENTITY(zio->io_bp)) ||
-			    zio->io_bp_orig.blk_birth !=
-			    zio->io_bp->blk_birth) {
-				panic("bad overwrite, hdr=%p exists=%p",
-				    (void *)hdr, (void *)exists);
+			if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
+				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
+					panic("bad overwrite, hdr=%p exists=%p",
+					    (void *)hdr, (void *)exists);
+				ASSERT(refcount_is_zero(&exists->b_refcnt));
+				arc_change_state(arc_anon, exists, hash_lock);
+				mutex_exit(hash_lock);
+				arc_hdr_destroy(exists);
+				exists = buf_hash_insert(hdr, &hash_lock);
+				ASSERT3P(exists, ==, NULL);
+			} else {
+				/* Dedup */
+				ASSERT(hdr->b_datacnt == 1);
+				ASSERT(hdr->b_state == arc_anon);
+				ASSERT(BP_GET_DEDUP(zio->io_bp));
+				ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
 			}
-
-			ASSERT(refcount_is_zero(&exists->b_refcnt));
-			arc_change_state(arc_anon, exists, hash_lock);
-			mutex_exit(hash_lock);
-			arc_hdr_destroy(exists);
-			exists = buf_hash_insert(hdr, &hash_lock);
-			ASSERT3P(exists, ==, NULL);
 		}
 		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
 		/* if it's not anon, we are doing a scrub */
-		if (hdr->b_state == arc_anon)
+		if (!exists && hdr->b_state == arc_anon)
 			arc_access(hdr, hash_lock);
 		mutex_exit(hash_lock);
-	} else if (callback->awcb_done == NULL) {
-		int destroy_hdr;
-		/*
-		 * This is an anonymous buffer with no user callback,
-		 * destroy it if there are no active references.
-		 */
-		mutex_enter(&arc_eviction_mtx);
-		destroy_hdr = refcount_is_zero(&hdr->b_refcnt);
-		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
-		mutex_exit(&arc_eviction_mtx);
-		if (destroy_hdr)
-			arc_hdr_destroy(hdr);
 	} else {
 		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
 	}
-	hdr->b_flags &= ~ARC_STORED;
-
-	if (callback->awcb_done) {
-		ASSERT(!refcount_is_zero(&hdr->b_refcnt));
-		callback->awcb_done(zio, buf, callback->awcb_private);
-	}
+
+	ASSERT(!refcount_is_zero(&hdr->b_refcnt));
+	callback->awcb_done(zio, buf, callback->awcb_private);
 
 	kmem_free(callback, sizeof (arc_write_callback_t));
 }
 
-void
-write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp)
-{
-	boolean_t ismd = (wp->wp_level > 0 || dmu_ot[wp->wp_type].ot_metadata);
-
-	/* Determine checksum setting */
-	if (ismd) {
-		/*
-		 * Metadata always gets checksummed.  If the data
-		 * checksum is multi-bit correctable, and it's not a
-		 * ZBT-style checksum, then it's suitable for metadata
-		 * as well.  Otherwise, the metadata checksum defaults
-		 * to fletcher4.
-		 */
-		if (zio_checksum_table[wp->wp_oschecksum].ci_correctable &&
-		    !zio_checksum_table[wp->wp_oschecksum].ci_zbt)
-			zp->zp_checksum = wp->wp_oschecksum;
-		else
-			zp->zp_checksum = ZIO_CHECKSUM_FLETCHER_4;
-	} else {
-		zp->zp_checksum = zio_checksum_select(wp->wp_dnchecksum,
-		    wp->wp_oschecksum);
-	}
-
-	/* Determine compression setting */
-	if (ismd) {
-		/*
-		 * XXX -- we should design a compression algorithm
-		 * that specializes in arrays of bps.
-		 */
-		zp->zp_compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
-		    ZIO_COMPRESS_LZJB;
-	} else {
-		zp->zp_compress = zio_compress_select(wp->wp_dncompress,
-		    wp->wp_oscompress);
-	}
-
-	zp->zp_type = wp->wp_type;
-	zp->zp_level = wp->wp_level;
-	zp->zp_ndvas = MIN(wp->wp_copies + ismd, spa_max_replication(spa));
-}
-
 zio_t *
-arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
-    boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
-    arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
-    int zio_flags, const zbookmark_t *zb)
+arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
+    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
+    arc_done_func_t *ready, arc_done_func_t *done, void *private,
+    int priority, int zio_flags, const zbookmark_t *zb)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	arc_write_callback_t *callback;
 	zio_t *zio;
-	zio_prop_t zp;
 
 	ASSERT(ready != NULL);
+	ASSERT(done != NULL);
 	ASSERT(!HDR_IO_ERROR(hdr));
 	ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
-	ASSERT(hdr->b_acb == 0);
+	ASSERT(hdr->b_acb == NULL);
 	if (l2arc)
 		hdr->b_flags |= ARC_L2CACHE;
 	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
@@ -3259,37 +3230,25 @@
 	callback->awcb_private = private;
 	callback->awcb_buf = buf;
 
-	write_policy(spa, wp, &zp);
-	zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, &zp,
+	zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
 	    arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
 
 	return (zio);
 }
 
-int
-arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    zio_done_func_t *done, void *private, uint32_t arc_flags)
+void
+arc_free(spa_t *spa, const blkptr_t *bp)
 {
 	arc_buf_hdr_t *ab;
 	kmutex_t *hash_lock;
-	zio_t	*zio;
 	uint64_t guid = spa_guid(spa);
 
 	/*
-	 * If this buffer is in the cache, release it, so it
-	 * can be re-used.
+	 * If this buffer is in the cache, release it, so it can be re-used.
 	 */
-	ab = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
+	ab = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
+	    &hash_lock);
 	if (ab != NULL) {
-		/*
-		 * The checksum of blocks to free is not always
-		 * preserved (eg. on the deadlist).  However, if it is
-		 * nonzero, it should match what we have in the cache.
-		 */
-		ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
-		    bp->blk_cksum.zc_word[0] == ab->b_cksum0 ||
-		    bp->blk_fill == BLK_FILL_ALREADY_FREED);
-
 		if (ab->b_state != arc_anon)
 			arc_change_state(arc_anon, ab, hash_lock);
 		if (HDR_IO_IN_PROGRESS(ab)) {
@@ -3308,37 +3267,14 @@
 			ab->b_buf->b_efunc = NULL;
 			ab->b_buf->b_private = NULL;
 			mutex_exit(hash_lock);
-		} else if (refcount_is_zero(&ab->b_refcnt)) {
+		} else {
+			ASSERT(refcount_is_zero(&ab->b_refcnt));
 			ab->b_flags |= ARC_FREE_IN_PROGRESS;
 			mutex_exit(hash_lock);
 			arc_hdr_destroy(ab);
 			ARCSTAT_BUMP(arcstat_deleted);
-		} else {
-			/*
-			 * We still have an active reference on this
-			 * buffer.  This can happen, e.g., from
-			 * dbuf_unoverride().
-			 */
-			ASSERT(!HDR_IN_HASH_TABLE(ab));
-			ab->b_arc_access = 0;
-			bzero(&ab->b_dva, sizeof (dva_t));
-			ab->b_birth = 0;
-			ab->b_cksum0 = 0;
-			ab->b_buf->b_efunc = NULL;
-			ab->b_buf->b_private = NULL;
-			mutex_exit(hash_lock);
 		}
 	}
-
-	zio = zio_free(pio, spa, txg, bp, done, private, ZIO_FLAG_MUSTSUCCEED);
-
-	if (arc_flags & ARC_WAIT)
-		return (zio_wait(zio));
-
-	ASSERT(arc_flags & ARC_NOWAIT);
-	zio_nowait(zio);
-
-	return (0);
 }
 
 static int
@@ -4237,7 +4173,7 @@
 	}
 	mutex_exit(&l2arc_buflist_mtx);
 
-	spa_l2cache_space_update(dev->l2ad_vdev, 0, -(taddr - dev->l2ad_evict));
+	vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
 	dev->l2ad_evict = taddr;
 }
 
@@ -4397,15 +4333,15 @@
 	ARCSTAT_BUMP(arcstat_l2_writes_sent);
 	ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz);
 	ARCSTAT_INCR(arcstat_l2_size, write_sz);
-	spa_l2cache_space_update(dev->l2ad_vdev, 0, write_sz);
+	vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0);
 
 	/*
 	 * Bump device hand to the device start if it is approaching the end.
 	 * l2arc_evict() will already have evicted ahead for this case.
 	 */
 	if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
-		spa_l2cache_space_update(dev->l2ad_vdev, 0,
-		    dev->l2ad_end - dev->l2ad_hand);
+		vdev_space_update(dev->l2ad_vdev,
+		    dev->l2ad_end - dev->l2ad_hand, 0, 0);
 		dev->l2ad_hand = dev->l2ad_start;
 		dev->l2ad_evict = dev->l2ad_start;
 		dev->l2ad_first = B_FALSE;
@@ -4556,7 +4492,7 @@
 	list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_l2node));
 
-	spa_l2cache_space_update(vd, adddev->l2ad_end - adddev->l2ad_hand, 0);
+	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
 
 	/*
 	 * Add device to global list
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/bplist.c
--- a/usr/src/uts/common/fs/zfs/bplist.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/bplist.c	Sun Nov 01 14:14:46 2009 -0800
@@ -19,13 +19,27 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #include <sys/bplist.h>
 #include <sys/zfs_context.h>
 
+void
+bplist_init(bplist_t *bpl)
+{
+	bzero(bpl, sizeof (*bpl));
+	mutex_init(&bpl->bpl_lock, NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+bplist_fini(bplist_t *bpl)
+{
+	ASSERT(bpl->bpl_queue == NULL);
+	mutex_destroy(&bpl->bpl_lock);
+}
+
 static int
 bplist_hold(bplist_t *bpl)
 {
@@ -208,12 +222,13 @@
 	bparray[off].blk_fill = 0;
 
 	/* The bplist will compress better if we can leave off the checksum */
-	bzero(&bparray[off].blk_cksum, sizeof (bparray[off].blk_cksum));
+	if (!BP_GET_DEDUP(&bparray[off]))
+		bzero(&bparray[off].blk_cksum, sizeof (bparray[off].blk_cksum));
 
 	dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
 	bpl->bpl_phys->bpl_entries++;
 	bpl->bpl_phys->bpl_bytes +=
-	    bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), bp);
+	    bp_get_dsize_sync(dmu_objset_spa(bpl->bpl_mos), bp);
 	if (bpl->bpl_havecomp) {
 		bpl->bpl_phys->bpl_comp += BP_GET_PSIZE(bp);
 		bpl->bpl_phys->bpl_uncomp += BP_GET_UCSIZE(bp);
@@ -223,8 +238,14 @@
 	return (0);
 }
 
+void
+bplist_enqueue_cb(void *bpl, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	VERIFY(bplist_enqueue(bpl, bp, tx) == 0);
+}
+
 /*
- * Deferred entry; will be written later by bplist_sync().
+ * Deferred entry; will be processed later by bplist_sync().
  */
 void
 bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp)
@@ -240,7 +261,7 @@
 }
 
 void
-bplist_sync(bplist_t *bpl, dmu_tx_t *tx)
+bplist_sync(bplist_t *bpl, bplist_sync_cb_t *func, void *arg, dmu_tx_t *tx)
 {
 	bplist_q_t *bpq;
 
@@ -248,7 +269,7 @@
 	while ((bpq = bpl->bpl_queue) != NULL) {
 		bpl->bpl_queue = bpq->bpq_next;
 		mutex_exit(&bpl->bpl_lock);
-		VERIFY(0 == bplist_enqueue(bpl, &bpq->bpq_blk, tx));
+		func(arg, &bpq->bpq_blk, tx);
 		kmem_free(bpq, sizeof (*bpq));
 		mutex_enter(&bpl->bpl_lock);
 	}
@@ -311,12 +332,12 @@
 }
 
 /*
- * Return (in *dasizep) the amount of space on the deadlist which is:
+ * Return (in *dsizep) the amount of space on the deadlist which is:
  * mintxg < blk_birth <= maxtxg
  */
 int
 bplist_space_birthrange(bplist_t *bpl, uint64_t mintxg, uint64_t maxtxg,
-    uint64_t *dasizep)
+    uint64_t *dsizep)
 {
 	uint64_t size = 0;
 	uint64_t itor = 0;
@@ -331,19 +352,18 @@
 		mutex_enter(&bpl->bpl_lock);
 		err = bplist_hold(bpl);
 		if (err == 0)
-			*dasizep = bpl->bpl_phys->bpl_bytes;
+			*dsizep = bpl->bpl_phys->bpl_bytes;
 		mutex_exit(&bpl->bpl_lock);
 		return (err);
 	}
 
 	while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) {
 		if (bp.blk_birth > mintxg && bp.blk_birth <= maxtxg) {
-			size +=
-			    bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), &bp);
+			size += bp_get_dsize(dmu_objset_spa(bpl->bpl_mos), &bp);
 		}
 	}
 	if (err == ENOENT)
 		err = 0;
-	*dasizep = size;
+	*dsizep = size;
 	return (err);
 }
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/dbuf.c
--- a/usr/src/uts/common/fs/zfs/dbuf.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/dbuf.c	Sun Nov 01 14:14:46 2009 -0800
@@ -38,10 +38,6 @@
 static void dbuf_destroy(dmu_buf_impl_t *db);
 static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
-static arc_done_func_t dbuf_write_ready;
-static arc_done_func_t dbuf_write_done;
-static zio_done_func_t dbuf_skip_write_ready;
-static zio_done_func_t dbuf_skip_write_done;
 
 /*
  * Global data structures and functions for the dbuf cache.
@@ -285,6 +281,7 @@
 dbuf_verify(dmu_buf_impl_t *db)
 {
 	dnode_t *dn = db->db_dnode;
+	dbuf_dirty_record_t *dr;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
@@ -310,13 +307,19 @@
 		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
 	}
 
+	for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
+		ASSERT(dr->dr_dbuf == db);
+
+	for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
+		ASSERT(dr->dr_dbuf == db);
+
 	/*
 	 * We can't assert that db_size matches dn_datablksz because it
 	 * can be momentarily different when another thread is doing
 	 * dnode_set_blksz().
 	 */
 	if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
-		dbuf_dirty_record_t *dr = db->db_data_pending;
+		dr = db->db_data_pending;
 		/*
 		 * It should only be modified in syncing context, so
 		 * make sure we only have one copy of the data.
@@ -505,11 +508,9 @@
 	if (DBUF_IS_L2CACHEABLE(db))
 		aflags |= ARC_L2CACHE;
 
-	zb.zb_objset = db->db_objset->os_dsl_dataset ?
-	    db->db_objset->os_dsl_dataset->ds_object : 0;
-	zb.zb_object = db->db.db_object;
-	zb.zb_level = db->db_level;
-	zb.zb_blkid = db->db_blkid;
+	SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
+	    db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
+	    db->db.db_object, db->db_level, db->db_blkid);
 
 	dbuf_add_ref(db, NULL);
 	/* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
@@ -682,6 +683,7 @@
 dbuf_unoverride(dbuf_dirty_record_t *dr)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
+	blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
 	uint64_t txg = dr->dr_txg;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -692,13 +694,12 @@
 	    dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
 		return;
 
+	ASSERT(db->db_data_pending != dr);
+
 	/* free this block */
-	if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) {
-		/* XXX can get silent EIO here */
-		(void) dsl_free(NULL,
-		    spa_get_dsl(db->db_dnode->dn_objset->os_spa),
-		    txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT);
-	}
+	if (!BP_IS_HOLE(bp))
+		dsl_free(spa_get_dsl(db->db_dnode->dn_objset->os_spa), txg, bp);
+
 	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 	/*
 	 * Release the already-written buffer, so we leave it in
@@ -961,7 +962,8 @@
 			 * we now need to reset its state.
 			 */
 			dbuf_unoverride(dr);
-			if (db->db.db_object != DMU_META_DNODE_OBJECT)
+			if (db->db.db_object != DMU_META_DNODE_OBJECT &&
+			    db->db_state != DB_NOFILL)
 				arc_buf_thaw(db->db_buf);
 		}
 		mutex_exit(&db->db_mtx);
@@ -1000,7 +1002,7 @@
 		 * Update the accounting.
 		 * Note: we delay "free accounting" until after we drop
 		 * the db_mtx.  This keeps us from grabbing other locks
-		 * (and possibly deadlocking) in bp_get_dasize() while
+		 * (and possibly deadlocking) in bp_get_dsize() while
 		 * also holding the db_mtx.
 		 */
 		dnode_willuse_space(dn, db->db.db_size, tx);
@@ -1079,7 +1081,7 @@
 	} else if (do_free_accounting) {
 		blkptr_t *bp = db->db_blkptr;
 		int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
-		    bp_get_dasize(os->os_spa, bp) : db->db.db_size;
+		    bp_get_dsize(os->os_spa, bp) : db->db.db_size;
 		/*
 		 * This is only a guess -- if the dbuf is dirty
 		 * in a previous txg, we don't know how much
@@ -1172,6 +1174,7 @@
 		return (0);
 	}
 	ASSERT(dr->dr_txg == txg);
+	ASSERT(dr->dr_dbuf == db);
 
 	/*
 	 * If this buffer is currently held, we cannot undirty
@@ -1231,7 +1234,7 @@
 	if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
 		arc_buf_t *buf = db->db_buf;
 
-		ASSERT(arc_released(buf));
+		ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
 		dbuf_set_data(db, NULL);
 		VERIFY(arc_buf_remove_ref(buf, db) == 1);
 		dbuf_evict(db);
@@ -1649,13 +1652,12 @@
 	if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
 		if (bp && !BP_IS_HOLE(bp)) {
 			arc_buf_t *pbuf;
+			dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 			uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
 			zbookmark_t zb;
-			zb.zb_objset = dn->dn_objset->os_dsl_dataset ?
-			    dn->dn_objset->os_dsl_dataset->ds_object : 0;
-			zb.zb_object = dn->dn_object;
-			zb.zb_level = 0;
-			zb.zb_blkid = blkid;
+
+			SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
+			    dn->dn_object, 0, blkid);
 
 			if (db)
 				pbuf = db->db_buf;
@@ -1801,9 +1803,20 @@
 void
 dbuf_rele(dmu_buf_impl_t *db, void *tag)
 {
+	mutex_enter(&db->db_mtx);
+	dbuf_rele_and_unlock(db, tag);
+}
+
+/*
+ * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
+ * db_dirtycnt and db_holds to be updated atomically.
+ */
+void
+dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
+{
 	int64_t holds;
 
-	mutex_enter(&db->db_mtx);
+	ASSERT(MUTEX_HELD(&db->db_mtx));
 	DBUF_VERIFY(db);
 
 	holds = refcount_remove(&db->db_holds, tag);
@@ -2056,12 +2069,12 @@
 		while (*drp != dr)
 			drp = &(*drp)->dr_next;
 		ASSERT(dr->dr_next == NULL);
+		ASSERT(dr->dr_dbuf == db);
 		*drp = dr->dr_next;
 		kmem_free(dr, sizeof (dbuf_dirty_record_t));
 		ASSERT(db->db_dirtycnt > 0);
 		db->db_dirtycnt -= 1;
-		mutex_exit(&db->db_mtx);
-		dbuf_rele(db, (void *)(uintptr_t)txg);
+		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
 		return;
 	}
 
@@ -2083,44 +2096,10 @@
 		ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
 	}
 
-	/*
-	 * If this dbuf has already been written out via an immediate write,
-	 * just complete the write by copying over the new block pointer and
-	 * updating the accounting via the write-completion functions.
-	 */
-	if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
-		zio_t zio_fake;
-
-		zio_fake.io_private = &db;
-		zio_fake.io_error = 0;
-		zio_fake.io_bp = db->db_blkptr;
-		zio_fake.io_bp_orig = *db->db_blkptr;
-		zio_fake.io_txg = txg;
-		zio_fake.io_flags = 0;
-
-		*db->db_blkptr = dr->dt.dl.dr_overridden_by;
-		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
-		db->db_data_pending = dr;
-		dr->dr_zio = &zio_fake;
-		mutex_exit(&db->db_mtx);
-
-		ASSERT(!DVA_EQUAL(BP_IDENTITY(zio_fake.io_bp),
-		    BP_IDENTITY(&zio_fake.io_bp_orig)) ||
-		    BP_IS_HOLE(zio_fake.io_bp));
-
-		if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg))
-			(void) dsl_dataset_block_kill(os->os_dsl_dataset,
-			    &zio_fake.io_bp_orig, dn->dn_zio, tx);
-
-		dbuf_write_ready(&zio_fake, db->db_buf, db);
-		dbuf_write_done(&zio_fake, db->db_buf, db);
-
-		return;
-	}
-
 	if (db->db_state != DB_NOFILL &&
 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
 	    refcount_count(&db->db_holds) > 1 &&
+	    dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
 	    *datap == db->db_buf) {
 		/*
 		 * If this buffer is currently "in use" (i.e., there
@@ -2177,130 +2156,27 @@
 	}
 }
 
-static void
-dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
-{
-	dmu_buf_impl_t *db = dr->dr_dbuf;
-	dnode_t *dn = db->db_dnode;
-	objset_t *os = dn->dn_objset;
-	dmu_buf_impl_t *parent = db->db_parent;
-	uint64_t txg = tx->tx_txg;
-	zbookmark_t zb;
-	writeprops_t wp = { 0 };
-	zio_t *zio;
-
-	if (!BP_IS_HOLE(db->db_blkptr) &&
-	    (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE)) {
-		/*
-		 * Private object buffers are released here rather
-		 * than in dbuf_dirty() since they are only modified
-		 * in the syncing context and we don't want the
-		 * overhead of making multiple copies of the data.
-		 */
-		arc_release(data, db);
-	} else if (db->db_state != DB_NOFILL) {
-		ASSERT(arc_released(data));
-		/* XXX why do we need to thaw here? */
-		arc_buf_thaw(data);
-	}
-
-	if (parent != dn->dn_dbuf) {
-		ASSERT(parent && parent->db_data_pending);
-		ASSERT(db->db_level == parent->db_level-1);
-		ASSERT(arc_released(parent->db_buf));
-		zio = parent->db_data_pending->dr_zio;
-	} else {
-		ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1);
-		ASSERT3P(db->db_blkptr, ==,
-		    &dn->dn_phys->dn_blkptr[db->db_blkid]);
-		zio = dn->dn_zio;
-	}
-
-	ASSERT(db->db_level == 0 || data == db->db_buf);
-	ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
-	ASSERT(zio);
-
-	zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
-	zb.zb_object = db->db.db_object;
-	zb.zb_level = db->db_level;
-	zb.zb_blkid = db->db_blkid;
-
-	wp.wp_type = dn->dn_type;
-	wp.wp_level = db->db_level;
-	wp.wp_copies = os->os_copies;
-	wp.wp_dncompress = dn->dn_compress;
-	wp.wp_oscompress = os->os_compress;
-	wp.wp_dnchecksum = dn->dn_checksum;
-	wp.wp_oschecksum = os->os_checksum;
-
-	if (BP_IS_OLDER(db->db_blkptr, txg))
-		(void) dsl_dataset_block_kill(
-		    os->os_dsl_dataset, db->db_blkptr, zio, tx);
-
-	if (db->db_state == DB_NOFILL) {
-		zio_prop_t zp = { 0 };
-
-		write_policy(os->os_spa, &wp, &zp);
-		dr->dr_zio = zio_write(zio, os->os_spa,
-		    txg, db->db_blkptr, NULL,
-		    db->db.db_size, &zp, dbuf_skip_write_ready,
-		    dbuf_skip_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
-		    ZIO_FLAG_MUSTSUCCEED, &zb);
-	} else {
-		dr->dr_zio = arc_write(zio, os->os_spa, &wp,
-		    DBUF_IS_L2CACHEABLE(db), txg, db->db_blkptr,
-		    data, dbuf_write_ready, dbuf_write_done, db,
-		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
-	}
-}
-
-/* wrapper function for dbuf_write_ready bypassing ARC */
-static void
-dbuf_skip_write_ready(zio_t *zio)
-{
-	blkptr_t *bp = zio->io_bp;
-
-	if (!BP_IS_GANG(bp))
-		zio_skip_write(zio);
-
-	dbuf_write_ready(zio, NULL, zio->io_private);
-}
-
-/* wrapper function for dbuf_write_done bypassing ARC */
-static void
-dbuf_skip_write_done(zio_t *zio)
-{
-	dbuf_write_done(zio, NULL, zio->io_private);
-}
-
 /* ARGSUSED */
 static void
 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
 	dmu_buf_impl_t *db = vdb;
-	dnode_t *dn = db->db_dnode;
-	objset_t *os = dn->dn_objset;
 	blkptr_t *bp = zio->io_bp;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
+	dnode_t *dn = db->db_dnode;
+	spa_t *spa = zio->io_spa;
+	int64_t delta;
 	uint64_t fill = 0;
-	int old_size, new_size, i;
+	int i;
 
 	ASSERT(db->db_blkptr == bp);
 
-	dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", "");
-
-	old_size = bp_get_dasize(os->os_spa, bp_orig);
-	new_size = bp_get_dasize(os->os_spa, bp);
-
-	dnode_diduse_space(dn, new_size - old_size);
+	delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
+	dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
+	zio->io_prev_space_delta = delta;
 
 	if (BP_IS_HOLE(bp)) {
-		dsl_dataset_t *ds = os->os_dsl_dataset;
-		dmu_tx_t *tx = os->os_synctx;
-
-		if (bp_orig->blk_birth == tx->tx_txg)
-			(void) dsl_dataset_block_kill(ds, bp_orig, zio, tx);
-		ASSERT3U(bp->blk_fill, ==, 0);
+		ASSERT(bp->blk_fill == 0);
 		return;
 	}
 
@@ -2341,17 +2217,6 @@
 	bp->blk_fill = fill;
 
 	mutex_exit(&db->db_mtx);
-
-	if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
-		ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig)));
-	} else {
-		dsl_dataset_t *ds = os->os_dsl_dataset;
-		dmu_tx_t *tx = os->os_synctx;
-
-		if (bp_orig->blk_birth == tx->tx_txg)
-			(void) dsl_dataset_block_kill(ds, bp_orig, zio, tx);
-		dsl_dataset_block_born(ds, bp, tx);
-	}
 }
 
 /* ARGSUSED */
@@ -2359,37 +2224,50 @@
 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
 	dmu_buf_impl_t *db = vdb;
+	blkptr_t *bp = zio->io_bp;
+	blkptr_t *bp_orig = &zio->io_bp_orig;
+	dnode_t *dn = db->db_dnode;
+	objset_t *os = dn->dn_objset;
 	uint64_t txg = zio->io_txg;
 	dbuf_dirty_record_t **drp, *dr;
 
 	ASSERT3U(zio->io_error, ==, 0);
+	ASSERT(db->db_blkptr == bp);
+
+	if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
+		ASSERT(BP_EQUAL(bp, bp_orig));
+	} else {
+		dsl_dataset_t *ds = os->os_dsl_dataset;
+		dmu_tx_t *tx = os->os_synctx;
+
+		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
+		dsl_dataset_block_born(ds, bp, tx);
+	}
 
 	mutex_enter(&db->db_mtx);
 
+	DBUF_VERIFY(db);
+
 	drp = &db->db_last_dirty;
 	while ((dr = *drp) != db->db_data_pending)
 		drp = &dr->dr_next;
 	ASSERT(!list_link_active(&dr->dr_dirty_node));
 	ASSERT(dr->dr_txg == txg);
+	ASSERT(dr->dr_dbuf == db);
 	ASSERT(dr->dr_next == NULL);
 	*drp = dr->dr_next;
 
 	if (db->db_level == 0) {
 		ASSERT(db->db_blkid != DB_BONUS_BLKID);
 		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
-
 		if (db->db_state != DB_NOFILL) {
 			if (dr->dt.dl.dr_data != db->db_buf)
 				VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
 				    db) == 1);
-			else if (!BP_IS_HOLE(db->db_blkptr))
+			else if (!arc_released(db->db_buf))
 				arc_set_callback(db->db_buf, dbuf_do_evict, db);
-			else
-				ASSERT(arc_released(db->db_buf));
 		}
 	} else {
-		dnode_t *dn = db->db_dnode;
-
 		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
 		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
 		if (!BP_IS_HOLE(db->db_blkptr)) {
@@ -2410,9 +2288,122 @@
 	ASSERT(db->db_dirtycnt > 0);
 	db->db_dirtycnt -= 1;
 	db->db_data_pending = NULL;
+	dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
+}
+
+static void
+dbuf_write_nofill_ready(zio_t *zio)
+{
+	dbuf_write_ready(zio, NULL, zio->io_private);
+}
+
+static void
+dbuf_write_nofill_done(zio_t *zio)
+{
+	dbuf_write_done(zio, NULL, zio->io_private);
+}
+
+static void
+dbuf_write_override_ready(zio_t *zio)
+{
+	dbuf_dirty_record_t *dr = zio->io_private;
+	dmu_buf_impl_t *db = dr->dr_dbuf;
+
+	dbuf_write_ready(zio, NULL, db);
+}
+
+static void
+dbuf_write_override_done(zio_t *zio)
+{
+	dbuf_dirty_record_t *dr = zio->io_private;
+	dmu_buf_impl_t *db = dr->dr_dbuf;
+	blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
+
+	mutex_enter(&db->db_mtx);
+	if (!BP_EQUAL(zio->io_bp, obp)) {
+		if (!BP_IS_HOLE(obp))
+			dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
+		arc_release(dr->dt.dl.dr_data, db);
+	}
 	mutex_exit(&db->db_mtx);
 
-	dprintf_dbuf_bp(db, zio->io_bp, "bp: %s", "");
+	dbuf_write_done(zio, NULL, db);
+}
+
+static void
+dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = dr->dr_dbuf;
+	dnode_t *dn = db->db_dnode;
+	objset_t *os = dn->dn_objset;
+	dmu_buf_impl_t *parent = db->db_parent;
+	uint64_t txg = tx->tx_txg;
+	zbookmark_t zb;
+	zio_prop_t zp;
+	zio_t *zio;
+
+	if (db->db_state != DB_NOFILL) {
+		if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
+			/*
+			 * Private object buffers are released here rather
+			 * than in dbuf_dirty() since they are only modified
+			 * in the syncing context and we don't want the
+			 * overhead of making multiple copies of the data.
+			 */
+			if (BP_IS_HOLE(db->db_blkptr)) {
+				arc_buf_thaw(data);
+			} else {
+				arc_release(data, db);
+			}
+		}
+	}
 
-	dbuf_rele(db, (void *)(uintptr_t)txg);
+	if (parent != dn->dn_dbuf) {
+		ASSERT(parent && parent->db_data_pending);
+		ASSERT(db->db_level == parent->db_level-1);
+		ASSERT(arc_released(parent->db_buf));
+		zio = parent->db_data_pending->dr_zio;
+	} else {
+		ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1);
+		ASSERT3P(db->db_blkptr, ==,
+		    &dn->dn_phys->dn_blkptr[db->db_blkid]);
+		zio = dn->dn_zio;
+	}
+
+	ASSERT(db->db_level == 0 || data == db->db_buf);
+	ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
+	ASSERT(zio);
+
+	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
+	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
+	    db->db.db_object, db->db_level, db->db_blkid);
+
+	dmu_write_policy(os, dn, db->db_level,
+	    db->db_state == DB_NOFILL ? WP_NOFILL : 0, &zp);
+
+	if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
+		ASSERT(db->db_state != DB_NOFILL);
+		dr->dr_zio = zio_write(zio, os->os_spa, txg,
+		    db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
+		    dbuf_write_override_ready, dbuf_write_override_done, dr,
+		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+		mutex_enter(&db->db_mtx);
+		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+		zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
+		    dr->dt.dl.dr_copies);
+		mutex_exit(&db->db_mtx);
+	} else if (db->db_state == DB_NOFILL) {
+		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
+		dr->dr_zio = zio_write(zio, os->os_spa, txg,
+		    db->db_blkptr, NULL, db->db.db_size, &zp,
+		    dbuf_write_nofill_ready, dbuf_write_nofill_done, db,
+		    ZIO_PRIORITY_ASYNC_WRITE,
+		    ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
+	} else {
+		ASSERT(arc_released(data));
+		dr->dr_zio = arc_write(zio, os->os_spa, txg,
+		    db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), &zp,
+		    dbuf_write_ready, dbuf_write_done, db,
+		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+	}
 }
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/ddt.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/fs/zfs/ddt.c	Sun Nov 01 14:14:46 2009 -0800
@@ -0,0 +1,955 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/ddt.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+#include <sys/arc.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+
+static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
+	&ddt_zap_ops,
+};
+
+static const char *ddt_class_name[DDT_CLASSES] = {
+	"ditto",
+	"duplicate",
+	"unique",
+};
+
+static void
+ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    dmu_tx_t *tx)
+{
+	spa_t *spa = ddt->ddt_spa;
+	objset_t *os = ddt->ddt_os;
+	uint64_t *objectp = &ddt->ddt_object[type][class];
+	boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_dedup;
+	char name[DDT_NAMELEN];
+
+	ddt_object_name(ddt, type, class, name);
+
+	ASSERT(*objectp == 0);
+	VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0);
+	ASSERT(*objectp != 0);
+
+	VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
+	    sizeof (uint64_t), 1, objectp, tx) == 0);
+
+	VERIFY(zap_add(os, spa->spa_ddt_stat_object, name,
+	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+	    &ddt->ddt_histogram[type][class], tx) == 0);
+}
+
+static void
+ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    dmu_tx_t *tx)
+{
+	spa_t *spa = ddt->ddt_spa;
+	objset_t *os = ddt->ddt_os;
+	uint64_t *objectp = &ddt->ddt_object[type][class];
+	char name[DDT_NAMELEN];
+
+	ddt_object_name(ddt, type, class, name);
+
+	ASSERT(*objectp != 0);
+	ASSERT(ddt_object_count(ddt, type, class) == 0);
+	ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
+	VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0);
+	VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0);
+	VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0);
+
+	*objectp = 0;
+}
+
+static int
+ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+{
+	char name[DDT_NAMELEN];
+	int error;
+
+	ddt_object_name(ddt, type, class, name);
+
+	error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
+	    sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
+
+	if (error)
+		return (error);
+
+	error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
+	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+	    &ddt->ddt_histogram[type][class]);
+
+	ASSERT(error == 0);
+	return (error);
+}
+
+static void
+ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    dmu_tx_t *tx)
+{
+	char name[DDT_NAMELEN];
+
+	ddt_object_name(ddt, type, class, name);
+
+	VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
+	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+	    &ddt->ddt_histogram[type][class], tx) == 0);
+}
+
+static int
+ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    ddt_entry_t *dde)
+{
+	if (!ddt_object_exists(ddt, type, class))
+		return (ENOENT);
+
+	return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
+	    ddt->ddt_object[type][class], dde));
+}
+
+static int
+ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    ddt_entry_t *dde, dmu_tx_t *tx)
+{
+	ASSERT(ddt_object_exists(ddt, type, class));
+
+	return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
+	    ddt->ddt_object[type][class], dde, tx));
+}
+
+static int
+ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    ddt_entry_t *dde, dmu_tx_t *tx)
+{
+	ASSERT(ddt_object_exists(ddt, type, class));
+
+	return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os,
+	    ddt->ddt_object[type][class], dde, tx));
+}
+
+int
+ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    ddt_entry_t *dde, uint64_t *walk)
+{
+	ASSERT(ddt_object_exists(ddt, type, class));
+
+	return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
+	    ddt->ddt_object[type][class], dde, walk));
+}
+
+uint64_t
+ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+{
+	ASSERT(ddt_object_exists(ddt, type, class));
+
+	return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
+	    ddt->ddt_object[type][class]));
+}
+
+int
+ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    dmu_object_info_t *doi)
+{
+	if (!ddt_object_exists(ddt, type, class))
+		return (ENOENT);
+
+	return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
+	    doi));
+}
+
+boolean_t
+ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+{
+	return (!!ddt->ddt_object[type][class]);
+}
+
+void
+ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    char *name)
+{
+	(void) sprintf(name, DMU_POOL_DDT,
+	    zio_checksum_table[ddt->ddt_checksum].ci_name,
+	    ddt_ops[type]->ddt_op_name, ddt_class_name[class]);
+}
+
+void
+ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg)
+{
+	ASSERT(txg != 0);
+
+	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+		bp->blk_dva[d] = ddp->ddp_dva[d];
+	BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth);
+}
+
+void
+ddt_bp_create(const ddt_t *ddt, const ddt_key_t *ddk, const ddt_phys_t *ddp,
+    blkptr_t *bp)
+{
+	BP_ZERO(bp);
+
+	if (ddp != NULL)
+		ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth);
+
+	bp->blk_cksum = ddk->ddk_cksum;
+
+	BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
+	BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
+	BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
+	BP_SET_CHECKSUM(bp, ddt->ddt_checksum);
+	BP_SET_TYPE(bp, DMU_OT_NONE);
+	BP_SET_LEVEL(bp, 0);
+	BP_SET_DEDUP(bp, 0);
+	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+}
+
+void
+ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
+{
+	ddk->ddk_cksum = bp->blk_cksum;
+	ddk->ddk_prop = 0;
+
+	DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp));
+	DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp));
+	DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp));
+}
+
+void
+ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp)
+{
+	ASSERT(ddp->ddp_phys_birth == 0);
+
+	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+		ddp->ddp_dva[d] = bp->blk_dva[d];
+	ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp);
+}
+
+void
+ddt_phys_clear(ddt_phys_t *ddp)
+{
+	bzero(ddp, sizeof (*ddp));
+}
+
+void
+ddt_phys_addref(ddt_phys_t *ddp)
+{
+	ddp->ddp_refcnt++;
+}
+
+void
+ddt_phys_decref(ddt_phys_t *ddp)
+{
+	ASSERT((int64_t)ddp->ddp_refcnt > 0);
+	ddp->ddp_refcnt--;
+}
+
+void
+ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg)
+{
+	blkptr_t blk;
+
+	ddt_bp_create(ddt, ddk, ddp, &blk);
+	ddt_phys_clear(ddp);
+	zio_free(ddt->ddt_spa, txg, &blk);
+}
+
+ddt_phys_t *
+ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp)
+{
+	ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys;
+
+	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+		if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
+		    BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth)
+			return (ddp);
+	}
+	return (NULL);
+}
+
+uint64_t
+ddt_phys_total_refcnt(const ddt_entry_t *dde)
+{
+	uint64_t refcnt = 0;
+
+	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
+		refcnt += dde->dde_phys[p].ddp_refcnt;
+
+	return (refcnt);
+}
+
+static void
+ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
+{
+	spa_t *spa = ddt->ddt_spa;
+	ddt_phys_t *ddp = dde->dde_phys;
+	ddt_key_t *ddk = &dde->dde_key;
+	uint64_t lsize = DDK_GET_LSIZE(ddk);
+	uint64_t psize = DDK_GET_PSIZE(ddk);
+
+	bzero(dds, sizeof (*dds));
+
+	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+		uint64_t dsize = 0;
+		uint64_t refcnt = ddp->ddp_refcnt;
+
+		if (ddp->ddp_phys_birth == 0)
+			continue;
+
+		for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+			dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
+
+		dds->dds_blocks += 1;
+		dds->dds_lsize += lsize;
+		dds->dds_psize += psize;
+		dds->dds_dsize += dsize;
+
+		dds->dds_ref_blocks += refcnt;
+		dds->dds_ref_lsize += lsize * refcnt;
+		dds->dds_ref_psize += psize * refcnt;
+		dds->dds_ref_dsize += dsize * refcnt;
+	}
+}
+
+void
+ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
+{
+	const uint64_t *s = (const uint64_t *)src;
+	uint64_t *d = (uint64_t *)dst;
+	uint64_t *d_end = (uint64_t *)(dst + 1);
+
+	ASSERT(neg == 0 || neg == -1ULL);	/* add or subtract */
+
+	while (d < d_end)
+		*d++ += (*s++ ^ neg) - neg;
+}
+
+static void
+ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
+{
+	ddt_stat_t dds;
+	ddt_histogram_t *ddh;
+	int bucket;
+
+	ddt_stat_generate(ddt, dde, &dds);
+
+	bucket = highbit(dds.dds_ref_blocks) - 1;
+	ASSERT(bucket >= 0);
+
+	ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
+
+	ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);
+}
+
+void
+ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
+{
+	for (int h = 0; h < 64; h++)
+		ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
+}
+
+void
+ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
+{
+	bzero(dds, sizeof (*dds));
+
+	for (int h = 0; h < 64; h++)
+		ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
+}
+
+boolean_t
+ddt_histogram_empty(const ddt_histogram_t *ddh)
+{
+	const uint64_t *s = (const uint64_t *)ddh;
+	const uint64_t *s_end = (const uint64_t *)(ddh + 1);
+
+	while (s < s_end)
+		if (*s++ != 0)
+			return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+uint64_t
+ddt_get_pool_dedup_ratio(spa_t *spa)
+{
+	ddt_histogram_t ddh_total = { 0 };
+	ddt_stat_t dds_total = { 0 };
+
+	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+		ddt_t *ddt = spa->spa_ddt[c];
+		for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+			for (enum ddt_class class = 0; class < DDT_CLASSES;
+			    class++) {
+				ddt_histogram_add(&ddh_total,
+				    &ddt->ddt_histogram[type][class]);
+			}
+		}
+	}
+
+	ddt_histogram_stat(&dds_total, &ddh_total);
+
+	if (dds_total.dds_dsize == 0)
+		return (100);
+
+	return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize);
+}
+
+int
+ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref)
+{
+	spa_t *spa = ddt->ddt_spa;
+	uint64_t total_refcnt = 0;
+	uint64_t ditto = spa->spa_dedup_ditto;
+	int total_copies = 0;
+	int desired_copies = 0;
+
+	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+		ddt_phys_t *ddp = &dde->dde_phys[p];
+		zio_t *zio = dde->dde_lead_zio[p];
+		uint64_t refcnt = ddp->ddp_refcnt;	/* committed refs */
+		if (zio != NULL)
+			refcnt += zio->io_parent_count;	/* pending refs */
+		if (ddp == ddp_willref)
+			refcnt++;			/* caller's ref */
+		if (refcnt != 0) {
+			total_refcnt += refcnt;
+			total_copies += p;
+		}
+	}
+
+	if (ditto == 0 || ditto > UINT32_MAX)
+		ditto = UINT32_MAX;
+
+	if (total_refcnt >= 1)
+		desired_copies++;
+	if (total_refcnt >= ditto)
+		desired_copies++;
+	if (total_refcnt >= ditto * ditto)
+		desired_copies++;
+
+	return (MAX(desired_copies, total_copies) - total_copies);
+}
+
+int
+ddt_ditto_copies_present(ddt_entry_t *dde)
+{
+	ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO];
+	dva_t *dva = ddp->ddp_dva;
+	int copies = 0 - DVA_GET_GANG(dva);
+
+	for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++)
+		if (DVA_IS_VALID(dva))
+			copies++;
+
+	ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP);
+
+	return (copies);
+}
+
+size_t
+ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len)
+{
+	uchar_t *version = dst++;
+	int cpfunc = ZIO_COMPRESS_ZLE;
+	zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+	size_t c_len;
+
+	ASSERT(d_len >= s_len + 1);	/* no compression plus version byte */
+
+	c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level);
+
+	if (c_len == s_len) {
+		cpfunc = ZIO_COMPRESS_OFF;
+		bcopy(src, dst, s_len);
+	}
+
+	*version = (ZFS_HOST_BYTEORDER & DDT_COMPRESS_BYTEORDER_MASK) | cpfunc;
+
+	return (c_len + 1);
+}
+
+void
+ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
+{
+	uchar_t version = *src++;
+	int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK;
+	zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+
+	if (ci->ci_decompress != NULL)
+		(void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
+	else
+		bcopy(src, dst, d_len);
+
+	if ((version ^ ZFS_HOST_BYTEORDER) & DDT_COMPRESS_BYTEORDER_MASK)
+		byteswap_uint64_array(dst, d_len);
+}
+
+ddt_t *
+ddt_select_by_checksum(spa_t *spa, enum zio_checksum c)
+{
+	return (spa->spa_ddt[c]);
+}
+
+ddt_t *
+ddt_select(spa_t *spa, const blkptr_t *bp)
+{
+	return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
+}
+
+void
+ddt_enter(ddt_t *ddt)
+{
+	mutex_enter(&ddt->ddt_lock);
+}
+
+void
+ddt_exit(ddt_t *ddt)
+{
+	mutex_exit(&ddt->ddt_lock);
+}
+
+static ddt_entry_t *
+ddt_alloc(const ddt_key_t *ddk)
+{
+	ddt_entry_t *dde;
+
+	dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP);
+	cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
+
+	dde->dde_key = *ddk;
+
+	return (dde);
+}
+
+static void
+ddt_free(ddt_entry_t *dde)
+{
+	ASSERT(!dde->dde_loading);
+
+	for (int p = 0; p < DDT_PHYS_TYPES; p++)
+		ASSERT(dde->dde_lead_zio[p] == NULL);
+
+	if (dde->dde_repair_data != NULL)
+		zio_buf_free(dde->dde_repair_data,
+		    DDK_GET_PSIZE(&dde->dde_key));
+
+	cv_destroy(&dde->dde_cv);
+	kmem_free(dde, sizeof (*dde));
+}
+
+void
+ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
+{
+	ASSERT(MUTEX_HELD(&ddt->ddt_lock));
+
+	avl_remove(&ddt->ddt_tree, dde);
+	ddt_free(dde);
+}
+
+ddt_entry_t *
+ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
+{
+	ddt_entry_t *dde, dde_search;
+	enum ddt_type type;
+	enum ddt_class class;
+	avl_index_t where;
+	int error;
+
+	ASSERT(MUTEX_HELD(&ddt->ddt_lock));
+
+	ddt_key_fill(&dde_search.dde_key, bp);
+
+	dde = avl_find(&ddt->ddt_tree, &dde_search, &where);
+	if (dde == NULL) {
+		if (!add)
+			return (NULL);
+		dde = ddt_alloc(&dde_search.dde_key);
+		avl_insert(&ddt->ddt_tree, dde, where);
+	}
+
+	while (dde->dde_loading)
+		cv_wait(&dde->dde_cv, &ddt->ddt_lock);
+
+	if (dde->dde_loaded)
+		return (dde);
+
+	dde->dde_loading = B_TRUE;
+
+	ddt_exit(ddt);
+
+	error = ENOENT;
+
+	for (type = 0; type < DDT_TYPES; type++) {
+		for (class = 0; class < DDT_CLASSES; class++) {
+			error = ddt_object_lookup(ddt, type, class, dde);
+			if (error != ENOENT)
+				break;
+		}
+		if (error != ENOENT)
+			break;
+	}
+
+	ASSERT(error == 0 || error == ENOENT);
+
+	ddt_enter(ddt);
+
+	ASSERT(dde->dde_loaded == B_FALSE);
+	ASSERT(dde->dde_loading == B_TRUE);
+
+	dde->dde_type = type;	/* will be DDT_TYPES if no entry found */
+	dde->dde_class = class;	/* will be DDT_CLASSES if no entry found */
+	dde->dde_loaded = B_TRUE;
+	dde->dde_loading = B_FALSE;
+
+	if (error == 0)
+		ddt_stat_update(ddt, dde, -1ULL);
+
+	cv_broadcast(&dde->dde_cv);
+
+	return (dde);
+}
+
+int
+ddt_entry_compare(const void *x1, const void *x2)
+{
+	const ddt_entry_t *dde1 = x1;
+	const ddt_entry_t *dde2 = x2;
+	const uint64_t *u1 = (const uint64_t *)&dde1->dde_key;
+	const uint64_t *u2 = (const uint64_t *)&dde2->dde_key;
+
+	for (int i = 0; i < DDT_KEY_WORDS; i++) {
+		if (u1[i] < u2[i])
+			return (-1);
+		if (u1[i] > u2[i])
+			return (1);
+	}
+
+	return (0);
+}
+
+static ddt_t *
+ddt_table_alloc(spa_t *spa, enum zio_checksum c)
+{
+	ddt_t *ddt;
+
+	ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP);
+
+	mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
+	avl_create(&ddt->ddt_tree, ddt_entry_compare,
+	    sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
+	avl_create(&ddt->ddt_repair_tree, ddt_entry_compare,
+	    sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
+	ddt->ddt_checksum = c;
+	ddt->ddt_spa = spa;
+	ddt->ddt_os = spa->spa_meta_objset;
+
+	return (ddt);
+}
+
+static void
+ddt_table_free(ddt_t *ddt)
+{
+	ASSERT(avl_numnodes(&ddt->ddt_tree) == 0);
+	ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0);
+	avl_destroy(&ddt->ddt_tree);
+	avl_destroy(&ddt->ddt_repair_tree);
+	mutex_destroy(&ddt->ddt_lock);
+	kmem_free(ddt, sizeof (*ddt));
+}
+
+void
+ddt_create(spa_t *spa)
+{
+	spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
+
+	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
+		spa->spa_ddt[c] = ddt_table_alloc(spa, c);
+}
+
+int
+ddt_load(spa_t *spa)
+{
+	int error;
+
+	ddt_create(spa);
+
+	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
+	    &spa->spa_ddt_stat_object);
+
+	if (error)
+		return (error == ENOENT ? 0 : error);
+
+	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+		for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+			for (enum ddt_class class = 0; class < DDT_CLASSES;
+			    class++) {
+				ddt_t *ddt = spa->spa_ddt[c];
+				error = ddt_object_load(ddt, type, class);
+				if (error != 0 && error != ENOENT)
+					return (error);
+			}
+		}
+	}
+
+	return (0);
+}
+
+void
+ddt_unload(spa_t *spa)
+{
+	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+		if (spa->spa_ddt[c]) {
+			ddt_table_free(spa->spa_ddt[c]);
+			spa->spa_ddt[c] = NULL;
+		}
+	}
+}
+
+ddt_entry_t *
+ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
+{
+	ddt_key_t ddk;
+	ddt_entry_t *dde;
+
+	ddt_key_fill(&ddk, bp);
+
+	dde = ddt_alloc(&ddk);
+
+	for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+		for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+			/*
+			 * We can only do repair if there are multiple copies
+			 * of the block.  For anything in the UNIQUE class,
+			 * there's definitely only one copy, so don't even try.
+			 */
+			if (class != DDT_CLASS_UNIQUE &&
+			    ddt_object_lookup(ddt, type, class, dde) == 0)
+				return (dde);
+		}
+	}
+
+	bzero(dde->dde_phys, sizeof (dde->dde_phys));
+
+	return (dde);
+}
+
+void
+ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
+{
+	avl_index_t where;
+
+	ddt_enter(ddt);
+
+	if (dde->dde_repair_data != NULL && spa_writeable(ddt->ddt_spa) &&
+	    avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
+		avl_insert(&ddt->ddt_repair_tree, dde, where);
+	else
+		ddt_free(dde);
+
+	ddt_exit(ddt);
+}
+
+static void
+ddt_repair_entry_done(zio_t *zio)
+{
+	ddt_entry_t *rdde = zio->io_private;
+
+	ddt_free(rdde);
+}
+
+static void
+ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
+{
+	ddt_phys_t *ddp = dde->dde_phys;
+	ddt_phys_t *rddp = rdde->dde_phys;
+	ddt_key_t *ddk = &dde->dde_key;
+	ddt_key_t *rddk = &rdde->dde_key;
+	zio_t *zio;
+	blkptr_t blk;
+
+	zio = zio_null(rio, rio->io_spa, NULL,
+	    ddt_repair_entry_done, rdde, rio->io_flags);
+
+	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) {
+		if (ddp->ddp_phys_birth == 0 ||
+		    ddp->ddp_phys_birth != rddp->ddp_phys_birth ||
+		    bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva)))
+			continue;
+		ddt_bp_create(ddt, ddk, ddp, &blk);
+		zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
+		    rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL,
+		    ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
+	}
+
+	zio_nowait(zio);
+}
+
+static void
+ddt_repair_table(ddt_t *ddt, zio_t *rio)
+{
+	spa_t *spa = ddt->ddt_spa;
+	ddt_entry_t *dde, *rdde_next, *rdde;
+	avl_tree_t *t = &ddt->ddt_repair_tree;
+	blkptr_t blk;
+
+	if (spa_sync_pass(spa) > 1)
+		return;
+
+	ddt_enter(ddt);
+	for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
+		rdde_next = AVL_NEXT(t, rdde);
+		avl_remove(&ddt->ddt_repair_tree, rdde);
+		ddt_exit(ddt);
+		ddt_bp_create(ddt, &rdde->dde_key, NULL, &blk);
+		dde = ddt_repair_start(ddt, &blk);
+		ddt_repair_entry(ddt, dde, rdde, rio);
+		ddt_repair_done(ddt, dde);
+		ddt_enter(ddt);
+	}
+	ddt_exit(ddt);
+}
+
+static void
+ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
+{
+	ddt_phys_t *ddp = dde->dde_phys;
+	ddt_key_t *ddk = &dde->dde_key;
+	enum ddt_type otype = dde->dde_type;
+	enum ddt_type ntype = DDT_TYPE_CURRENT;
+	enum ddt_class oclass = dde->dde_class;
+	enum ddt_class nclass;
+	uint64_t total_refcnt = 0;
+
+	ASSERT(dde->dde_loaded);
+	ASSERT(!dde->dde_loading);
+
+	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+		ASSERT(dde->dde_lead_zio[p] == NULL);
+		ASSERT((int64_t)ddp->ddp_refcnt >= 0);
+		if (ddp->ddp_phys_birth == 0) {
+			ASSERT(ddp->ddp_refcnt == 0);
+			continue;
+		}
+		if (p == DDT_PHYS_DITTO) {
+			if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0)
+				ddt_phys_free(ddt, ddk, ddp, txg);
+			continue;
+		}
+		if (ddp->ddp_refcnt == 0)
+			ddt_phys_free(ddt, ddk, ddp, txg);
+		total_refcnt += ddp->ddp_refcnt;
+	}
+
+	if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0)
+		nclass = DDT_CLASS_DITTO;
+	else if (total_refcnt > 1)
+		nclass = DDT_CLASS_DUPLICATE;
+	else
+		nclass = DDT_CLASS_UNIQUE;
+
+	if (otype != DDT_TYPES &&
+	    (otype != ntype || oclass != nclass || total_refcnt == 0)) {
+		VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0);
+		ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT);
+	}
+
+	if (total_refcnt != 0) {
+		dde->dde_type = ntype;
+		dde->dde_class = nclass;
+		ddt_stat_update(ddt, dde, 0);
+		if (!ddt_object_exists(ddt, ntype, nclass))
+			ddt_object_create(ddt, ntype, nclass, tx);
+		VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0);
+	}
+}
+
+static void
+ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
+{
+	spa_t *spa = ddt->ddt_spa;
+	ddt_entry_t *dde;
+	void *cookie = NULL;
+
+	if (avl_numnodes(&ddt->ddt_tree) == 0)
+		return;
+
+	ASSERT(spa_sync_pass(spa) == 1);
+	ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);
+
+	if (spa->spa_ddt_stat_object == 0) {
+		spa->spa_ddt_stat_object = zap_create(ddt->ddt_os,
+		    DMU_OT_DDT_STATS, DMU_OT_NONE, 0, tx);
+		VERIFY(zap_add(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
+		    &spa->spa_ddt_stat_object, tx) == 0);
+	}
+
+	while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
+		ddt_sync_entry(ddt, dde, tx, txg);
+		ddt_free(dde);
+	}
+
+	for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+		for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+			if (!ddt_object_exists(ddt, type, class))
+				continue;
+			ddt_object_sync(ddt, type, class, tx);
+			if (ddt_object_count(ddt, type, class) == 0)
+				ddt_object_destroy(ddt, type, class, tx);
+		}
+	}
+}
+
+void
+ddt_sync(spa_t *spa, uint64_t txg)
+{
+	dmu_tx_t *tx;
+	zio_t *rio = zio_root(spa, NULL, NULL,
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+
+	ASSERT(spa_syncing_txg(spa) == txg);
+
+	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+		ddt_t *ddt = spa->spa_ddt[c];
+		if (ddt == NULL)
+			continue;
+		ddt_sync_table(ddt, tx, txg);
+		ddt_repair_table(ddt, rio);
+	}
+
+	(void) zio_wait(rio);
+
+	dmu_tx_commit(tx);
+}
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/ddt_zap.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/fs/zfs/ddt_zap.c	Sun Nov 01 14:14:46 2009 -0800
@@ -0,0 +1,150 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/ddt.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+#include <util/sscanf.h>
+
+int ddt_zap_leaf_blockshift = 12;
+int ddt_zap_indirect_blockshift = 12;
+
+static int
+ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash)
+{
+	zap_flags_t flags = ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY;
+
+	if (prehash)
+		flags |= ZAP_FLAG_PRE_HASHED_KEY;
+
+	*objectp = zap_create_flags(os, 0, flags, DMU_OT_DDT_ZAP,
+	    ddt_zap_leaf_blockshift, ddt_zap_indirect_blockshift,
+	    DMU_OT_NONE, 0, tx);
+
+	return (*objectp == 0 ? ENOTSUP : 0);
+}
+
+static int
+ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+	return (zap_destroy(os, object, tx));
+}
+
+static int
+ddt_zap_lookup(objset_t *os, uint64_t object, ddt_entry_t *dde)
+{
+	uchar_t cbuf[sizeof (dde->dde_phys) + 1];
+	uint64_t one, csize;
+	int error;
+
+	error = zap_length_uint64(os, object, (uint64_t *)&dde->dde_key,
+	    DDT_KEY_WORDS, &one, &csize);
+	if (error)
+		return (error);
+
+	ASSERT(one == 1);
+	ASSERT(csize <= sizeof (cbuf));
+
+	error = zap_lookup_uint64(os, object, (uint64_t *)&dde->dde_key,
+	    DDT_KEY_WORDS, 1, csize, cbuf);
+	if (error)
+		return (error);
+
+	ddt_decompress(cbuf, dde->dde_phys, csize, sizeof (dde->dde_phys));
+
+	return (0);
+}
+
+static int
+ddt_zap_update(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx)
+{
+	uchar_t cbuf[sizeof (dde->dde_phys) + 1];
+	uint64_t csize;
+
+	csize = ddt_compress(dde->dde_phys, cbuf,
+	    sizeof (dde->dde_phys), sizeof (cbuf));
+
+	return (zap_update_uint64(os, object, (uint64_t *)&dde->dde_key,
+	    DDT_KEY_WORDS, 1, csize, cbuf, tx));
+}
+
+static int
+ddt_zap_remove(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx)
+{
+	return (zap_remove_uint64(os, object, (uint64_t *)&dde->dde_key,
+	    DDT_KEY_WORDS, tx));
+}
+
+static int
+ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk)
+{
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	int error;
+
+	zap_cursor_init_serialized(&zc, os, object, *walk);
+	if ((error = zap_cursor_retrieve(&zc, &za)) == 0) {
+		uchar_t cbuf[sizeof (dde->dde_phys) + 1];
+		uint64_t csize = za.za_num_integers;
+		ASSERT(za.za_integer_length == 1);
+		error = zap_lookup_uint64(os, object, (uint64_t *)za.za_name,
+		    DDT_KEY_WORDS, 1, csize, cbuf);
+		ASSERT(error == 0);
+		if (error == 0) {
+			ddt_decompress(cbuf, dde->dde_phys, csize,
+			    sizeof (dde->dde_phys));
+			dde->dde_key = *(ddt_key_t *)za.za_name;
+		}
+		zap_cursor_advance(&zc);
+		*walk = zap_cursor_serialize(&zc);
+	}
+	zap_cursor_fini(&zc);
+	return (error);
+}
+
+static uint64_t
+ddt_zap_count(objset_t *os, uint64_t object)
+{
+	uint64_t count = 0;
+
+	VERIFY(zap_count(os, object, &count) == 0);
+
+	return (count);
+}
+
+const ddt_ops_t ddt_zap_ops = {
+	"zap",
+	ddt_zap_create,
+	ddt_zap_destroy,
+	ddt_zap_lookup,
+	ddt_zap_update,
+	ddt_zap_remove,
+	ddt_zap_walk,
+	ddt_zap_count,
+};
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/dmu.c
--- a/usr/src/uts/common/fs/zfs/dmu.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/dmu.c	Sun Nov 01 14:14:46 2009 -0800
@@ -88,6 +88,8 @@
 	{	zap_byteswap,		TRUE,	"ZFS user/group used"	},
 	{	zap_byteswap,		TRUE,	"ZFS user/group quota"	},
 	{	zap_byteswap,		TRUE,	"snapshot refcount tags"},
+	{	zap_byteswap,		TRUE,	"DDT ZAP algorithm"	},
+	{	zap_byteswap,		TRUE,	"DDT statistics"	},
 };
 
 int
@@ -859,55 +861,114 @@
 }
 
 typedef struct {
-	dbuf_dirty_record_t	*dr;
-	dmu_sync_cb_t		*done;
-	void			*arg;
+	dbuf_dirty_record_t	*dsa_dr;
+	dmu_sync_cb_t		*dsa_done;
+	zgd_t			*dsa_zgd;
+	dmu_tx_t		*dsa_tx;
 } dmu_sync_arg_t;
 
 /* ARGSUSED */
 static void
 dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
 {
+	dmu_sync_arg_t *dsa = varg;
+	dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
+	dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
 	blkptr_t *bp = zio->io_bp;
-	dmu_sync_arg_t *in = varg;
-	dbuf_dirty_record_t *dr = in->dr;
-	dmu_buf_impl_t *db = dr->dr_dbuf;
 
-	if (!BP_IS_HOLE(bp)) {
-		ASSERT(BP_GET_TYPE(bp) == db->db_dnode->dn_type);
-		ASSERT(BP_GET_LEVEL(bp) == 0);
-		bp->blk_fill = 1;
-	} else {
-		/*
-		 * dmu_sync() can compress a block of zeros to a null blkptr
-		 * but the block size still needs to be passed through to replay
-		 */
-		BP_SET_LSIZE(bp, db->db.db_size);
+	if (zio->io_error == 0) {
+		if (BP_IS_HOLE(bp)) {
+			/*
+			 * A block of zeros may compress to a hole, but the
+			 * block size still needs to be known for replay.
+			 */
+			BP_SET_LSIZE(bp, db->db_size);
+		} else {
+			ASSERT(BP_GET_TYPE(bp) == dn->dn_type);
+			ASSERT(BP_GET_LEVEL(bp) == 0);
+			bp->blk_fill = 1;
+		}
 	}
 }
 
+static void
+dmu_sync_late_arrival_ready(zio_t *zio)
+{
+	dmu_sync_ready(zio, NULL, zio->io_private);
+}
+
 /* ARGSUSED */
 static void
 dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
 {
-	dmu_sync_arg_t *in = varg;
-	dbuf_dirty_record_t *dr = in->dr;
+	dmu_sync_arg_t *dsa = varg;
+	dbuf_dirty_record_t *dr = dsa->dsa_dr;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
-	dmu_sync_cb_t *done = in->done;
 
 	mutex_enter(&db->db_mtx);
 	ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
-	dr->dt.dl.dr_overridden_by = *zio->io_bp; /* structure assignment */
-	if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by))
-		BP_ZERO(&dr->dt.dl.dr_overridden_by);
-	dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
+	if (zio->io_error == 0) {
+		dr->dt.dl.dr_overridden_by = *zio->io_bp;
+		dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
+		dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
+		if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by))
+			BP_ZERO(&dr->dt.dl.dr_overridden_by);
+	} else {
+		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+	}
 	cv_broadcast(&db->db_changed);
 	mutex_exit(&db->db_mtx);
 
-	if (done)
-		done(&(db->db), in->arg);
+	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
+
+	kmem_free(dsa, sizeof (*dsa));
+}
+
+static void
+dmu_sync_late_arrival_done(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	dmu_sync_arg_t *dsa = zio->io_private;
+
+	if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
+		ASSERT(zio->io_bp->blk_birth == zio->io_txg);
+		ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
+		zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
+	}
+
+	dmu_tx_commit(dsa->dsa_tx);
+
+	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
+
+	kmem_free(dsa, sizeof (*dsa));
+}
 
-	kmem_free(in, sizeof (dmu_sync_arg_t));
+static int
+dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
+    zio_prop_t *zp, zbookmark_t *zb)
+{
+	dmu_sync_arg_t *dsa;
+	dmu_tx_t *tx;
+
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
+	if (dmu_tx_assign(tx, TXG_NOWAIT) != 0) {
+		dmu_tx_abort(tx);
+		return (EIO);	/* Make zl_get_data do txg_waited_synced() */
+	}
+
+	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
+	dsa->dsa_dr = NULL;
+	dsa->dsa_done = done;
+	dsa->dsa_zgd = zgd;
+	dsa->dsa_tx = tx;
+
+	zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
+	    zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp,
+	    dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa,
+	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
+
+	return (0);
 }
 
 /*
@@ -926,156 +987,108 @@
  *	EALREADY: this block is already in the process of being synced.
  *		The caller should track its progress (somehow).
  *
- *	EINPROGRESS: the IO has been initiated.
- *		The caller should log this blkptr in the callback.
+ *	EIO: could not do the I/O.
+ *		The caller should do a txg_wait_synced().
  *
- *	0: completed.  Sets *bp to the blkptr just written.
- *		The caller should log this blkptr immediately.
+ *	0: the I/O has been initiated.
+ *		The caller should log this blkptr in the done callback.
+ *		It is possible that the I/O will fail, in which case
+ *		the error will be reported to the done callback and
+ *		propagated to pio from zio_done().
  */
 int
-dmu_sync(zio_t *pio, dmu_buf_t *db_fake,
-    blkptr_t *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg)
+dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
 {
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	blkptr_t *bp = zgd->zgd_bp;
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
 	objset_t *os = db->db_objset;
-	dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;
-	tx_state_t *tx = &dp->dp_tx;
+	dsl_dataset_t *ds = os->os_dsl_dataset;
 	dbuf_dirty_record_t *dr;
-	dmu_sync_arg_t *in;
+	dmu_sync_arg_t *dsa;
 	zbookmark_t zb;
-	writeprops_t wp = { 0 };
-	zio_t *zio;
-	int err;
+	zio_prop_t zp;
 
+	ASSERT(pio != NULL);
 	ASSERT(BP_IS_HOLE(bp));
 	ASSERT(txg != 0);
 
-	dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n",
-	    txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg);
+	SET_BOOKMARK(&zb, ds->ds_object,
+	    db->db.db_object, db->db_level, db->db_blkid);
+
+	dmu_write_policy(os, db->db_dnode, db->db_level, WP_DMU_SYNC, &zp);
 
 	/*
-	 * XXX - would be nice if we could do this without suspending...
+	 * If we're frozen (running ziltest), we always need to generate a bp.
 	 */
-	txg_suspend(dp);
+	if (txg > spa_freeze_txg(os->os_spa))
+		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
 
 	/*
-	 * If this txg already synced, there's nothing to do.
+	 * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
+	 * and us.  If we determine that this txg is not yet syncing,
+	 * but it begins to sync a moment later, that's OK because the
+	 * sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
 	 */
-	if (txg <= tx->tx_synced_txg) {
-		txg_resume(dp);
+	mutex_enter(&db->db_mtx);
+
+	if (txg <= spa_last_synced_txg(os->os_spa)) {
 		/*
-		 * If we're running ziltest, we need the blkptr regardless.
+		 * This txg has already synced.  There's nothing to do.
 		 */
-		if (txg > spa_freeze_txg(dp->dp_spa)) {
-			/* if db_blkptr == NULL, this was an empty write */
-			if (db->db_blkptr)
-				*bp = *db->db_blkptr; /* structure assignment */
-			return (0);
-		}
+		mutex_exit(&db->db_mtx);
 		return (EEXIST);
 	}
 
-	mutex_enter(&db->db_mtx);
-
-	if (txg == tx->tx_syncing_txg) {
-		while (db->db_data_pending) {
-			/*
-			 * IO is in-progress.  Wait for it to finish.
-			 * XXX - would be nice to be able to somehow "attach"
-			 * this zio to the parent zio passed in.
-			 */
-			cv_wait(&db->db_changed, &db->db_mtx);
-			if (!db->db_data_pending &&
-			    db->db_blkptr && BP_IS_HOLE(db->db_blkptr)) {
-				/*
-				 * IO was compressed away
-				 */
-				*bp = *db->db_blkptr; /* structure assignment */
-				mutex_exit(&db->db_mtx);
-				txg_resume(dp);
-				return (0);
-			}
-			ASSERT(db->db_data_pending ||
-			    (db->db_blkptr && db->db_blkptr->blk_birth == txg));
-		}
-
-		if (db->db_blkptr && db->db_blkptr->blk_birth == txg) {
-			/*
-			 * IO is already completed.
-			 */
-			*bp = *db->db_blkptr; /* structure assignment */
-			mutex_exit(&db->db_mtx);
-			txg_resume(dp);
-			return (0);
-		}
+	if (txg <= spa_syncing_txg(os->os_spa)) {
+		/*
+		 * This txg is currently syncing, so we can't mess with
+		 * the dirty record anymore; just write a new log block.
+		 */
+		mutex_exit(&db->db_mtx);
+		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
 	}
 
 	dr = db->db_last_dirty;
-	while (dr && dr->dr_txg > txg)
+	while (dr && dr->dr_txg != txg)
 		dr = dr->dr_next;
-	if (dr == NULL || dr->dr_txg < txg) {
+
+	if (dr == NULL) {
 		/*
-		 * This dbuf isn't dirty, must have been free_range'd.
+		 * There's no dr for this dbuf, so it must have been freed.
 		 * There's no need to log writes to freed blocks, so we're done.
 		 */
 		mutex_exit(&db->db_mtx);
-		txg_resume(dp);
 		return (ENOENT);
 	}
 
 	ASSERT(dr->dr_txg == txg);
-	if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
+	if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
+	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
 		/*
-		 * We have already issued a sync write for this buffer.
+		 * We have already issued a sync write for this buffer,
+		 * or this buffer has already been synced.  It could not
+		 * have been dirtied since, or we would have cleared the state.
 		 */
 		mutex_exit(&db->db_mtx);
-		txg_resume(dp);
 		return (EALREADY);
-	} else if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
-		/*
-		 * This buffer has already been synced.  It could not
-		 * have been dirtied since, or we would have cleared the state.
-		 */
-		*bp = dr->dt.dl.dr_overridden_by; /* structure assignment */
-		mutex_exit(&db->db_mtx);
-		txg_resume(dp);
-		return (0);
 	}
 
+	ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
 	dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
-	in = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
-	in->dr = dr;
-	in->done = done;
-	in->arg = arg;
 	mutex_exit(&db->db_mtx);
-	txg_resume(dp);
-
-	zb.zb_objset = os->os_dsl_dataset->ds_object;
-	zb.zb_object = db->db.db_object;
-	zb.zb_level = db->db_level;
-	zb.zb_blkid = db->db_blkid;
 
-	wp.wp_type = db->db_dnode->dn_type;
-	wp.wp_level = db->db_level;
-	wp.wp_copies = os->os_copies;
-	wp.wp_dnchecksum = db->db_dnode->dn_checksum;
-	wp.wp_oschecksum = os->os_checksum;
-	wp.wp_dncompress = db->db_dnode->dn_compress;
-	wp.wp_oscompress = os->os_compress;
-
-	ASSERT(BP_IS_HOLE(bp));
+	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
+	dsa->dsa_dr = dr;
+	dsa->dsa_done = done;
+	dsa->dsa_zgd = zgd;
+	dsa->dsa_tx = NULL;
 
-	zio = arc_write(pio, os->os_spa, &wp, DBUF_IS_L2CACHEABLE(db),
-	    txg, bp, dr->dt.dl.dr_data, dmu_sync_ready, dmu_sync_done, in,
-	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
-	if (pio) {
-		zio_nowait(zio);
-		err = EINPROGRESS;
-	} else {
-		err = zio_wait(zio);
-		ASSERT(err == 0);
-	}
-	return (err);
+	zio_nowait(arc_write(pio, os->os_spa, txg,
+	    bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), &zp,
+	    dmu_sync_ready, dmu_sync_done, dsa,
+	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
+
+	return (0);
 }
 
 int
@@ -1121,6 +1134,84 @@
 	dnode_rele(dn, FTAG);
 }
 
+int zfs_mdcomp_disable = 0;
+
+void
+dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
+{
+	dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
+	boolean_t ismd = (level > 0 || dmu_ot[type].ot_metadata);
+	enum zio_checksum checksum = os->os_checksum;
+	enum zio_compress compress = os->os_compress;
+	enum zio_checksum dedup_checksum = os->os_dedup_checksum;
+	boolean_t dedup;
+	boolean_t dedup_verify = os->os_dedup_verify;
+	int copies = os->os_copies;
+
+	/*
+	 * Determine checksum setting.
+	 */
+	if (ismd) {
+		/*
+		 * Metadata always gets checksummed.  If the data
+		 * checksum is multi-bit correctable, and it's not a
+		 * ZBT-style checksum, then it's suitable for metadata
+		 * as well.  Otherwise, the metadata checksum defaults
+		 * to fletcher4.
+		 */
+		if (zio_checksum_table[checksum].ci_correctable < 1 ||
+		    zio_checksum_table[checksum].ci_zbt)
+			checksum = ZIO_CHECKSUM_FLETCHER_4;
+	} else {
+		checksum = zio_checksum_select(dn->dn_checksum, checksum);
+	}
+
+	/*
+	 * Determine compression setting.
+	 */
+	if (ismd) {
+		/*
+		 * XXX -- we should design a compression algorithm
+		 * that specializes in arrays of bps.
+		 */
+		compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
+		    ZIO_COMPRESS_LZJB;
+	} else {
+		compress = zio_compress_select(dn->dn_compress, compress);
+	}
+
+	/*
+	 * Determine dedup setting.  If we are in dmu_sync(), we won't
+	 * actually dedup now because that's all done in syncing context;
+	 * but we do want to use the dedup checkum.  If the checksum is not
+	 * strong enough to ensure unique signatures, force dedup_verify.
+	 */
+	dedup = (!ismd && dedup_checksum != ZIO_CHECKSUM_OFF);
+	if (dedup) {
+		checksum = dedup_checksum;
+		if (!zio_checksum_table[checksum].ci_dedup)
+			dedup_verify = 1;
+	}
+
+	if (wp & WP_DMU_SYNC)
+		dedup = 0;
+
+	if (wp & WP_NOFILL) {
+		ASSERT(!ismd && level == 0);
+		checksum = ZIO_CHECKSUM_OFF;
+		compress = ZIO_COMPRESS_OFF;
+		dedup = B_FALSE;
+	}
+
+	zp->zp_checksum = checksum;
+	zp->zp_compress = compress;
+	zp->zp_type = type;
+	zp->zp_level = level;
+	zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa));
+	zp->zp_dedup = dedup;
+	zp->zp_dedup_verify = dedup && dedup_verify;
+}
+
 int
 dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
 {
@@ -1155,21 +1246,27 @@
 void
 dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
 {
+	dnode_phys_t *dnp;
+
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	mutex_enter(&dn->dn_mtx);
 
+	dnp = dn->dn_phys;
+
 	doi->doi_data_block_size = dn->dn_datablksz;
 	doi->doi_metadata_block_size = dn->dn_indblkshift ?
 	    1ULL << dn->dn_indblkshift : 0;
+	doi->doi_type = dn->dn_type;
+	doi->doi_bonus_type = dn->dn_bonustype;
+	doi->doi_bonus_size = dn->dn_bonuslen;
 	doi->doi_indirection = dn->dn_nlevels;
 	doi->doi_checksum = dn->dn_checksum;
 	doi->doi_compress = dn->dn_compress;
-	doi->doi_physical_blks = (DN_USED_BYTES(dn->dn_phys) +
-	    SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT;
-	doi->doi_max_block_offset = dn->dn_phys->dn_maxblkid;
-	doi->doi_type = dn->dn_type;
-	doi->doi_bonus_size = dn->dn_bonuslen;
-	doi->doi_bonus_type = dn->dn_bonustype;
+	doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
+	doi->doi_max_offset = (dnp->dn_maxblkid + 1) * dn->dn_datablksz;
+	doi->doi_fill_count = 0;
+	for (int i = 0; i < dnp->dn_nblkptr; i++)
+		doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill;
 
 	mutex_exit(&dn->dn_mtx);
 	rw_exit(&dn->dn_struct_rwlock);
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/dmu_objset.c
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c	Sun Nov 01 14:14:46 2009 -0800
@@ -36,7 +36,6 @@
 #include <sys/dbuf.h>
 #include <sys/zvol.h>
 #include <sys/dmu_tx.h>
-#include <sys/zio_checksum.h>
 #include <sys/zap.h>
 #include <sys/zil.h>
 #include <sys/dmu_impl.h>
@@ -138,6 +137,24 @@
 }
 
 static void
+dedup_changed_cb(void *arg, uint64_t newval)
+{
+	objset_t *os = arg;
+	spa_t *spa = os->os_spa;
+	enum zio_checksum checksum;
+
+	/*
+	 * Inheritance should have been done by now.
+	 */
+	ASSERT(newval != ZIO_CHECKSUM_INHERIT);
+
+	checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF);
+
+	os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK;
+	os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY);
+}
+
+static void
 primary_cache_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
@@ -209,10 +226,9 @@
 	if (!BP_IS_HOLE(os->os_rootbp)) {
 		uint32_t aflags = ARC_WAIT;
 		zbookmark_t zb;
-		zb.zb_objset = ds ? ds->ds_object : 0;
-		zb.zb_object = 0;
-		zb.zb_level = -1;
-		zb.zb_blkid = 0;
+		SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
+		    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+
 		if (DMU_OS_IS_L2CACHEABLE(os))
 			aflags |= ARC_L2CACHE;
 
@@ -281,6 +297,9 @@
 				err = dsl_prop_register(ds, "copies",
 				    copies_changed_cb, os);
 			if (err == 0)
+				err = dsl_prop_register(ds, "dedup",
+				    dedup_changed_cb, os);
+			if (err == 0)
 				err = dsl_prop_register(ds, "logbias",
 				    logbias_changed_cb, os);
 		}
@@ -295,6 +314,9 @@
 		os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
 		os->os_compress = ZIO_COMPRESS_LZJB;
 		os->os_copies = spa_max_replication(spa);
+		os->os_dedup_checksum = ZIO_CHECKSUM_OFF;
+		os->os_dedup_verify = 0;
+		os->os_logbias = 0;
 		os->os_primary_cache = ZFS_CACHE_ALL;
 		os->os_secondary_cache = ZFS_CACHE_ALL;
 	}
@@ -454,12 +476,9 @@
 dmu_objset_evict(objset_t *os)
 {
 	dsl_dataset_t *ds = os->os_dsl_dataset;
-	int i;
 
-	for (i = 0; i < TXG_SIZE; i++) {
-		ASSERT(list_head(&os->os_dirty_dnodes[i]) == NULL);
-		ASSERT(list_head(&os->os_free_dnodes[i]) == NULL);
-	}
+	for (int t = 0; t < TXG_SIZE; t++)
+		ASSERT(!dmu_objset_is_dirty(os, t));
 
 	if (ds) {
 		if (!dsl_dataset_is_snapshot(ds)) {
@@ -469,6 +488,8 @@
 			    compression_changed_cb, os));
 			VERIFY(0 == dsl_prop_unregister(ds, "copies",
 			    copies_changed_cb, os));
+			VERIFY(0 == dsl_prop_unregister(ds, "dedup",
+			    dedup_changed_cb, os));
 			VERIFY(0 == dsl_prop_unregister(ds, "logbias",
 			    logbias_changed_cb, os));
 		}
@@ -873,10 +894,9 @@
 
 /* ARGSUSED */
 static void
-ready(zio_t *zio, arc_buf_t *abuf, void *arg)
+dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
 {
 	blkptr_t *bp = zio->io_bp;
-	blkptr_t *bp_orig = &zio->io_bp_orig;
 	objset_t *os = arg;
 	dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
 
@@ -893,14 +913,24 @@
 	bp->blk_fill = 0;
 	for (int i = 0; i < dnp->dn_nblkptr; i++)
 		bp->blk_fill += dnp->dn_blkptr[i].blk_fill;
+}
+
+/* ARGSUSED */
+static void
+dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg)
+{
+	blkptr_t *bp = zio->io_bp;
+	blkptr_t *bp_orig = &zio->io_bp_orig;
+	objset_t *os = arg;
 
 	if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
-		ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig)));
+		ASSERT(BP_EQUAL(bp, bp_orig));
 	} else {
-		if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg)
-			(void) dsl_dataset_block_kill(os->os_dsl_dataset,
-			    &zio->io_bp_orig, zio, os->os_synctx);
-		dsl_dataset_block_born(os->os_dsl_dataset, bp, os->os_synctx);
+		dsl_dataset_t *ds = os->os_dsl_dataset;
+		dmu_tx_t *tx = os->os_synctx;
+
+		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
+		dsl_dataset_block_born(ds, bp, tx);
 	}
 }
 
@@ -910,7 +940,7 @@
 {
 	int txgoff;
 	zbookmark_t zb;
-	writeprops_t wp = { 0 };
+	zio_prop_t zp;
 	zio_t *zio;
 	list_t *list;
 	list_t *newlist = NULL;
@@ -934,26 +964,17 @@
 	/*
 	 * Create the root block IO
 	 */
-	zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
-	zb.zb_object = 0;
-	zb.zb_level = -1;	/* for block ordering; it's level 0 on disk */
-	zb.zb_blkid = 0;
-
-	wp.wp_type = DMU_OT_OBJSET;
-	wp.wp_level = 0;	/* on-disk BP level; see above */
-	wp.wp_copies = os->os_copies;
-	wp.wp_oschecksum = os->os_checksum;
-	wp.wp_oscompress = os->os_compress;
-
-	if (BP_IS_OLDER(os->os_rootbp, tx->tx_txg)) {
-		(void) dsl_dataset_block_kill(os->os_dsl_dataset,
-		    os->os_rootbp, pio, tx);
-	}
-
 	arc_release(os->os_phys_buf, &os->os_phys_buf);
 
-	zio = arc_write(pio, os->os_spa, &wp, DMU_OS_IS_L2CACHEABLE(os),
-	    tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, NULL, os,
+	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
+	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
+	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+
+	dmu_write_policy(os, NULL, 0, 0, &zp);
+
+	zio = arc_write(pio, os->os_spa, tx->tx_txg,
+	    os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), &zp,
+	    dmu_objset_write_ready, dmu_objset_write_done, os,
 	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 
 	/*
@@ -1002,6 +1023,13 @@
 	zio_nowait(zio);
 }
 
+boolean_t
+dmu_objset_is_dirty(objset_t *os, uint64_t txg)
+{
+	return (!list_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK]) ||
+	    !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK]));
+}
+
 static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];
 
 void
@@ -1431,10 +1459,8 @@
 			uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
 			zbookmark_t zb;
 
-			zb.zb_objset = ds->ds_object;
-			zb.zb_object = 0;
-			zb.zb_level = -1;
-			zb.zb_blkid = 0;
+			SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT,
+			    ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 
 			(void) arc_read_nolock(NULL, dsl_dataset_get_spa(ds),
 			    &ds->ds_phys->ds_bp, NULL, NULL,
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/dmu_send.c
--- a/usr/src/uts/common/fs/zfs/dmu_send.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/dmu_send.c	Sun Nov 01 14:14:46 2009 -0800
@@ -150,9 +150,10 @@
 	(((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
 	(level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
 
+/* ARGSUSED */
 static int
-backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
-    const dnode_phys_t *dnp, void *arg)
+backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	struct backuparg *ba = arg;
 	dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
@@ -161,9 +162,10 @@
 	if (issig(JUSTLOOKING) && issig(FORREAL))
 		return (EINTR);
 
-	if (zb->zb_object != 0 && DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
+	if (zb->zb_object != DMU_META_DNODE_OBJECT &&
+	    DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
 		return (0);
-	} else if (bp == NULL && zb->zb_object == 0) {
+	} else if (bp == NULL && zb->zb_object == DMU_META_DNODE_OBJECT) {
 		uint64_t span = BP_SPAN(dnp, zb->zb_level);
 		uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
 		err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT);
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/dmu_traverse.c
--- a/usr/src/uts/common/fs/zfs/dmu_traverse.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c	Sun Nov 01 14:14:46 2009 -0800
@@ -35,14 +35,6 @@
 #include <sys/dmu_impl.h>
 #include <sys/callb.h>
 
-#define	SET_BOOKMARK(zb, objset, object, level, blkid)  \
-{                                                       \
-	(zb)->zb_objset = objset;                       \
-	(zb)->zb_object = object;                       \
-	(zb)->zb_level = level;                         \
-	(zb)->zb_blkid = blkid;                         \
-}
-
 struct prefetch_data {
 	kmutex_t pd_mtx;
 	kcondvar_t pd_cv;
@@ -68,27 +60,28 @@
     arc_buf_t *buf, uint64_t objset, uint64_t object);
 
 /* ARGSUSED */
-static void
+static int
 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
 {
 	struct traverse_data *td = arg;
 	zbookmark_t zb;
 
 	if (bp->blk_birth == 0)
-		return;
+		return (0);
 
 	if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa))
-		return;
+		return (0);
 
-	zb.zb_objset = td->td_objset;
-	zb.zb_object = 0;
-	zb.zb_level = -1;
-	zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
-	(void) td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg);
+	SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
+	    bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
+
+	(void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg);
+
+	return (0);
 }
 
 /* ARGSUSED */
-static void
+static int
 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
 {
 	struct traverse_data *td = arg;
@@ -99,17 +92,18 @@
 		zbookmark_t zb;
 
 		if (bp->blk_birth == 0)
-			return;
+			return (0);
 
 		if (claim_txg == 0 || bp->blk_birth < claim_txg)
-			return;
+			return (0);
+
+		SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, ZB_ZIL_LEVEL,
+		    lr->lr_offset / BP_GET_LSIZE(bp));
 
-		zb.zb_objset = td->td_objset;
-		zb.zb_object = lr->lr_foid;
-		zb.zb_level = BP_GET_LEVEL(bp);
-		zb.zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
-		(void) td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg);
+		(void) td->td_func(td->td_spa, zilog, bp, &zb, NULL,
+		    td->td_arg);
 	}
+	return (0);
 }
 
 static void
@@ -120,7 +114,7 @@
 
 	/*
 	 * We only want to visit blocks that have been claimed but not yet
-	 * replayed (or, in read-only mode, blocks that *would* be claimed).
+	 * replayed; plus, in read-only mode, blocks that are already stable.
 	 */
 	if (claim_txg == 0 && spa_writeable(td->td_spa))
 		return;
@@ -143,7 +137,7 @@
 	struct prefetch_data *pd = td->td_pfd;
 
 	if (bp->blk_birth == 0) {
-		err = td->td_func(td->td_spa, NULL, zb, dnp, td->td_arg);
+		err = td->td_func(td->td_spa, NULL, NULL, zb, dnp, td->td_arg);
 		return (err);
 	}
 
@@ -163,7 +157,7 @@
 	}
 
 	if (td->td_flags & TRAVERSE_PRE) {
-		err = td->td_func(td->td_spa, bp, zb, dnp, td->td_arg);
+		err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
 		if (err)
 			return (err);
 	}
@@ -224,7 +218,8 @@
 		traverse_zil(td, &osp->os_zil_header);
 
 		dnp = &osp->os_meta_dnode;
-		err = traverse_dnode(td, dnp, buf, zb->zb_objset, 0);
+		err = traverse_dnode(td, dnp, buf, zb->zb_objset,
+		    DMU_META_DNODE_OBJECT);
 		if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
 			dnp = &osp->os_userused_dnode;
 			err = traverse_dnode(td, dnp, buf, zb->zb_objset,
@@ -241,7 +236,7 @@
 		(void) arc_buf_remove_ref(buf, &buf);
 
 	if (err == 0 && (td->td_flags & TRAVERSE_POST))
-		err = td->td_func(td->td_spa, bp, zb, dnp, td->td_arg);
+		err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
 
 	return (err);
 }
@@ -265,8 +260,8 @@
 
 /* ARGSUSED */
 static int
-traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
-    const dnode_phys_t *dnp, void *arg)
+traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	struct prefetch_data *pfd = arg;
 	uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
@@ -305,7 +300,8 @@
 	td.td_arg = td_main->td_pfd;
 	td.td_pfd = NULL;
 
-	SET_BOOKMARK(&czb, td.td_objset, 0, -1, 0);
+	SET_BOOKMARK(&czb, td.td_objset,
+	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 	(void) traverse_visitbp(&td, NULL, NULL, td.td_rootbp, &czb);
 
 	mutex_enter(&td_main->td_pfd->pd_mtx);
@@ -346,7 +342,8 @@
 	    &td, TQ_NOQUEUE))
 		pd.pd_exited = B_TRUE;
 
-	SET_BOOKMARK(&czb, objset, 0, -1, 0);
+	SET_BOOKMARK(&czb, objset,
+	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 	err = traverse_visitbp(&td, NULL, NULL, rootbp, &czb);
 
 	mutex_enter(&pd.pd_mtx);
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/dmu_tx.c
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c	Sun Nov 01 14:14:46 2009 -0800
@@ -163,38 +163,47 @@
 }
 
 static void
-dmu_tx_count_indirects(dmu_tx_hold_t *txh, dmu_buf_impl_t *db,
-    boolean_t freeable, dmu_buf_impl_t **history)
+dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db,
+    int level, uint64_t blkid, boolean_t freeable, uint64_t *history)
 {
-	int i = db->db_level + 1;
-	dnode_t *dn = db->db_dnode;
+	objset_t *os = dn->dn_objset;
+	dsl_dataset_t *ds = os->os_dsl_dataset;
+	int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+	dmu_buf_impl_t *parent = NULL;
+	blkptr_t *bp = NULL;
+	uint64_t space;
 
-	if (i >= dn->dn_nlevels)
+	if (level >= dn->dn_nlevels || history[level] == blkid)
 		return;
 
-	db = db->db_parent;
-	if (db == NULL) {
-		uint64_t lvls = dn->dn_nlevels - i;
+	history[level] = blkid;
+
+	space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift);
 
-		txh->txh_space_towrite += lvls << dn->dn_indblkshift;
-		return;
+	if (db == NULL || db == dn->dn_dbuf) {
+		ASSERT(level != 0);
+		db = NULL;
+	} else {
+		ASSERT(db->db_dnode == dn);
+		ASSERT(db->db_level == level);
+		ASSERT(db->db.db_size == space);
+		ASSERT(db->db_blkid == blkid);
+		bp = db->db_blkptr;
+		parent = db->db_parent;
 	}
 
-	if (db != history[i]) {
-		dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
-		uint64_t space = 1ULL << dn->dn_indblkshift;
+	freeable = (bp && (freeable ||
+	    dsl_dataset_block_freeable(ds, bp->blk_birth)));
 
-		freeable = (db->db_blkptr && (freeable ||
-		    dsl_dataset_block_freeable(ds, db->db_blkptr->blk_birth)));
-		if (freeable)
-			txh->txh_space_tooverwrite += space;
-		else
-			txh->txh_space_towrite += space;
-		if (db->db_blkptr)
-			txh->txh_space_tounref += space;
-		history[i] = db;
-		dmu_tx_count_indirects(txh, db, freeable, history);
-	}
+	if (freeable)
+		txh->txh_space_tooverwrite += space;
+	else
+		txh->txh_space_towrite += space;
+	if (bp)
+		txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp);
+
+	dmu_tx_count_twig(txh, dn, parent, level + 1,
+	    blkid >> epbs, freeable, history);
 }
 
 /* ARGSUSED */
@@ -215,7 +224,7 @@
 	max_ibs = DN_MAX_INDBLKSHIFT;
 
 	if (dn) {
-		dmu_buf_impl_t *last[DN_MAX_LEVELS];
+		uint64_t history[DN_MAX_LEVELS];
 		int nlvls = dn->dn_nlevels;
 		int delta;
 
@@ -291,29 +300,18 @@
 		 * If this write is not off the end of the file
 		 * we need to account for overwrites/unref.
 		 */
-		if (start <= dn->dn_maxblkid)
-			bzero(last, sizeof (dmu_buf_impl_t *) * DN_MAX_LEVELS);
+		if (start <= dn->dn_maxblkid) {
+			for (int l = 0; l < DN_MAX_LEVELS; l++)
+				history[l] = -1ULL;
+		}
 		while (start <= dn->dn_maxblkid) {
-			spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
-			dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 			dmu_buf_impl_t *db;
 
 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
 			db = dbuf_hold_level(dn, 0, start, FTAG);
 			rw_exit(&dn->dn_struct_rwlock);
-			if (db->db_blkptr && dsl_dataset_block_freeable(ds,
-			    db->db_blkptr->blk_birth)) {
-				dprintf_bp(db->db_blkptr, "can free old%s", "");
-				txh->txh_space_tooverwrite += dn->dn_datablksz;
-				txh->txh_space_tounref += dn->dn_datablksz;
-				dmu_tx_count_indirects(txh, db, TRUE, last);
-			} else {
-				txh->txh_space_towrite += dn->dn_datablksz;
-				if (db->db_blkptr)
-					txh->txh_space_tounref +=
-					    bp_get_dasize(spa, db->db_blkptr);
-				dmu_tx_count_indirects(txh, db, FALSE, last);
-			}
+			dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE,
+			    history);
 			dbuf_rele(db, FTAG);
 			if (++start > end) {
 				/*
@@ -461,7 +459,7 @@
 			bp += blkid + i;
 			if (dsl_dataset_block_freeable(ds, bp->blk_birth)) {
 				dprintf_bp(bp, "can free old%s", "");
-				space += bp_get_dasize(spa, bp);
+				space += bp_get_dsize(spa, bp);
 			}
 			unref += BP_GET_ASIZE(bp);
 		}
@@ -538,7 +536,7 @@
 		for (i = 0; i < tochk; i++) {
 			if (dsl_dataset_block_freeable(ds, bp[i].blk_birth)) {
 				dprintf_bp(&bp[i], "can free old%s", "");
-				space += bp_get_dasize(spa, &bp[i]);
+				space += bp_get_dsize(spa, &bp[i]);
 			}
 			unref += BP_GET_ASIZE(bp);
 		}
@@ -583,6 +581,8 @@
 	if (len != DMU_OBJECT_END)
 		dmu_tx_count_write(txh, off+len, 1);
 
+	dmu_tx_count_dnode(txh);
+
 	if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
 		return;
 	if (len == DMU_OBJECT_END)
@@ -625,7 +625,6 @@
 		}
 	}
 
-	dmu_tx_count_dnode(txh);
 	dmu_tx_count_free(txh, off, len);
 }
 
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/dnode_sync.c
--- a/usr/src/uts/common/fs/zfs/dnode_sync.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/dnode_sync.c	Sun Nov 01 14:14:46 2009 -0800
@@ -120,7 +120,7 @@
 		if (BP_IS_HOLE(bp))
 			continue;
 
-		bytesfreed += dsl_dataset_block_kill(ds, bp, dn->dn_zio, tx);
+		bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE);
 		ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
 		bzero(bp, sizeof (blkptr_t));
 		blocks_freed += 1;
@@ -424,6 +424,9 @@
 		dmu_buf_impl_t *db = dr->dr_dbuf;
 		uint64_t txg = dr->dr_txg;
 
+		if (db->db_level != 0)
+			dnode_undirty_dbufs(&dr->dt.di.dr_children);
+
 		mutex_enter(&db->db_mtx);
 		/* XXX - use dbuf_undirty()? */
 		list_remove(list, dr);
@@ -434,13 +437,9 @@
 			ASSERT(db->db_blkid == DB_BONUS_BLKID ||
 			    dr->dt.dl.dr_data == db->db_buf);
 			dbuf_unoverride(dr);
-			mutex_exit(&db->db_mtx);
-		} else {
-			mutex_exit(&db->db_mtx);
-			dnode_undirty_dbufs(&dr->dt.di.dr_children);
 		}
 		kmem_free(dr, sizeof (dbuf_dirty_record_t));
-		dbuf_rele(db, (void *)(uintptr_t)txg);
+		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
 	}
 }
 
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/dsl_dataset.c
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c	Sun Nov 01 14:14:46 2009 -0800
@@ -75,9 +75,9 @@
 }
 
 void
-dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
+dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
 {
-	int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
+	int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
 	int compressed = BP_GET_PSIZE(bp);
 	int uncompressed = BP_GET_UCSIZE(bp);
 	int64_t delta;
@@ -118,29 +118,26 @@
 }
 
 int
-dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
-    dmu_tx_t *tx)
+dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
+    boolean_t async)
 {
-	int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
+	if (BP_IS_HOLE(bp))
+		return (0);
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(bp->blk_birth <= tx->tx_txg);
+
+	int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
 	int compressed = BP_GET_PSIZE(bp);
 	int uncompressed = BP_GET_UCSIZE(bp);
 
-	ASSERT(pio != NULL);
-	ASSERT(dmu_tx_is_syncing(tx));
-	/* No block pointer => nothing to free */
-	if (BP_IS_HOLE(bp))
-		return (0);
-
 	ASSERT(used > 0);
 	if (ds == NULL) {
-		int err;
 		/*
 		 * Account for the meta-objset space in its placeholder
 		 * dataset.
 		 */
-		err = dsl_free(pio, tx->tx_pool,
-		    tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT);
-		ASSERT(err == 0);
+		dsl_free(tx->tx_pool, tx->tx_txg, bp);
 
 		dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD,
 		    -used, -compressed, -uncompressed, tx);
@@ -153,13 +150,10 @@
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 
 	if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
-		int err;
 		int64_t delta;
 
 		dprintf_bp(bp, "freeing: %s", "");
-		err = dsl_free(pio, tx->tx_pool,
-		    tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT);
-		ASSERT(err == 0);
+		dsl_free(tx->tx_pool, tx->tx_txg, bp);
 
 		mutex_enter(&ds->ds_dir->dd_lock);
 		mutex_enter(&ds->ds_lock);
@@ -175,7 +169,18 @@
 		mutex_exit(&ds->ds_dir->dd_lock);
 	} else {
 		dprintf_bp(bp, "putting on dead list: %s", "");
-		VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx));
+		if (async) {
+			/*
+			 * We are here as part of zio's write done callback,
+			 * which means we're a zio interrupt thread.  We can't
+			 * call bplist_enqueue() now because it may block
+			 * waiting for I/O.  Instead, put bp on the deferred
+			 * queue and let dsl_pool_sync() finish the job.
+			 */
+			bplist_enqueue_deferred(&ds->ds_deadlist, bp);
+		} else {
+			VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx));
+		}
 		ASSERT3U(ds->ds_prev->ds_object, ==,
 		    ds->ds_phys->ds_prev_snap_obj);
 		ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
@@ -261,9 +266,9 @@
 	mutex_destroy(&ds->ds_lock);
 	mutex_destroy(&ds->ds_recvlock);
 	mutex_destroy(&ds->ds_opening_lock);
-	mutex_destroy(&ds->ds_deadlist.bpl_lock);
 	rw_destroy(&ds->ds_rwlock);
 	cv_destroy(&ds->ds_exclusive_cv);
+	bplist_fini(&ds->ds_deadlist);
 
 	kmem_free(ds, sizeof (dsl_dataset_t));
 }
@@ -361,10 +366,9 @@
 		mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
 		mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL);
 		mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
-		mutex_init(&ds->ds_deadlist.bpl_lock, NULL, MUTEX_DEFAULT,
-		    NULL);
 		rw_init(&ds->ds_rwlock, 0, 0, 0);
 		cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL);
+		bplist_init(&ds->ds_deadlist);
 
 		err = bplist_open(&ds->ds_deadlist,
 		    mos, ds->ds_phys->ds_deadlist_obj);
@@ -380,9 +384,9 @@
 			mutex_destroy(&ds->ds_lock);
 			mutex_destroy(&ds->ds_recvlock);
 			mutex_destroy(&ds->ds_opening_lock);
-			mutex_destroy(&ds->ds_deadlist.bpl_lock);
 			rw_destroy(&ds->ds_rwlock);
 			cv_destroy(&ds->ds_exclusive_cv);
+			bplist_fini(&ds->ds_deadlist);
 			kmem_free(ds, sizeof (dsl_dataset_t));
 			dmu_buf_rele(dbuf, tag);
 			return (err);
@@ -459,9 +463,9 @@
 			mutex_destroy(&ds->ds_lock);
 			mutex_destroy(&ds->ds_recvlock);
 			mutex_destroy(&ds->ds_opening_lock);
-			mutex_destroy(&ds->ds_deadlist.bpl_lock);
 			rw_destroy(&ds->ds_rwlock);
 			cv_destroy(&ds->ds_exclusive_cv);
+			bplist_fini(&ds->ds_deadlist);
 			kmem_free(ds, sizeof (dsl_dataset_t));
 			if (err) {
 				dmu_buf_rele(dbuf, tag);
@@ -1238,31 +1242,31 @@
 
 struct killarg {
 	dsl_dataset_t *ds;
-	zio_t *zio;
 	dmu_tx_t *tx;
 };
 
 /* ARGSUSED */
 static int
-kill_blkptr(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
-    const dnode_phys_t *dnp, void *arg)
+kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	struct killarg *ka = arg;
+	dmu_tx_t *tx = ka->tx;
 
 	if (bp == NULL)
 		return (0);
 
-	if ((zb->zb_level == -1ULL && zb->zb_blkid != 0) ||
-	    (zb->zb_object != 0 && dnp == NULL)) {
+	if (zb->zb_level == ZB_ZIL_LEVEL) {
+		ASSERT(zilog != NULL);
 		/*
 		 * It's a block in the intent log.  It has no
 		 * accounting, so just free it.
 		 */
-		VERIFY3U(0, ==, dsl_free(ka->zio, ka->tx->tx_pool,
-		    ka->tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT));
+		dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
 	} else {
+		ASSERT(zilog == NULL);
 		ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg);
-		(void) dsl_dataset_block_kill(ka->ds, bp, ka->zio, ka->tx);
+		(void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
 	}
 
 	return (0);
@@ -1490,7 +1494,6 @@
 {
 	struct dsl_ds_destroyarg *dsda = arg1;
 	dsl_dataset_t *ds = dsda->ds;
-	zio_t *zio;
 	int err;
 	int after_branch_point = FALSE;
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
@@ -1577,8 +1580,6 @@
 		}
 	}
 
-	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
-
 	if (ds->ds_phys->ds_next_snap_obj != 0) {
 		blkptr_t bp;
 		dsl_dataset_t *ds_next;
@@ -1616,15 +1617,13 @@
 				    bp.blk_birth >
 				    ds_prev->ds_phys->ds_prev_snap_txg) {
 					ds_prev->ds_phys->ds_unique_bytes +=
-					    bp_get_dasize(dp->dp_spa, &bp);
+					    bp_get_dsize_sync(dp->dp_spa, &bp);
 				}
 			} else {
-				used += bp_get_dasize(dp->dp_spa, &bp);
+				used += bp_get_dsize_sync(dp->dp_spa, &bp);
 				compressed += BP_GET_PSIZE(&bp);
 				uncompressed += BP_GET_UCSIZE(&bp);
-				/* XXX check return value? */
-				(void) dsl_free(zio, dp, tx->tx_txg,
-				    &bp, NULL, NULL, ARC_NOWAIT);
+				dsl_free(dp, tx->tx_txg, &bp);
 			}
 		}
 
@@ -1726,7 +1725,6 @@
 		 * freed all the objects in open context.
 		 */
 		ka.ds = ds;
-		ka.zio = zio;
 		ka.tx = tx;
 		err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
 		    TRAVERSE_POST, kill_blkptr, &ka);
@@ -1740,9 +1738,6 @@
 		}
 	}
 
-	err = zio_wait(zio);
-	ASSERT3U(err, ==, 0);
-
 	if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) {
 		/* Erase the link in the dir */
 		dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
@@ -2785,7 +2780,7 @@
 	if (err == 0) {
 		err = dsl_sync_task_do(dp, dsl_dataset_promote_check,
 		    dsl_dataset_promote_sync, ds, &pa,
-		    2 + 2 * doi.doi_physical_blks);
+		    2 + 2 * doi.doi_physical_blocks_512);
 		if (err && pa.err_ds && conflsnap)
 			(void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN);
 	}
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/dsl_deleg.c
--- a/usr/src/uts/common/fs/zfs/dsl_deleg.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/dsl_deleg.c	Sun Nov 01 14:14:46 2009 -0800
@@ -75,8 +75,6 @@
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_deleg.h>
 #include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/zio_checksum.h> /* for the default checksum value */
 #include <sys/zap.h>
 #include <sys/fs/zfs.h>
 #include <sys/cred.h>
@@ -739,5 +737,5 @@
 boolean_t
 dsl_delegation_on(objset_t *os)
 {
-	return (os->os_spa->spa_delegation);
+	return (!!spa_delegation(os->os_spa));
 }
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/dsl_dir.c
--- a/usr/src/uts/common/fs/zfs/dsl_dir.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/dsl_dir.c	Sun Nov 01 14:14:46 2009 -0800
@@ -32,6 +32,7 @@
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_deleg.h>
 #include <sys/spa.h>
+#include <sys/metaslab.h>
 #include <sys/zap.h>
 #include <sys/zio.h>
 #include <sys/arc.h>
@@ -650,7 +651,8 @@
 		 * dsl_pool_adjustedsize()), something is very
 		 * wrong.
 		 */
-		ASSERT3U(used, <=, spa_get_space(dd->dd_pool->dp_spa));
+		ASSERT3U(used, <=, metaslab_class_get_space(
+		    spa_normal_class(dd->dd_pool->dp_spa)));
 	} else {
 		/*
 		 * the lesser of the space provided by our parent and
@@ -736,8 +738,9 @@
 	 * removes to get through.
 	 */
 	if (dd->dd_parent == NULL) {
+		spa_t *spa = dd->dd_pool->dp_spa;
 		uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
-		deferred = spa_get_defers(dd->dd_pool->dp_spa);
+		deferred = metaslab_class_get_deferred(spa_normal_class(spa));
 		if (poolsize - deferred < quota) {
 			quota = poolsize - deferred;
 			retval = ENOSPC;
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/dsl_pool.c
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c	Sun Nov 01 14:14:46 2009 -0800
@@ -346,6 +346,15 @@
 	}
 	err = zio_wait(zio);
 
+	/*
+	 * If anything was added to a deadlist during a zio done callback,
+	 * it had to be put on the deferred queue.  Enqueue it for real now.
+	 */
+	for (ds = list_head(&dp->dp_synced_datasets); ds;
+	    ds = list_next(&dp->dp_synced_datasets, ds))
+		bplist_sync(&ds->ds_deadlist,
+		    bplist_enqueue_cb, &ds->ds_deadlist, tx);
+
 	while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) {
 		/*
 		 * No more sync tasks should have been added while we
@@ -422,16 +431,19 @@
 }
 
 void
-dsl_pool_zil_clean(dsl_pool_t *dp)
+dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
 {
 	dsl_dataset_t *ds;
+	objset_t *os;
 
 	while (ds = list_head(&dp->dp_synced_datasets)) {
 		list_remove(&dp->dp_synced_datasets, ds);
-		ASSERT(ds->ds_objset != NULL);
-		zil_clean(ds->ds_objset->os_zil);
+		os = ds->ds_objset;
+		zil_clean(os->os_zil);
+		ASSERT(!dmu_objset_is_dirty(os, txg));
 		dmu_buf_rele(ds->ds_dbuf, ds);
 	}
+	ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
 }
 
 /*
@@ -460,7 +472,7 @@
 	 * cut the reservation in half to allow forward progress
 	 * (e.g. make it possible to rm(1) files from a full pool).
 	 */
-	space = spa_get_dspace(dp->dp_spa);
+	space = metaslab_class_get_dspace(spa_normal_class(dp->dp_spa));
 	resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1);
 	if (netfree)
 		resv >>= 1;
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/dsl_prop.c
--- a/usr/src/uts/common/fs/zfs/dsl_prop.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/dsl_prop.c	Sun Nov 01 14:14:46 2009 -0800
@@ -31,7 +31,6 @@
 #include <sys/dsl_prop.h>
 #include <sys/dsl_synctask.h>
 #include <sys/spa.h>
-#include <sys/zio_checksum.h> /* for the default checksum value */
 #include <sys/zap.h>
 #include <sys/fs/zfs.h>
 
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/dsl_scrub.c
--- a/usr/src/uts/common/fs/zfs/dsl_scrub.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/dsl_scrub.c	Sun Nov 01 14:14:46 2009 -0800
@@ -40,6 +40,8 @@
 #include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/zil_impl.h>
+#include <sys/zio_checksum.h>
+#include <sys/ddt.h>
 
 typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
 
@@ -59,14 +61,6 @@
 	dsl_pool_scrub_clean_cb
 };
 
-#define	SET_BOOKMARK(zb, objset, object, level, blkid)  \
-{                                                       \
-	(zb)->zb_objset = objset;                       \
-	(zb)->zb_object = object;                       \
-	(zb)->zb_level = level;                         \
-	(zb)->zb_blkid = blkid;                         \
-}
-
 /* ARGSUSED */
 static void
 dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
@@ -126,6 +120,7 @@
 	    ot ? ot : DMU_OT_SCRUB_QUEUE, DMU_OT_NONE, 0, tx);
 	bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
 	dp->dp_scrub_restart = B_FALSE;
+	dp->dp_scrub_ditto = B_FALSE;
 	dp->dp_spa->spa_scrub_errors = 0;
 
 	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
@@ -241,15 +236,13 @@
 	    dsl_pool_scrub_cancel_sync, dp, &complete, 3));
 }
 
-int
-dsl_free(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp,
-    zio_done_func_t *done, void *private, uint32_t arc_flags)
+void
+dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
 {
 	/*
 	 * This function will be used by bp-rewrite wad to intercept frees.
 	 */
-	return (arc_free(pio, dp->dp_spa, txg, (blkptr_t *)bpp,
-	    done, private, arc_flags));
+	zio_free(dp->dp_spa, txg, bpp);
 }
 
 static boolean_t
@@ -267,14 +260,14 @@
 	uint64_t zb1nextL0, zb2thisobj;
 
 	ASSERT(zb1->zb_objset == zb2->zb_objset);
-	ASSERT(zb1->zb_object != -1ULL);
+	ASSERT(zb1->zb_object != DMU_DEADLIST_OBJECT);
 	ASSERT(zb2->zb_level == 0);
 
 	/*
 	 * A bookmark in the deadlist is considered to be after
 	 * everything else.
 	 */
-	if (zb2->zb_object == -1ULL)
+	if (zb2->zb_object == DMU_DEADLIST_OBJECT)
 		return (B_TRUE);
 
 	/* The objset_phys_t isn't before anything. */
@@ -287,7 +280,7 @@
 	zb2thisobj = zb2->zb_object ? zb2->zb_object :
 	    zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
 
-	if (zb1->zb_object == 0) {
+	if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
 		uint64_t nextobj = zb1nextL0 *
 		    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
 		return (nextobj <= zb2thisobj);
@@ -297,7 +290,7 @@
 		return (B_TRUE);
 	if (zb1->zb_object > zb2thisobj)
 		return (B_FALSE);
-	if (zb2->zb_object == 0)
+	if (zb2->zb_object == DMU_META_DNODE_OBJECT)
 		return (B_FALSE);
 	return (zb1nextL0 <= zb2->zb_blkid);
 }
@@ -339,7 +332,7 @@
 } zil_traverse_arg_t;
 
 /* ARGSUSED */
-static void
+static int
 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
 {
 	zil_traverse_arg_t *zta = arg;
@@ -348,7 +341,7 @@
 	zbookmark_t zb;
 
 	if (bp->blk_birth <= dp->dp_scrub_min_txg)
-		return;
+		return (0);
 
 	/*
 	 * One block ("stubby") can be allocated a long time ago; we
@@ -357,17 +350,17 @@
 	 * plain scrub there's nothing to do to it).
 	 */
 	if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
-		return;
+		return (0);
 
-	zb.zb_objset = zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET];
-	zb.zb_object = 0;
-	zb.zb_level = -1;
-	zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
+	SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
+
 	VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb));
+	return (0);
 }
 
 /* ARGSUSED */
-static void
+static int
 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
 {
 	if (lrc->lrc_txtype == TX_WRITE) {
@@ -379,7 +372,7 @@
 		zbookmark_t zb;
 
 		if (bp->blk_birth <= dp->dp_scrub_min_txg)
-			return;
+			return (0);
 
 		/*
 		 * birth can be < claim_txg if this record's txg is
@@ -387,14 +380,15 @@
 		 * other records that are not synced)
 		 */
 		if (claim_txg == 0 || bp->blk_birth < claim_txg)
-			return;
+			return (0);
 
-		zb.zb_objset = zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET];
-		zb.zb_object = lr->lr_foid;
-		zb.zb_level = BP_GET_LEVEL(bp);
-		zb.zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
+		SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+		    lr->lr_foid, ZB_ZIL_LEVEL,
+		    lr->lr_offset / BP_GET_LSIZE(bp));
+
 		VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb));
 	}
+	return (0);
 }
 
 static void
@@ -522,12 +516,12 @@
 		traverse_zil(dp, &osp->os_zil_header);
 
 		scrub_visitdnode(dp, &osp->os_meta_dnode,
-		    buf, zb->zb_objset, 0);
+		    buf, zb->zb_objset, DMU_META_DNODE_OBJECT);
 		if (arc_buf_size(buf) >= sizeof (objset_phys_t)) {
 			scrub_visitdnode(dp, &osp->os_userused_dnode,
-			    buf, zb->zb_objset, 0);
+			    buf, zb->zb_objset, DMU_USERUSED_OBJECT);
 			scrub_visitdnode(dp, &osp->os_groupused_dnode,
-			    buf, zb->zb_objset, 0);
+			    buf, zb->zb_objset, DMU_GROUPUSED_OBJECT);
 		}
 	}
 
@@ -556,7 +550,8 @@
 {
 	zbookmark_t zb;
 
-	SET_BOOKMARK(&zb, ds ? ds->ds_object : 0, 0, -1, 0);
+	SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
+	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 	scrub_visitbp(dp, NULL, NULL, bp, &zb);
 }
 
@@ -569,7 +564,8 @@
 		return;
 
 	if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) {
-		SET_BOOKMARK(&dp->dp_scrub_bookmark, -1, 0, 0, 0);
+		SET_BOOKMARK(&dp->dp_scrub_bookmark, ZB_DESTROYED_OBJSET,
+		    0, 0, 0);
 	} else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
 	    ds->ds_object, tx) != 0) {
 		return;
@@ -775,6 +771,36 @@
 	return (0);
 }
 
+static void
+dsl_pool_scrub_ddt(dsl_pool_t *dp, enum zio_checksum c, enum ddt_type type,
+    enum ddt_class class)
+{
+	ddt_t *ddt = ddt_select_by_checksum(dp->dp_spa, c);
+	ddt_entry_t dde;
+	blkptr_t blk;
+	zbookmark_t zb = { 0 };
+	uint64_t walk = 0;
+	int error;
+
+	if (!ddt_object_exists(ddt, type, class))
+		return;
+
+	while ((error = ddt_object_walk(ddt, type, class, &dde, &walk)) == 0) {
+		int p = DDT_PHYS_DITTO;
+		ddt_bp_create(ddt, &dde.dde_key, &dde.dde_phys[p], &blk);
+		scrub_funcs[dp->dp_scrub_func](dp, &blk, &zb);
+	}
+	ASSERT(error == ENOENT);
+}
+
+static void
+dsl_pool_scrub_ditto(dsl_pool_t *dp)
+{
+	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
+		for (enum ddt_type type = 0; type < DDT_TYPES; type++)
+			dsl_pool_scrub_ddt(dp, c, type, DDT_CLASS_DITTO);
+}
+
 void
 dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 {
@@ -814,7 +840,12 @@
 	dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0);
 	spa->spa_scrub_active = B_TRUE;
 
-	if (dp->dp_scrub_bookmark.zb_objset == 0) {
+	if (!dp->dp_scrub_ditto) {
+		dsl_pool_scrub_ditto(dp);
+		dp->dp_scrub_ditto = B_TRUE;
+	}
+
+	if (dp->dp_scrub_bookmark.zb_objset == DMU_META_OBJSET) {
 		/* First do the MOS & ORIGIN */
 		scrub_visit_rootbp(dp, NULL, &dp->dp_meta_rootbp);
 		if (dp->dp_scrub_pausing)
@@ -827,12 +858,12 @@
 			scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx);
 		}
 		ASSERT(!dp->dp_scrub_pausing);
-	} else if (dp->dp_scrub_bookmark.zb_objset != -1ULL) {
+	} else if (dp->dp_scrub_bookmark.zb_objset != ZB_DESTROYED_OBJSET) {
 		/*
-		 * If we were paused, continue from here.  Note if the
-		 * ds we were paused on was deleted, the zb_objset will
-		 * be -1, so we will skip this and find a new objset
-		 * below.
+		 * If we were paused, continue from here.  Note if the ds
+		 * we were paused on was destroyed, the zb_objset will be
+		 * ZB_DESTROYED_OBJSET, so we will skip this and find a new
+		 * objset below.
 		 */
 		scrub_visitds(dp, dp->dp_scrub_bookmark.zb_objset, tx);
 		if (dp->dp_scrub_pausing)
@@ -961,13 +992,13 @@
 {
 	size_t size = BP_GET_PSIZE(bp);
 	spa_t *spa = dp->dp_spa;
+	uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
 	boolean_t needs_io;
 	int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
 	int zio_priority;
 
-	ASSERT(bp->blk_birth > dp->dp_scrub_min_txg);
-
-	if (bp->blk_birth >= dp->dp_scrub_max_txg)
+	if (phys_birth <= dp->dp_scrub_min_txg ||
+	    phys_birth >= dp->dp_scrub_max_txg)
 		return (0);
 
 	count_block(dp->dp_blkstats, bp);
@@ -985,7 +1016,7 @@
 	}
 
 	/* If it's an intent log block, failure is expected. */
-	if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
+	if (zb->zb_level == ZB_ZIL_LEVEL)
 		zio_flags |= ZIO_FLAG_SPECULATIVE;
 
 	for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
@@ -1015,7 +1046,7 @@
 				needs_io = B_TRUE;
 			} else {
 				needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
-				    bp->blk_birth, 1);
+				    phys_birth, 1);
 			}
 		}
 	}
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/dsl_synctask.c
--- a/usr/src/uts/common/fs/zfs/dsl_synctask.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/dsl_synctask.c	Sun Nov 01 14:14:46 2009 -0800
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_pool.h>
@@ -118,8 +116,10 @@
 
 	txg_wait_synced(dstg->dstg_pool, txg);
 
-	if (dstg->dstg_err == EAGAIN)
+	if (dstg->dstg_err == EAGAIN) {
+		txg_wait_synced(dstg->dstg_pool, txg + TXG_DEFER_SIZE);
 		goto top;
+	}
 
 	return (dstg->dstg_err);
 }
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/lzjb.c
--- a/usr/src/uts/common/fs/zfs/lzjb.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/lzjb.c	Sun Nov 01 14:14:46 2009 -0800
@@ -20,18 +20,18 @@
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * We keep our own copy of this algorithm for 2 main reasons:
- * 	1. If we didn't, anyone modifying common/os/compress.c would
+ *	1. If we didn't, anyone modifying common/os/compress.c would
  *         directly break our on disk format
- * 	2. Our version of lzjb does not have a number of checks that the
+ *	2. Our version of lzjb does not have a number of checks that the
  *         common/os version needs and uses
+ *	3. We initialize the lempel to ensure deterministic results,
+ *	   so that identical blocks can always be deduplicated.
  * In particular, we are adding the "feature" that compress() can
  * take a destination buffer size and return -1 if the data will not
  * compress to d_len or less.
@@ -43,7 +43,7 @@
 #define	MATCH_MIN	3
 #define	MATCH_MAX	((1 << MATCH_BITS) + (MATCH_MIN - 1))
 #define	OFFSET_MASK	((1 << (16 - MATCH_BITS)) - 1)
-#define	LEMPEL_SIZE	256
+#define	LEMPEL_SIZE	1024
 
 /*ARGSUSED*/
 size_t
@@ -53,20 +53,14 @@
 	uchar_t *dst = d_start;
 	uchar_t *cpy, *copymap;
 	int copymask = 1 << (NBBY - 1);
-	int mlen, offset;
+	int mlen, offset, hash;
 	uint16_t *hp;
-	uint16_t lempel[LEMPEL_SIZE];	/* uninitialized; see above */
+	uint16_t lempel[LEMPEL_SIZE] = { 0 };
 
 	while (src < (uchar_t *)s_start + s_len) {
 		if ((copymask <<= 1) == (1 << NBBY)) {
-			if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY) {
-				if (d_len != s_len)
-					return (s_len);
-				mlen = s_len;
-				for (src = s_start, dst = d_start; mlen; mlen--)
-					*dst++ = *src++;
+			if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY)
 				return (s_len);
-			}
 			copymask = 1;
 			copymap = dst;
 			*dst++ = 0;
@@ -75,8 +69,10 @@
 			*dst++ = *src++;
 			continue;
 		}
-		hp = &lempel[((src[0] + 13) ^ (src[1] - 13) ^ src[2]) &
-		    (LEMPEL_SIZE - 1)];
+		hash = (src[0] << 16) + (src[1] << 8) + src[2];
+		hash += hash >> 9;
+		hash += hash >> 5;
+		hp = &lempel[hash & (LEMPEL_SIZE - 1)];
 		offset = (intptr_t)(src - *hp) & OFFSET_MASK;
 		*hp = (uint16_t)(uintptr_t)src;
 		cpy = src - offset;
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/metaslab.c
--- a/usr/src/uts/common/fs/zfs/metaslab.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/metaslab.c	Sun Nov 01 14:14:46 2009 -0800
@@ -24,7 +24,6 @@
  */
 
 #include <sys/zfs_context.h>
-#include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/space_map.h>
@@ -36,6 +35,11 @@
 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
 
 /*
+ * Metaslab debugging: when set, keeps all space maps in core to verify frees.
+ */
+static int metaslab_debug = 0;
+
+/*
  * Minimum size which forces the dynamic allocator to change
  * it's allocation strategy. Once the space map cannot satisfy
  * an allocation of this size then it switches to using more
@@ -153,6 +157,45 @@
 	return (0);
 }
 
+void
+metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
+    int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
+{
+	atomic_add_64(&mc->mc_alloc, alloc_delta);
+	atomic_add_64(&mc->mc_deferred, defer_delta);
+	atomic_add_64(&mc->mc_space, space_delta);
+	atomic_add_64(&mc->mc_dspace, dspace_delta);
+
+	ASSERT((int64_t)mc->mc_alloc >= 0 &&
+	    (int64_t)mc->mc_deferred >= 0 &&
+	    (int64_t)mc->mc_space >= 0 &&
+	    (int64_t)mc->mc_dspace >= 0);
+}
+
+uint64_t
+metaslab_class_get_alloc(metaslab_class_t *mc)
+{
+	return (mc->mc_alloc);
+}
+
+uint64_t
+metaslab_class_get_deferred(metaslab_class_t *mc)
+{
+	return (mc->mc_deferred);
+}
+
+uint64_t
+metaslab_class_get_space(metaslab_class_t *mc)
+{
+	return (mc->mc_space);
+}
+
+uint64_t
+metaslab_class_get_dspace(metaslab_class_t *mc)
+{
+	return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
+}
+
 /*
  * ==========================================================================
  * Metaslab groups
@@ -493,6 +536,13 @@
 
 	metaslab_group_add(mg, msp);
 
+	if (metaslab_debug && smo->smo_object != 0) {
+		mutex_enter(&msp->ms_lock);
+		VERIFY(space_map_load(&msp->ms_map, mg->mg_class->mc_ops,
+		    SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0);
+		mutex_exit(&msp->ms_lock);
+	}
+
 	/*
 	 * If we're opening an existing pool (txg == 0) or creating
 	 * a new one (txg == TXG_INITIAL), all space is available now.
@@ -515,8 +565,8 @@
 {
 	metaslab_group_t *mg = msp->ms_group;
 
-	vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size,
-	    -msp->ms_smo.smo_alloc, 0, B_TRUE);
+	vdev_space_update(mg->mg_vd,
+	    -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size);
 
 	metaslab_group_remove(mg, msp);
 
@@ -607,7 +657,7 @@
 		if (!sm->sm_loaded) {
 			int error = space_map_load(sm, sm_ops, SM_FREE,
 			    &msp->ms_smo,
-			    msp->ms_group->mg_vd->vdev_spa->spa_meta_objset);
+			    spa_meta_objset(msp->ms_group->mg_vd->vdev_spa));
 			if (error) {
 				metaslab_group_sort(msp->ms_group, msp, 0);
 				return (error);
@@ -641,7 +691,9 @@
 	 * this metaslab again.  In that case, it had better be empty,
 	 * or we would be leaving space on the table.
 	 */
+#if 0
 	ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0);
+#endif
 	metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
 	ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
 }
@@ -654,7 +706,7 @@
 {
 	vdev_t *vd = msp->ms_group->mg_vd;
 	spa_t *spa = vd->vdev_spa;
-	objset_t *mos = spa->spa_meta_objset;
+	objset_t *mos = spa_meta_objset(spa);
 	space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK];
 	space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK];
 	space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
@@ -780,14 +832,13 @@
 			space_map_create(&msp->ms_defermap[t], sm->sm_start,
 			    sm->sm_size, sm->sm_shift, sm->sm_lock);
 
-		vdev_space_update(vd, sm->sm_size, 0, 0, B_TRUE);
+		vdev_space_update(vd, 0, 0, sm->sm_size);
 	}
 
 	alloc_delta = smosync->smo_alloc - smo->smo_alloc;
 	defer_delta = freed_map->sm_space - defer_map->sm_space;
 
-	vdev_space_update(vd, 0, alloc_delta + defer_delta,
-	    defer_delta, B_TRUE);
+	vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
 
 	ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0);
 	ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0);
@@ -827,7 +878,7 @@
 			if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space)
 				evictable = 0;
 
-		if (evictable)
+		if (evictable && !metaslab_debug)
 			space_map_unload(sm);
 	}
 
@@ -973,7 +1024,7 @@
 
 	/*
 	 * Start at the rotor and loop through all mgs until we find something.
-	 * Note that there's no locking on mc_rotor or mc_allocated because
+	 * Note that there's no locking on mc_rotor or mc_aliquot because
 	 * nothing actually breaks if we miss a few updates -- we just won't
 	 * allocate quite as evenly.  It all balances out over time.
 	 *
@@ -1071,32 +1122,28 @@
 			 * over- or under-used relative to the pool,
 			 * and set an allocation bias to even it out.
 			 */
-			if (mc->mc_allocated == 0) {
+			if (mc->mc_aliquot == 0) {
 				vdev_stat_t *vs = &vd->vdev_stat;
-				uint64_t alloc, space;
-				int64_t vu, su;
-
-				alloc = spa_get_alloc(spa);
-				space = spa_get_space(spa);
+				int64_t vu, cu;
 
 				/*
 				 * Determine percent used in units of 0..1024.
 				 * (This is just to avoid floating point.)
 				 */
 				vu = (vs->vs_alloc << 10) / (vs->vs_space + 1);
-				su = (alloc << 10) / (space + 1);
+				cu = (mc->mc_alloc << 10) / (mc->mc_space + 1);
 
 				/*
 				 * Bias by at most +/- 25% of the aliquot.
 				 */
-				mg->mg_bias = ((su - vu) *
+				mg->mg_bias = ((cu - vu) *
 				    (int64_t)mg->mg_aliquot) / (1024 * 4);
 			}
 
-			if (atomic_add_64_nv(&mc->mc_allocated, asize) >=
+			if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
 			    mg->mg_aliquot + mg->mg_bias) {
 				mc->mc_rotor = mg->mg_next;
-				mc->mc_allocated = 0;
+				mc->mc_aliquot = 0;
 			}
 
 			DVA_SET_VDEV(&dva[d], vd->vdev_id);
@@ -1108,7 +1155,7 @@
 		}
 next:
 		mc->mc_rotor = mg->mg_next;
-		mc->mc_allocated = 0;
+		mc->mc_aliquot = 0;
 	} while ((mg = mg->mg_next) != rotor);
 
 	if (!all_zero) {
@@ -1188,7 +1235,7 @@
 	uint64_t size = DVA_GET_ASIZE(dva);
 	vdev_t *vd;
 	metaslab_t *msp;
-	int error;
+	int error = 0;
 
 	ASSERT(DVA_IS_VALID(dva));
 
@@ -1203,7 +1250,12 @@
 
 	mutex_enter(&msp->ms_lock);
 
-	error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0);
+	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded)
+		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0);
+
+	if (error == 0 && !space_map_contains(&msp->ms_map, offset, size))
+		error = ENOENT;
+
 	if (error || txg == 0) {	/* txg == 0 indicates dry run */
 		mutex_exit(&msp->ms_lock);
 		return (error);
@@ -1231,6 +1283,7 @@
 	int error = 0;
 
 	ASSERT(bp->blk_birth == 0);
+	ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
 
 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
 
@@ -1260,7 +1313,7 @@
 
 	spa_config_exit(spa, SCL_ALLOC, FTAG);
 
-	bp->blk_birth = txg;
+	BP_SET_BIRTH(bp, txg, txg);
 
 	return (0);
 }
@@ -1272,7 +1325,7 @@
 	int ndvas = BP_GET_NDVAS(bp);
 
 	ASSERT(!BP_IS_HOLE(bp));
-	ASSERT(!now || bp->blk_birth >= spa->spa_syncing_txg);
+	ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
 
 	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
 
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/sha256.c
--- a/usr/src/uts/common/fs/zfs/sha256.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sha256.c	Sun Nov 01 14:14:46 2009 -0800
@@ -19,15 +19,12 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/zio.h>
-#include <sys/zio_checksum.h>
 
 /*
  * SHA-256 checksum, as specified in FIPS 180-3, available at:
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/spa.c
--- a/usr/src/uts/common/fs/zfs/spa.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/spa.c	Sun Nov 01 14:14:46 2009 -0800
@@ -35,11 +35,11 @@
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
-#include <sys/zio_compress.h>
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/zap.h>
 #include <sys/zil.h>
+#include <sys/ddt.h>
 #include <sys/vdev_impl.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
@@ -154,8 +154,8 @@
 	ASSERT(MUTEX_HELD(&spa->spa_props_lock));
 
 	if (spa->spa_root_vdev != NULL) {
-		size = spa_get_space(spa);
-		used = spa_get_alloc(spa);
+		used = metaslab_class_get_alloc(spa_normal_class(spa));
+		size = metaslab_class_get_space(spa_normal_class(spa));
 		spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src);
@@ -165,6 +165,9 @@
 		cap = (size == 0) ? 0 : (used * 100 / size);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
 
+		spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
+		    ddt_get_pool_dedup_ratio(spa), src);
+
 		spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
 		    spa->spa_root_vdev->vdev_state, src);
 
@@ -199,9 +202,9 @@
 int
 spa_prop_get(spa_t *spa, nvlist_t **nvp)
 {
+	objset_t *mos = spa->spa_meta_objset;
 	zap_cursor_t zc;
 	zap_attribute_t za;
-	objset_t *mos = spa->spa_meta_objset;
 	int err;
 
 	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
@@ -436,6 +439,16 @@
 			    strcmp(slash, "/..") == 0)
 				error = EINVAL;
 			break;
+
+		case ZPOOL_PROP_DEDUPDITTO:
+			if (spa_version(spa) < SPA_VERSION_DEDUP)
+				error = ENOTSUP;
+			else
+				error = nvpair_value_uint64(elem, &intval);
+			if (error == 0 &&
+			    intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
+				error = EINVAL;
+			break;
 		}
 
 		if (error)
@@ -771,6 +784,8 @@
 		spa->spa_dsl_pool = NULL;
 	}
 
+	ddt_unload(spa);
+
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	/*
@@ -1145,12 +1160,24 @@
 static void
 spa_aux_check_removed(spa_aux_vdev_t *sav)
 {
-	int i;
-
-	for (i = 0; i < sav->sav_count; i++)
+	for (int i = 0; i < sav->sav_count; i++)
 		spa_check_removed(sav->sav_vdevs[i]);
 }
 
+void
+spa_claim_notify(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+
+	if (zio->io_error)
+		return;
+
+	mutex_enter(&spa->spa_props_lock);	/* any mutex will do */
+	if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
+		spa->spa_claim_max_txg = zio->io_bp->blk_birth;
+	mutex_exit(&spa->spa_props_lock);
+}
+
 typedef struct spa_load_error {
 	uint64_t	sle_metadata_count;
 	uint64_t	sle_data_count;
@@ -1176,8 +1203,8 @@
 
 /*ARGSUSED*/
 static int
-spa_load_verify_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
-    const dnode_phys_t *dnp, void *arg)
+spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	if (bp != NULL) {
 		zio_t *rio = arg;
@@ -1380,6 +1407,8 @@
 	    TXG_INITIAL : spa_last_synced_txg(spa) - TXG_DEFER_SIZE;
 	spa->spa_first_txg = spa->spa_last_ubsync_txg ?
 	    spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
+	spa->spa_claim_max_txg = spa->spa_first_txg;
+
 	error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
 	if (error) {
 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
@@ -1448,7 +1477,7 @@
 
 	if (zap_lookup(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
-	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) {
+	    sizeof (uint64_t), 1, &spa->spa_deferred_bplist_obj) != 0) {
 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		error = EIO;
@@ -1562,20 +1591,6 @@
 		spa_config_exit(spa, SCL_ALL, FTAG);
 	}
 
-	VERIFY(nvlist_lookup_nvlist(nvconfig, ZPOOL_CONFIG_VDEV_TREE,
-	    &nvroot) == 0);
-	spa_load_log_state(spa, nvroot);
-	nvlist_free(nvconfig);
-
-	if (spa_check_logs(spa)) {
-		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_BAD_LOG);
-		error = ENXIO;
-		ereport = FM_EREPORT_ZFS_LOG_REPLAY;
-		goto out;
-	}
-
-
 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
 
 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
@@ -1610,6 +1625,10 @@
 		    spa->spa_pool_props_object,
 		    zpool_prop_to_name(ZPOOL_PROP_AUTOEXPAND),
 		    sizeof (uint64_t), 1, &spa->spa_autoexpand);
+		(void) zap_lookup(spa->spa_meta_objset,
+		    spa->spa_pool_props_object,
+		    zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO),
+		    sizeof (uint64_t), 1, &spa->spa_dedup_ditto);
 	}
 
 	/*
@@ -1653,6 +1672,17 @@
 		goto out;
 	}
 
+	/*
+	 * Load the DDTs (dedup tables).
+	 */
+	error = ddt_load(spa);
+	if (error != 0) {
+		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_CORRUPT_DATA);
+		error = EIO;
+		goto out;
+	}
+
 	if (state != SPA_LOAD_TRYIMPORT) {
 		error = spa_load_verify(spa);
 		if (error) {
@@ -1662,6 +1692,22 @@
 		}
 	}
 
+	/*
+	 * Load the intent log state and check log integrity.
+	 */
+	VERIFY(nvlist_lookup_nvlist(nvconfig, ZPOOL_CONFIG_VDEV_TREE,
+	    &nvroot) == 0);
+	spa_load_log_state(spa, nvroot);
+	nvlist_free(nvconfig);
+
+	if (spa_check_logs(spa)) {
+		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_BAD_LOG);
+		error = ENXIO;
+		ereport = FM_EREPORT_ZFS_LOG_REPLAY;
+		goto out;
+	}
+
 	if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
 	    spa->spa_load_max_txg == UINT64_MAX)) {
 		dmu_tx_t *tx;
@@ -1672,22 +1718,32 @@
 		/*
 		 * Claim log blocks that haven't been committed yet.
 		 * This must all happen in a single txg.
+		 * Note: spa_claim_max_txg is updated by spa_claim_notify(),
+		 * invoked from zil_claim_log_block()'s i/o done callback.
 		 * Price of rollback is that we abandon the log.
 		 */
+		spa->spa_claiming = B_TRUE;
+
 		tx = dmu_tx_create_assigned(spa_get_dsl(spa),
 		    spa_first_txg(spa));
 		(void) dmu_objset_find(spa_name(spa),
 		    zil_claim, tx, DS_FIND_CHILDREN);
 		dmu_tx_commit(tx);
 
+		spa->spa_claiming = B_FALSE;
+
 		spa->spa_log_state = SPA_LOG_GOOD;
 		spa->spa_sync_on = B_TRUE;
 		txg_sync_start(spa->spa_dsl_pool);
 
 		/*
-		 * Wait for all claims to sync.
+		 * Wait for all claims to sync.  We sync up to the highest
+		 * claimed log block birth time so that claimed log blocks
+		 * don't appear to be from the future.  spa_claim_max_txg
+		 * will have been set for us by either zil_check_log_chain()
+		 * (invoked from spa_check_logs()) or zil_claim() above.
 		 */
-		txg_wait_synced(spa->spa_dsl_pool, 0);
+		txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
 
 		/*
 		 * If the config cache is stale, or we have uninitialized
@@ -2350,8 +2406,6 @@
 	spa = spa_add(pool, NULL, altroot);
 	spa_activate(spa, spa_mode_global);
 
-	spa->spa_uberblock.ub_txg = txg - 1;
-
 	if (props && (error = spa_prop_validate(spa, props))) {
 		spa_deactivate(spa);
 		spa_remove(spa);
@@ -2363,6 +2417,9 @@
 	    &version) != 0)
 		version = SPA_VERSION;
 	ASSERT(version <= SPA_VERSION);
+
+	spa->spa_first_txg = txg;
+	spa->spa_uberblock.ub_txg = txg - 1;
 	spa->spa_uberblock.ub_version = version;
 	spa->spa_ubsync = spa->spa_uberblock;
 
@@ -2468,14 +2525,14 @@
 	 * because sync-to-convergence takes longer if the blocksize
 	 * keeps changing.
 	 */
-	spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset,
+	spa->spa_deferred_bplist_obj = bplist_create(spa->spa_meta_objset,
 	    1 << 14, tx);
-	dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj,
-	    ZIO_COMPRESS_OFF, tx);
+	dmu_object_set_compress(spa->spa_meta_objset,
+	    spa->spa_deferred_bplist_obj, ZIO_COMPRESS_OFF, tx);
 
 	if (zap_add(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
-	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) {
+	    sizeof (uint64_t), 1, &spa->spa_deferred_bplist_obj, tx) != 0) {
 		cmn_err(CE_PANIC, "failed to add bplist");
 	}
 
@@ -2492,11 +2549,17 @@
 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
 	spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
 	spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
+
 	if (props != NULL) {
 		spa_configfile_set(spa, props, B_FALSE);
 		spa_sync_props(spa, props, CRED(), tx);
 	}
 
+	/*
+	 * Create DDTs (dedup tables).
+	 */
+	ddt_create(spa);
+
 	dmu_tx_commit(tx);
 
 	spa->spa_sync_on = B_TRUE;
@@ -3732,6 +3795,7 @@
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+	ASSERT(vd == vd->vdev_top);
 
 	/*
 	 * Remove our vdev from the allocatable vdevs
@@ -3751,6 +3815,7 @@
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
+	ASSERT(vd == vd->vdev_top);
 
 	/*
 	 * Evacuate the device.  We don't hold the config lock as writer
@@ -3799,8 +3864,15 @@
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+	ASSERT(vd == vd->vdev_top);
 
 	(void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
+
+	if (list_link_active(&vd->vdev_state_dirty_node))
+		vdev_state_clean(vd);
+	if (list_link_active(&vd->vdev_config_dirty_node))
+		vdev_config_clean(vd);
+
 	vdev_free(vd);
 
 	/*
@@ -3873,6 +3945,7 @@
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	} else if (vd != NULL && vd->vdev_islog) {
 		ASSERT(!locked);
+		ASSERT(vd == vd->vdev_top);
 
 		/*
 		 * XXX - Once we have bp-rewrite this should
@@ -3887,7 +3960,13 @@
 		 */
 		spa_vdev_remove_start(spa, vd);
 
-		spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
+		/*
+		 * Wait for the youngest allocations and frees to sync,
+		 * and then wait for the deferral of those frees to finish.
+		 */
+		spa_vdev_config_exit(spa, NULL,
+		    txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
+
 		if ((error = spa_vdev_remove_evacuate(spa, vd)) != 0)
 			return (error);
 		txg = spa_vdev_config_enter(spa);
@@ -4165,24 +4244,23 @@
 	 * See if the config needs to be updated.
 	 */
 	if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
-		uint64_t oldsz, space_update;
+		uint64_t old_space, new_space;
 
 		mutex_enter(&spa_namespace_lock);
-		oldsz = spa_get_space(spa);
+		old_space = metaslab_class_get_space(spa_normal_class(spa));
 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
-		space_update = spa_get_space(spa) - oldsz;
+		new_space = metaslab_class_get_space(spa_normal_class(spa));
 		mutex_exit(&spa_namespace_lock);
 
 		/*
 		 * If the pool grew as a result of the config update,
 		 * then log an internal history event.
 		 */
-		if (space_update) {
+		if (new_space != old_space) {
 			spa_history_internal_log(LOG_POOL_VDEV_ONLINE,
 			    spa, NULL, CRED(),
 			    "pool '%s' size: %llu(+%llu)",
-			    spa_name(spa), spa_get_space(spa),
-			    space_update);
+			    spa_name(spa), new_space, new_space - old_space);
 		}
 	}
 
@@ -4280,38 +4358,34 @@
  * SPA syncing routines
  * ==========================================================================
  */
-
 static void
-spa_sync_deferred_frees(spa_t *spa, uint64_t txg)
+spa_sync_deferred_bplist(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx, uint64_t txg)
 {
-	bplist_t *bpl = &spa->spa_sync_bplist;
-	dmu_tx_t *tx;
 	blkptr_t blk;
 	uint64_t itor = 0;
-	zio_t *zio;
-	int error;
 	uint8_t c = 1;
 
-	zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
-
 	while (bplist_iterate(bpl, &itor, &blk) == 0) {
 		ASSERT(blk.blk_birth < txg);
-		zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL,
-		    ZIO_FLAG_MUSTSUCCEED));
+		zio_free(spa, txg, &blk);
 	}
 
-	error = zio_wait(zio);
-	ASSERT3U(error, ==, 0);
-
-	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 	bplist_vacate(bpl, tx);
 
 	/*
 	 * Pre-dirty the first block so we sync to convergence faster.
 	 * (Usually only the first block is needed.)
 	 */
-	dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx);
-	dmu_tx_commit(tx);
+	dmu_write(bpl->bpl_mos, spa->spa_deferred_bplist_obj, 0, 1, &c, tx);
+}
+
+static void
+spa_sync_free(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	zio_t *zio = arg;
+
+	zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
+	    zio->io_flags));
 }
 
 static void
@@ -4469,8 +4543,6 @@
 			 * Set pool property values in the poolprops mos object.
 			 */
 			if (spa->spa_pool_props_object == 0) {
-				objset_t *mos = spa->spa_meta_objset;
-
 				VERIFY((spa->spa_pool_props_object =
 				    zap_create(mos, DMU_OT_POOL_PROPS,
 				    DMU_OT_NONE, 0, tx)) > 0);
@@ -4521,6 +4593,9 @@
 				spa->spa_autoexpand = intval;
 				spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
 				break;
+			case ZPOOL_PROP_DEDUPDITTO:
+				spa->spa_dedup_ditto = intval;
+				break;
 			default:
 				break;
 			}
@@ -4547,11 +4622,11 @@
 {
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	objset_t *mos = spa->spa_meta_objset;
-	bplist_t *bpl = &spa->spa_sync_bplist;
+	bplist_t *defer_bpl = &spa->spa_deferred_bplist;
+	bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd;
 	dmu_tx_t *tx;
-	int dirty_vdevs;
 	int error;
 
 	/*
@@ -4586,7 +4661,7 @@
 	}
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
-	VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj));
+	VERIFY(0 == bplist_open(defer_bpl, mos, spa->spa_deferred_bplist_obj));
 
 	tx = dmu_tx_create_assigned(dp, txg);
 
@@ -4632,13 +4707,13 @@
 	if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
 	    !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
 	    !txg_list_empty(&dp->dp_sync_tasks, txg))
-		spa_sync_deferred_frees(spa, txg);
+		spa_sync_deferred_bplist(spa, defer_bpl, tx, txg);
 
 	/*
 	 * Iterate to convergence.
 	 */
 	do {
-		spa->spa_sync_pass++;
+		int pass = ++spa->spa_sync_pass;
 
 		spa_sync_config_object(spa, tx);
 		spa_sync_aux_dev(spa, &spa->spa_spares, tx,
@@ -4648,18 +4723,24 @@
 		spa_errlog_sync(spa, txg);
 		dsl_pool_sync(dp, txg);
 
-		dirty_vdevs = 0;
-		while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) {
-			vdev_sync(vd, txg);
-			dirty_vdevs++;
+		if (pass <= SYNC_PASS_DEFERRED_FREE) {
+			zio_t *zio = zio_root(spa, NULL, NULL, 0);
+			bplist_sync(free_bpl, spa_sync_free, zio, tx);
+			VERIFY(zio_wait(zio) == 0);
+		} else {
+			bplist_sync(free_bpl, bplist_enqueue_cb, defer_bpl, tx);
 		}
 
-		bplist_sync(bpl, tx);
-	} while (dirty_vdevs);
-
-	bplist_close(bpl);
-
-	dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass);
+		ddt_sync(spa, txg);
+
+		while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
+			vdev_sync(vd, txg);
+
+	} while (dmu_objset_is_dirty(mos, txg));
+
+	ASSERT(free_bpl->bpl_queue == NULL);
+
+	bplist_close(defer_bpl);
 
 	/*
 	 * Rewrite the vdev configuration (which includes the uberblock)
@@ -4730,10 +4811,7 @@
 
 	spa->spa_ubsync = spa->spa_uberblock;
 
-	/*
-	 * Clean up the ZIL records for the synced txg.
-	 */
-	dsl_pool_zil_clean(dp);
+	dsl_pool_sync_done(dp, txg);
 
 	/*
 	 * Update usable space statistics.
@@ -4748,7 +4826,10 @@
 	ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
 	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
 	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
-	ASSERT(bpl->bpl_queue == NULL);
+	ASSERT(defer_bpl->bpl_queue == NULL);
+	ASSERT(free_bpl->bpl_queue == NULL);
+
+	spa->spa_sync_pass = 0;
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/spa_history.c
--- a/usr/src/uts/common/fs/zfs/spa_history.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/spa_history.c	Sun Nov 01 14:14:46 2009 -0800
@@ -103,7 +103,8 @@
 	 * Figure out maximum size of history log.  We set it at
 	 * 1% of pool size, with a max of 32MB and min of 128KB.
 	 */
-	shpp->sh_phys_max_off = spa_get_dspace(spa) / 100;
+	shpp->sh_phys_max_off =
+	    metaslab_class_get_dspace(spa_normal_class(spa)) / 100;
 	shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 32<<20);
 	shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10);
 
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/spa_misc.c
--- a/usr/src/uts/common/fs/zfs/spa_misc.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c	Sun Nov 01 14:14:46 2009 -0800
@@ -186,7 +186,7 @@
  *
  * SCL_VDEV
  *	Held as reader to prevent changes to the vdev tree during trivial
- *	inquiries such as bp_get_dasize().  SCL_VDEV is distinct from the
+ *	inquiries such as bp_get_dsize().  SCL_VDEV is distinct from the
  *	other locks, and lower than all of them, to ensure that it's safe
  *	to acquire regardless of caller context.
  *
@@ -433,7 +433,6 @@
 	mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
 
@@ -441,6 +440,10 @@
 	cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
 
+	for (int t = 0; t < TXG_SIZE; t++)
+		bplist_init(&spa->spa_free_bplist[t]);
+	bplist_init(&spa->spa_deferred_bplist);
+
 	(void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
 	spa->spa_state = POOL_STATE_UNINITIALIZED;
 	spa->spa_freeze_txg = UINT64_MAX;
@@ -514,6 +517,10 @@
 
 	spa_config_lock_destroy(spa);
 
+	for (int t = 0; t < TXG_SIZE; t++)
+		bplist_fini(&spa->spa_free_bplist[t]);
+	bplist_fini(&spa->spa_deferred_bplist);
+
 	cv_destroy(&spa->spa_async_cv);
 	cv_destroy(&spa->spa_scrub_io_cv);
 	cv_destroy(&spa->spa_suspend_cv);
@@ -522,7 +529,6 @@
 	mutex_destroy(&spa->spa_scrub_lock);
 	mutex_destroy(&spa->spa_errlog_lock);
 	mutex_destroy(&spa->spa_errlist_lock);
-	mutex_destroy(&spa->spa_sync_bplist.bpl_lock);
 	mutex_destroy(&spa->spa_history_lock);
 	mutex_destroy(&spa->spa_props_lock);
 	mutex_destroy(&spa->spa_suspend_lock);
@@ -819,12 +825,6 @@
 	mutex_exit(&spa_l2cache_lock);
 }
 
-void
-spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc)
-{
-	vdev_space_update(vd, space, alloc, 0, B_FALSE);
-}
-
 /*
  * ==========================================================================
  * SPA vdev locking
@@ -890,8 +890,8 @@
 	/*
 	 * Verify the metaslab classes.
 	 */
-	ASSERT(metaslab_class_validate(spa->spa_normal_class) == 0);
-	ASSERT(metaslab_class_validate(spa->spa_log_class) == 0);
+	ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0);
+	ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0);
 
 	spa_config_exit(spa, SCL_ALL, spa);
 
@@ -955,6 +955,10 @@
 int
 spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
 {
+	if (vd != NULL || error == 0)
+		vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev,
+		    0, 0, B_FALSE);
+
 	if (vd != NULL) {
 		vdev_state_dirty(vd->vdev_top);
 		spa->spa_config_generation++;
@@ -1105,50 +1109,13 @@
 }
 
 void
-sprintf_blkptr(char *buf, int len, const blkptr_t *bp)
+sprintf_blkptr(char *buf, const blkptr_t *bp)
 {
-	int d;
-
-	if (bp == NULL) {
-		(void) snprintf(buf, len, "<NULL>");
-		return;
-	}
-
-	if (BP_IS_HOLE(bp)) {
-		(void) snprintf(buf, len, "<hole>");
-		return;
-	}
-
-	(void) snprintf(buf, len, "[L%llu %s] %llxL/%llxP ",
-	    (u_longlong_t)BP_GET_LEVEL(bp),
-	    BP_GET_TYPE(bp) < DMU_OT_NUMTYPES ?
-	    dmu_ot[BP_GET_TYPE(bp)].ot_name : "UNKNOWN",
-	    (u_longlong_t)BP_GET_LSIZE(bp),
-	    (u_longlong_t)BP_GET_PSIZE(bp));
+	char *type = dmu_ot[BP_GET_TYPE(bp)].ot_name;
+	char *checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
+	char *compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
 
-	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
-		const dva_t *dva = &bp->blk_dva[d];
-		(void) snprintf(buf + strlen(buf), len - strlen(buf),
-		    "DVA[%d]=<%llu:%llx:%llx> ", d,
-		    (u_longlong_t)DVA_GET_VDEV(dva),
-		    (u_longlong_t)DVA_GET_OFFSET(dva),
-		    (u_longlong_t)DVA_GET_ASIZE(dva));
-	}
-
-	(void) snprintf(buf + strlen(buf), len - strlen(buf),
-	    "%s %s %s %s birth=%llu fill=%llu cksum=%llx:%llx:%llx:%llx",
-	    BP_GET_CHECKSUM(bp) < ZIO_CHECKSUM_FUNCTIONS ?
-	    zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name : "UNKNOWN",
-	    BP_GET_COMPRESS(bp) < ZIO_COMPRESS_FUNCTIONS ?
-	    zio_compress_table[BP_GET_COMPRESS(bp)].ci_name : "UNKNOWN",
-	    BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE",
-	    BP_IS_GANG(bp) ? "gang" : "contiguous",
-	    (u_longlong_t)bp->blk_birth,
-	    (u_longlong_t)bp->blk_fill,
-	    (u_longlong_t)bp->blk_cksum.zc_word[0],
-	    (u_longlong_t)bp->blk_cksum.zc_word[1],
-	    (u_longlong_t)bp->blk_cksum.zc_word[2],
-	    (u_longlong_t)bp->blk_cksum.zc_word[3]);
+	SPRINTF_BLKPTR(snprintf, ' ', buf, bp, type, checksum, compress);
 }
 
 void
@@ -1254,6 +1221,12 @@
 	return (spa->spa_first_txg);
 }
 
+uint64_t
+spa_syncing_txg(spa_t *spa)
+{
+	return (spa->spa_syncing_txg);
+}
+
 pool_state_t
 spa_state(spa_t *spa)
 {
@@ -1266,56 +1239,18 @@
 	return (spa->spa_freeze_txg);
 }
 
-/*
- * Return how much space is allocated in the pool (ie. sum of all asize)
- */
-uint64_t
-spa_get_alloc(spa_t *spa)
-{
-	return (spa->spa_root_vdev->vdev_stat.vs_alloc);
-}
-
-/*
- * Return how much (raid-z inflated) space there is in the pool.
- */
-uint64_t
-spa_get_space(spa_t *spa)
-{
-	return (spa->spa_root_vdev->vdev_stat.vs_space);
-}
-
-/*
- * Return the amount of raid-z-deflated space in the pool.
- */
-uint64_t
-spa_get_dspace(spa_t *spa)
-{
-	if (spa->spa_deflate)
-		return (spa->spa_root_vdev->vdev_stat.vs_dspace);
-	else
-		return (spa->spa_root_vdev->vdev_stat.vs_space);
-}
-
-/*
- * Return the amount of space deferred from freeing (in in-core maps only)
- */
-uint64_t
-spa_get_defers(spa_t *spa)
-{
-	return (spa->spa_root_vdev->vdev_stat.vs_defer);
-}
-
 /* ARGSUSED */
 uint64_t
 spa_get_asize(spa_t *spa, uint64_t lsize)
 {
 	/*
-	 * For now, the worst case is 512-byte RAID-Z blocks, in which
-	 * case the space requirement is exactly 2x; so just assume that.
-	 * Add to this the fact that we can have up to 3 DVAs per bp, and
-	 * we have to multiply by a total of 6x.
+	 * The worst case is single-sector max-parity RAID-Z blocks, in which
+	 * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
+	 * times the size; so just assume that.  Add to this the fact that
+	 * we can have up to 3 DVAs per bp, and one more factor of 2 because
+	 * the block may be dittoed with up to 3 DVAs by ddt_sync().
 	 */
-	return (lsize * 6);
+	return (lsize * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2);
 }
 
 /*
@@ -1340,6 +1275,24 @@
 	return (spa->spa_ubsync.ub_version);
 }
 
+boolean_t
+spa_deflate(spa_t *spa)
+{
+	return (spa->spa_deflate);
+}
+
+metaslab_class_t *
+spa_normal_class(spa_t *spa)
+{
+	return (spa->spa_normal_class);
+}
+
+metaslab_class_t *
+spa_log_class(spa_t *spa)
+{
+	return (spa->spa_log_class);
+}
+
 int
 spa_max_replication(spa_t *spa)
 {
@@ -1354,23 +1307,45 @@
 }
 
 uint64_t
-bp_get_dasize(spa_t *spa, const blkptr_t *bp)
+dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
 {
-	int sz = 0, i;
+	uint64_t asize = DVA_GET_ASIZE(dva);
+	uint64_t dsize = asize;
+
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
+
+	if (asize != 0 && spa->spa_deflate) {
+		vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
+		dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
+	}
+
+	return (dsize);
+}
 
-	if (!spa->spa_deflate)
-		return (BP_GET_ASIZE(bp));
+uint64_t
+bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp)
+{
+	uint64_t dsize = 0;
+
+	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+		dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
+
+	return (dsize);
+}
+
+uint64_t
+bp_get_dsize(spa_t *spa, const blkptr_t *bp)
+{
+	uint64_t dsize = 0;
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
-	for (i = 0; i < SPA_DVAS_PER_BP; i++) {
-		vdev_t *vd =
-		    vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[i]));
-		if (vd)
-			sz += (DVA_GET_ASIZE(&bp->blk_dva[i]) >>
-			    SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
-	}
+
+	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+		dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
+
 	spa_config_exit(spa, SCL_VDEV, FTAG);
-	return (sz);
+
+	return (dsize);
 }
 
 /*
@@ -1472,9 +1447,18 @@
 	return (spa->spa_log_class->mc_rotor != NULL);
 }
 
-/*
- * Return whether this pool is the root pool.
- */
+spa_log_state_t
+spa_get_log_state(spa_t *spa)
+{
+	return (spa->spa_log_state);
+}
+
+void
+spa_set_log_state(spa_t *spa, spa_log_state_t state)
+{
+	spa->spa_log_state = state;
+}
+
 boolean_t
 spa_is_root(spa_t *spa)
 {
@@ -1492,3 +1476,27 @@
 {
 	return (spa->spa_mode);
 }
+
+uint64_t
+spa_bootfs(spa_t *spa)
+{
+	return (spa->spa_bootfs);
+}
+
+uint64_t
+spa_delegation(spa_t *spa)
+{
+	return (spa->spa_delegation);
+}
+
+objset_t *
+spa_meta_objset(spa_t *spa)
+{
+	return (spa->spa_meta_objset);
+}
+
+enum zio_checksum
+spa_dedup_checksum(spa_t *spa)
+{
+	return (spa->spa_dedup_checksum);
+}
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/space_map.c
--- a/usr/src/uts/common/fs/zfs/space_map.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/space_map.c	Sun Nov 01 14:14:46 2009 -0800
@@ -258,8 +258,10 @@
 {
 	ASSERT(MUTEX_HELD(sm->sm_lock));
 
-	while (sm->sm_loading)
+	while (sm->sm_loading) {
+		ASSERT(!sm->sm_loaded);
 		cv_wait(&sm->sm_load_cv, sm->sm_lock);
+	}
 }
 
 /*
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/sys/arc.h
--- a/usr/src/uts/common/fs/zfs/sys/arc.h	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/arc.h	Sun Nov 01 14:14:46 2009 -0800
@@ -99,27 +99,17 @@
 int arc_referenced(arc_buf_t *buf);
 #endif
 
-typedef struct writeprops {
-	dmu_object_type_t wp_type;
-	uint8_t wp_level;
-	uint8_t wp_copies;
-	uint8_t wp_dncompress, wp_oscompress;
-	uint8_t wp_dnchecksum, wp_oschecksum;
-} writeprops_t;
-
-void write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp);
-int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
+int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf,
     arc_done_func_t *done, void *private, int priority, int zio_flags,
     uint32_t *arc_flags, const zbookmark_t *zb);
-int arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
+int arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     arc_done_func_t *done, void *private, int priority, int flags,
     uint32_t *arc_flags, const zbookmark_t *zb);
-zio_t *arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
-    boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
-    arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
-    int zio_flags, const zbookmark_t *zb);
-int arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    zio_done_func_t *done, void *private, uint32_t arc_flags);
+zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
+    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
+    arc_done_func_t *ready, arc_done_func_t *done, void *private,
+    int priority, int zio_flags, const zbookmark_t *zb);
+void arc_free(spa_t *spa, const blkptr_t *bp);
 
 void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private);
 int arc_buf_evict(arc_buf_t *buf);
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/sys/bplist.h
--- a/usr/src/uts/common/fs/zfs/sys/bplist.h	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/bplist.h	Sun Nov 01 14:14:46 2009 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -29,6 +29,7 @@
 #include <sys/dmu.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
+#include <sys/zio.h>
 #include <sys/zfs_context.h>
 
 #ifdef	__cplusplus
@@ -67,6 +68,10 @@
 	dmu_buf_t	*bpl_cached_dbuf;
 } bplist_t;
 
+typedef void bplist_sync_cb_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
+
+extern void bplist_init(bplist_t *bpl);
+extern void bplist_fini(bplist_t *bpl);
 extern uint64_t bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx);
 extern void bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx);
 extern int bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object);
@@ -74,13 +79,15 @@
 extern boolean_t bplist_empty(bplist_t *bpl);
 extern int bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp);
 extern int bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx);
+extern void bplist_enqueue_cb(void *bpl, const blkptr_t *bp, dmu_tx_t *tx);
 extern void bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp);
-extern void bplist_sync(bplist_t *bpl, dmu_tx_t *tx);
+extern void bplist_sync(bplist_t *bpl, bplist_sync_cb_t *func,
+    void *arg, dmu_tx_t *tx);
 extern void bplist_vacate(bplist_t *bpl, dmu_tx_t *tx);
 extern int bplist_space(bplist_t *bpl,
     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
 extern int bplist_space_birthrange(bplist_t *bpl,
-    uint64_t mintxg, uint64_t maxtxg, uint64_t *dasizep);
+    uint64_t mintxg, uint64_t maxtxg, uint64_t *dsizep);
 
 #ifdef	__cplusplus
 }
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/sys/dbuf.h
--- a/usr/src/uts/common/fs/zfs/sys/dbuf.h	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h	Sun Nov 01 14:14:46 2009 -0800
@@ -133,6 +133,7 @@
 			arc_buf_t *dr_data;
 			blkptr_t dr_overridden_by;
 			override_states_t dr_override_state;
+			uint8_t dr_copies;
 		} dl;
 	} dt;
 } dbuf_dirty_record_t;
@@ -254,6 +255,7 @@
 uint64_t dbuf_refcount(dmu_buf_impl_t *db);
 
 void dbuf_rele(dmu_buf_impl_t *db, void *tag);
+void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag);
 
 dmu_buf_impl_t *dbuf_find(struct dnode *dn, uint8_t level, uint64_t blkid);
 
@@ -323,7 +325,7 @@
 #define	dprintf_dbuf_bp(db, bp, fmt, ...) do {			\
 	if (zfs_flags & ZFS_DEBUG_DPRINTF) {			\
 	char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP);	\
-	sprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp);		\
+	sprintf_blkptr(__blkbuf, bp);				\
 	dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf);	\
 	kmem_free(__blkbuf, BP_SPRINTF_LEN);			\
 	} 							\
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/sys/ddt.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/fs/zfs/sys/ddt.h	Sun Nov 01 14:14:46 2009 -0800
@@ -0,0 +1,239 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DDT_H
+#define	_SYS_DDT_H
+
+#include <sys/sysmacros.h>
+#include <sys/types.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <sys/dmu.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * On-disk DDT formats, in the desired search order (newest version first).
+ */
+enum ddt_type {
+	DDT_TYPE_ZAP = 0,
+	DDT_TYPES
+};
+
+/*
+ * DDT classes, in the desired search order (highest replication level first).
+ */
+enum ddt_class {
+	DDT_CLASS_DITTO = 0,
+	DDT_CLASS_DUPLICATE,
+	DDT_CLASS_UNIQUE,
+	DDT_CLASSES
+};
+
+#define	DDT_TYPE_CURRENT		0
+
+#define	DDT_COMPRESS_BYTEORDER_MASK	0x80
+#define	DDT_COMPRESS_FUNCTION_MASK	0x7f
+
+/*
+ * DDT statistics.
+ */
+typedef struct ddt_stat {
+	uint64_t	dds_blocks;	/* blocks			*/
+	uint64_t	dds_lsize;	/* logical size			*/
+	uint64_t	dds_psize;	/* physical size		*/
+	uint64_t	dds_dsize;	/* deflated allocated size	*/
+	uint64_t	dds_ref_blocks;	/* referenced blocks		*/
+	uint64_t	dds_ref_lsize;	/* referenced lsize * refcnt	*/
+	uint64_t	dds_ref_psize;	/* referenced psize * refcnt	*/
+	uint64_t	dds_ref_dsize;	/* referenced dsize * refcnt	*/
+} ddt_stat_t;
+
+typedef struct ddt_histogram {
+	ddt_stat_t	ddh_stat[64];	/* power-of-two histogram buckets */
+} ddt_histogram_t;
+
+/*
+ * On-disk ddt entry:  key (name) and physical storage (value).
+ */
+typedef struct ddt_key {
+	zio_cksum_t	ddk_cksum;	/* 256-bit block checksum */
+	uint64_t	ddk_prop;	/* LSIZE, PSIZE, compression */
+} ddt_key_t;
+
+/*
+ * ddk_prop layout:
+ *
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ *	|   0	|   0	|   0	| comp	|     PSIZE	|     LSIZE	|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ */
+#define	DDK_GET_LSIZE(ddk)	\
+	BF64_GET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)
+#define	DDK_SET_LSIZE(ddk, x)	\
+	BF64_SET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
+
+#define	DDK_GET_PSIZE(ddk)	\
+	BF64_GET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1)
+#define	DDK_SET_PSIZE(ddk, x)	\
+	BF64_SET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
+
+#define	DDK_GET_COMPRESS(ddk)		BF64_GET((ddk)->ddk_prop, 32, 8)
+#define	DDK_SET_COMPRESS(ddk, x)	BF64_SET((ddk)->ddk_prop, 32, 8, x)
+
+#define	DDT_KEY_WORDS	(sizeof (ddt_key_t) / sizeof (uint64_t))
+
+typedef struct ddt_phys {
+	dva_t		ddp_dva[SPA_DVAS_PER_BP];
+	uint64_t	ddp_refcnt;
+	uint64_t	ddp_phys_birth;
+} ddt_phys_t;
+
+enum ddt_phys_type {
+	DDT_PHYS_DITTO = 0,
+	DDT_PHYS_SINGLE = 1,
+	DDT_PHYS_DOUBLE = 2,
+	DDT_PHYS_TRIPLE = 3,
+	DDT_PHYS_TYPES
+} ddt_phys_type_t;
+
+/*
+ * In-core ddt entry
+ */
+struct ddt_entry {
+	ddt_key_t	dde_key;
+	ddt_phys_t	dde_phys[DDT_PHYS_TYPES];
+	zio_t		*dde_lead_zio[DDT_PHYS_TYPES];
+	void		*dde_repair_data;
+	enum ddt_type	dde_type;
+	enum ddt_class	dde_class;
+	uint8_t		dde_loading;
+	uint8_t		dde_loaded;
+	kcondvar_t	dde_cv;
+	avl_node_t	dde_node;
+};
+
+/*
+ * In-core ddt
+ */
+struct ddt {
+	kmutex_t	ddt_lock;
+	avl_tree_t	ddt_tree;
+	avl_tree_t	ddt_repair_tree;
+	enum zio_checksum ddt_checksum;
+	spa_t		*ddt_spa;
+	objset_t	*ddt_os;
+	uint64_t	ddt_stat_object;
+	uint64_t	ddt_object[DDT_TYPES][DDT_CLASSES];
+	ddt_histogram_t	ddt_histogram[DDT_TYPES][DDT_CLASSES];
+	avl_node_t	ddt_node;
+};
+
+typedef struct ddt_ops {
+	char ddt_op_name[32];
+	int (*ddt_op_create)(objset_t *os, uint64_t *object, dmu_tx_t *tx,
+	    boolean_t prehash);
+	int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx);
+	int (*ddt_op_lookup)(objset_t *os, uint64_t object, ddt_entry_t *dde);
+	int (*ddt_op_update)(objset_t *os, uint64_t object, ddt_entry_t *dde,
+	    dmu_tx_t *tx);
+	int (*ddt_op_remove)(objset_t *os, uint64_t object, ddt_entry_t *dde,
+	    dmu_tx_t *tx);
+	int (*ddt_op_walk)(objset_t *os, uint64_t object, ddt_entry_t *dde,
+	    uint64_t *walk);
+	uint64_t (*ddt_op_count)(objset_t *os, uint64_t object);
+} ddt_ops_t;
+
+#define	DDT_NAMELEN	80
+
+extern void ddt_object_name(ddt_t *ddt, enum ddt_type type,
+    enum ddt_class class, char *name);
+extern int ddt_object_walk(ddt_t *ddt, enum ddt_type type,
+    enum ddt_class class, ddt_entry_t *dde, uint64_t *walk);
+extern uint64_t ddt_object_count(ddt_t *ddt, enum ddt_type type,
+    enum ddt_class class);
+extern int ddt_object_info(ddt_t *ddt, enum ddt_type type,
+    enum ddt_class class, dmu_object_info_t *);
+extern boolean_t ddt_object_exists(ddt_t *ddt, enum ddt_type type,
+    enum ddt_class class);
+
+extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp,
+    uint64_t txg);
+extern void ddt_bp_create(const ddt_t *ddt, const ddt_key_t *ddk,
+    const ddt_phys_t *ddp, blkptr_t *bp);
+
+extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp);
+
+extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp);
+extern void ddt_phys_clear(ddt_phys_t *ddp);
+extern void ddt_phys_addref(ddt_phys_t *ddp);
+extern void ddt_phys_decref(ddt_phys_t *ddp);
+extern void ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp,
+    uint64_t txg);
+extern ddt_phys_t *ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp);
+extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde);
+
+extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg);
+
+extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src);
+extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh);
+extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh);
+
+extern uint64_t ddt_get_pool_dedup_ratio(spa_t *spa);
+
+extern int ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde,
+    ddt_phys_t *ddp_willref);
+extern int ddt_ditto_copies_present(ddt_entry_t *dde);
+
+extern size_t ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len);
+extern void ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len);
+
+extern ddt_t *ddt_select(spa_t *spa, const blkptr_t *bp);
+extern ddt_t *ddt_select_by_checksum(spa_t *spa, enum zio_checksum c);
+
+extern void ddt_enter(ddt_t *ddt);
+extern void ddt_exit(ddt_t *ddt);
+extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add);
+extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde);
+
+extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp);
+extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde);
+
+extern int ddt_entry_compare(const void *x1, const void *x2);
+
+extern void ddt_create(spa_t *spa);
+extern int ddt_load(spa_t *spa);
+extern void ddt_unload(spa_t *spa);
+extern void ddt_sync(spa_t *spa, uint64_t txg);
+
+extern const ddt_ops_t ddt_zap_ops;
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_DDT_H */
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/sys/dmu.h
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h	Sun Nov 01 14:14:46 2009 -0800
@@ -61,6 +61,7 @@
 struct spa;
 struct nvlist;
 struct arc_buf;
+struct zio_prop;
 
 typedef struct objset objset_t;
 typedef struct dmu_tx dmu_tx_t;
@@ -118,6 +119,8 @@
 	DMU_OT_USERGROUP_USED,		/* ZAP */
 	DMU_OT_USERGROUP_QUOTA,		/* ZAP */
 	DMU_OT_USERREFS,		/* ZAP */
+	DMU_OT_DDT_ZAP,			/* ZAP */
+	DMU_OT_DDT_STATS,		/* ZAP */
 	DMU_OT_NUMTYPES
 } dmu_object_type_t;
 
@@ -152,6 +155,7 @@
 
 #define	DMU_USERUSED_OBJECT	(-1ULL)
 #define	DMU_GROUPUSED_OBJECT	(-2ULL)
+#define	DMU_DEADLIST_OBJECT	(-3ULL)
 
 /*
  * Public routines to create, destroy, open, and close objsets.
@@ -202,6 +206,8 @@
 #define	DMU_POOL_PROPS			"pool_props"
 #define	DMU_POOL_L2CACHE		"l2cache"
 #define	DMU_POOL_TMP_USERREFS		"tmp_userrefs"
+#define	DMU_POOL_DDT			"DDT-%s-%s-%s"
+#define	DMU_POOL_DDT_STATS		"DDT-statistics"
 
 /* 4x8 zbookmark_t */
 #define	DMU_POOL_SCRUB_BOOKMARK		"scrub_bookmark"
@@ -299,11 +305,13 @@
     dmu_tx_t *tx);
 
 /*
- * Decide how many copies of a given block we should make.  Can be from
- * 1 to SPA_DVAS_PER_BP.
+ * Decide how to write a block: checksum, compression, number of copies, etc.
  */
-int dmu_get_replication_level(objset_t *os, struct zbookmark *zb,
-    dmu_object_type_t ot);
+#define	WP_NOFILL	0x1
+#define	WP_DMU_SYNC	0x2
+
+void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp,
+    struct zio_prop *zp);
 /*
  * The bonus data is accessed more or less like a regular buffer.
  * You must dmu_bonus_hold() to get the buffer, which will give you a
@@ -498,19 +506,19 @@
     uint64_t len);
 
 typedef struct dmu_object_info {
-	/* All sizes are in bytes. */
+	/* All sizes are in bytes unless otherwise indicated. */
 	uint32_t doi_data_block_size;
 	uint32_t doi_metadata_block_size;
-	uint64_t doi_bonus_size;
 	dmu_object_type_t doi_type;
 	dmu_object_type_t doi_bonus_type;
+	uint64_t doi_bonus_size;
 	uint8_t doi_indirection;		/* 2 = dnode->indirect->data */
 	uint8_t doi_checksum;
 	uint8_t doi_compress;
 	uint8_t doi_pad[5];
-	/* Values below are number of 512-byte blocks. */
-	uint64_t doi_physical_blks;		/* data + metadata */
-	uint64_t doi_max_block_offset;
+	uint64_t doi_physical_blocks_512;	/* data + metadata, 512b blks */
+	uint64_t doi_max_offset;
+	uint64_t doi_fill_count;		/* number of non-empty blocks */
 } dmu_object_info_t;
 
 typedef void arc_byteswap_func_t(void *buf, size_t size);
@@ -623,9 +631,20 @@
  * storage when the write completes this new data does not become a
  * permanent part of the file until the associated transaction commits.
  */
-typedef void dmu_sync_cb_t(dmu_buf_t *db, void *arg);
-int dmu_sync(struct zio *zio, dmu_buf_t *db,
-    struct blkptr *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg);
+
+/*
+ * {zfs,zvol,ztest}_get_done() args
+ */
+typedef struct zgd {
+	struct zilog	*zgd_zilog;
+	struct blkptr	*zgd_bp;
+	dmu_buf_t	*zgd_db;
+	struct rl	*zgd_rl;
+	void		*zgd_private;
+} zgd_t;
+
+typedef void dmu_sync_cb_t(zgd_t *arg, int error);
+int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd);
 
 /*
  * Find the next hole or data block in file starting at *off
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/sys/dmu_objset.h
--- a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h	Sun Nov 01 14:14:46 2009 -0800
@@ -67,12 +67,16 @@
 	dnode_t *os_userused_dnode;
 	dnode_t *os_groupused_dnode;
 	zilog_t *os_zil;
-	uint8_t os_checksum;	/* can change, under dsl_dir's locks */
-	uint8_t os_compress;	/* can change, under dsl_dir's locks */
-	uint8_t os_copies;	/* can change, under dsl_dir's locks */
-	uint8_t os_primary_cache;	/* can change, under dsl_dir's locks */
-	uint8_t os_secondary_cache;	/* can change, under dsl_dir's locks */
-	uint8_t os_logbias;	/* can change, under dsl_dir's locks */
+
+	/* can change, under dsl_dir's locks: */
+	uint8_t os_checksum;
+	uint8_t os_compress;
+	uint8_t os_copies;
+	uint8_t os_dedup_checksum;
+	uint8_t os_dedup_verify;
+	uint8_t os_logbias;
+	uint8_t os_primary_cache;
+	uint8_t os_secondary_cache;
 
 	/* no lock needed: */
 	struct dmu_tx *os_synctx; /* XXX sketchy */
@@ -97,6 +101,7 @@
 	void *os_user_ptr;
 };
 
+#define	DMU_META_OBJSET		0
 #define	DMU_META_DNODE_OBJECT	0
 #define	DMU_OBJECT_IS_SPECIAL(obj) ((int64_t)(obj) <= 0)
 
@@ -135,6 +140,7 @@
 
 /* called from dsl */
 void dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx);
+boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg);
 objset_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds,
     blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx);
 int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp,
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/sys/dmu_traverse.h
--- a/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h	Sun Nov 01 14:14:46 2009 -0800
@@ -36,8 +36,9 @@
 
 struct dnode_phys;
 struct dsl_dataset;
+struct zilog;
 
-typedef int (blkptr_cb_t)(spa_t *spa, blkptr_t *bp,
+typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_t *zb, const struct dnode_phys *dnp, void *arg);
 
 #define	TRAVERSE_PRE			(1<<0)
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h	Sun Nov 01 14:14:46 2009 -0800
@@ -213,9 +213,10 @@
 
 void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx);
 
-void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
-int dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
+void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp,
     dmu_tx_t *tx);
+int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp,
+    dmu_tx_t *tx, boolean_t async);
 boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth);
 uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds);
 
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/sys/dsl_pool.h
--- a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h	Sun Nov 01 14:14:46 2009 -0800
@@ -102,6 +102,7 @@
 	uint64_t dp_scrub_start_time;
 	kmutex_t dp_scrub_cancel_lock; /* protects dp_scrub_restart */
 	boolean_t dp_scrub_restart;
+	boolean_t dp_scrub_ditto;
 
 	/* Has its own locking */
 	tx_state_t dp_tx;
@@ -124,7 +125,7 @@
 void dsl_pool_close(dsl_pool_t *dp);
 dsl_pool_t *dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg);
 void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg);
-void dsl_pool_zil_clean(dsl_pool_t *dp);
+void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg);
 int dsl_pool_sync_context(dsl_pool_t *dp);
 uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree);
 uint64_t dsl_pool_adjustedfree(dsl_pool_t *dp, boolean_t netfree);
@@ -132,8 +133,7 @@
 void dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
 void dsl_pool_memory_pressure(dsl_pool_t *dp);
 void dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
-int dsl_free(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp,
-    zio_done_func_t *done, void *private, uint32_t arc_flags);
+void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp);
 void dsl_pool_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
 void dsl_pool_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
 void dsl_pool_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/sys/metaslab.h
--- a/usr/src/uts/common/fs/zfs/sys/metaslab.h	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h	Sun Nov 01 14:14:46 2009 -0800
@@ -36,9 +36,6 @@
 extern "C" {
 #endif
 
-typedef struct metaslab_class metaslab_class_t;
-typedef struct metaslab_group metaslab_group_t;
-
 extern space_map_ops_t *zfs_metaslab_ops;
 
 extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
@@ -64,6 +61,14 @@
 extern void metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg);
 extern int metaslab_class_validate(metaslab_class_t *mc);
 
+extern void metaslab_class_space_update(metaslab_class_t *mc,
+    int64_t alloc_delta, int64_t defer_delta,
+    int64_t space_delta, int64_t dspace_delta);
+extern uint64_t metaslab_class_get_alloc(metaslab_class_t *mc);
+extern uint64_t metaslab_class_get_space(metaslab_class_t *mc);
+extern uint64_t metaslab_class_get_dspace(metaslab_class_t *mc);
+extern uint64_t metaslab_class_get_deferred(metaslab_class_t *mc);
+
 extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc,
     vdev_t *vd);
 extern void metaslab_group_destroy(metaslab_group_t *mg);
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
--- a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h	Sun Nov 01 14:14:46 2009 -0800
@@ -39,8 +39,12 @@
 struct metaslab_class {
 	spa_t			*mc_spa;
 	metaslab_group_t	*mc_rotor;
-	uint64_t		mc_allocated;
 	space_map_ops_t		*mc_ops;
+	uint64_t		mc_aliquot;
+	uint64_t		mc_alloc;	/* total allocated space */
+	uint64_t		mc_deferred;	/* total deferred frees */
+	uint64_t		mc_space;	/* total space (alloc + free) */
+	uint64_t		mc_dspace;	/* total deflated space */
 };
 
 struct metaslab_group {
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/sys/spa.h
--- a/usr/src/uts/common/fs/zfs/sys/spa.h	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h	Sun Nov 01 14:14:46 2009 -0800
@@ -43,8 +43,13 @@
 typedef struct spa spa_t;
 typedef struct vdev vdev_t;
 typedef struct metaslab metaslab_t;
+typedef struct metaslab_group metaslab_group_t;
+typedef struct metaslab_class metaslab_class_t;
+typedef struct zio zio_t;
 typedef struct zilog zilog_t;
 typedef struct spa_aux_vdev spa_aux_vdev_t;
+typedef struct ddt ddt_t;
+typedef struct ddt_entry ddt_entry_t;
 struct dsl_pool;
 
 /*
@@ -134,15 +139,15 @@
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 5	|G|			 offset3				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 6	|E| lvl | type	| cksum | comp	|     PSIZE	|     LSIZE	|
+ * 6	|BDX|lvl| type	| cksum | comp	|     PSIZE	|     LSIZE	|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 7	|			padding					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 8	|			padding					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 9	|			padding					|
+ * 9	|			physical birth txg			|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * a	|			birth txg				|
+ * a	|			logical birth txg			|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * b	|			fill count				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
@@ -166,25 +171,29 @@
  * cksum	checksum function
  * comp		compression function
  * G		gang block indicator
- * E		endianness
+ * B		byteorder (endianness)
+ * D		dedup
+ * X		unused
+ * lvl		level of indirection
  * type		DMU object type
- * lvl		level of indirection
- * birth txg	transaction group in which the block was born
+ * phys birth	txg of block allocation; zero if same as logical birth txg
+ * log. birth	transaction group in which the block was logically born
  * fill count	number of non-zero blocks under this bp
  * checksum[4]	256-bit checksum of the data this bp describes
  */
-typedef struct blkptr {
-	dva_t		blk_dva[3];	/* 128-bit Data Virtual Address	*/
-	uint64_t	blk_prop;	/* size, compression, type, etc	*/
-	uint64_t	blk_pad[3];	/* Extra space for the future	*/
-	uint64_t	blk_birth;	/* transaction group at birth	*/
-	uint64_t	blk_fill;	/* fill count			*/
-	zio_cksum_t	blk_cksum;	/* 256-bit checksum		*/
-} blkptr_t;
-
 #define	SPA_BLKPTRSHIFT	7		/* blkptr_t is 128 bytes	*/
 #define	SPA_DVAS_PER_BP	3		/* Number of DVAs in a bp	*/
 
+typedef struct blkptr {
+	dva_t		blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */
+	uint64_t	blk_prop;	/* size, compression, type, etc	    */
+	uint64_t	blk_pad[2];	/* Extra space for the future	    */
+	uint64_t	blk_phys_birth;	/* txg when block was allocated	    */
+	uint64_t	blk_birth;	/* transaction group at birth	    */
+	uint64_t	blk_fill;	/* fill count			    */
+	zio_cksum_t	blk_cksum;	/* 256-bit checksum		    */
+} blkptr_t;
+
 /*
  * Macros to get and set fields in a bp or DVA.
  */
@@ -209,7 +218,6 @@
 
 #define	BP_GET_LSIZE(bp)	\
 	BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)
-
 #define	BP_SET_LSIZE(bp, x)	\
 	BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
 
@@ -218,20 +226,35 @@
 #define	BP_SET_PSIZE(bp, x)	\
 	BF64_SET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
 
-#define	BP_GET_COMPRESS(bp)	BF64_GET((bp)->blk_prop, 32, 8)
-#define	BP_SET_COMPRESS(bp, x)	BF64_SET((bp)->blk_prop, 32, 8, x)
+#define	BP_GET_COMPRESS(bp)		BF64_GET((bp)->blk_prop, 32, 8)
+#define	BP_SET_COMPRESS(bp, x)		BF64_SET((bp)->blk_prop, 32, 8, x)
+
+#define	BP_GET_CHECKSUM(bp)		BF64_GET((bp)->blk_prop, 40, 8)
+#define	BP_SET_CHECKSUM(bp, x)		BF64_SET((bp)->blk_prop, 40, 8, x)
 
-#define	BP_GET_CHECKSUM(bp)	BF64_GET((bp)->blk_prop, 40, 8)
-#define	BP_SET_CHECKSUM(bp, x)	BF64_SET((bp)->blk_prop, 40, 8, x)
+#define	BP_GET_TYPE(bp)			BF64_GET((bp)->blk_prop, 48, 8)
+#define	BP_SET_TYPE(bp, x)		BF64_SET((bp)->blk_prop, 48, 8, x)
+
+#define	BP_GET_LEVEL(bp)		BF64_GET((bp)->blk_prop, 56, 5)
+#define	BP_SET_LEVEL(bp, x)		BF64_SET((bp)->blk_prop, 56, 5, x)
+
+#define	BP_GET_PROP_BIT_61(bp)		BF64_GET((bp)->blk_prop, 61, 1)
+#define	BP_SET_PROP_BIT_61(bp, x)	BF64_SET((bp)->blk_prop, 61, 1, x)
 
-#define	BP_GET_TYPE(bp)		BF64_GET((bp)->blk_prop, 48, 8)
-#define	BP_SET_TYPE(bp, x)	BF64_SET((bp)->blk_prop, 48, 8, x)
+#define	BP_GET_DEDUP(bp)		BF64_GET((bp)->blk_prop, 62, 1)
+#define	BP_SET_DEDUP(bp, x)		BF64_SET((bp)->blk_prop, 62, 1, x)
+
+#define	BP_GET_BYTEORDER(bp)		(0 - BF64_GET((bp)->blk_prop, 63, 1))
+#define	BP_SET_BYTEORDER(bp, x)		BF64_SET((bp)->blk_prop, 63, 1, x)
 
-#define	BP_GET_LEVEL(bp)	BF64_GET((bp)->blk_prop, 56, 5)
-#define	BP_SET_LEVEL(bp, x)	BF64_SET((bp)->blk_prop, 56, 5, x)
+#define	BP_PHYSICAL_BIRTH(bp)		\
+	((bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
 
-#define	BP_GET_BYTEORDER(bp)	(0 - BF64_GET((bp)->blk_prop, 63, 1))
-#define	BP_SET_BYTEORDER(bp, x)	BF64_SET((bp)->blk_prop, 63, 1, x)
+#define	BP_SET_BIRTH(bp, logical, physical)	\
+{						\
+	(bp)->blk_birth = (logical);		\
+	(bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \
+}
 
 #define	BP_GET_ASIZE(bp)	\
 	(DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
@@ -255,6 +278,12 @@
 	((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
 	(dva1)->dva_word[0] == (dva2)->dva_word[0])
 
+#define	BP_EQUAL(bp1, bp2)	\
+	(BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) &&	\
+	DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) &&	\
+	DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) &&	\
+	DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2]))
+
 #define	ZIO_CHECKSUM_EQUAL(zc1, zc2) \
 	(0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \
 	((zc1).zc_word[1] - (zc2).zc_word[1]) | \
@@ -274,7 +303,6 @@
 #define	BP_IDENTITY(bp)		(&(bp)->blk_dva[0])
 #define	BP_IS_GANG(bp)		DVA_GET_GANG(BP_IDENTITY(bp))
 #define	BP_IS_HOLE(bp)		((bp)->blk_birth == 0)
-#define	BP_IS_OLDER(bp, txg)	(!BP_IS_HOLE(bp) && (bp)->blk_birth < (txg))
 
 #define	BP_ZERO(bp)				\
 {						\
@@ -287,14 +315,12 @@
 	(bp)->blk_prop = 0;			\
 	(bp)->blk_pad[0] = 0;			\
 	(bp)->blk_pad[1] = 0;			\
-	(bp)->blk_pad[2] = 0;			\
+	(bp)->blk_phys_birth = 0;		\
 	(bp)->blk_birth = 0;			\
 	(bp)->blk_fill = 0;			\
 	ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0);	\
 }
 
-#define	BLK_FILL_ALREADY_FREED	(-1ULL)
-
 /*
  * Note: the byteorder is either 0 or -1, both of which are palindromes.
  * This simplifies the endianness handling a bit.
@@ -309,14 +335,71 @@
 
 #define	BP_SPRINTF_LEN	320
 
+/*
+ * This macro allows code sharing between zfs, libzpool, and mdb.
+ * 'func' is either snprintf() or mdb_snprintf().
+ * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line.
+ */
+#define	SPRINTF_BLKPTR(func, ws, buf, bp, type, checksum, compress)	\
+{									\
+	static const char *copyname[] =					\
+	    { "zero", "single", "double", "triple" };			\
+	int size = BP_SPRINTF_LEN;					\
+	int len = 0;							\
+	int copies = 0;							\
+									\
+	if (bp == NULL) {						\
+		len = func(buf + len, size - len, "<NULL>");		\
+	} else if (BP_IS_HOLE(bp)) {					\
+		len = func(buf + len, size - len, "<hole>");		\
+	} else {							\
+		for (int d = 0; d < BP_GET_NDVAS(bp); d++) {		\
+			const dva_t *dva = &bp->blk_dva[d];		\
+			if (DVA_IS_VALID(dva))				\
+				copies++;				\
+			len += func(buf + len, size - len,		\
+			    "DVA[%d]=<%llu:%llx:%llx>%c", d,		\
+			    (u_longlong_t)DVA_GET_VDEV(dva),		\
+			    (u_longlong_t)DVA_GET_OFFSET(dva),		\
+			    (u_longlong_t)DVA_GET_ASIZE(dva),		\
+			    ws);					\
+		}							\
+		if (BP_IS_GANG(bp) &&					\
+		    DVA_GET_ASIZE(&bp->blk_dva[2]) <=			\
+		    DVA_GET_ASIZE(&bp->blk_dva[1]) / 2)			\
+			copies--;					\
+		len += func(buf + len, size - len,			\
+		    "[L%llu %s] %s %s %s %s %s %s%c"			\
+		    "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c"	\
+		    "cksum=%llx:%llx:%llx:%llx",			\
+		    (u_longlong_t)BP_GET_LEVEL(bp),			\
+		    type,						\
+		    checksum,						\
+		    compress,						\
+		    BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE",		\
+		    BP_IS_GANG(bp) ? "gang" : "contiguous",		\
+		    BP_GET_DEDUP(bp) ? "dedup" : "unique",		\
+		    copyname[copies],					\
+		    ws,							\
+		    (u_longlong_t)BP_GET_LSIZE(bp),			\
+		    (u_longlong_t)BP_GET_PSIZE(bp),			\
+		    (u_longlong_t)bp->blk_birth,			\
+		    (u_longlong_t)BP_PHYSICAL_BIRTH(bp),		\
+		    (u_longlong_t)bp->blk_fill,				\
+		    ws,							\
+		    (u_longlong_t)bp->blk_cksum.zc_word[0],		\
+		    (u_longlong_t)bp->blk_cksum.zc_word[1],		\
+		    (u_longlong_t)bp->blk_cksum.zc_word[2],		\
+		    (u_longlong_t)bp->blk_cksum.zc_word[3]);		\
+	}								\
+	ASSERT(len < size);						\
+}
+
 #include <sys/dmu.h>
 
 #define	BP_GET_BUFC_TYPE(bp)						\
 	(((BP_GET_LEVEL(bp) > 0) || (dmu_ot[BP_GET_TYPE(bp)].ot_metadata)) ? \
 	ARC_BUFC_METADATA : ARC_BUFC_DATA);
-/*
- * Routines found in spa.c
- */
 
 /* state manipulation functions */
 extern int spa_open(const char *pool, spa_t **, void *tag);
@@ -370,7 +453,6 @@
 extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool);
 extern void spa_l2cache_activate(vdev_t *vd);
 extern void spa_l2cache_drop(spa_t *spa);
-extern void spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc);
 
 /* scrubbing */
 extern int spa_scrub(spa_t *spa, pool_scrub_type_t type);
@@ -379,6 +461,10 @@
 extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
 extern void spa_sync_allpools(void);
 
+#define	SYNC_PASS_DEFERRED_FREE	1	/* defer frees after this pass */
+#define	SYNC_PASS_DONT_COMPRESS	4	/* don't compress after this pass */
+#define	SYNC_PASS_REWRITE	1	/* rewrite new bps after this pass */
+
 /* spa namespace global mutex */
 extern kmutex_t spa_namespace_lock;
 
@@ -396,7 +482,6 @@
 extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg,
     int getstats);
 extern void spa_config_update(spa_t *spa, int what);
-extern void spa_config_update_common(spa_t *spa, int what, boolean_t isroot);
 
 /*
  * Miscellaneous SPA routines in spa_misc.c
@@ -442,6 +527,20 @@
 extern void spa_vdev_state_enter(spa_t *spa, int oplock);
 extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error);
 
+/* Log state */
+typedef enum spa_log_state {
+	SPA_LOG_UNKNOWN = 0,	/* unknown log state */
+	SPA_LOG_MISSING,	/* missing log(s) */
+	SPA_LOG_CLEAR,		/* clear the log(s) */
+	SPA_LOG_GOOD,		/* log(s) are good */
+} spa_log_state_t;
+
+extern spa_log_state_t spa_get_log_state(spa_t *spa);
+extern void spa_set_log_state(spa_t *spa, spa_log_state_t state);
+
+/* Log claim callback */
+extern void spa_claim_notify(zio_t *zio);
+
 /* Accessor functions */
 extern boolean_t spa_shutting_down(spa_t *spa);
 extern struct dsl_pool *spa_get_dsl(spa_t *spa);
@@ -453,19 +552,23 @@
 extern uint64_t spa_guid(spa_t *spa);
 extern uint64_t spa_last_synced_txg(spa_t *spa);
 extern uint64_t spa_first_txg(spa_t *spa);
+extern uint64_t spa_syncing_txg(spa_t *spa);
 extern uint64_t spa_version(spa_t *spa);
 extern pool_state_t spa_state(spa_t *spa);
 extern uint64_t spa_freeze_txg(spa_t *spa);
-extern uint64_t spa_get_alloc(spa_t *spa);
-extern uint64_t spa_get_space(spa_t *spa);
-extern uint64_t spa_get_dspace(spa_t *spa);
-extern uint64_t spa_get_defers(spa_t *spa);
 extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize);
 extern uint64_t spa_version(spa_t *spa);
+extern boolean_t spa_deflate(spa_t *spa);
+extern metaslab_class_t *spa_normal_class(spa_t *spa);
+extern metaslab_class_t *spa_log_class(spa_t *spa);
 extern int spa_max_replication(spa_t *spa);
 extern int spa_busy(void);
 extern uint8_t spa_get_failmode(spa_t *spa);
 extern boolean_t spa_suspended(spa_t *spa);
+extern uint64_t spa_bootfs(spa_t *spa);
+extern uint64_t spa_delegation(spa_t *spa);
+extern objset_t *spa_meta_objset(spa_t *spa);
+extern enum zio_checksum spa_dedup_checksum(spa_t *spa);
 
 /* Miscellaneous support routines */
 extern int spa_rename(const char *oldname, const char *newname);
@@ -473,14 +576,16 @@
 extern char *spa_strdup(const char *);
 extern void spa_strfree(char *);
 extern uint64_t spa_get_random(uint64_t range);
-extern void sprintf_blkptr(char *buf, int len, const blkptr_t *bp);
+extern void sprintf_blkptr(char *buf, const blkptr_t *bp);
 extern void spa_freeze(spa_t *spa);
 extern void spa_upgrade(spa_t *spa, uint64_t version);
 extern void spa_evict_all(void);
 extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid,
     boolean_t l2cache);
 extern boolean_t spa_has_spare(spa_t *, uint64_t guid);
-extern uint64_t bp_get_dasize(spa_t *spa, const blkptr_t *bp);
+extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva);
+extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp);
+extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp);
 extern boolean_t spa_has_slogs(spa_t *spa);
 extern boolean_t spa_is_root(spa_t *spa);
 extern boolean_t spa_writeable(spa_t *spa);
@@ -516,10 +621,9 @@
 
 /* error handling */
 struct zbookmark;
-struct zio;
-extern void spa_log_error(spa_t *spa, struct zio *zio);
+extern void spa_log_error(spa_t *spa, zio_t *zio);
 extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd,
-    struct zio *zio, uint64_t stateoroffset, uint64_t length);
+    zio_t *zio, uint64_t stateoroffset, uint64_t length);
 extern void zfs_post_remove(spa_t *spa, vdev_t *vd);
 extern void zfs_post_state_change(spa_t *spa, vdev_t *vd);
 extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd);
@@ -552,7 +656,7 @@
 #define	dprintf_bp(bp, fmt, ...) do {				\
 	if (zfs_flags & ZFS_DEBUG_DPRINTF) { 			\
 	char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP);	\
-	sprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp));		\
+	sprintf_blkptr(__blkbuf, (bp));				\
 	dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf);		\
 	kmem_free(__blkbuf, BP_SPRINTF_LEN);			\
 	} \
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/sys/spa_impl.h
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h	Sun Nov 01 14:14:46 2009 -0800
@@ -78,13 +78,6 @@
 	char		*scd_path;
 } spa_config_dirent_t;
 
-typedef enum spa_log_state {
-	SPA_LOG_UNKNOWN = 0,	/* unknown log state */
-	SPA_LOG_MISSING,	/* missing log(s) */
-	SPA_LOG_CLEAR,		/* clear the log(s) */
-	SPA_LOG_GOOD,		/* log(s) are good */
-} spa_log_state_t;
-
 enum zio_taskq_type {
 	ZIO_TASKQ_ISSUE = 0,
 	ZIO_TASKQ_INTERRUPT,
@@ -114,6 +107,7 @@
 	uint64_t	spa_final_txg;		/* txg of export/destroy */
 	uint64_t	spa_freeze_txg;		/* freeze pool at this txg */
 	uint64_t	spa_load_max_txg;	/* best initial ub_txg */
+	uint64_t	spa_claim_max_txg;	/* highest claimed birth txg */
 	objset_t	*spa_meta_objset;	/* copy of dp->dp_meta_objset */
 	txg_list_t	spa_vdev_txg_list;	/* per-txg dirty vdev list */
 	vdev_t		*spa_root_vdev;		/* top-level vdev container */
@@ -125,8 +119,9 @@
 	uint64_t	spa_config_object;	/* MOS object for pool config */
 	uint64_t	spa_config_generation;	/* config generation number */
 	uint64_t	spa_syncing_txg;	/* txg currently syncing */
-	uint64_t	spa_sync_bplist_obj;	/* object for deferred frees */
-	bplist_t	spa_sync_bplist;	/* deferred-free bplist */
+	uint64_t	spa_deferred_bplist_obj; /* object for deferred frees */
+	bplist_t	spa_deferred_bplist;	/* deferred-free bplist */
+	bplist_t	spa_free_bplist[TXG_SIZE]; /* bplist of stuff to free */
 	uberblock_t	spa_ubsync;		/* last synced uberblock */
 	uberblock_t	spa_uberblock;		/* current uberblock */
 	boolean_t	spa_extreme_rewind;	/* rewind past deferred frees */
@@ -177,11 +172,16 @@
 	kmutex_t	spa_suspend_lock;	/* protects suspend_zio_root */
 	kcondvar_t	spa_suspend_cv;		/* notification of resume */
 	uint8_t		spa_suspended;		/* pool is suspended */
+	uint8_t		spa_claiming;		/* pool is doing zil_claim() */
 	boolean_t	spa_is_root;		/* pool is root */
 	int		spa_minref;		/* num refs when first opened */
 	int		spa_mode;		/* FREAD | FWRITE */
 	spa_log_state_t spa_log_state;		/* log state */
 	uint64_t	spa_autoexpand;		/* lun expansion on/off */
+	ddt_t		*spa_ddt[ZIO_CHECKSUM_FUNCTIONS]; /* in-core DDTs */
+	uint64_t	spa_ddt_stat_object;	/* DDT statistics */
+	uint64_t	spa_dedup_ditto;	/* dedup ditto threshold */
+	uint64_t	spa_dedup_checksum;	/* default dedup checksum */
 	boolean_t	spa_autoreplace;	/* autoreplace set in open */
 	int		spa_vdev_locks;		/* locks grabbed */
 	/*
@@ -196,12 +196,6 @@
 
 extern const char *spa_config_path;
 
-#define	BOOTFS_COMPRESS_VALID(compress) \
-	((compress) == ZIO_COMPRESS_LZJB || \
-	((compress) == ZIO_COMPRESS_ON && \
-	ZIO_COMPRESS_ON_VALUE == ZIO_COMPRESS_LZJB) || \
-	(compress) == ZIO_COMPRESS_OFF)
-
 #ifdef	__cplusplus
 }
 #endif
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/sys/txg.h
--- a/usr/src/uts/common/fs/zfs/sys/txg.h	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/txg.h	Sun Nov 01 14:14:46 2009 -0800
@@ -73,8 +73,6 @@
 extern void txg_rele_to_quiesce(txg_handle_t *txghp);
 extern void txg_rele_to_sync(txg_handle_t *txghp);
 extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks);
-extern void txg_suspend(struct dsl_pool *dp);
-extern void txg_resume(struct dsl_pool *dp);
 
 /*
  * Delay the caller by the specified number of ticks or until
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/sys/txg_impl.h
--- a/usr/src/uts/common/fs/zfs/sys/txg_impl.h	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/txg_impl.h	Sun Nov 01 14:14:46 2009 -0800
@@ -44,7 +44,6 @@
 typedef struct tx_state {
 	tx_cpu_t	*tx_cpu;	/* protects right to enter txg	*/
 	kmutex_t	tx_sync_lock;	/* protects tx_state_t */
-	krwlock_t	tx_suspend;
 	uint64_t	tx_open_txg;	/* currently open txg id */
 	uint64_t	tx_quiesced_txg; /* quiesced txg waiting for sync */
 	uint64_t	tx_syncing_txg;	/* currently syncing txg id */
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/sys/uberblock.h
--- a/usr/src/uts/common/fs/zfs/sys/uberblock.h	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/uberblock.h	Sun Nov 01 14:14:46 2009 -0800
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,19 +19,16 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_UBERBLOCK_H
 #define	_SYS_UBERBLOCK_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/spa.h>
 #include <sys/vdev.h>
 #include <sys/zio.h>
-#include <sys/zio_checksum.h>
 
 #ifdef	__cplusplus
 extern "C" {
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/sys/vdev.h
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h	Sun Nov 01 14:14:46 2009 -0800
@@ -84,8 +84,8 @@
 extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
     vdev_aux_t aux);
 
-extern void vdev_space_update(vdev_t *vd, int64_t space_delta,
-    int64_t alloc_delta, int64_t defer_delta, boolean_t update_root);
+extern void vdev_space_update(vdev_t *vd,
+    int64_t alloc_delta, int64_t defer_delta, int64_t space_delta);
 
 extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
 
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/sys/vdev_impl.h
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h	Sun Nov 01 14:14:46 2009 -0800
@@ -198,6 +198,8 @@
 	kmutex_t	vdev_probe_lock; /* protects vdev_probe_zio	*/
 };
 
+#define	VDEV_RAIDZ_MAXPARITY	3
+
 #define	VDEV_PAD_SIZE		(8 << 10)
 /* 2 padding areas (vl_pad1 and vl_pad2) to skip */
 #define	VDEV_SKIP_SIZE		VDEV_PAD_SIZE * 2
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/sys/zap.h
--- a/usr/src/uts/common/fs/zfs/sys/zap.h	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/zap.h	Sun Nov 01 14:14:46 2009 -0800
@@ -101,6 +101,18 @@
 	MT_FIRST
 } matchtype_t;
 
+typedef enum zap_flags {
+	/* Use 64-bit hash value (serialized cursors will always use 64-bits) */
+	ZAP_FLAG_HASH64 = 1 << 0,
+	/* Key is binary, not string (zap_add_uint64() can be used) */
+	ZAP_FLAG_UINT64_KEY = 1 << 1,
+	/*
+	 * First word of key (which must be an array of uint64) is
+	 * already randomly distributed.
+	 */
+	ZAP_FLAG_PRE_HASHED_KEY = 1 << 2,
+} zap_flags_t;
+
 /*
  * Create a new zapobj with no attributes and return its object number.
  * MT_EXACT will cause the zap object to only support MT_EXACT lookups,
@@ -118,6 +130,9 @@
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
 uint64_t zap_create_norm(objset_t *ds, int normflags, dmu_object_type_t ot,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
+    dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
 
 /*
  * Create a new zapobj with no attributes from the given (unallocated)
@@ -180,6 +195,8 @@
     uint64_t integer_size, uint64_t num_integers, void *buf,
     matchtype_t mt, char *realname, int rn_len,
     boolean_t *normalization_conflictp);
+int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf);
 
 int zap_count_write(objset_t *os, uint64_t zapobj, const char *name,
     int add, uint64_t *towrite, uint64_t *tooverwrite);
@@ -190,9 +207,12 @@
  * If an attribute with the given name already exists, the call will
  * fail and return EEXIST.
  */
-int zap_add(objset_t *ds, uint64_t zapobj, const char *name,
+int zap_add(objset_t *ds, uint64_t zapobj, const char *key,
     int integer_size, uint64_t num_integers,
     const void *val, dmu_tx_t *tx);
+int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key,
+    int key_numints, int integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx);
 
 /*
  * Set the attribute with the given name to the given value.  If an
@@ -204,6 +224,9 @@
  */
 int zap_update(objset_t *ds, uint64_t zapobj, const char *name,
     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
+int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints,
+    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
 
 /*
  * Get the length (in integers) and the integer size of the specified
@@ -214,6 +237,8 @@
  */
 int zap_length(objset_t *ds, uint64_t zapobj, const char *name,
     uint64_t *integer_size, uint64_t *num_integers);
+int zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints, uint64_t *integer_size, uint64_t *num_integers);
 
 /*
  * Remove the specified attribute.
@@ -224,6 +249,8 @@
 int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx);
 int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name,
     matchtype_t mt, dmu_tx_t *tx);
+int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints, dmu_tx_t *tx);
 
 /*
  * Returns (in *count) the number of attributes in the specified zap
@@ -266,6 +293,7 @@
 	struct zap *zc_zap;
 	struct zap_leaf *zc_leaf;
 	uint64_t zc_zapobj;
+	uint64_t zc_serialized;
 	uint64_t zc_hash;
 	uint32_t zc_cd;
 } zap_cursor_t;
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/sys/zap_impl.h
--- a/usr/src/uts/common/fs/zfs/sys/zap_impl.h	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/zap_impl.h	Sun Nov 01 14:14:46 2009 -0800
@@ -40,13 +40,13 @@
 
 #define	FZAP_BLOCK_SHIFT(zap)	((zap)->zap_f.zap_block_shift)
 
-#define	ZAP_MAXCD		(uint32_t)(-1)
-#define	ZAP_HASHBITS		28
 #define	MZAP_ENT_LEN		64
 #define	MZAP_NAME_LEN		(MZAP_ENT_LEN - 8 - 4 - 2)
 #define	MZAP_MAX_BLKSHIFT	SPA_MAXBLOCKSHIFT
 #define	MZAP_MAX_BLKSZ		(1 << MZAP_MAX_BLKSHIFT)
 
+#define	ZAP_NEED_CD		(-1U)
+
 typedef struct mzap_ent_phys {
 	uint64_t mze_value;
 	uint32_t mze_cd;
@@ -70,7 +70,6 @@
 	mzap_ent_phys_t mze_phys;
 } mzap_ent_t;
 
-
 /*
  * The (fat) zap is stored in one object. It is an array of
  * 1<<FZAP_BLOCK_SHIFT byte blocks. The layout looks like one of:
@@ -127,6 +126,7 @@
 	uint64_t zap_num_entries;	/* number of entries */
 	uint64_t zap_salt;		/* salt to stir into hash function */
 	uint64_t zap_normflags;		/* flags for u8_textprep_str() */
+	uint64_t zap_flags;		/* zap_flags_t */
 	/*
 	 * This structure is followed by padding, and then the embedded
 	 * pointer table.  The embedded pointer table takes up second
@@ -168,10 +168,13 @@
 
 typedef struct zap_name {
 	zap_t *zn_zap;
-	const char *zn_name_orij;
+	int zn_key_intlen;
+	const void *zn_key_orig;
+	int zn_key_orig_len;
+	const void *zn_key_norm;
+	int zn_key_norm_len;
 	uint64_t zn_hash;
 	matchtype_t zn_matchtype;
-	const char *zn_name_norm;
 	char zn_normbuf[ZAP_MAXNAMELEN];
 } zap_name_t;
 
@@ -183,8 +186,11 @@
     krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp);
 void zap_unlockdir(zap_t *zap);
 void zap_evict(dmu_buf_t *db, void *vmzap);
-zap_name_t *zap_name_alloc(zap_t *zap, const char *name, matchtype_t mt);
+zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt);
 void zap_name_free(zap_name_t *zn);
+int zap_hashbits(zap_t *zap);
+uint32_t zap_maxcd(zap_t *zap);
+uint64_t zap_getflags(zap_t *zap);
 
 #define	ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n))))
 
@@ -209,7 +215,7 @@
 int fzap_add_cd(zap_name_t *zn,
     uint64_t integer_size, uint64_t num_integers,
     const void *val, uint32_t cd, dmu_tx_t *tx);
-void fzap_upgrade(zap_t *zap, dmu_tx_t *tx);
+void fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags);
 int fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn);
 
 #ifdef	__cplusplus
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/sys/zap_leaf.h
--- a/usr/src/uts/common/fs/zfs/sys/zap_leaf.h	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/zap_leaf.h	Sun Nov 01 14:14:46 2009 -0800
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_ZAP_LEAF_H
 #define	_SYS_ZAP_LEAF_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -132,7 +130,7 @@
 		uint8_t le_int_size;		/* size of ints */
 		uint16_t le_next;		/* next entry in hash chain */
 		uint16_t le_name_chunk;		/* first chunk of the name */
-		uint16_t le_name_length;	/* bytes in name, incl null */
+		uint16_t le_name_length;	/* ints in name (incl null) */
 		uint16_t le_value_chunk;	/* first chunk of the value */
 		uint16_t le_value_length;	/* value length in ints */
 		uint32_t le_cd;			/* collision differentiator */
@@ -193,10 +191,10 @@
  * num_integers in the attribute.
  */
 extern int zap_entry_read(const zap_entry_handle_t *zeh,
-	uint8_t integer_size, uint64_t num_integers, void *buf);
+    uint8_t integer_size, uint64_t num_integers, void *buf);
 
-extern int zap_entry_read_name(const zap_entry_handle_t *zeh,
-	uint16_t buflen, char *buf);
+extern int zap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh,
+    uint16_t buflen, char *buf);
 
 /*
  * Replace the value of an existing entry.
@@ -204,7 +202,7 @@
  * zap_entry_update may fail if it runs out of space (ENOSPC).
  */
 extern int zap_entry_update(zap_entry_handle_t *zeh,
-	uint8_t integer_size, uint64_t num_integers, const void *buf);
+    uint8_t integer_size, uint64_t num_integers, const void *buf);
 
 /*
  * Remove an entry.
@@ -216,10 +214,9 @@
  * belong in this leaf (according to its hash value).  Fills in the
  * entry handle on success.  Returns 0 on success or ENOSPC on failure.
  */
-extern int zap_entry_create(zap_leaf_t *l,
-	const char *name, uint64_t h, uint32_t cd,
-	uint8_t integer_size, uint64_t num_integers, const void *buf,
-	zap_entry_handle_t *zeh);
+extern int zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
+    uint8_t integer_size, uint64_t num_integers, const void *buf,
+    zap_entry_handle_t *zeh);
 
 /*
  * Return true if there are additional entries with the same normalized
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/sys/zil.h
--- a/usr/src/uts/common/fs/zfs/sys/zil.h	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/zil.h	Sun Nov 01 14:14:46 2009 -0800
@@ -55,15 +55,17 @@
 	uint64_t zh_claim_txg;	/* txg in which log blocks were claimed */
 	uint64_t zh_replay_seq;	/* highest replayed sequence number */
 	blkptr_t zh_log;	/* log chain */
-	uint64_t zh_claim_seq;	/* highest claimed sequence number */
+	uint64_t zh_claim_blk_seq; /* highest claimed block sequence number */
 	uint64_t zh_flags;	/* header flags */
-	uint64_t zh_pad[4];
+	uint64_t zh_claim_lr_seq; /* highest claimed lr sequence number */
+	uint64_t zh_pad[3];
 } zil_header_t;
 
 /*
  * zh_flags bit settings
  */
-#define	ZIL_REPLAY_NEEDED 0x1	/* replay needed - internal only */
+#define	ZIL_REPLAY_NEEDED	0x1	/* replay needed - internal only */
+#define	ZIL_CLAIM_LR_SEQ_VALID	0x2	/* zh_claim_lr_seq field is valid */
 
 /*
  * Log block trailer - structure at the end of the header and each log block
@@ -150,6 +152,20 @@
 #define	TX_CI	((uint64_t)0x1 << 63) /* case-insensitive behavior requested */
 
 /*
+ * Transactions for write, truncate, setattr, acl_v0, and acl can be logged
+ * out of order.  For convenience in the code, all such records must have
+ * lr_foid at the same offset.
+ */
+#define	TX_OOO(txtype)			\
+	((txtype) == TX_WRITE ||	\
+	(txtype) == TX_TRUNCATE ||	\
+	(txtype) == TX_SETATTR ||	\
+	(txtype) == TX_ACL_V0 ||	\
+	(txtype) == TX_ACL ||		\
+	(txtype) == TX_WRITE2)
+
+
+/*
  * Format of log records.
  * The fields are carefully defined to allow them to be aligned
  * and sized the same on sparc & intel architectures.
@@ -169,6 +185,14 @@
 } lr_t;
 
 /*
+ * Common start of all out-of-order record types (TX_OOO() above).
+ */
+typedef struct {
+	lr_t		lr_common;	/* common portion of log record */
+	uint64_t	lr_foid;	/* object id */
+} lr_ooo_t;
+
+/*
  * Handle option extended vattr attributes.
  *
  * Whenever new attributes are added the version number
@@ -258,7 +282,7 @@
 	uint64_t	lr_foid;	/* file object to write */
 	uint64_t	lr_offset;	/* offset to write to */
 	uint64_t	lr_length;	/* user data length to write */
-	uint64_t	lr_blkoff;	/* offset represented by lr_blkptr */
+	uint64_t	lr_blkoff;	/* no longer used */
 	blkptr_t	lr_blkptr;	/* spa block pointer for replay */
 	/* write data will follow for small writes */
 } lr_write_t;
@@ -333,6 +357,7 @@
 			/* and put blkptr in log, rather than actual data) */
 	WR_COPIED,	/* immediate - data is copied into lr_write_t */
 	WR_NEED_COPY,	/* immediate - data needs to be copied if pushed */
+	WR_NUM_STATES	/* number of states */
 } itx_wr_state_t;
 
 typedef struct itx {
@@ -345,26 +370,14 @@
 	/* followed by type-specific part of lr_xx_t and its immediate data */
 } itx_t;
 
-
-/*
- * zgd_t is passed through dmu_sync() to the callback routine zfs_get_done()
- * to handle the cleanup of the dmu_sync() buffer write
- */
-typedef struct {
-	zilog_t		*zgd_zilog;	/* zilog */
-	blkptr_t	*zgd_bp;	/* block pointer */
-	struct rl	*zgd_rl;	/* range lock */
-} zgd_t;
-
-
-typedef void zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg,
+typedef int zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg,
     uint64_t txg);
-typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
+typedef int zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
     uint64_t txg);
 typedef int zil_replay_func_t();
 typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio);
 
-extern uint64_t	zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
+extern int zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
     zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg);
 
 extern void	zil_init(void);
@@ -378,10 +391,12 @@
 
 extern void	zil_replay(objset_t *os, void *arg,
     zil_replay_func_t *replay_func[TX_MAX_TYPE]);
+extern boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx);
 extern void	zil_destroy(zilog_t *zilog, boolean_t keep_first);
 extern void	zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx);
 
 extern itx_t	*zil_itx_create(uint64_t txtype, size_t lrsize);
+extern void	zil_itx_destroy(itx_t *itx);
 extern uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
 
 extern void	zil_commit(zilog_t *zilog, uint64_t seq, uint64_t oid);
@@ -391,14 +406,14 @@
 extern int	zil_check_log_chain(char *osname, void *txarg);
 extern void	zil_sync(zilog_t *zilog, dmu_tx_t *tx);
 extern void	zil_clean(zilog_t *zilog);
-extern int	zil_is_committed(zilog_t *zilog);
 
 extern int	zil_suspend(zilog_t *zilog);
 extern void	zil_resume(zilog_t *zilog);
 
-extern void	zil_add_block(zilog_t *zilog, blkptr_t *bp);
+extern void	zil_add_block(zilog_t *zilog, const blkptr_t *bp);
+extern int	zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp);
+
 extern void	zil_set_logbias(zilog_t *zilog, uint64_t slogval);
-extern void	zil_get_replay_data(zilog_t *zilog, lr_write_t *lr);
 
 extern int zil_disable;
 
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/sys/zil_impl.h
--- a/usr/src/uts/common/fs/zfs/sys/zil_impl.h	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/zil_impl.h	Sun Nov 01 14:14:46 2009 -0800
@@ -43,8 +43,8 @@
 	int		lwb_sz;		/* size of block and buffer */
 	char		*lwb_buf;	/* log write buffer */
 	zio_t		*lwb_zio;	/* zio for this buffer */
+	dmu_tx_t	*lwb_tx;	/* tx for log block allocation */
 	uint64_t	lwb_max_txg;	/* highest txg in this lwb */
-	txg_handle_t	lwb_txgh;	/* txg handle for txg_exit() */
 	list_node_t	lwb_node;	/* zilog->zl_lwb_list linkage */
 } lwb_t;
 
@@ -68,9 +68,10 @@
 	objset_t	*zl_os;		/* object set we're logging */
 	zil_get_data_t	*zl_get_data;	/* callback to get object content */
 	zio_t		*zl_root_zio;	/* log writer root zio */
-	uint64_t	zl_itx_seq;	/* next itx sequence number */
+	uint64_t	zl_itx_seq;	/* next in-core itx sequence number */
+	uint64_t	zl_lr_seq;	/* on-disk log record sequence number */
 	uint64_t	zl_commit_seq;	/* committed upto this number */
-	uint64_t	zl_lr_seq;	/* log record sequence number */
+	uint64_t	zl_commit_lr_seq; /* last committed on-disk lr seq */
 	uint64_t	zl_destroy_txg;	/* txg of last zil_destroy() */
 	uint64_t	zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */
 	uint64_t	zl_replaying_seq; /* current replay seq number */
@@ -82,8 +83,12 @@
 	uint8_t		zl_replay;	/* replaying records while set */
 	uint8_t		zl_stop_sync;	/* for debugging */
 	uint8_t		zl_writer;	/* boolean: write setup in progress */
-	uint8_t		zl_log_error;	/* boolean: log write error */
 	uint8_t		zl_logbias;	/* latency or throughput */
+	int		zl_parse_error;	/* last zil_parse() error */
+	uint64_t	zl_parse_blk_seq; /* highest blk seq on last parse */
+	uint64_t	zl_parse_lr_seq; /* highest lr seq on last parse */
+	uint64_t	zl_parse_blk_count; /* number of blocks parsed */
+	uint64_t	zl_parse_lr_count; /* number of log records parsed */
 	list_t		zl_itx_list;	/* in-memory itx list */
 	uint64_t	zl_itx_list_sz;	/* total size of records on list */
 	uint64_t	zl_cur_used;	/* current commit log size used */
@@ -92,15 +97,16 @@
 	kmutex_t	zl_vdev_lock;	/* protects zl_vdev_tree */
 	avl_tree_t	zl_vdev_tree;	/* vdevs to flush in zil_commit() */
 	taskq_t		*zl_clean_taskq; /* runs lwb and itx clean tasks */
-	avl_tree_t	zl_dva_tree;	/* track DVAs during log parse */
+	avl_tree_t	zl_bp_tree;	/* track bps during log parse */
 	clock_t		zl_replay_time;	/* lbolt of when replay started */
 	uint64_t	zl_replay_blks;	/* number of log blocks replayed */
+	zil_header_t	zl_old_header;	/* debugging aid */
 };
 
-typedef struct zil_dva_node {
+typedef struct zil_bp_node {
 	dva_t		zn_dva;
 	avl_node_t	zn_node;
-} zil_dva_node_t;
+} zil_bp_node_t;
 
 #define	ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_trailer_t) - \
     sizeof (lr_write_t))
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/sys/zio.h
--- a/usr/src/uts/common/fs/zfs/sys/zio.h	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h	Sun Nov 01 14:14:46 2009 -0800
@@ -79,6 +79,12 @@
 #define	ZIO_CHECKSUM_ON_VALUE	ZIO_CHECKSUM_FLETCHER_4
 #define	ZIO_CHECKSUM_DEFAULT	ZIO_CHECKSUM_ON
 
+#define	ZIO_CHECKSUM_MASK	0xffULL
+#define	ZIO_CHECKSUM_VERIFY	(1 << 8)
+
+#define	ZIO_DEDUPCHECKSUM	ZIO_CHECKSUM_SHA256
+#define	ZIO_DEDUPDITTO_MIN	100
+
 enum zio_compress {
 	ZIO_COMPRESS_INHERIT = 0,
 	ZIO_COMPRESS_ON,
@@ -94,12 +100,19 @@
 	ZIO_COMPRESS_GZIP_7,
 	ZIO_COMPRESS_GZIP_8,
 	ZIO_COMPRESS_GZIP_9,
+	ZIO_COMPRESS_ZLE,
 	ZIO_COMPRESS_FUNCTIONS
 };
 
 #define	ZIO_COMPRESS_ON_VALUE	ZIO_COMPRESS_LZJB
 #define	ZIO_COMPRESS_DEFAULT	ZIO_COMPRESS_OFF
 
+#define	BOOTFS_COMPRESS_VALID(compress)			\
+	((compress) == ZIO_COMPRESS_LZJB ||		\
+	((compress) == ZIO_COMPRESS_ON &&		\
+	ZIO_COMPRESS_ON_VALUE == ZIO_COMPRESS_LZJB) ||	\
+	(compress) == ZIO_COMPRESS_OFF)
+
 #define	ZIO_FAILURE_MODE_WAIT		0
 #define	ZIO_FAILURE_MODE_CONTINUE	1
 #define	ZIO_FAILURE_MODE_PANIC		2
@@ -116,75 +129,78 @@
 #define	ZIO_PRIORITY_SCRUB		(zio_priority_table[9])
 #define	ZIO_PRIORITY_TABLE_SIZE		10
 
-#define	ZIO_FLAG_MUSTSUCCEED		0x000000
-#define	ZIO_FLAG_CANFAIL		0x000001
-#define	ZIO_FLAG_SPECULATIVE		0x000002
-#define	ZIO_FLAG_CONFIG_WRITER		0x000004
-#define	ZIO_FLAG_DONT_RETRY		0x000008
-
-#define	ZIO_FLAG_DONT_CACHE		0x000010
-#define	ZIO_FLAG_DONT_QUEUE		0x000020
-#define	ZIO_FLAG_DONT_AGGREGATE		0x000040
-#define	ZIO_FLAG_DONT_PROPAGATE		0x000080
-
-#define	ZIO_FLAG_IO_BYPASS		0x000100
-#define	ZIO_FLAG_IO_REPAIR		0x000200
-#define	ZIO_FLAG_IO_RETRY		0x000400
-#define	ZIO_FLAG_IO_REWRITE		0x000800
-
-#define	ZIO_FLAG_SELF_HEAL		0x001000
-#define	ZIO_FLAG_RESILVER		0x002000
-#define	ZIO_FLAG_SCRUB			0x004000
-#define	ZIO_FLAG_SCRUB_THREAD		0x008000
-
-#define	ZIO_FLAG_PROBE			0x010000
-#define	ZIO_FLAG_GANG_CHILD		0x020000
-#define	ZIO_FLAG_RAW			0x040000
-#define	ZIO_FLAG_GODFATHER		0x080000
-
-#define	ZIO_FLAG_TRYHARD		0x100000
-#define	ZIO_FLAG_NODATA			0x200000
-#define	ZIO_FLAG_OPTIONAL		0x400000
-
-#define	ZIO_FLAG_GANG_INHERIT		\
-	(ZIO_FLAG_CANFAIL |		\
-	ZIO_FLAG_SPECULATIVE |		\
-	ZIO_FLAG_CONFIG_WRITER |	\
-	ZIO_FLAG_DONT_RETRY |		\
-	ZIO_FLAG_DONT_CACHE |		\
-	ZIO_FLAG_DONT_AGGREGATE |	\
-	ZIO_FLAG_SELF_HEAL |		\
-	ZIO_FLAG_RESILVER |		\
-	ZIO_FLAG_SCRUB |		\
-	ZIO_FLAG_SCRUB_THREAD)
-
-#define	ZIO_FLAG_VDEV_INHERIT		\
-	(ZIO_FLAG_GANG_INHERIT |	\
-	ZIO_FLAG_IO_REPAIR |		\
-	ZIO_FLAG_IO_RETRY |		\
-	ZIO_FLAG_PROBE |		\
-	ZIO_FLAG_TRYHARD |		\
-	ZIO_FLAG_NODATA |		\
-	ZIO_FLAG_OPTIONAL)
-
-#define	ZIO_FLAG_AGG_INHERIT		\
-	(ZIO_FLAG_DONT_AGGREGATE |	\
-	ZIO_FLAG_IO_REPAIR |		\
-	ZIO_FLAG_SELF_HEAL |		\
-	ZIO_FLAG_RESILVER |		\
-	ZIO_FLAG_SCRUB |		\
-	ZIO_FLAG_SCRUB_THREAD)
-
 #define	ZIO_PIPELINE_CONTINUE		0x100
 #define	ZIO_PIPELINE_STOP		0x101
 
+enum zio_flag {
+	/*
+	 * Flags inherited by gang, ddt, and vdev children,
+	 * and that must be equal for two zios to aggregate
+	 */
+	ZIO_FLAG_DONT_AGGREGATE	= 1 << 0,
+	ZIO_FLAG_IO_REPAIR	= 1 << 1,
+	ZIO_FLAG_SELF_HEAL	= 1 << 2,
+	ZIO_FLAG_RESILVER	= 1 << 3,
+	ZIO_FLAG_SCRUB		= 1 << 4,
+	ZIO_FLAG_SCRUB_THREAD	= 1 << 5,
+
+#define	ZIO_FLAG_AGG_INHERIT	(ZIO_FLAG_CANFAIL - 1)
+
+	/*
+	 * Flags inherited by ddt, gang, and vdev children.
+	 */
+	ZIO_FLAG_CANFAIL	= 1 << 6,	/* must be first for INHERIT */
+	ZIO_FLAG_SPECULATIVE	= 1 << 7,
+	ZIO_FLAG_CONFIG_WRITER	= 1 << 8,
+	ZIO_FLAG_DONT_RETRY	= 1 << 9,
+	ZIO_FLAG_DONT_CACHE	= 1 << 10,
+	ZIO_FLAG_NODATA		= 1 << 11,
+	ZIO_FLAG_INDUCE_DAMAGE	= 1 << 12,
+
+#define	ZIO_FLAG_DDT_INHERIT	(ZIO_FLAG_IO_RETRY - 1)
+#define	ZIO_FLAG_GANG_INHERIT	(ZIO_FLAG_IO_RETRY - 1)
+
+	/*
+	 * Flags inherited by vdev children.
+	 */
+	ZIO_FLAG_IO_RETRY	= 1 << 13,	/* must be first for INHERIT */
+	ZIO_FLAG_PROBE		= 1 << 14,
+	ZIO_FLAG_TRYHARD	= 1 << 15,
+	ZIO_FLAG_OPTIONAL	= 1 << 16,
+
+#define	ZIO_FLAG_VDEV_INHERIT	(ZIO_FLAG_DONT_QUEUE - 1)
+
+	/*
+	 * Flags not inherited by any children.
+	 */
+	ZIO_FLAG_DONT_QUEUE	= 1 << 17,	/* must be first for INHERIT */
+	ZIO_FLAG_DONT_PROPAGATE	= 1 << 18,
+	ZIO_FLAG_IO_BYPASS	= 1 << 19,
+	ZIO_FLAG_IO_REWRITE	= 1 << 20,
+	ZIO_FLAG_RAW		= 1 << 21,
+	ZIO_FLAG_GANG_CHILD	= 1 << 22,
+	ZIO_FLAG_DDT_CHILD	= 1 << 23,
+	ZIO_FLAG_GODFATHER	= 1 << 24
+};
+
+#define	ZIO_FLAG_MUSTSUCCEED		0
+
+#define	ZIO_DDT_CHILD_FLAGS(zio)				\
+	(((zio)->io_flags & ZIO_FLAG_DDT_INHERIT) |		\
+	ZIO_FLAG_DDT_CHILD | ZIO_FLAG_CANFAIL)
+
 #define	ZIO_GANG_CHILD_FLAGS(zio)				\
 	(((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) |		\
 	ZIO_FLAG_GANG_CHILD | ZIO_FLAG_CANFAIL)
 
+#define	ZIO_VDEV_CHILD_FLAGS(zio)				\
+	(((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) |		\
+	ZIO_FLAG_CANFAIL)
+
 enum zio_child {
 	ZIO_CHILD_VDEV = 0,
 	ZIO_CHILD_GANG,
+	ZIO_CHILD_DDT,
 	ZIO_CHILD_LOGICAL,
 	ZIO_CHILD_TYPES
 };
@@ -202,7 +218,6 @@
 #define	ECKSUM	EBADE
 #define	EFRAGS	EBADR
 
-typedef struct zio zio_t;
 typedef void zio_done_func_t(zio_t *zio);
 
 extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE];
@@ -211,18 +226,15 @@
 /*
  * A bookmark is a four-tuple <objset, object, level, blkid> that uniquely
  * identifies any block in the pool.  By convention, the meta-objset (MOS)
- * is objset 0, the meta-dnode is object 0, the root block (osphys_t) is
- * level -1 of the meta-dnode, and intent log blocks (which are chained
- * off the root block) have blkid == sequence number.  In summary:
+ * is objset 0, and the meta-dnode is object 0.  This covers all blocks
+ * except root blocks and ZIL blocks, which are defined as follows:
  *
- *	mos is objset 0
- *	meta-dnode is object 0
- *	root block is <objset, 0, -1, 0>
- *	intent log is <objset, 0, -1, ZIL sequence number>
+ * Root blocks (objset_phys_t) are object 0, level -1:  <objset, 0, -1, 0>.
+ * ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>.
+ * dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>.
  *
- * Note: this structure is called a bookmark because its first purpose was
- * to remember where to resume a pool-wide traverse.  The absolute ordering
- * for block visitation during traversal is defined in compare_bookmark().
+ * Note: this structure is called a bookmark because its original purpose
+ * was to remember where to resume a pool-wide traverse.
  *
  * Note: this structure is passed between userland and the kernel.
  * Therefore it must not change size or alignment between 32/64 bit
@@ -235,12 +247,31 @@
 	uint64_t	zb_blkid;
 } zbookmark_t;
 
+#define	SET_BOOKMARK(zb, objset, object, level, blkid)  \
+{                                                       \
+	(zb)->zb_objset = objset;                       \
+	(zb)->zb_object = object;                       \
+	(zb)->zb_level = level;                         \
+	(zb)->zb_blkid = blkid;                         \
+}
+
+#define	ZB_DESTROYED_OBJSET	(-1ULL)
+
+#define	ZB_ROOT_OBJECT		(0ULL)
+#define	ZB_ROOT_LEVEL		(-1LL)
+#define	ZB_ROOT_BLKID		(0ULL)
+
+#define	ZB_ZIL_OBJECT		(0ULL)
+#define	ZB_ZIL_LEVEL		(-2LL)
+
 typedef struct zio_prop {
 	enum zio_checksum	zp_checksum;
 	enum zio_compress	zp_compress;
 	dmu_object_type_t	zp_type;
 	uint8_t			zp_level;
-	uint8_t			zp_ndvas;
+	uint8_t			zp_copies;
+	uint8_t			zp_dedup;
+	uint8_t			zp_dedup_verify;
 } zio_prop_t;
 
 typedef struct zio_cksum_report zio_cksum_report_t;
@@ -255,9 +286,9 @@
 	struct zio_cksum_report *zcr_next;
 	nvlist_t		*zcr_ereport;
 	nvlist_t		*zcr_detector;
-
 	void			*zcr_cbdata;
 	size_t			zcr_cbinfo;	/* passed to zcr_free() */
+	uint64_t		zcr_align;
 	uint64_t		zcr_length;
 	zio_cksum_finish_f	*zcr_finish;
 	zio_cksum_free_f	*zcr_free;
@@ -326,6 +357,7 @@
 	uint64_t	io_txg;
 	spa_t		*io_spa;
 	blkptr_t	*io_bp;
+	blkptr_t	*io_bp_override;
 	blkptr_t	io_bp_copy;
 	list_t		io_parent_list;
 	list_t		io_child_list;
@@ -337,11 +369,14 @@
 	zio_done_func_t	*io_ready;
 	zio_done_func_t	*io_done;
 	void		*io_private;
+	int64_t		io_prev_space_delta;	/* DMU private */
 	blkptr_t	io_bp_orig;
 
 	/* Data represented by this I/O */
 	void		*io_data;
+	void		*io_orig_data;
 	uint64_t	io_size;
+	uint64_t	io_orig_size;
 
 	/* Stuff for the vdev stack */
 	vdev_t		*io_vd;
@@ -355,15 +390,17 @@
 	avl_tree_t	*io_vdev_tree;
 
 	/* Internal pipeline state */
-	int		io_flags;
-	zio_stage_t	io_stage;
-	uint32_t	io_pipeline;
-	int		io_orig_flags;
-	zio_stage_t	io_orig_stage;
-	uint32_t	io_orig_pipeline;
+	enum zio_flag	io_flags;
+	enum zio_stage	io_stage;
+	enum zio_stage	io_pipeline;
+	enum zio_flag	io_orig_flags;
+	enum zio_stage	io_orig_stage;
+	enum zio_stage	io_orig_pipeline;
 	int		io_error;
 	int		io_child_error[ZIO_CHILD_TYPES];
 	uint64_t	io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
+	uint64_t	io_child_count;
+	uint64_t	io_parent_count;
 	uint64_t	*io_stall;
 	zio_t		*io_gang_leader;
 	zio_gang_node_t	*io_gang_tree;
@@ -378,48 +415,51 @@
 };
 
 extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
-    zio_done_func_t *done, void *private, int flags);
+    zio_done_func_t *done, void *private, enum zio_flag flags);
 
 extern zio_t *zio_root(spa_t *spa,
-    zio_done_func_t *done, void *private, int flags);
+    zio_done_func_t *done, void *private, enum zio_flag flags);
 
 extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data,
     uint64_t size, zio_done_func_t *done, void *private,
-    int priority, int flags, const zbookmark_t *zb);
+    int priority, enum zio_flag flags, const zbookmark_t *zb);
 
 extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    void *data, uint64_t size, zio_prop_t *zp,
+    void *data, uint64_t size, const zio_prop_t *zp,
     zio_done_func_t *ready, zio_done_func_t *done, void *private,
-    int priority, int flags, const zbookmark_t *zb);
+    int priority, enum zio_flag flags, const zbookmark_t *zb);
 
 extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     void *data, uint64_t size, zio_done_func_t *done, void *private,
-    int priority, int flags, zbookmark_t *zb);
+    int priority, enum zio_flag flags, zbookmark_t *zb);
 
-extern void zio_skip_write(zio_t *zio);
+extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies);
 
-extern zio_t *zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    zio_done_func_t *done, void *private, int flags);
+extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp);
 
-extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    zio_done_func_t *done, void *private, int flags);
+extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg,
+    const blkptr_t *bp,
+    zio_done_func_t *done, void *private, enum zio_flag flags);
 
 extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
-    zio_done_func_t *done, void *private, int priority, int flags);
+    zio_done_func_t *done, void *private, int priority, enum zio_flag flags);
 
 extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
     uint64_t size, void *data, int checksum,
-    zio_done_func_t *done, void *private, int priority, int flags,
+    zio_done_func_t *done, void *private, int priority, enum zio_flag flags,
     boolean_t labels);
 
 extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
     uint64_t size, void *data, int checksum,
-    zio_done_func_t *done, void *private, int priority, int flags,
+    zio_done_func_t *done, void *private, int priority, enum zio_flag flags,
     boolean_t labels);
 
-extern int zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp,
-    blkptr_t *old_bp, uint64_t txg, boolean_t use_slog);
-extern void zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg);
+extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
+    const blkptr_t *bp, enum zio_flag flags);
+
+extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp,
+    blkptr_t *old_bp, uint64_t size, boolean_t use_slog);
+extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp);
 extern void zio_flush(zio_t *zio, vdev_t *vd);
 
 extern int zio_wait(zio_t *zio);
@@ -441,11 +481,11 @@
 
 extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
     uint64_t offset, void *data, uint64_t size, int type, int priority,
-    int flags, zio_done_func_t *done, void *private);
+    enum zio_flag flags, zio_done_func_t *done, void *private);
 
 extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
     void *data, uint64_t size, int type, int priority,
-    int flags, zio_done_func_t *done, void *private);
+    enum zio_flag flags, zio_done_func_t *done, void *private);
 
 extern void zio_vdev_io_bypass(zio_t *zio);
 extern void zio_vdev_io_reissue(zio_t *zio);
@@ -454,8 +494,12 @@
 extern void zio_checksum_verified(zio_t *zio);
 extern int zio_worst_error(int e1, int e2);
 
-extern uint8_t zio_checksum_select(uint8_t child, uint8_t parent);
-extern uint8_t zio_compress_select(uint8_t child, uint8_t parent);
+extern enum zio_checksum zio_checksum_select(enum zio_checksum child,
+    enum zio_checksum parent);
+extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa,
+    enum zio_checksum child, enum zio_checksum parent);
+extern enum zio_compress zio_compress_select(enum zio_compress child,
+    enum zio_compress parent);
 
 extern void zio_suspend(spa_t *spa, zio_t *zio);
 extern int zio_resume(spa_t *spa);
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/sys/zio_checksum.h
--- a/usr/src/uts/common/fs/zfs/sys/zio_checksum.h	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/zio_checksum.h	Sun Nov 01 14:14:46 2009 -0800
@@ -27,6 +27,7 @@
 #define	_SYS_ZIO_CHECKSUM_H
 
 #include <sys/zio.h>
+#include <zfs_fletcher.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -44,6 +45,7 @@
 	zio_checksum_t	*ci_func[2]; /* checksum function for each byteorder */
 	int		ci_correctable;	/* number of correctable bits	*/
 	int		ci_zbt;		/* uses zio block tail?	*/
+	int		ci_dedup;	/* strong enough for dedup? */
 	char		*ci_name;	/* descriptive name */
 } zio_checksum_info_t;
 
@@ -61,14 +63,6 @@
 /*
  * Checksum routines.
  */
-extern zio_checksum_t fletcher_2_native;
-extern zio_checksum_t fletcher_4_native;
-extern zio_checksum_t fletcher_4_incremental_native;
-
-extern zio_checksum_t fletcher_2_byteswap;
-extern zio_checksum_t fletcher_4_byteswap;
-extern zio_checksum_t fletcher_4_incremental_byteswap;
-
 extern zio_checksum_t zio_checksum_SHA256;
 
 extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/sys/zio_compress.h
--- a/usr/src/uts/common/fs/zfs/sys/zio_compress.h	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/zio_compress.h	Sun Nov 01 14:14:46 2009 -0800
@@ -20,15 +20,13 @@
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_ZIO_COMPRESS_H
 #define	_SYS_ZIO_COMPRESS_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zio.h>
 
 #ifdef	__cplusplus
@@ -66,14 +64,18 @@
     int level);
 extern int gzip_decompress(void *src, void *dst, size_t s_len, size_t d_len,
     int level);
+extern size_t zle_compress(void *src, void *dst, size_t s_len, size_t d_len,
+    int level);
+extern int zle_decompress(void *src, void *dst, size_t s_len, size_t d_len,
+    int level);
 
 /*
  * Compress and decompress data if necessary.
  */
-extern int zio_compress_data(int cpfunc, void *src, uint64_t srcsize,
-    void **destp, uint64_t *destsizep, uint64_t *destbufsizep);
-extern int zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
-    void *dest, uint64_t destsize);
+extern size_t zio_compress_data(enum zio_compress c, void *src, void *dst,
+    size_t s_len);
+extern int zio_decompress_data(enum zio_compress c, void *src, void *dst,
+    size_t s_len, size_t d_len);
 
 #ifdef	__cplusplus
 }
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/sys/zio_impl.h
--- a/usr/src/uts/common/fs/zfs/sys/zio_impl.h	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/zio_impl.h	Sun Nov 01 14:14:46 2009 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -34,104 +34,136 @@
 #endif
 
 /*
- * I/O Groups: pipeline stage definitions.
+ * zio pipeline stage definitions
  */
-typedef enum zio_stage {
-	ZIO_STAGE_OPEN = 0,			/* RWFCI */
+enum zio_stage {
+	ZIO_STAGE_OPEN			= 1 << 0,	/* RWFCI */
 
-	ZIO_STAGE_ISSUE_ASYNC,			/* -W--- */
+	ZIO_STAGE_READ_BP_INIT		= 1 << 1,	/* R---- */
+	ZIO_STAGE_FREE_BP_INIT		= 1 << 2,	/* --F-- */
+	ZIO_STAGE_ISSUE_ASYNC		= 1 << 3,	/* RWF-- */
+	ZIO_STAGE_WRITE_BP_INIT		= 1 << 4,	/* -W--- */
 
-	ZIO_STAGE_READ_BP_INIT,			/* R---- */
-	ZIO_STAGE_WRITE_BP_INIT,		/* -W--- */
+	ZIO_STAGE_CHECKSUM_GENERATE	= 1 << 5,	/* -W--- */
 
-	ZIO_STAGE_CHECKSUM_GENERATE,		/* -W--- */
+	ZIO_STAGE_DDT_READ_START	= 1 << 6,	/* R---- */
+	ZIO_STAGE_DDT_READ_DONE		= 1 << 7,	/* R---- */
+	ZIO_STAGE_DDT_WRITE		= 1 << 8,	/* -W--- */
+	ZIO_STAGE_DDT_FREE		= 1 << 9,	/* --F-- */
 
-	ZIO_STAGE_GANG_ASSEMBLE,		/* RWFC- */
-	ZIO_STAGE_GANG_ISSUE,			/* RWFC- */
+	ZIO_STAGE_GANG_ASSEMBLE		= 1 << 10,	/* RWFC- */
+	ZIO_STAGE_GANG_ISSUE		= 1 << 11,	/* RWFC- */
 
-	ZIO_STAGE_DVA_ALLOCATE,			/* -W--- */
-	ZIO_STAGE_DVA_FREE,			/* --F-- */
-	ZIO_STAGE_DVA_CLAIM,			/* ---C- */
+	ZIO_STAGE_DVA_ALLOCATE		= 1 << 12,	/* -W--- */
+	ZIO_STAGE_DVA_FREE		= 1 << 13,	/* --F-- */
+	ZIO_STAGE_DVA_CLAIM		= 1 << 14,	/* ---C- */
 
-	ZIO_STAGE_READY,			/* RWFCI */
+	ZIO_STAGE_READY			= 1 << 15,	/* RWFCI */
 
-	ZIO_STAGE_VDEV_IO_START,		/* RW--I */
-	ZIO_STAGE_VDEV_IO_DONE,			/* RW--I */
-	ZIO_STAGE_VDEV_IO_ASSESS,		/* RW--I */
+	ZIO_STAGE_VDEV_IO_START		= 1 << 16,	/* RW--I */
+	ZIO_STAGE_VDEV_IO_DONE		= 1 << 17,	/* RW--I */
+	ZIO_STAGE_VDEV_IO_ASSESS	= 1 << 18,	/* RW--I */
 
-	ZIO_STAGE_CHECKSUM_VERIFY,		/* R---- */
+	ZIO_STAGE_CHECKSUM_VERIFY	= 1 << 19,	/* R---- */
 
-	ZIO_STAGE_DONE,				/* RWFCI */
-	ZIO_STAGES
-} zio_stage_t;
+	ZIO_STAGE_DONE			= 1 << 20	/* RWFCI */
+};
 
-#define	ZIO_INTERLOCK_STAGES					\
-	((1U << ZIO_STAGE_READY) |				\
-	(1U << ZIO_STAGE_DONE))
+#define	ZIO_INTERLOCK_STAGES			\
+	(ZIO_STAGE_READY |			\
+	ZIO_STAGE_DONE)
 
-#define	ZIO_INTERLOCK_PIPELINE					\
+#define	ZIO_INTERLOCK_PIPELINE			\
 	ZIO_INTERLOCK_STAGES
 
-#define	ZIO_VDEV_IO_STAGES					\
-	((1U << ZIO_STAGE_VDEV_IO_START) |			\
-	(1U << ZIO_STAGE_VDEV_IO_DONE) |			\
-	(1U << ZIO_STAGE_VDEV_IO_ASSESS))
+#define	ZIO_VDEV_IO_STAGES			\
+	(ZIO_STAGE_VDEV_IO_START |		\
+	ZIO_STAGE_VDEV_IO_DONE |		\
+	ZIO_STAGE_VDEV_IO_ASSESS)
 
-#define	ZIO_VDEV_CHILD_PIPELINE					\
-	(ZIO_VDEV_IO_STAGES |					\
-	(1U << ZIO_STAGE_DONE))
+#define	ZIO_VDEV_CHILD_PIPELINE			\
+	(ZIO_VDEV_IO_STAGES |			\
+	ZIO_STAGE_DONE)
 
-#define	ZIO_READ_COMMON_STAGES					\
-	(ZIO_INTERLOCK_STAGES |					\
-	ZIO_VDEV_IO_STAGES |					\
-	(1U << ZIO_STAGE_CHECKSUM_VERIFY))
+#define	ZIO_READ_COMMON_STAGES			\
+	(ZIO_INTERLOCK_STAGES |			\
+	ZIO_VDEV_IO_STAGES |			\
+	ZIO_STAGE_CHECKSUM_VERIFY)
 
-#define	ZIO_READ_PHYS_PIPELINE					\
+#define	ZIO_READ_PHYS_PIPELINE			\
 	ZIO_READ_COMMON_STAGES
 
-#define	ZIO_READ_PIPELINE					\
-	(ZIO_READ_COMMON_STAGES |				\
-	(1U << ZIO_STAGE_READ_BP_INIT))
+#define	ZIO_READ_PIPELINE			\
+	(ZIO_READ_COMMON_STAGES |		\
+	ZIO_STAGE_READ_BP_INIT)
+
+#define	ZIO_DDT_CHILD_READ_PIPELINE		\
+	ZIO_READ_COMMON_STAGES
 
-#define	ZIO_WRITE_COMMON_STAGES					\
-	(ZIO_INTERLOCK_STAGES |					\
-	ZIO_VDEV_IO_STAGES |					\
-	(1U << ZIO_STAGE_ISSUE_ASYNC) |				\
-	(1U << ZIO_STAGE_CHECKSUM_GENERATE))
+#define	ZIO_DDT_READ_PIPELINE			\
+	(ZIO_INTERLOCK_STAGES |			\
+	ZIO_STAGE_READ_BP_INIT |		\
+	ZIO_STAGE_DDT_READ_START |		\
+	ZIO_STAGE_DDT_READ_DONE)
 
-#define	ZIO_WRITE_PHYS_PIPELINE					\
+#define	ZIO_WRITE_COMMON_STAGES			\
+	(ZIO_INTERLOCK_STAGES |			\
+	ZIO_VDEV_IO_STAGES |			\
+	ZIO_STAGE_ISSUE_ASYNC |			\
+	ZIO_STAGE_CHECKSUM_GENERATE)
+
+#define	ZIO_WRITE_PHYS_PIPELINE			\
 	ZIO_WRITE_COMMON_STAGES
 
-#define	ZIO_REWRITE_PIPELINE					\
-	(ZIO_WRITE_COMMON_STAGES |				\
-	(1U << ZIO_STAGE_WRITE_BP_INIT))
+#define	ZIO_REWRITE_PIPELINE			\
+	(ZIO_WRITE_COMMON_STAGES |		\
+	ZIO_STAGE_WRITE_BP_INIT)
+
+#define	ZIO_WRITE_PIPELINE			\
+	(ZIO_WRITE_COMMON_STAGES |		\
+	ZIO_STAGE_WRITE_BP_INIT |		\
+	ZIO_STAGE_DVA_ALLOCATE)
 
-#define	ZIO_WRITE_PIPELINE					\
-	(ZIO_WRITE_COMMON_STAGES |				\
-	(1U << ZIO_STAGE_WRITE_BP_INIT) |			\
-	(1U << ZIO_STAGE_DVA_ALLOCATE))
+#define	ZIO_DDT_CHILD_WRITE_PIPELINE		\
+	(ZIO_INTERLOCK_STAGES |			\
+	ZIO_VDEV_IO_STAGES |			\
+	ZIO_STAGE_DVA_ALLOCATE)
 
-#define	ZIO_GANG_STAGES						\
-	((1U << ZIO_STAGE_GANG_ASSEMBLE) |			\
-	(1U << ZIO_STAGE_GANG_ISSUE))
+#define	ZIO_DDT_WRITE_PIPELINE			\
+	(ZIO_INTERLOCK_STAGES |			\
+	ZIO_STAGE_ISSUE_ASYNC |			\
+	ZIO_STAGE_WRITE_BP_INIT |		\
+	ZIO_STAGE_CHECKSUM_GENERATE |		\
+	ZIO_STAGE_DDT_WRITE)
+
+#define	ZIO_GANG_STAGES				\
+	(ZIO_STAGE_GANG_ASSEMBLE |		\
+	ZIO_STAGE_GANG_ISSUE)
 
-#define	ZIO_FREE_PIPELINE					\
-	(ZIO_INTERLOCK_STAGES |					\
-	(1U << ZIO_STAGE_DVA_FREE))
+#define	ZIO_FREE_PIPELINE			\
+	(ZIO_INTERLOCK_STAGES |			\
+	ZIO_STAGE_FREE_BP_INIT |		\
+	ZIO_STAGE_DVA_FREE)
 
-#define	ZIO_CLAIM_PIPELINE					\
-	(ZIO_INTERLOCK_STAGES |					\
-	(1U << ZIO_STAGE_DVA_CLAIM))
+#define	ZIO_DDT_FREE_PIPELINE			\
+	(ZIO_INTERLOCK_STAGES |			\
+	ZIO_STAGE_FREE_BP_INIT |		\
+	ZIO_STAGE_ISSUE_ASYNC |			\
+	ZIO_STAGE_DDT_FREE)
 
-#define	ZIO_IOCTL_PIPELINE					\
-	(ZIO_INTERLOCK_STAGES |					\
-	(1U << ZIO_STAGE_VDEV_IO_START) |			\
-	(1U << ZIO_STAGE_VDEV_IO_ASSESS))
+#define	ZIO_CLAIM_PIPELINE			\
+	(ZIO_INTERLOCK_STAGES |			\
+	ZIO_STAGE_DVA_CLAIM)
 
-#define	ZIO_CONFIG_LOCK_BLOCKING_STAGES				\
-	((1U << ZIO_STAGE_VDEV_IO_START) |			\
-	(1U << ZIO_STAGE_DVA_ALLOCATE) |			\
-	(1U << ZIO_STAGE_DVA_CLAIM))
+#define	ZIO_IOCTL_PIPELINE			\
+	(ZIO_INTERLOCK_STAGES |			\
+	ZIO_STAGE_VDEV_IO_START |		\
+	ZIO_STAGE_VDEV_IO_ASSESS)
+
+#define	ZIO_BLOCKING_STAGES			\
+	(ZIO_STAGE_DVA_ALLOCATE |		\
+	ZIO_STAGE_DVA_CLAIM |			\
+	ZIO_STAGE_VDEV_IO_START)
 
 extern void zio_inject_init(void);
 extern void zio_inject_fini(void);
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/txg.c
--- a/usr/src/uts/common/fs/zfs/txg.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/txg.c	Sun Nov 01 14:14:46 2009 -0800
@@ -64,7 +64,6 @@
 		}
 	}
 
-	rw_init(&tx->tx_suspend, NULL, RW_DEFAULT, NULL);
 	mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL);
@@ -87,7 +86,6 @@
 
 	ASSERT(tx->tx_threads == 0);
 
-	rw_destroy(&tx->tx_suspend);
 	mutex_destroy(&tx->tx_sync_lock);
 
 	cv_destroy(&tx->tx_sync_more_cv);
@@ -397,8 +395,6 @@
 		if (tx->tx_exiting)
 			txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
 
-		rw_enter(&tx->tx_suspend, RW_WRITER);
-
 		/*
 		 * Consume the quiesced txg which has been handed off to
 		 * us.  This may cause the quiescing thread to now be
@@ -408,7 +404,6 @@
 		tx->tx_quiesced_txg = 0;
 		tx->tx_syncing_txg = txg;
 		cv_broadcast(&tx->tx_quiesce_more_cv);
-		rw_exit(&tx->tx_suspend);
 
 		dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
 		    txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
@@ -419,10 +414,8 @@
 		delta = lbolt - start;
 
 		mutex_enter(&tx->tx_sync_lock);
-		rw_enter(&tx->tx_suspend, RW_WRITER);
 		tx->tx_synced_txg = txg;
 		tx->tx_syncing_txg = 0;
-		rw_exit(&tx->tx_suspend);
 		cv_broadcast(&tx->tx_sync_done_cv);
 
 		/*
@@ -514,7 +507,7 @@
 	mutex_enter(&tx->tx_sync_lock);
 	ASSERT(tx->tx_threads == 2);
 	if (txg == 0)
-		txg = tx->tx_open_txg;
+		txg = tx->tx_open_txg + TXG_DEFER_SIZE;
 	if (tx->tx_sync_txg_waiting < txg)
 		tx->tx_sync_txg_waiting = txg;
 	dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
@@ -565,21 +558,6 @@
 	    tx->tx_quiesced_txg != 0);
 }
 
-void
-txg_suspend(dsl_pool_t *dp)
-{
-	tx_state_t *tx = &dp->dp_tx;
-	/* XXX some code paths suspend when they are already suspended! */
-	rw_enter(&tx->tx_suspend, RW_READER);
-}
-
-void
-txg_resume(dsl_pool_t *dp)
-{
-	tx_state_t *tx = &dp->dp_tx;
-	rw_exit(&tx->tx_suspend);
-}
-
 /*
  * Per-txg object lists.
  */
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/vdev.c
--- a/usr/src/uts/common/fs/zfs/vdev.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/vdev.c	Sun Nov 01 14:14:46 2009 -0800
@@ -409,10 +409,7 @@
 	if (ops == &vdev_raidz_ops) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
 		    &nparity) == 0) {
-			/*
-			 * Currently, we can only support 3 parity devices.
-			 */
-			if (nparity == 0 || nparity > 3)
+			if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
 				return (EINVAL);
 			/*
 			 * Previous versions could only support 1 or 2 parity
@@ -567,6 +564,7 @@
 	vdev_close(vd);
 
 	ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
+	ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
 
 	/*
 	 * Free all children.
@@ -816,9 +814,9 @@
 	ASSERT(oldc <= newc);
 
 	if (vd->vdev_islog)
-		mc = spa->spa_log_class;
+		mc = spa_log_class(spa);
 	else
-		mc = spa->spa_normal_class;
+		mc = spa_normal_class(spa);
 
 	if (vd->vdev_mg == NULL)
 		vd->vdev_mg = metaslab_group_create(mc, vd);
@@ -1573,7 +1571,7 @@
 		vdev_dtl_reassess(vd->vdev_child[c], txg,
 		    scrub_txg, scrub_done);
 
-	if (vd == spa->spa_root_vdev || vd->vdev_ishole)
+	if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux)
 		return;
 
 	if (vd->vdev_ops->vdev_op_leaf) {
@@ -2171,11 +2169,10 @@
 			return (spa_vdev_state_exit(spa, NULL, EBUSY));
 
 		/*
-		 * If the top-level is a slog and it's had allocations
-		 * then proceed. We check that the vdev's metaslab
-		 * grop is not NULL since it's possible that we may
-		 * have just added this vdev and have not yet initialized
-		 * it's metaslabs.
+		 * If the top-level is a slog and it has had allocations
+		 * then proceed.  We check that the vdev's metaslab group
+		 * is not NULL since it's possible that we may have just
+		 * added this vdev but not yet initialized its metaslabs.
 		 */
 		if (tvd->vdev_islog && mg != NULL) {
 			/*
@@ -2496,14 +2493,17 @@
 
 	if (type == ZIO_TYPE_WRITE && txg != 0 &&
 	    (!(flags & ZIO_FLAG_IO_REPAIR) ||
-	    (flags & ZIO_FLAG_SCRUB_THREAD))) {
+	    (flags & ZIO_FLAG_SCRUB_THREAD) ||
+	    spa->spa_claiming)) {
 		/*
-		 * This is either a normal write (not a repair), or it's a
-		 * repair induced by the scrub thread.  In the normal case,
-		 * we commit the DTL change in the same txg as the block
-		 * was born.  In the scrub-induced repair case, we know that
-		 * scrubs run in first-pass syncing context, so we commit
-		 * the DTL change in spa->spa_syncing_txg.
+		 * This is either a normal write (not a repair), or it's
+		 * a repair induced by the scrub thread, or it's a repair
+		 * made by zil_claim() during spa_load() in the first txg.
+		 * In the normal case, we commit the DTL change in the same
+		 * txg as the block was born.  In the scrub-induced repair
+		 * case, we know that scrubs run in first-pass syncing context,
+		 * so we commit the DTL change in spa_syncing_txg(spa).
+		 * In the zil_claim() case, we commit in spa_first_txg(spa).
 		 *
 		 * We currently do not make DTL entries for failed spontaneous
 		 * self-healing writes triggered by normal (non-scrubbing)
@@ -2516,9 +2516,12 @@
 				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
 				ASSERT(spa_sync_pass(spa) == 1);
 				vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
-				commit_txg = spa->spa_syncing_txg;
+				commit_txg = spa_syncing_txg(spa);
+			} else if (spa->spa_claiming) {
+				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
+				commit_txg = spa_first_txg(spa);
 			}
-			ASSERT(commit_txg >= spa->spa_syncing_txg);
+			ASSERT(commit_txg >= spa_syncing_txg(spa));
 			if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
 				return;
 			for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
@@ -2560,15 +2563,18 @@
 }
 
 /*
- * Update the in-core space usage stats for this vdev and the root vdev.
+ * Update the in-core space usage stats for this vdev, its metaslab class,
+ * and the root vdev.
  */
 void
-vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta,
-    int64_t defer_delta, boolean_t update_root)
+vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
+    int64_t space_delta)
 {
 	int64_t dspace_delta = space_delta;
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
+	metaslab_group_t *mg = vd->vdev_mg;
+	metaslab_class_t *mc = mg ? mg->mg_class : NULL;
 
 	ASSERT(vd == vd->vdev_top);
 
@@ -2584,29 +2590,25 @@
 	    vd->vdev_deflate_ratio;
 
 	mutex_enter(&vd->vdev_stat_lock);
+	vd->vdev_stat.vs_alloc += alloc_delta;
 	vd->vdev_stat.vs_space += space_delta;
-	vd->vdev_stat.vs_alloc += alloc_delta;
 	vd->vdev_stat.vs_dspace += dspace_delta;
-	vd->vdev_stat.vs_defer += defer_delta;
 	mutex_exit(&vd->vdev_stat_lock);
 
-	if (update_root) {
+	if (mc == spa_normal_class(spa)) {
+		mutex_enter(&rvd->vdev_stat_lock);
+		rvd->vdev_stat.vs_alloc += alloc_delta;
+		rvd->vdev_stat.vs_space += space_delta;
+		rvd->vdev_stat.vs_dspace += dspace_delta;
+		mutex_exit(&rvd->vdev_stat_lock);
+	}
+
+	if (mc != NULL) {
 		ASSERT(rvd == vd->vdev_parent);
 		ASSERT(vd->vdev_ms_count != 0);
 
-		/*
-		 * Don't count non-normal (e.g. intent log) space as part of
-		 * the pool's capacity.
-		 */
-		if (vd->vdev_islog)
-			return;
-
-		mutex_enter(&rvd->vdev_stat_lock);
-		rvd->vdev_stat.vs_space += space_delta;
-		rvd->vdev_stat.vs_alloc += alloc_delta;
-		rvd->vdev_stat.vs_dspace += dspace_delta;
-		rvd->vdev_stat.vs_defer += defer_delta;
-		mutex_exit(&rvd->vdev_stat_lock);
+		metaslab_class_space_update(mc,
+		    alloc_delta, defer_delta, space_delta, dspace_delta);
 	}
 }
 
@@ -2722,7 +2724,7 @@
 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
 	    spa_config_held(spa, SCL_STATE, RW_READER)));
 
-	if (!list_link_active(&vd->vdev_state_dirty_node))
+	if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole)
 		list_insert_head(&spa->spa_state_dirty_list, vd);
 }
 
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/vdev_mirror.c
--- a/usr/src/uts/common/fs/zfs/vdev_mirror.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c	Sun Nov 01 14:14:46 2009 -0800
@@ -214,7 +214,7 @@
 	uint64_t txg = zio->io_txg;
 	int i, c;
 
-	ASSERT(zio->io_bp == NULL || zio->io_bp->blk_birth == txg);
+	ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg);
 
 	/*
 	 * Try to find a child whose DTL doesn't contain the block to read.
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/vdev_queue.c
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c	Sun Nov 01 14:14:46 2009 -0800
@@ -24,7 +24,6 @@
  */
 
 #include <sys/zfs_context.h>
-#include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
 #include <sys/avl.h>
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/vdev_raidz.c
--- a/usr/src/uts/common/fs/zfs/vdev_raidz.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c	Sun Nov 01 14:14:46 2009 -0800
@@ -129,7 +129,6 @@
 #define	VDEV_RAIDZ_P		0
 #define	VDEV_RAIDZ_Q		1
 #define	VDEV_RAIDZ_R		2
-#define	VDEV_RAIDZ_MAXPARITY	3
 
 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
@@ -1524,7 +1523,6 @@
 	vdev_t *vd = zio->io_vd;
 	vdev_t *tvd = vd->vdev_top;
 	vdev_t *cvd;
-	blkptr_t *bp = zio->io_bp;
 	raidz_map_t *rm;
 	raidz_col_t *rc;
 	int c, i;
@@ -1585,7 +1583,7 @@
 			rc->rc_skipped = 1;
 			continue;
 		}
-		if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) {
+		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
 			if (c >= rm->rm_firstdatacol)
 				rm->rm_missingdata++;
 			else
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/zap.c
--- a/usr/src/uts/common/fs/zfs/zap.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/zap.c	Sun Nov 01 14:14:46 2009 -0800
@@ -70,7 +70,7 @@
 }
 
 void
-fzap_upgrade(zap_t *zap, dmu_tx_t *tx)
+fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
 {
 	dmu_buf_t *db;
 	zap_leaf_t *l;
@@ -102,6 +102,7 @@
 	zp->zap_num_entries = 0;
 	zp->zap_salt = zap->zap_salt;
 	zp->zap_normflags = zap->zap_normflags;
+	zp->zap_flags = flags;
 
 	/* block 1 will be the first leaf */
 	for (i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++)
@@ -315,8 +316,13 @@
 static int
 zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
 {
-	/* In case things go horribly wrong. */
-	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= ZAP_HASHBITS-2)
+	/*
+	 * The pointer table should never use more hash bits than we
+	 * have (otherwise we'd be using useless zero bits to index it).
+	 * If we are within 2 bits of running out, stop growing, since
+	 * this is already an aberrant condition.
+	 */
+	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2)
 		return (ENOSPC);
 
 	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
@@ -702,9 +708,9 @@
 
 
 static int
-fzap_checksize(const char *name, uint64_t integer_size, uint64_t num_integers)
+fzap_checksize(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers)
 {
-	if (name && strlen(name) > ZAP_MAXNAMELEN)
+	if (zn->zn_key_orig_len > ZAP_MAXNAMELEN)
 		return (E2BIG);
 
 	/* Only integer sizes supported by C */
@@ -736,7 +742,7 @@
 	int err;
 	zap_entry_handle_t zeh;
 
-	err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers);
+	err = fzap_checksize(zn, integer_size, num_integers);
 	if (err != 0)
 		return (err);
 
@@ -746,7 +752,7 @@
 	err = zap_leaf_lookup(l, zn, &zeh);
 	if (err == 0) {
 		err = zap_entry_read(&zeh, integer_size, num_integers, buf);
-		(void) zap_entry_read_name(&zeh, rn_len, realname);
+		(void) zap_entry_read_name(zn->zn_zap, &zeh, rn_len, realname);
 		if (ncp) {
 			*ncp = zap_entry_normalization_conflict(&zeh,
 			    zn, NULL, zn->zn_zap);
@@ -769,8 +775,7 @@
 
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 	ASSERT(!zap->zap_ismicro);
-	ASSERT(fzap_checksize(zn->zn_name_orij,
-	    integer_size, num_integers) == 0);
+	ASSERT(fzap_checksize(zn, integer_size, num_integers) == 0);
 
 	err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
 	if (err != 0)
@@ -784,7 +789,7 @@
 	if (err != ENOENT)
 		goto out;
 
-	err = zap_entry_create(l, zn->zn_name_orij, zn->zn_hash, cd,
+	err = zap_entry_create(l, zn, cd,
 	    integer_size, num_integers, val, &zeh);
 
 	if (err == 0) {
@@ -807,12 +812,12 @@
     uint64_t integer_size, uint64_t num_integers,
     const void *val, dmu_tx_t *tx)
 {
-	int err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers);
+	int err = fzap_checksize(zn, integer_size, num_integers);
 	if (err != 0)
 		return (err);
 
 	return (fzap_add_cd(zn, integer_size, num_integers,
-	    val, ZAP_MAXCD, tx));
+	    val, ZAP_NEED_CD, tx));
 }
 
 int
@@ -825,7 +830,7 @@
 	zap_t *zap = zn->zn_zap;
 
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-	err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers);
+	err = fzap_checksize(zn, integer_size, num_integers);
 	if (err != 0)
 		return (err);
 
@@ -838,8 +843,8 @@
 	ASSERT(err == 0 || err == ENOENT);
 
 	if (create) {
-		err = zap_entry_create(l, zn->zn_name_orij, zn->zn_hash,
-		    ZAP_MAXCD, integer_size, num_integers, val, &zeh);
+		err = zap_entry_create(l, zn, ZAP_NEED_CD,
+		    integer_size, num_integers, val, &zeh);
 		if (err == 0)
 			zap_increment_num_entries(zap, 1, tx);
 	} else {
@@ -1013,6 +1018,8 @@
 	zap_entry_handle_t zeh;
 	zap_leaf_t *l;
 
+	/* memset(za, 0xba, sizeof (zap_attribute_t)); */
+
 	/* retrieve the next entry at or after zc_hash/zc_cd */
 	/* if no entry, return ENOENT */
 
@@ -1063,7 +1070,7 @@
 			err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer);
 			ASSERT(err == 0 || err == EOVERFLOW);
 		}
-		err = zap_entry_read_name(&zeh,
+		err = zap_entry_read_name(zap, &zeh,
 		    sizeof (za->za_name), za->za_name);
 		ASSERT(err == 0);
 
@@ -1109,7 +1116,7 @@
 	zap_leaf_t *l;
 	zap_entry_handle_t zeh;
 
-	if (zn->zn_name_orij && strlen(zn->zn_name_orij) > ZAP_MAXNAMELEN)
+	if (zn->zn_key_orig_len > ZAP_MAXNAMELEN)
 		return (E2BIG);
 
 	err = zap_deref_leaf(zc->zc_zap, zn->zn_hash, NULL, RW_READER, &l);
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/zap_leaf.c
--- a/usr/src/uts/common/fs/zfs/zap_leaf.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/zap_leaf.c	Sun Nov 01 14:14:46 2009 -0800
@@ -29,6 +29,7 @@
  * the names are stored null-terminated.
  */
 
+#include <sys/zio.h>
 #include <sys/spa.h>
 #include <sys/dmu.h>
 #include <sys/zfs_context.h>
@@ -272,11 +273,12 @@
 static void
 zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
     int array_int_len, int array_len, int buf_int_len, uint64_t buf_len,
-    char *buf)
+    void *buf)
 {
 	int len = MIN(array_len, buf_len);
 	int byten = 0;
 	uint64_t value = 0;
+	char *p = buf;
 
 	ASSERT3U(array_int_len, <=, buf_int_len);
 
@@ -284,7 +286,7 @@
 	if (array_int_len == 8 && buf_int_len == 8 && len == 1) {
 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
 		uint8_t *ip = la->la_array;
-		uint64_t *buf64 = (uint64_t *)buf;
+		uint64_t *buf64 = buf;
 
 		*buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 |
 		    (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 |
@@ -299,8 +301,8 @@
 		while (chunk != CHAIN_END) {
 			struct zap_leaf_array *la =
 			    &ZAP_LEAF_CHUNK(l, chunk).l_array;
-			bcopy(la->la_array, buf, ZAP_LEAF_ARRAY_BYTES);
-			buf += ZAP_LEAF_ARRAY_BYTES;
+			bcopy(la->la_array, p, ZAP_LEAF_ARRAY_BYTES);
+			p += ZAP_LEAF_ARRAY_BYTES;
 			chunk = la->la_next;
 		}
 		return;
@@ -315,12 +317,12 @@
 			value = (value << 8) | la->la_array[i];
 			byten++;
 			if (byten == array_int_len) {
-				stv(buf_int_len, buf, value);
+				stv(buf_int_len, p, value);
 				byten = 0;
 				len--;
 				if (len == 0)
 					return;
-				buf += buf_int_len;
+				p += buf_int_len;
 			}
 		}
 		chunk = la->la_next;
@@ -328,7 +330,6 @@
 }
 
 /*
- * Only to be used on 8-bit arrays.
  * array_len is actual len in bytes (not encoded le_value_length).
  * namenorm is null-terminated.
  */
@@ -337,23 +338,44 @@
 {
 	int bseen = 0;
 
+	if (zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY) {
+		uint64_t *thiskey;
+		boolean_t match;
+
+		ASSERT(zn->zn_key_intlen == sizeof (*thiskey));
+		thiskey = kmem_alloc(array_len * sizeof (*thiskey), KM_SLEEP);
+
+		zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_len,
+		    sizeof (*thiskey), array_len, thiskey);
+		match = bcmp(thiskey, zn->zn_key_orig,
+		    array_len * sizeof (*thiskey)) == 0;
+		kmem_free(thiskey, array_len * sizeof (*thiskey));
+		return (match);
+	}
+
 	if (zn->zn_matchtype == MT_FIRST) {
 		char *thisname = kmem_alloc(array_len, KM_SLEEP);
 		boolean_t match;
 
-		zap_leaf_array_read(l, chunk, 1, array_len, 1,
-		    array_len, thisname);
+		zap_leaf_array_read(l, chunk, sizeof (char), array_len,
+		    sizeof (char), array_len, thisname);
 		match = zap_match(zn, thisname);
 		kmem_free(thisname, array_len);
 		return (match);
 	}
 
-	/* Fast path for exact matching */
+	/*
+	 * Fast path for exact matching.
+	 * First check that the lengths match, so that we don't read
+	 * past the end of the zn_key_orig array.
+	 */
+	if (array_len != zn->zn_key_orig_len)
+		return (B_FALSE);
 	while (bseen < array_len) {
 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
 		int toread = MIN(array_len - bseen, ZAP_LEAF_ARRAY_BYTES);
 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
-		if (bcmp(la->la_array, zn->zn_name_orij + bseen, toread))
+		if (bcmp(la->la_array, (char *)zn->zn_key_orig + bseen, toread))
 			break;
 		chunk = la->la_next;
 		bseen += toread;
@@ -426,7 +448,7 @@
 {
 	uint16_t chunk;
 	uint64_t besth = -1ULL;
-	uint32_t bestcd = ZAP_MAXCD;
+	uint32_t bestcd = -1U;
 	uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1;
 	uint16_t lh;
 	struct zap_leaf_entry *le;
@@ -459,7 +481,7 @@
 		}
 	}
 
-	return (bestcd == ZAP_MAXCD ? ENOENT : 0);
+	return (bestcd == -1U ? ENOENT : 0);
 }
 
 int
@@ -483,14 +505,20 @@
 }
 
 int
-zap_entry_read_name(const zap_entry_handle_t *zeh, uint16_t buflen, char *buf)
+zap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh, uint16_t buflen,
+    char *buf)
 {
 	struct zap_leaf_entry *le =
 	    ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
 
-	zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1,
-	    le->le_name_length, 1, buflen, buf);
+	if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
+		zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 8,
+		    le->le_name_length, 8, buflen / 8, buf);
+	} else {
+		zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1,
+		    le->le_name_length, 1, buflen, buf);
+	}
 	if (le->le_name_length > buflen)
 		return (EOVERFLOW);
 	return (0);
@@ -549,26 +577,26 @@
 }
 
 int
-zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
+zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
     uint8_t integer_size, uint64_t num_integers, const void *buf,
     zap_entry_handle_t *zeh)
 {
 	uint16_t chunk;
 	uint16_t *chunkp;
 	struct zap_leaf_entry *le;
-	uint64_t namelen, valuelen;
+	uint64_t valuelen;
 	int numchunks;
+	uint64_t h = zn->zn_hash;
 
 	valuelen = integer_size * num_integers;
-	namelen = strlen(name) + 1;
-	ASSERT(namelen >= 2);
 
-	numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(namelen) +
+	numchunks = 1 +
+	    ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_len * zn->zn_key_intlen) +
 	    ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
 	if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
 		return (E2BIG);
 
-	if (cd == ZAP_MAXCD) {
+	if (cd == ZAP_NEED_CD) {
 		/* find the lowest unused cd */
 		if (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) {
 			cd = 0;
@@ -585,7 +613,7 @@
 			}
 		} else {
 			/* old unsorted format; do it the O(n^2) way */
-			for (cd = 0; cd < ZAP_MAXCD; cd++) {
+			for (cd = 0; ; cd++) {
 				for (chunk = *LEAF_HASH_ENTPTR(l, h);
 				    chunk != CHAIN_END; chunk = le->le_next) {
 					le = ZAP_LEAF_ENTRY(l, chunk);
@@ -600,10 +628,10 @@
 			}
 		}
 		/*
-		 * we would run out of space in a block before we could
-		 * have ZAP_MAXCD entries
+		 * We would run out of space in a block before we could
+		 * store enough entries to run out of CD values.
 		 */
-		ASSERT3U(cd, <, ZAP_MAXCD);
+		ASSERT3U(cd, <, zap_maxcd(zn->zn_zap));
 	}
 
 	if (l->l_phys->l_hdr.lh_nfree < numchunks)
@@ -613,8 +641,9 @@
 	chunk = zap_leaf_chunk_alloc(l);
 	le = ZAP_LEAF_ENTRY(l, chunk);
 	le->le_type = ZAP_CHUNK_ENTRY;
-	le->le_name_chunk = zap_leaf_array_create(l, name, 1, namelen);
-	le->le_name_length = namelen;
+	le->le_name_chunk = zap_leaf_array_create(l, zn->zn_key_orig,
+	    zn->zn_key_intlen, zn->zn_key_orig_len);
+	le->le_name_length = zn->zn_key_orig_len;
 	le->le_value_chunk =
 	    zap_leaf_array_create(l, buf, integer_size, num_integers);
 	le->le_value_length = num_integers;
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/zap_micro.c
--- a/usr/src/uts/common/fs/zfs/zap_micro.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/zap_micro.c	Sun Nov 01 14:14:46 2009 -0800
@@ -23,6 +23,7 @@
  * Use is subject to license terms.
  */
 
+#include <sys/zio.h>
 #include <sys/spa.h>
 #include <sys/dmu.h>
 #include <sys/zfs_context.h>
@@ -36,33 +37,74 @@
 #include <sys/sunddi.h>
 #endif
 
-static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx);
+static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags);
+
+uint64_t
+zap_getflags(zap_t *zap)
+{
+	if (zap->zap_ismicro)
+		return (0);
+	return (zap->zap_u.zap_fat.zap_phys->zap_flags);
+}
 
+int
+zap_hashbits(zap_t *zap)
+{
+	if (zap_getflags(zap) & ZAP_FLAG_HASH64)
+		return (48);
+	else
+		return (28);
+}
+
+uint32_t
+zap_maxcd(zap_t *zap)
+{
+	if (zap_getflags(zap) & ZAP_FLAG_HASH64)
+		return ((1<<16)-1);
+	else
+		return (-1U);
+}
 
 static uint64_t
-zap_hash(zap_t *zap, const char *normname)
+zap_hash(zap_name_t *zn)
 {
-	const uint8_t *cp;
-	uint8_t c;
-	uint64_t crc = zap->zap_salt;
+	zap_t *zap = zn->zn_zap;
+	uint64_t h = 0;
 
-	/* NB: name must already be normalized, if necessary */
+	if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) {
+		ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY);
+		h = *(uint64_t *)zn->zn_key_orig;
+	} else {
+		const uint8_t *cp = (const uint8_t *)zn->zn_key_norm;
+		int i, len;
+
+		h = zap->zap_salt;
+		ASSERT(h != 0);
+		ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+		len = zn->zn_key_norm_len;
 
-	ASSERT(crc != 0);
-	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
-	for (cp = (const uint8_t *)normname; (c = *cp) != '\0'; cp++) {
-		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
+		/*
+		 * Because we previously stored the terminating null on
+		 * disk, but didn't hash it, we need to continue to not
+		 * hash it.  (The zn_key_*_len includes the terminating
+		 * null for non-binary keys.)
+		 */
+		if (!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY))
+			len--;
+
+		for (i = 0; i < len; cp++, i++)
+			h = (h >> 8) ^ zfs_crc64_table[(h ^ *cp) & 0xFF];
+
 	}
-
 	/*
-	 * Only use 28 bits, since we need 4 bits in the cookie for the
-	 * collision differentiator.  We MUST use the high bits, since
-	 * those are the ones that we first pay attention to when
+	 * Don't use all 64 bits, since we need some in the cookie for
+	 * the collision differentiator.  We MUST use the high bits,
+	 * since those are the ones that we first pay attention to when
 	 * chosing the bucket.
 	 */
-	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
+	h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1);
 
-	return (crc);
+	return (h);
 }
 
 static int
@@ -71,6 +113,8 @@
 	size_t inlen, outlen;
 	int err;
 
+	ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY));
+
 	inlen = strlen(name) + 1;
 	outlen = ZAP_MAXNAMELEN;
 
@@ -85,16 +129,18 @@
 boolean_t
 zap_match(zap_name_t *zn, const char *matchname)
 {
+	ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY));
+
 	if (zn->zn_matchtype == MT_FIRST) {
 		char norm[ZAP_MAXNAMELEN];
 
 		if (zap_normalize(zn->zn_zap, matchname, norm) != 0)
 			return (B_FALSE);
 
-		return (strcmp(zn->zn_name_norm, norm) == 0);
+		return (strcmp(zn->zn_key_norm, norm) == 0);
 	} else {
 		/* MT_BEST or MT_EXACT */
-		return (strcmp(zn->zn_name_orij, matchname) == 0);
+		return (strcmp(zn->zn_key_orig, matchname) == 0);
 	}
 }
 
@@ -104,30 +150,49 @@
 	kmem_free(zn, sizeof (zap_name_t));
 }
 
-/* XXX combine this with zap_lockdir()? */
 zap_name_t *
-zap_name_alloc(zap_t *zap, const char *name, matchtype_t mt)
+zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt)
 {
 	zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
 
 	zn->zn_zap = zap;
-	zn->zn_name_orij = name;
+	zn->zn_key_intlen = sizeof (*key);
+	zn->zn_key_orig = key;
+	zn->zn_key_orig_len = strlen(zn->zn_key_orig) + 1;
 	zn->zn_matchtype = mt;
 	if (zap->zap_normflags) {
-		if (zap_normalize(zap, name, zn->zn_normbuf) != 0) {
+		if (zap_normalize(zap, key, zn->zn_normbuf) != 0) {
 			zap_name_free(zn);
 			return (NULL);
 		}
-		zn->zn_name_norm = zn->zn_normbuf;
+		zn->zn_key_norm = zn->zn_normbuf;
+		zn->zn_key_norm_len = strlen(zn->zn_key_norm) + 1;
 	} else {
 		if (mt != MT_EXACT) {
 			zap_name_free(zn);
 			return (NULL);
 		}
-		zn->zn_name_norm = zn->zn_name_orij;
+		zn->zn_key_norm = zn->zn_key_orig;
+		zn->zn_key_norm_len = zn->zn_key_orig_len;
 	}
 
-	zn->zn_hash = zap_hash(zap, zn->zn_name_norm);
+	zn->zn_hash = zap_hash(zn);
+	return (zn);
+}
+
+zap_name_t *
+zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints)
+{
+	zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
+
+	ASSERT(zap->zap_normflags == 0);
+	zn->zn_zap = zap;
+	zn->zn_key_intlen = sizeof (*key);
+	zn->zn_key_orig = zn->zn_key_norm = key;
+	zn->zn_key_orig_len = zn->zn_key_norm_len = numints;
+	zn->zn_matchtype = MT_EXACT;
+
+	zn->zn_hash = zap_hash(zn);
 	return (zn);
 }
 
@@ -186,7 +251,7 @@
 
 	ASSERT(zap->zap_ismicro);
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-	ASSERT(mzep->mze_cd < ZAP_MAXCD);
+	ASSERT(mzep->mze_cd < zap_maxcd(zap));
 
 	mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
 	mze->mze_chunkid = chunkid;
@@ -206,9 +271,6 @@
 	ASSERT(zn->zn_zap->zap_ismicro);
 	ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
 
-	if (strlen(zn->zn_name_norm) >= sizeof (mze_tofind.mze_phys.mze_name))
-		return (NULL);
-
 	mze_tofind.mze_hash = zn->zn_hash;
 	mze_tofind.mze_phys.mze_cd = 0;
 
@@ -421,7 +483,7 @@
 			dprintf("upgrading obj %llu: num_entries=%u\n",
 			    obj, zap->zap_m.zap_num_entries);
 			*zapp = zap;
-			return (mzap_upgrade(zapp, tx));
+			return (mzap_upgrade(zapp, tx, 0));
 		}
 		err = dmu_object_set_blocksize(os, obj, newsz, 0, tx);
 		ASSERT3U(err, ==, 0);
@@ -441,10 +503,11 @@
 }
 
 static int
-mzap_upgrade(zap_t **zapp, dmu_tx_t *tx)
+mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags)
 {
 	mzap_phys_t *mzp;
-	int i, sz, nchunks, err;
+	int i, sz, nchunks;
+	int err = 0;
 	zap_t *zap = *zapp;
 
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
@@ -454,11 +517,13 @@
 	bcopy(zap->zap_dbuf->db_data, mzp, sz);
 	nchunks = zap->zap_m.zap_num_chunks;
 
-	err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
-	    1ULL << fzap_default_block_shift, 0, tx);
-	if (err) {
-		kmem_free(mzp, sz);
-		return (err);
+	if (!flags) {
+		err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
+		    1ULL << fzap_default_block_shift, 0, tx);
+		if (err) {
+			kmem_free(mzp, sz);
+			return (err);
+		}
 	}
 
 	dprintf("upgrading obj=%llu with %u chunks\n",
@@ -466,10 +531,9 @@
 	/* XXX destroy the avl later, so we can use the stored hash value */
 	mze_destroy(zap);
 
-	fzap_upgrade(zap, tx);
+	fzap_upgrade(zap, tx, flags);
 
 	for (i = 0; i < nchunks; i++) {
-		int err;
 		mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
 		zap_name_t *zn;
 		if (mze->mze_name[0] == 0)
@@ -489,7 +553,8 @@
 }
 
 static void
-mzap_create_impl(objset_t *os, uint64_t obj, int normflags, dmu_tx_t *tx)
+mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags,
+    dmu_tx_t *tx)
 {
 	dmu_buf_t *db;
 	mzap_phys_t *zp;
@@ -510,6 +575,15 @@
 	zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL;
 	zp->mz_normflags = normflags;
 	dmu_buf_rele(db, FTAG);
+
+	if (flags != 0) {
+		zap_t *zap;
+		/* Only fat zap supports flags; upgrade immediately. */
+		VERIFY(0 == zap_lockdir(os, obj, tx, RW_WRITER,
+		    B_FALSE, B_FALSE, &zap));
+		VERIFY3U(0, ==, mzap_upgrade(&zap, tx, flags));
+		zap_unlockdir(zap);
+	}
 }
 
 int
@@ -530,7 +604,7 @@
 	err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx);
 	if (err != 0)
 		return (err);
-	mzap_create_impl(os, obj, normflags, tx);
+	mzap_create_impl(os, obj, normflags, 0, tx);
 	return (0);
 }
 
@@ -547,7 +621,26 @@
 {
 	uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
 
-	mzap_create_impl(os, obj, normflags, tx);
+	mzap_create_impl(os, obj, normflags, 0, tx);
+	return (obj);
+}
+
+uint64_t
+zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
+    dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+	uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
+
+	ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT &&
+	    leaf_blockshift <= SPA_MAXBLOCKSHIFT &&
+	    indirect_blockshift >= SPA_MINBLOCKSHIFT &&
+	    indirect_blockshift <= SPA_MAXBLOCKSHIFT);
+
+	VERIFY(dmu_object_set_blocksize(os, obj,
+	    1ULL << leaf_blockshift, indirect_blockshift, tx) == 0);
+
+	mzap_create_impl(os, obj, normflags, flags, tx);
 	return (obj);
 }
 
@@ -699,6 +792,30 @@
 }
 
 int
+zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+	zap_t *zap;
+	int err;
+	zap_name_t *zn;
+
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+	if (err)
+		return (err);
+	zn = zap_name_alloc_uint64(zap, key, key_numints);
+	if (zn == NULL) {
+		zap_unlockdir(zap);
+		return (ENOTSUP);
+	}
+
+	err = fzap_lookup(zn, integer_size, num_integers, buf,
+	    NULL, 0, NULL);
+	zap_name_free(zn);
+	zap_unlockdir(zap);
+	return (err);
+}
+
+int
 zap_length(objset_t *os, uint64_t zapobj, const char *name,
     uint64_t *integer_size, uint64_t *num_integers)
 {
@@ -733,6 +850,28 @@
 	return (err);
 }
 
+int
+zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints, uint64_t *integer_size, uint64_t *num_integers)
+{
+	zap_t *zap;
+	int err;
+	zap_name_t *zn;
+
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+	if (err)
+		return (err);
+	zn = zap_name_alloc_uint64(zap, key, key_numints);
+	if (zn == NULL) {
+		zap_unlockdir(zap);
+		return (ENOTSUP);
+	}
+	err = fzap_length(zn, integer_size, num_integers);
+	zap_name_free(zn);
+	zap_unlockdir(zap);
+	return (err);
+}
+
 static void
 mzap_addent(zap_name_t *zn, uint64_t value)
 {
@@ -741,20 +880,18 @@
 	int start = zap->zap_m.zap_alloc_next;
 	uint32_t cd;
 
-	dprintf("obj=%llu %s=%llu\n", zap->zap_object,
-	    zn->zn_name_orij, value);
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
 #ifdef ZFS_DEBUG
 	for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
 		mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
-		ASSERT(strcmp(zn->zn_name_orij, mze->mze_name) != 0);
+		ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0);
 	}
 #endif
 
 	cd = mze_find_unused_cd(zap, zn->zn_hash);
 	/* given the limited size of the microzap, this can't happen */
-	ASSERT(cd != ZAP_MAXCD);
+	ASSERT(cd < zap_maxcd(zap));
 
 again:
 	for (i = start; i < zap->zap_m.zap_num_chunks; i++) {
@@ -762,7 +899,7 @@
 		if (mze->mze_name[0] == 0) {
 			mze->mze_value = value;
 			mze->mze_cd = cd;
-			(void) strcpy(mze->mze_name, zn->zn_name_orij);
+			(void) strcpy(mze->mze_name, zn->zn_key_orig);
 			zap->zap_m.zap_num_entries++;
 			zap->zap_m.zap_alloc_next = i+1;
 			if (zap->zap_m.zap_alloc_next ==
@@ -780,7 +917,7 @@
 }
 
 int
-zap_add(objset_t *os, uint64_t zapobj, const char *name,
+zap_add(objset_t *os, uint64_t zapobj, const char *key,
     int integer_size, uint64_t num_integers,
     const void *val, dmu_tx_t *tx)
 {
@@ -793,7 +930,7 @@
 	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
 	if (err)
 		return (err);
-	zn = zap_name_alloc(zap, name, MT_EXACT);
+	zn = zap_name_alloc(zap, key, MT_EXACT);
 	if (zn == NULL) {
 		zap_unlockdir(zap);
 		return (ENOTSUP);
@@ -802,10 +939,8 @@
 		err = fzap_add(zn, integer_size, num_integers, val, tx);
 		zap = zn->zn_zap;	/* fzap_add() may change zap */
 	} else if (integer_size != 8 || num_integers != 1 ||
-	    strlen(name) >= MZAP_NAME_LEN) {
-		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
-		    zapobj, integer_size, num_integers, name);
-		err = mzap_upgrade(&zn->zn_zap, tx);
+	    strlen(key) >= MZAP_NAME_LEN) {
+		err = mzap_upgrade(&zn->zn_zap, tx, 0);
 		if (err == 0)
 			err = fzap_add(zn, integer_size, num_integers, val, tx);
 		zap = zn->zn_zap;	/* fzap_add() may change zap */
@@ -825,6 +960,31 @@
 }
 
 int
+zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints, int integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx)
+{
+	zap_t *zap;
+	int err;
+	zap_name_t *zn;
+
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
+	if (err)
+		return (err);
+	zn = zap_name_alloc_uint64(zap, key, key_numints);
+	if (zn == NULL) {
+		zap_unlockdir(zap);
+		return (ENOTSUP);
+	}
+	err = fzap_add(zn, integer_size, num_integers, val, tx);
+	zap = zn->zn_zap;	/* fzap_add() may change zap */
+	zap_name_free(zn);
+	if (zap != NULL)	/* may be NULL if fzap_add() failed */
+		zap_unlockdir(zap);
+	return (err);
+}
+
+int
 zap_update(objset_t *os, uint64_t zapobj, const char *name,
     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
 {
@@ -849,7 +1009,7 @@
 	    strlen(name) >= MZAP_NAME_LEN) {
 		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
 		    zapobj, integer_size, num_integers, name);
-		err = mzap_upgrade(&zn->zn_zap, tx);
+		err = mzap_upgrade(&zn->zn_zap, tx, 0);
 		if (err == 0)
 			err = fzap_update(zn, integer_size, num_integers,
 			    val, tx);
@@ -872,6 +1032,31 @@
 }
 
 int
+zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints,
+    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+{
+	zap_t *zap;
+	zap_name_t *zn;
+	int err;
+
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
+	if (err)
+		return (err);
+	zn = zap_name_alloc_uint64(zap, key, key_numints);
+	if (zn == NULL) {
+		zap_unlockdir(zap);
+		return (ENOTSUP);
+	}
+	err = fzap_update(zn, integer_size, num_integers, val, tx);
+	zap = zn->zn_zap;	/* fzap_update() may change zap */
+	zap_name_free(zn);
+	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
+		zap_unlockdir(zap);
+	return (err);
+}
+
+int
 zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
 {
 	return (zap_remove_norm(os, zapobj, name, MT_EXACT, tx));
@@ -912,17 +1097,32 @@
 	return (err);
 }
 
+int
+zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints, dmu_tx_t *tx)
+{
+	zap_t *zap;
+	int err;
+	zap_name_t *zn;
+
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap);
+	if (err)
+		return (err);
+	zn = zap_name_alloc_uint64(zap, key, key_numints);
+	if (zn == NULL) {
+		zap_unlockdir(zap);
+		return (ENOTSUP);
+	}
+	err = fzap_remove(zn, tx);
+	zap_name_free(zn);
+	zap_unlockdir(zap);
+	return (err);
+}
+
 /*
  * Routines for iterating over the attributes.
  */
 
-/*
- * We want to keep the high 32 bits of the cursor zero if we can, so
- * that 32-bit programs can access this.  So use a small hash value so
- * we can fit 4 bits of cd into the 32-bit cursor.
- *
- * [ 4 zero bits | 32-bit collision differentiator | 28-bit hash value ]
- */
 void
 zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
     uint64_t serialized)
@@ -931,15 +1131,9 @@
 	zc->zc_zap = NULL;
 	zc->zc_leaf = NULL;
 	zc->zc_zapobj = zapobj;
-	if (serialized == -1ULL) {
-		zc->zc_hash = -1ULL;
-		zc->zc_cd = 0;
-	} else {
-		zc->zc_hash = serialized << (64-ZAP_HASHBITS);
-		zc->zc_cd = serialized >> ZAP_HASHBITS;
-		if (zc->zc_cd >= ZAP_MAXCD) /* corrupt serialized */
-			zc->zc_cd = 0;
-	}
+	zc->zc_serialized = serialized;
+	zc->zc_hash = 0;
+	zc->zc_cd = 0;
 }
 
 void
@@ -969,10 +1163,21 @@
 {
 	if (zc->zc_hash == -1ULL)
 		return (-1ULL);
-	ASSERT((zc->zc_hash & (ZAP_MAXCD-1)) == 0);
-	ASSERT(zc->zc_cd < ZAP_MAXCD);
-	return ((zc->zc_hash >> (64-ZAP_HASHBITS)) |
-	    ((uint64_t)zc->zc_cd << ZAP_HASHBITS));
+	if (zc->zc_zap == NULL)
+		return (zc->zc_serialized);
+	ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0);
+	ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap));
+
+	/*
+	 * We want to keep the high 32 bits of the cursor zero if we can, so
+	 * that 32-bit programs can access this.  So usually use a small
+	 * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits
+	 * of the cursor.
+	 *
+	 * [ collision differentiator | zap_hashbits()-bit hash value ]
+	 */
+	return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) |
+	    ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap)));
 }
 
 int
@@ -987,10 +1192,23 @@
 		return (ENOENT);
 
 	if (zc->zc_zap == NULL) {
+		int hb;
 		err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
 		    RW_READER, TRUE, FALSE, &zc->zc_zap);
 		if (err)
 			return (err);
+
+		/*
+		 * To support zap_cursor_init_serialized, advance, retrieve,
+		 * we must add to the existing zc_cd, which may already
+		 * be 1 due to the zap_cursor_advance.
+		 */
+		ASSERT(zc->zc_hash == 0);
+		hb = zap_hashbits(zc->zc_zap);
+		zc->zc_hash = zc->zc_serialized << (64 - hb);
+		zc->zc_cd += zc->zc_serialized >> hb;
+		if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */
+			zc->zc_cd = 0;
 	} else {
 		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
 	}
@@ -1035,12 +1253,6 @@
 	if (zc->zc_hash == -1ULL)
 		return;
 	zc->zc_cd++;
-	if (zc->zc_cd >= ZAP_MAXCD) {
-		zc->zc_cd = 0;
-		zc->zc_hash += 1ULL<<(64-ZAP_HASHBITS);
-		if (zc->zc_hash == 0) /* EOF */
-			zc->zc_hash = -1ULL;
-	}
 }
 
 int
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/zfs_fm.c
--- a/usr/src/uts/common/fs/zfs/zfs_fm.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/zfs_fm.c	Sun Nov 01 14:14:46 2009 -0800
@@ -700,6 +700,7 @@
 		bcopy(info, report->zcr_ckinfo, sizeof (*info));
 	}
 
+	report->zcr_align = 1ULL << vd->vdev_top->vdev_ashift;
 	report->zcr_length = length;
 
 #ifdef _KERNEL
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/zfs_ioctl.c
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c	Sun Nov 01 14:14:46 2009 -0800
@@ -41,7 +41,6 @@
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev.h>
-#include <sys/vdev_impl.h>
 #include <sys/dmu.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_dataset.h>
@@ -179,7 +178,7 @@
 
 	if (dmu_objset_hold(name, FTAG, &os) == 0) {
 		boolean_t ret;
-		ret = (dmu_objset_id(os) == dmu_objset_spa(os)->spa_bootfs);
+		ret = (dmu_objset_id(os) == spa_bootfs(dmu_objset_spa(os)));
 		dmu_objset_rele(os, FTAG);
 		return (ret);
 	}
@@ -1219,7 +1218,7 @@
 	 *
 	 * l2cache and spare devices are ok to be added to a rootpool.
 	 */
-	if (spa->spa_bootfs != 0 && nl2cache == 0 && nspares == 0) {
+	if (spa_bootfs(spa) != 0 && nl2cache == 0 && nspares == 0) {
 		spa_close(spa, FTAG);
 		return (EDOM);
 	}
@@ -1665,6 +1664,11 @@
 				    SPA_VERSION_GZIP_COMPRESSION))
 					return (ENOTSUP);
 
+				if (intval == ZIO_COMPRESS_ZLE &&
+				    zfs_earlier_version(name,
+				    SPA_VERSION_ZLE_COMPRESSION))
+					return (ENOTSUP);
+
 				/*
 				 * If this is a bootable dataset then
 				 * verify that the compression algorithm
@@ -1683,6 +1687,11 @@
 				return (ENOTSUP);
 			break;
 
+		case ZFS_PROP_DEDUP:
+			if (zfs_earlier_version(name, SPA_VERSION_DEDUP))
+				return (ENOTSUP);
+			break;
+
 		case ZFS_PROP_SHARESMB:
 			if (zpl_earlier_version(name, ZPL_VERSION_FUID))
 				return (ENOTSUP);
@@ -2978,9 +2987,9 @@
 		mutex_exit(&spa_namespace_lock);
 		return (EIO);
 	}
-	if (spa->spa_log_state == SPA_LOG_MISSING) {
+	if (spa_get_log_state(spa) == SPA_LOG_MISSING) {
 		/* we need to let spa_open/spa_load clear the chains */
-		spa->spa_log_state = SPA_LOG_CLEAR;
+		spa_set_log_state(spa, SPA_LOG_CLEAR);
 	}
 	spa->spa_last_open_failed = 0;
 	mutex_exit(&spa_namespace_lock);
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/zfs_log.c
--- a/usr/src/uts/common/fs/zfs/zfs_log.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/zfs_log.c	Sun Nov 01 14:14:46 2009 -0800
@@ -47,14 +47,6 @@
 #include <sys/ddi.h>
 #include <sys/dsl_dataset.h>
 
-#define	ZFS_HANDLE_REPLAY(zilog, tx) \
-	if (zilog->zl_replay) { \
-		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); \
-		zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = \
-		    zilog->zl_replaying_seq; \
-		return; \
-	}
-
 /*
  * These zfs_log_* functions must be called within a dmu tx, in one
  * of 2 contexts depending on zilog->z_replay:
@@ -251,11 +243,9 @@
 	size_t namesize = strlen(name) + 1;
 	size_t fuidsz = 0;
 
-	if (zilog == NULL)
+	if (zil_replaying(zilog, tx))
 		return;
 
-	ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
 	/*
 	 * If we have FUIDs present then add in space for
 	 * domains and ACE fuid's if any.
@@ -356,11 +346,9 @@
 	lr_remove_t *lr;
 	size_t namesize = strlen(name) + 1;
 
-	if (zilog == NULL)
+	if (zil_replaying(zilog, tx))
 		return;
 
-	ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
 	itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
 	lr = (lr_remove_t *)&itx->itx_lr;
 	lr->lr_doid = dzp->z_id;
@@ -382,11 +370,9 @@
 	lr_link_t *lr;
 	size_t namesize = strlen(name) + 1;
 
-	if (zilog == NULL)
+	if (zil_replaying(zilog, tx))
 		return;
 
-	ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
 	itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
 	lr = (lr_link_t *)&itx->itx_lr;
 	lr->lr_doid = dzp->z_id;
@@ -411,11 +397,9 @@
 	size_t namesize = strlen(name) + 1;
 	size_t linksize = strlen(link) + 1;
 
-	if (zilog == NULL)
+	if (zil_replaying(zilog, tx))
 		return;
 
-	ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
 	itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize);
 	lr = (lr_create_t *)&itx->itx_lr;
 	lr->lr_doid = dzp->z_id;
@@ -447,11 +431,9 @@
 	size_t snamesize = strlen(sname) + 1;
 	size_t dnamesize = strlen(dname) + 1;
 
-	if (zilog == NULL)
+	if (zil_replaying(zilog, tx))
 		return;
 
-	ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
 	itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
 	lr = (lr_rename_t *)&itx->itx_lr;
 	lr->lr_sdoid = sdzp->z_id;
@@ -479,11 +461,9 @@
 	uintptr_t fsync_cnt;
 	ssize_t immediate_write_sz;
 
-	if (zilog == NULL || zp->z_unlinked)
+	if (zil_replaying(zilog, tx) || zp->z_unlinked)
 		return;
 
-	ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
 	immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
 	    ? 0 : zfs_immediate_write_sz;
 
@@ -518,8 +498,7 @@
 		lr = (lr_write_t *)&itx->itx_lr;
 		if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
 		    zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
-			kmem_free(itx, offsetof(itx_t, itx_lr) +
-			    itx->itx_lr.lrc_reclen);
+			zil_itx_destroy(itx);
 			itx = zil_itx_create(txtype, sizeof (*lr));
 			lr = (lr_write_t *)&itx->itx_lr;
 			write_state = WR_NEED_COPY;
@@ -560,11 +539,9 @@
 	uint64_t seq;
 	lr_truncate_t *lr;
 
-	if (zilog == NULL || zp->z_unlinked)
+	if (zil_replaying(zilog, tx) || zp->z_unlinked)
 		return;
 
-	ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
 	itx = zil_itx_create(txtype, sizeof (*lr));
 	lr = (lr_truncate_t *)&itx->itx_lr;
 	lr->lr_foid = zp->z_id;
@@ -590,12 +567,9 @@
 	size_t		recsize = sizeof (lr_setattr_t);
 	void		*start;
 
-
-	if (zilog == NULL || zp->z_unlinked)
+	if (zil_replaying(zilog, tx) || zp->z_unlinked)
 		return;
 
-	ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
 	/*
 	 * If XVATTR set, then log record size needs to allow
 	 * for lr_attr_t + xvattr mask, mapsize and create time
@@ -659,11 +633,9 @@
 	size_t txsize;
 	size_t aclbytes = vsecp->vsa_aclentsz;
 
-	if (zilog == NULL || zp->z_unlinked)
+	if (zil_replaying(zilog, tx) || zp->z_unlinked)
 		return;
 
-	ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
 	txtype = (zp->z_zfsvfs->z_version < ZPL_VERSION_FUID) ?
 	    TX_ACL_V0 : TX_ACL;
 
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/zfs_replay.c
--- a/usr/src/uts/common/fs/zfs/zfs_replay.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/zfs_replay.c	Sun Nov 01 14:14:46 2009 -0800
@@ -625,7 +625,7 @@
 	znode_t	*zp;
 	int error;
 	ssize_t resid;
-	uint64_t orig_eof, eod;
+	uint64_t orig_eof, eod, offset, length;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
@@ -640,15 +640,24 @@
 			error = 0;
 		return (error);
 	}
+
+	offset = lr->lr_offset;
+	length = lr->lr_length;
+	eod = offset + length;		/* end of data for this write */
+
 	orig_eof = zp->z_phys->zp_size;
-	eod = lr->lr_offset + lr->lr_length; /* end of data for this write */
 
-	/* If it's a dmu_sync() block get the data and write the whole block */
-	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t))
-		zil_get_replay_data(zfsvfs->z_log, lr);
+	/* If it's a dmu_sync() block, write the whole block */
+	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
+		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
+		if (length < blocksize) {
+			offset -= offset % blocksize;
+			length = blocksize;
+		}
+	}
 
-	error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length,
-	    lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
+	error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, length, offset,
+	    UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
 
 	/*
 	 * This may be a write from a dmu_sync() for a whole block,
@@ -682,16 +691,8 @@
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
-	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
-		/*
-		 * As we can log writes out of order, it's possible the
-		 * file has been removed. In this case just drop the write
-		 * and return success.
-		 */
-		if (error == ENOENT)
-			error = 0;
+	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
 		return (error);
-	}
 
 	end = lr->lr_offset + lr->lr_length;
 	if (end > zp->z_phys->zp_size) {
@@ -714,16 +715,8 @@
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
-	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
-		/*
-		 * As we can log truncates out of order, it's possible the
-		 * file has been removed. In this case just drop the truncate
-		 * and return success.
-		 */
-		if (error == ENOENT)
-			error = 0;
+	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
 		return (error);
-	}
 
 	bzero(&fl, sizeof (fl));
 	fl.l_type = F_WRLCK;
@@ -757,16 +750,8 @@
 			zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
 	}
 
-	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
-		/*
-		 * As we can log setattrs out of order, it's possible the
-		 * file has been removed. In this case just drop the setattr
-		 * and return success.
-		 */
-		if (error == ENOENT)
-			error = 0;
+	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
 		return (error);
-	}
 
 	zfs_init_vattr(vap, lr->lr_mask, lr->lr_mode,
 	    lr->lr_uid, lr->lr_gid, 0, lr->lr_foid);
@@ -812,16 +797,8 @@
 		zfs_oldace_byteswap(ace, lr->lr_aclcnt);
 	}
 
-	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
-		/*
-		 * As we can log acls out of order, it's possible the
-		 * file has been removed. In this case just drop the acl
-		 * and return success.
-		 */
-		if (error == ENOENT)
-			error = 0;
+	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
 		return (error);
-	}
 
 	bzero(&vsa, sizeof (vsa));
 	vsa.vsa_mask = VSA_ACE | VSA_ACECNT;
@@ -869,16 +846,8 @@
 		}
 	}
 
-	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
-		/*
-		 * As we can log acls out of order, it's possible the
-		 * file has been removed. In this case just drop the acl
-		 * and return success.
-		 */
-		if (error == ENOENT)
-			error = 0;
+	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
 		return (error);
-	}
 
 	bzero(&vsa, sizeof (vsa));
 	vsa.vsa_mask = VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS;
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/zfs_vnops.c
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c	Sun Nov 01 14:14:46 2009 -0800
@@ -837,21 +837,25 @@
 }
 
 void
-zfs_get_done(dmu_buf_t *db, void *vzgd)
+zfs_get_done(zgd_t *zgd, int error)
 {
-	zgd_t *zgd = (zgd_t *)vzgd;
-	rl_t *rl = zgd->zgd_rl;
-	vnode_t *vp = ZTOV(rl->r_zp);
-	objset_t *os = rl->r_zp->z_zfsvfs->z_os;
-
-	dmu_buf_rele(db, vzgd);
-	zfs_range_unlock(rl);
+	znode_t *zp = zgd->zgd_private;
+	objset_t *os = zp->z_zfsvfs->z_os;
+
+	if (zgd->zgd_db)
+		dmu_buf_rele(zgd->zgd_db, zgd);
+
+	zfs_range_unlock(zgd->zgd_rl);
+
 	/*
 	 * Release the vnode asynchronously as we currently have the
 	 * txg stopped from syncing.
 	 */
-	VN_RELE_ASYNC(vp, dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
-	zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
+	VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
+
+	if (error == 0 && zgd->zgd_bp)
+		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
+
 	kmem_free(zgd, sizeof (zgd_t));
 }
 
@@ -868,20 +872,21 @@
 	zfsvfs_t *zfsvfs = arg;
 	objset_t *os = zfsvfs->z_os;
 	znode_t *zp;
-	uint64_t off = lr->lr_offset;
+	uint64_t object = lr->lr_foid;
+	uint64_t offset = lr->lr_offset;
+	uint64_t size = lr->lr_length;
+	blkptr_t *bp = &lr->lr_blkptr;
 	dmu_buf_t *db;
-	rl_t *rl;
 	zgd_t *zgd;
-	int dlen = lr->lr_length;		/* length of user data */
 	int error = 0;
 
-	ASSERT(zio);
-	ASSERT(dlen != 0);
+	ASSERT(zio != NULL);
+	ASSERT(size != 0);
 
 	/*
 	 * Nothing to do if the file has been removed
 	 */
-	if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
+	if (zfs_zget(zfsvfs, object, &zp) != 0)
 		return (ENOENT);
 	if (zp->z_unlinked) {
 		/*
@@ -893,6 +898,10 @@
 		return (ENOENT);
 	}
 
+	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
+	zgd->zgd_zilog = zfsvfs->z_log;
+	zgd->zgd_private = zp;
+
 	/*
 	 * Write records come in two flavors: immediate and indirect.
 	 * For small writes it's cheaper to store the data with the
@@ -901,17 +910,16 @@
 	 * we don't have to write the data twice.
 	 */
 	if (buf != NULL) { /* immediate write */
-		rl = zfs_range_lock(zp, off, dlen, RL_READER);
+		zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
 		/* test for truncation needs to be done while range locked */
-		if (off >= zp->z_phys->zp_size) {
+		if (offset >= zp->z_phys->zp_size) {
 			error = ENOENT;
-			goto out;
+		} else {
+			error = dmu_read(os, object, offset, size, buf,
+			    DMU_READ_NO_PREFETCH);
 		}
-		VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf,
-		    DMU_READ_NO_PREFETCH));
+		ASSERT(error == 0 || error == ENOENT);
 	} else { /* indirect write */
-		uint64_t boff; /* block starting offset */
-
 		/*
 		 * Have to lock the whole block to ensure when it's
 		 * written out and it's checksum is being calculated
@@ -919,80 +927,58 @@
 		 * blocksize after we get the lock in case it's changed!
 		 */
 		for (;;) {
-			if (ISP2(zp->z_blksz)) {
-				boff = P2ALIGN_TYPED(off, zp->z_blksz,
-				    uint64_t);
-			} else {
-				boff = 0;
-			}
-			dlen = zp->z_blksz;
-			rl = zfs_range_lock(zp, boff, dlen, RL_READER);
-			if (zp->z_blksz == dlen)
+			uint64_t blkoff;
+			size = zp->z_blksz;
+			blkoff = ISP2(size) ? P2PHASE(offset, size) : 0;
+			offset -= blkoff;
+			zgd->zgd_rl = zfs_range_lock(zp, offset, size,
+			    RL_READER);
+			if (zp->z_blksz == size)
 				break;
-			zfs_range_unlock(rl);
+			offset += blkoff;
+			zfs_range_unlock(zgd->zgd_rl);
 		}
 		/* test for truncation needs to be done while range locked */
-		if (off >= zp->z_phys->zp_size) {
+		if (offset >= zp->z_phys->zp_size)
 			error = ENOENT;
-			goto out;
-		}
-		zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
-		zgd->zgd_rl = rl;
-		zgd->zgd_zilog = zfsvfs->z_log;
-		zgd->zgd_bp = &lr->lr_blkptr;
 #ifdef DEBUG
 		if (zil_fault_io) {
 			error = EIO;
 			zil_fault_io = 0;
-		} else {
-			error = dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db);
 		}
-#else
-		error = dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db);
 #endif
-		if (error != 0) {
-			kmem_free(zgd, sizeof (zgd_t));
-			goto out;
-		}
-
-		ASSERT(boff == db->db_offset);
-		lr->lr_blkoff = off - boff;
-		error = dmu_sync(zio, db, &lr->lr_blkptr,
-		    lr->lr_common.lrc_txg, zfs_get_done, zgd);
-		ASSERT((error && error != EINPROGRESS) ||
-		    lr->lr_length <= zp->z_blksz);
+		if (error == 0)
+			error = dmu_buf_hold(os, object, offset, zgd, &db);
+
 		if (error == 0) {
+			zgd->zgd_db = db;
+			zgd->zgd_bp = bp;
+
+			ASSERT(db->db_offset == offset);
+			ASSERT(db->db_size == size);
+
+			error = dmu_sync(zio, lr->lr_common.lrc_txg,
+			    zfs_get_done, zgd);
+			ASSERT(error || lr->lr_length <= zp->z_blksz);
+
 			/*
-			 * dmu_sync() can compress a block of zeros to a null
-			 * blkptr but the block size still needs to be passed
-			 * through to replay.
+			 * On success, we need to wait for the write I/O
+			 * initiated by dmu_sync() to complete before we can
+			 * release this dbuf.  We will finish everything up
+			 * in the zfs_get_done() callback.
 			 */
-			BP_SET_LSIZE(&lr->lr_blkptr, db->db_size);
-			zil_add_block(zfsvfs->z_log, &lr->lr_blkptr);
+			if (error == 0)
+				return (0);
+
+			if (error == EALREADY) {
+				lr->lr_common.lrc_txtype = TX_WRITE2;
+				error = 0;
+			}
 		}
-
-		/*
-		 * If we get EINPROGRESS, then we need to wait for a
-		 * write IO initiated by dmu_sync() to complete before
-		 * we can release this dbuf.  We will finish everything
-		 * up in the zfs_get_done() callback.
-		 */
-		if (error == EINPROGRESS) {
-			return (0);
-		} else if (error == EALREADY) {
-			lr->lr_common.lrc_txtype = TX_WRITE2;
-			error = 0;
-		}
-		dmu_buf_rele(db, zgd);
-		kmem_free(zgd, sizeof (zgd_t));
 	}
-out:
-	zfs_range_unlock(rl);
-	/*
-	 * Release the vnode asynchronously as we currently have the
-	 * txg stopped from syncing.
-	 */
-	VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
+
+	zfs_get_done(zgd, error);
+
 	return (error);
 }
 
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/zil.c
--- a/usr/src/uts/common/fs/zfs/zil.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/zil.c	Sun Nov 01 14:14:46 2009 -0800
@@ -25,7 +25,6 @@
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
-#include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/zap.h>
 #include <sys/arc.h>
@@ -80,10 +79,10 @@
 static boolean_t zil_empty(zilog_t *zilog);
 
 static int
-zil_dva_compare(const void *x1, const void *x2)
+zil_bp_compare(const void *x1, const void *x2)
 {
-	const dva_t *dva1 = x1;
-	const dva_t *dva2 = x2;
+	const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva;
+	const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva;
 
 	if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2))
 		return (-1);
@@ -99,34 +98,37 @@
 }
 
 static void
-zil_dva_tree_init(avl_tree_t *t)
+zil_bp_tree_init(zilog_t *zilog)
 {
-	avl_create(t, zil_dva_compare, sizeof (zil_dva_node_t),
-	    offsetof(zil_dva_node_t, zn_node));
+	avl_create(&zilog->zl_bp_tree, zil_bp_compare,
+	    sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node));
 }
 
 static void
-zil_dva_tree_fini(avl_tree_t *t)
+zil_bp_tree_fini(zilog_t *zilog)
 {
-	zil_dva_node_t *zn;
+	avl_tree_t *t = &zilog->zl_bp_tree;
+	zil_bp_node_t *zn;
 	void *cookie = NULL;
 
 	while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
-		kmem_free(zn, sizeof (zil_dva_node_t));
+		kmem_free(zn, sizeof (zil_bp_node_t));
 
 	avl_destroy(t);
 }
 
-static int
-zil_dva_tree_add(avl_tree_t *t, dva_t *dva)
+int
+zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp)
 {
-	zil_dva_node_t *zn;
+	avl_tree_t *t = &zilog->zl_bp_tree;
+	const dva_t *dva = BP_IDENTITY(bp);
+	zil_bp_node_t *zn;
 	avl_index_t where;
 
 	if (avl_find(t, dva, &where) != NULL)
 		return (EEXIST);
 
-	zn = kmem_alloc(sizeof (zil_dva_node_t), KM_SLEEP);
+	zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP);
 	zn->zn_dva = *dva;
 	avl_insert(t, zn, where);
 
@@ -151,37 +153,38 @@
 }
 
 /*
- * Read a log block, make sure it's valid, and byteswap it if necessary.
+ * Read a log block and make sure it's valid.
  */
 static int
-zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp)
+zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst)
 {
-	blkptr_t blk = *bp;
+	enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
+	uint32_t aflags = ARC_WAIT;
+	arc_buf_t *abuf = NULL;
 	zbookmark_t zb;
-	uint32_t aflags = ARC_WAIT;
 	int error;
 
-	zb.zb_objset = bp->blk_cksum.zc_word[ZIL_ZC_OBJSET];
-	zb.zb_object = 0;
-	zb.zb_level = -1;
-	zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
+	if (zilog->zl_header->zh_claim_txg == 0)
+		zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
 
-	*abufpp = NULL;
+	if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
+		zio_flags |= ZIO_FLAG_SPECULATIVE;
 
-	/*
-	 * We shouldn't be doing any scrubbing while we're doing log
-	 * replay, it's OK to not lock.
-	 */
-	error = arc_read_nolock(NULL, zilog->zl_spa, &blk,
-	    arc_getbuf_func, abufpp, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL |
-	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB, &aflags, &zb);
+	SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET],
+	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
+
+	error = arc_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
+	    ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
 
 	if (error == 0) {
-		char *data = (*abufpp)->b_data;
-		uint64_t blksz = BP_GET_LSIZE(bp);
-		zil_trailer_t *ztp = (zil_trailer_t *)(data + blksz) - 1;
+		char *data = abuf->b_data;
+		uint64_t size = BP_GET_LSIZE(bp);
+		zil_trailer_t *ztp = (zil_trailer_t *)(data + size) - 1;
 		zio_cksum_t cksum = bp->blk_cksum;
 
+		bcopy(data, dst, size);
+		*nbp = ztp->zit_next_blk;
+
 		/*
 		 * Validate the checksummed log block.
 		 *
@@ -194,41 +197,76 @@
 
 		if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum,
 		    sizeof (cksum)) || BP_IS_HOLE(&ztp->zit_next_blk) ||
-		    (ztp->zit_nused > (blksz - sizeof (zil_trailer_t)))) {
+		    (ztp->zit_nused > (size - sizeof (zil_trailer_t))))
 			error = ECKSUM;
-		}
 
-		if (error) {
-			VERIFY(arc_buf_remove_ref(*abufpp, abufpp) == 1);
-			*abufpp = NULL;
-		}
+		VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
 	}
 
-	dprintf("error %d on %llu:%llu\n", error, zb.zb_objset, zb.zb_blkid);
+	return (error);
+}
+
+/*
+ * Read a TX_WRITE log data block.
+ */
+static int
+zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
+{
+	enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
+	const blkptr_t *bp = &lr->lr_blkptr;
+	uint32_t aflags = ARC_WAIT;
+	arc_buf_t *abuf = NULL;
+	zbookmark_t zb;
+	int error;
+
+	if (BP_IS_HOLE(bp)) {
+		if (wbuf != NULL)
+			bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length));
+		return (0);
+	}
+
+	if (zilog->zl_header->zh_claim_txg == 0)
+		zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
+
+	SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid,
+	    ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
+
+	error = arc_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
+	    ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
+
+	if (error == 0) {
+		if (wbuf != NULL)
+			bcopy(abuf->b_data, wbuf, arc_buf_size(abuf));
+		(void) arc_buf_remove_ref(abuf, &abuf);
+	}
 
 	return (error);
 }
 
 /*
  * Parse the intent log, and call parse_func for each valid record within.
- * Return the highest sequence number.
  */
-uint64_t
+int
 zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
     zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg)
 {
 	const zil_header_t *zh = zilog->zl_header;
-	uint64_t claim_seq = zh->zh_claim_seq;
-	uint64_t seq = 0;
-	uint64_t max_seq = 0;
-	blkptr_t blk = zh->zh_log;
-	arc_buf_t *abuf;
+	boolean_t claimed = !!zh->zh_claim_txg;
+	uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX;
+	uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX;
+	uint64_t max_blk_seq = 0;
+	uint64_t max_lr_seq = 0;
+	uint64_t blk_count = 0;
+	uint64_t lr_count = 0;
+	blkptr_t blk, next_blk;
 	char *lrbuf, *lrp;
-	zil_trailer_t *ztp;
-	int reclen, error;
+	int error = 0;
 
-	if (BP_IS_HOLE(&blk))
-		return (max_seq);
+	/*
+	 * Old logs didn't record the maximum zh_claim_lr_seq.
+	 */
+	if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
+		claim_lr_seq = UINT64_MAX;
 
 	/*
 	 * Starting at the block pointed to by zh_log we read the log chain.
@@ -239,95 +277,121 @@
 	 * If the log has been claimed, stop if we encounter a sequence
 	 * number greater than the highest claimed sequence number.
 	 */
-	zil_dva_tree_init(&zilog->zl_dva_tree);
-	for (;;) {
-		seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
+	lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE);
+	zil_bp_tree_init(zilog);
+
+	for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
+		zil_trailer_t *ztp =
+		    (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;
+		uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
+		int reclen;
 
-		if (claim_seq != 0 && seq > claim_seq)
+		if (blk_seq > claim_blk_seq)
+			break;
+		if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0)
+			break;
+		ASSERT(max_blk_seq < blk_seq);
+		max_blk_seq = blk_seq;
+		blk_count++;
+
+		if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq)
 			break;
 
-		ASSERT(max_seq < seq);
-		max_seq = seq;
-
-		error = zil_read_log_block(zilog, &blk, &abuf);
-
-		if (parse_blk_func != NULL)
-			parse_blk_func(zilog, &blk, arg, txg);
-
+		error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf);
 		if (error)
 			break;
 
-		lrbuf = abuf->b_data;
-		ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;
-		blk = ztp->zit_next_blk;
-
-		if (parse_lr_func == NULL) {
-			VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
-			continue;
-		}
-
 		for (lrp = lrbuf; lrp < lrbuf + ztp->zit_nused; lrp += reclen) {
 			lr_t *lr = (lr_t *)lrp;
 			reclen = lr->lrc_reclen;
 			ASSERT3U(reclen, >=, sizeof (lr_t));
-			parse_lr_func(zilog, lr, arg, txg);
+			if (lr->lrc_seq > claim_lr_seq)
+				goto done;
+			if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0)
+				goto done;
+			ASSERT(max_lr_seq < lr->lrc_seq);
+			max_lr_seq = lr->lrc_seq;
+			lr_count++;
 		}
-		VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
 	}
-	zil_dva_tree_fini(&zilog->zl_dva_tree);
+done:
+	zilog->zl_parse_error = error;
+	zilog->zl_parse_blk_seq = max_blk_seq;
+	zilog->zl_parse_lr_seq = max_lr_seq;
+	zilog->zl_parse_blk_count = blk_count;
+	zilog->zl_parse_lr_count = lr_count;
+
+	ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) ||
+	    (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq));
+
+	zil_bp_tree_fini(zilog);
+	zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE);
+
+	return (error);
+}
+
+static int
+zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
+{
+	/*
+	 * Claim log block if not already committed and not already claimed.
+	 * If tx == NULL, just verify that the block is claimable.
+	 */
+	if (bp->blk_birth < first_txg || zil_bp_tree_add(zilog, bp) != 0)
+		return (0);
 
-	return (max_seq);
+	return (zio_wait(zio_claim(NULL, zilog->zl_spa,
+	    tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL,
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB)));
+}
+
+static int
+zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
+{
+	lr_write_t *lr = (lr_write_t *)lrc;
+	int error;
+
+	if (lrc->lrc_txtype != TX_WRITE)
+		return (0);
+
+	/*
+	 * If the block is not readable, don't claim it.  This can happen
+	 * in normal operation when a log block is written to disk before
+	 * some of the dmu_sync() blocks it points to.  In this case, the
+	 * transaction cannot have been committed to anyone (we would have
+	 * waited for all writes to be stable first), so it is semantically
+	 * correct to declare this the end of the log.
+	 */
+	if (lr->lr_blkptr.blk_birth >= first_txg &&
+	    (error = zil_read_log_data(zilog, lr, NULL)) != 0)
+		return (error);
+
+	return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg));
 }
 
 /* ARGSUSED */
-static void
-zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
+static int
+zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg)
 {
-	spa_t *spa = zilog->zl_spa;
-	int err;
+	zio_free_zil(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
 
-	/*
-	 * Claim log block if not already committed and not already claimed.
-	 */
-	if (bp->blk_birth >= first_txg &&
-	    zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp)) == 0) {
-		err = zio_wait(zio_claim(NULL, spa, first_txg, bp, NULL, NULL,
-		    ZIO_FLAG_MUSTSUCCEED));
-		ASSERT(err == 0);
-	}
+	return (0);
 }
 
-static void
-zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
-{
-	if (lrc->lrc_txtype == TX_WRITE) {
-		lr_write_t *lr = (lr_write_t *)lrc;
-		zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg);
-	}
-}
-
-/* ARGSUSED */
-static void
-zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg)
-{
-	zio_free_blk(zilog->zl_spa, bp, dmu_tx_get_txg(tx));
-}
-
-static void
+static int
 zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg)
 {
+	lr_write_t *lr = (lr_write_t *)lrc;
+	blkptr_t *bp = &lr->lr_blkptr;
+
 	/*
 	 * If we previously claimed it, we need to free it.
 	 */
-	if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE) {
-		lr_write_t *lr = (lr_write_t *)lrc;
-		blkptr_t *bp = &lr->lr_blkptr;
-		if (bp->blk_birth >= claim_txg &&
-		    !zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp))) {
-			(void) arc_free(NULL, zilog->zl_spa,
-			    dmu_tx_get_txg(tx), bp, NULL, NULL, ARC_WAIT);
-		}
-	}
+	if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE &&
+	    bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0)
+		zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
+
+	return (0);
 }
 
 /*
@@ -359,17 +423,17 @@
 	 */
 	if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) {
 		tx = dmu_tx_create(zilog->zl_os);
-		(void) dmu_tx_assign(tx, TXG_WAIT);
+		VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
 		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 		txg = dmu_tx_get_txg(tx);
 
 		if (!BP_IS_HOLE(&blk)) {
-			zio_free_blk(zilog->zl_spa, &blk, txg);
+			zio_free_zil(zilog->zl_spa, txg, &blk);
 			BP_ZERO(&blk);
 		}
 
-		error = zio_alloc_blk(zilog->zl_spa, ZIL_MIN_BLKSZ, &blk,
-		    NULL, txg, zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
+		error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL,
+		    ZIL_MIN_BLKSZ, zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
 
 		if (error == 0)
 			zil_init_log_chain(zilog, &blk);
@@ -387,6 +451,7 @@
 		lwb->lwb_buf = zio_buf_alloc(lwb->lwb_sz);
 		lwb->lwb_max_txg = txg;
 		lwb->lwb_zio = NULL;
+		lwb->lwb_tx = NULL;
 
 		mutex_enter(&zilog->zl_lock);
 		list_insert_tail(&zilog->zl_lwb_list, lwb);
@@ -428,11 +493,13 @@
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 
+	zilog->zl_old_header = *zh;		/* debugging aid */
+
 	if (BP_IS_HOLE(&zh->zh_log))
 		return;
 
 	tx = dmu_tx_create(zilog->zl_os);
-	(void) dmu_tx_assign(tx, TXG_WAIT);
+	VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
 	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 	txg = dmu_tx_get_txg(tx);
 
@@ -440,61 +507,27 @@
 
 	ASSERT3U(zilog->zl_destroy_txg, <, txg);
 	zilog->zl_destroy_txg = txg;
+	zilog->zl_keep_first = keep_first;
 
 	if (!list_is_empty(&zilog->zl_lwb_list)) {
 		ASSERT(zh->zh_claim_txg == 0);
-		zilog->zl_keep_first = B_FALSE;
+		ASSERT(!keep_first);
 		while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
 			list_remove(&zilog->zl_lwb_list, lwb);
 			if (lwb->lwb_buf != NULL)
 				zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
-			zio_free_blk(zilog->zl_spa, &lwb->lwb_blk, txg);
+			zio_free_zil(zilog->zl_spa, txg, &lwb->lwb_blk);
 			kmem_cache_free(zil_lwb_cache, lwb);
 		}
-	} else {
-		zilog->zl_keep_first = keep_first;
-		if (zh->zh_flags & ZIL_REPLAY_NEEDED) {
-			ASSERT(!keep_first);
-			(void) zil_parse(zilog, zil_free_log_block,
-			    zil_free_log_record, tx, zh->zh_claim_txg);
-		} else {
-			/*
-			 * Would like to assert zil_empty() but that
-			 * would force us to read the log chain which
-			 * requires us to do I/O to the log. This is
-			 * overkill since we really just want to destroy
-			 * the chain anyway.
-			 */
-			if (!keep_first) {
-				blkptr_t bp = zh->zh_log;
-				zio_free_blk(zilog->zl_spa, &bp, txg);
-			}
-		}
+	} else if (!keep_first) {
+		(void) zil_parse(zilog, zil_free_log_block,
+		    zil_free_log_record, tx, zh->zh_claim_txg);
 	}
 	mutex_exit(&zilog->zl_lock);
 
 	dmu_tx_commit(tx);
 }
 
-/*
- * return true if the initial log block is not valid
- */
-static boolean_t
-zil_empty(zilog_t *zilog)
-{
-	const zil_header_t *zh = zilog->zl_header;
-	arc_buf_t *abuf = NULL;
-
-	if (BP_IS_HOLE(&zh->zh_log))
-		return (B_TRUE);
-
-	if (zil_read_log_block(zilog, &zh->zh_log, &abuf) != 0)
-		return (B_TRUE);
-
-	VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
-	return (B_FALSE);
-}
-
 int
 zil_claim(char *osname, void *txarg)
 {
@@ -514,9 +547,9 @@
 	zilog = dmu_objset_zil(os);
 	zh = zil_header_in_syncing_context(zilog);
 
-	if (zilog->zl_spa->spa_log_state == SPA_LOG_CLEAR) {
+	if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR) {
 		if (!BP_IS_HOLE(&zh->zh_log))
-			zio_free_blk(zilog->zl_spa, &zh->zh_log, first_txg);
+			zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log);
 		BP_ZERO(&zh->zh_log);
 		dsl_dataset_dirty(dmu_objset_ds(os), tx);
 		dmu_objset_rele(os, FTAG);
@@ -524,23 +557,6 @@
 	}
 
 	/*
-	 * Record here whether the zil has any records to replay.
-	 * If the header block pointer is null or the block points
-	 * to the stubby then we know there are no valid log records.
-	 * We use the header to store this state as the the zilog gets
-	 * freed later in dmu_objset_close().
-	 * The flags (and the rest of the header fields) are cleared in
-	 * zil_sync() as a result of a zil_destroy(), after replaying the log.
-	 *
-	 * Note, the intent log can be empty but still need the
-	 * stubby to be claimed.
-	 */
-	if (!zil_empty(zilog)) {
-		zh->zh_flags |= ZIL_REPLAY_NEEDED;
-		dsl_dataset_dirty(dmu_objset_ds(os), tx);
-	}
-
-	/*
 	 * Claim all log blocks if we haven't already done so, and remember
 	 * the highest claimed sequence number.  This ensures that if we can
 	 * read only part of the log now (e.g. due to a missing device),
@@ -549,9 +565,14 @@
 	 */
 	ASSERT3U(zh->zh_claim_txg, <=, first_txg);
 	if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
+		(void) zil_parse(zilog, zil_claim_log_block,
+		    zil_claim_log_record, tx, first_txg);
 		zh->zh_claim_txg = first_txg;
-		zh->zh_claim_seq = zil_parse(zilog, zil_claim_log_block,
-		    zil_claim_log_record, tx, first_txg);
+		zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq;
+		zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq;
+		if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1)
+			zh->zh_flags |= ZIL_REPLAY_NEEDED;
+		zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID;
 		dsl_dataset_dirty(dmu_objset_ds(os), tx);
 	}
 
@@ -565,19 +586,15 @@
  * Checksum errors are ok as they indicate the end of the chain.
  * Any other error (no device or read failure) returns an error.
  */
-/* ARGSUSED */
 int
-zil_check_log_chain(char *osname, void *txarg)
+zil_check_log_chain(char *osname, void *tx)
 {
 	zilog_t *zilog;
-	zil_header_t *zh;
-	blkptr_t blk;
-	arc_buf_t *abuf;
 	objset_t *os;
-	char *lrbuf;
-	zil_trailer_t *ztp;
 	int error;
 
+	ASSERT(tx == NULL);
+
 	error = dmu_objset_hold(osname, FTAG, &os);
 	if (error) {
 		cmn_err(CE_WARN, "can't open objset for %s", osname);
@@ -585,26 +602,20 @@
 	}
 
 	zilog = dmu_objset_zil(os);
-	zh = zil_header_in_syncing_context(zilog);
-	blk = zh->zh_log;
-	if (BP_IS_HOLE(&blk)) {
-		dmu_objset_rele(os, FTAG);
-		return (0); /* no chain */
-	}
 
-	for (;;) {
-		error = zil_read_log_block(zilog, &blk, &abuf);
-		if (error)
-			break;
-		lrbuf = abuf->b_data;
-		ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;
-		blk = ztp->zit_next_blk;
-		VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
-	}
+	/*
+	 * Because tx == NULL, zil_claim_log_block() will not actually claim
+	 * any blocks, but just determine whether it is possible to do so.
+	 * In addition to checking the log chain, zil_claim_log_block()
+	 * will invoke zio_claim() with a done func of spa_claim_notify(),
+	 * which will update spa_max_claim_txg.  See spa_load() for details.
+	 */
+	error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
+	    zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa));
+
 	dmu_objset_rele(os, FTAG);
-	if (error == ECKSUM)
-		return (0); /* normal end of chain */
-	return (error);
+
+	return ((error == ECKSUM || error == ENOENT) ? 0 : error);
 }
 
 static int
@@ -622,7 +633,7 @@
 }
 
 void
-zil_add_block(zilog_t *zilog, blkptr_t *bp)
+zil_add_block(zilog_t *zilog, const blkptr_t *bp)
 {
 	avl_tree_t *t = &zilog->zl_vdev_tree;
 	avl_index_t where;
@@ -698,6 +709,7 @@
 {
 	lwb_t *lwb = zio->io_private;
 	zilog_t *zilog = lwb->lwb_zilog;
+	dmu_tx_t *tx = lwb->lwb_tx;
 
 	ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
 	ASSERT(BP_GET_CHECKSUM(zio->io_bp) == ZIO_CHECKSUM_ZILOG);
@@ -719,17 +731,15 @@
 	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 	mutex_enter(&zilog->zl_lock);
 	lwb->lwb_buf = NULL;
-	if (zio->io_error)
-		zilog->zl_log_error = B_TRUE;
+	lwb->lwb_tx = NULL;
+	mutex_exit(&zilog->zl_lock);
 
 	/*
 	 * Now that we've written this log block, we have a stable pointer
 	 * to the next block in the chain, so it's OK to let the txg in
-	 * which we allocated the next block sync. We still have the
-	 * zl_lock to ensure zil_sync doesn't kmem free the lwb.
+	 * which we allocated the next block sync.
 	 */
-	txg_rele_to_sync(&lwb->lwb_txgh);
-	mutex_exit(&zilog->zl_lock);
+	dmu_tx_commit(tx);
 }
 
 /*
@@ -740,10 +750,9 @@
 {
 	zbookmark_t zb;
 
-	zb.zb_objset = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET];
-	zb.zb_object = 0;
-	zb.zb_level = -1;
-	zb.zb_blkid = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
+	SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
+	    lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
 
 	if (zilog->zl_root_zio == NULL) {
 		zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL,
@@ -778,6 +787,7 @@
 	zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1;
 	spa_t *spa = zilog->zl_spa;
 	blkptr_t *bp = &ztp->zit_next_blk;
+	dmu_tx_t *tx;
 	uint64_t txg;
 	uint64_t zil_blksz;
 	int error;
@@ -789,10 +799,16 @@
 	 * before writing it in order to establish the log chain.
 	 * Note that if the allocation of nlwb synced before we wrote
 	 * the block that points at it (lwb), we'd leak it if we crashed.
-	 * Therefore, we don't do txg_rele_to_sync() until zil_lwb_write_done().
+	 * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done().
+	 * We dirty the dataset to ensure that zil_sync() will be called
+	 * to clean up in the event of allocation failure or I/O failure.
 	 */
-	txg = txg_hold_open(zilog->zl_dmu_pool, &lwb->lwb_txgh);
-	txg_rele_to_quiesce(&lwb->lwb_txgh);
+	tx = dmu_tx_create(zilog->zl_os);
+	VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
+	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+	txg = dmu_tx_get_txg(tx);
+
+	lwb->lwb_tx = tx;
 
 	/*
 	 * Pick a ZIL blocksize. We request a size that is the
@@ -808,22 +824,11 @@
 
 	BP_ZERO(bp);
 	/* pass the old blkptr in order to spread log blocks across devs */
-	error = zio_alloc_blk(spa, zil_blksz, bp, &lwb->lwb_blk, txg,
+	error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz,
 	    USE_SLOG(zilog));
 	if (error) {
-		dmu_tx_t *tx = dmu_tx_create_assigned(zilog->zl_dmu_pool, txg);
-
 		/*
-		 * We dirty the dataset to ensure that zil_sync() will
-		 * be called to remove this lwb from our zl_lwb_list.
-		 * Failing to do so, may leave an lwb with a NULL lwb_buf
-		 * hanging around on the zl_lwb_list.
-		 */
-		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
-		dmu_tx_commit(tx);
-
-		/*
-		 * Since we've just experienced an allocation failure so we
+		 * Since we've just experienced an allocation failure,
 		 * terminate the current lwb and send it on its way.
 		 */
 		ztp->zit_pad = 0;
@@ -848,7 +853,6 @@
 	 * Allocate a new log write buffer (lwb).
 	 */
 	nlwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
-
 	nlwb->lwb_zilog = zilog;
 	nlwb->lwb_blk = *bp;
 	nlwb->lwb_nused = 0;
@@ -856,6 +860,7 @@
 	nlwb->lwb_buf = zio_buf_alloc(nlwb->lwb_sz);
 	nlwb->lwb_max_txg = txg;
 	nlwb->lwb_zio = NULL;
+	nlwb->lwb_tx = NULL;
 
 	/*
 	 * Put new lwb at the end of the log chain
@@ -870,7 +875,6 @@
 	/*
 	 * kick off the write for the old log block
 	 */
-	dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg);
 	ASSERT(lwb->lwb_zio);
 	zio_nowait(lwb->lwb_zio);
 
@@ -881,20 +885,20 @@
 zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
 {
 	lr_t *lrc = &itx->itx_lr; /* common log record */
-	lr_write_t *lr = (lr_write_t *)lrc;
+	lr_write_t *lrw = (lr_write_t *)lrc;
+	char *lr_buf;
 	uint64_t txg = lrc->lrc_txg;
 	uint64_t reclen = lrc->lrc_reclen;
-	uint64_t dlen;
+	uint64_t dlen = 0;
 
 	if (lwb == NULL)
 		return (NULL);
+
 	ASSERT(lwb->lwb_buf != NULL);
 
 	if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY)
 		dlen = P2ROUNDUP_TYPED(
-		    lr->lr_length, sizeof (uint64_t), uint64_t);
-	else
-		dlen = 0;
+		    lrw->lr_length, sizeof (uint64_t), uint64_t);
 
 	zilog->zl_cur_used += (reclen + dlen);
 
@@ -915,12 +919,10 @@
 		}
 	}
 
-	/*
-	 * Update the lrc_seq, to be log record sequence number. See zil.h
-	 * Then copy the record to the log buffer.
-	 */
-	lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
-	bcopy(lrc, lwb->lwb_buf + lwb->lwb_nused, reclen);
+	lr_buf = lwb->lwb_buf + lwb->lwb_nused;
+	bcopy(lrc, lr_buf, reclen);
+	lrc = (lr_t *)lr_buf;
+	lrw = (lr_write_t *)lrc;
 
 	/*
 	 * If it's a write, fetch the data or get its blkptr as appropriate.
@@ -932,18 +934,16 @@
 			char *dbuf;
 			int error;
 
-			/* alignment is guaranteed */
-			lr = (lr_write_t *)(lwb->lwb_buf + lwb->lwb_nused);
 			if (dlen) {
 				ASSERT(itx->itx_wr_state == WR_NEED_COPY);
-				dbuf = lwb->lwb_buf + lwb->lwb_nused + reclen;
-				lr->lr_common.lrc_reclen += dlen;
+				dbuf = lr_buf + reclen;
+				lrw->lr_common.lrc_reclen += dlen;
 			} else {
 				ASSERT(itx->itx_wr_state == WR_INDIRECT);
 				dbuf = NULL;
 			}
 			error = zilog->zl_get_data(
-			    itx->itx_private, lr, dbuf, lwb->lwb_zio);
+			    itx->itx_private, lrw, dbuf, lwb->lwb_zio);
 			if (error == EIO) {
 				txg_wait_synced(zilog->zl_dmu_pool, txg);
 				return (lwb);
@@ -956,6 +956,13 @@
 		}
 	}
 
+	/*
+	 * We're actually making an entry, so update lrc_seq to be the
+	 * log record sequence number.  Note that this is generally not
+	 * equal to the itx sequence number because not all transactions
+	 * are synchronous, and sometimes spa_sync() gets there first.
+	 */
+	lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
 	lwb->lwb_nused += reclen + dlen;
 	lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
 	ASSERT3U(lwb->lwb_nused, <=, ZIL_BLK_DATA_SZ(lwb));
@@ -980,12 +987,19 @@
 	return (itx);
 }
 
+void
+zil_itx_destroy(itx_t *itx)
+{
+	kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen);
+}
+
 uint64_t
 zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
 {
 	uint64_t seq;
 
 	ASSERT(itx->itx_lr.lrc_seq == 0);
+	ASSERT(!zilog->zl_replay);
 
 	mutex_enter(&zilog->zl_lock);
 	list_insert_tail(&zilog->zl_itx_list, itx);
@@ -1034,8 +1048,7 @@
 	/* destroy sync'd log transactions */
 	while ((itx = list_head(&clean_list)) != NULL) {
 		list_remove(&clean_list, itx);
-		kmem_free(itx, offsetof(itx_t, itx_lr)
-		    + itx->itx_lr.lrc_reclen);
+		zil_itx_destroy(itx);
 	}
 	list_destroy(&clean_list);
 }
@@ -1064,9 +1077,10 @@
 {
 	uint64_t txg;
 	uint64_t commit_seq = 0;
-	itx_t *itx, *itx_next = (itx_t *)-1;
+	itx_t *itx, *itx_next;
 	lwb_t *lwb;
 	spa_t *spa;
+	int error = 0;
 
 	zilog->zl_writer = B_TRUE;
 	ASSERT(zilog->zl_root_zio == NULL);
@@ -1094,69 +1108,56 @@
 
 	/* Loop through in-memory log transactions filling log blocks. */
 	DTRACE_PROBE1(zil__cw1, zilog_t *, zilog);
-	for (;;) {
+
+	for (itx = list_head(&zilog->zl_itx_list); itx; itx = itx_next) {
 		/*
-		 * Find the next itx to push:
-		 * Push all transactions related to specified foid and all
-		 * other transactions except TX_WRITE, TX_TRUNCATE,
-		 * TX_SETATTR and TX_ACL for all other files.
+		 * Save the next pointer.  Even though we drop zl_lock below,
+		 * all threads that can remove itx list entries (other writers
+		 * and zil_itx_clean()) can't do so until they have zl_writer.
 		 */
-		if (itx_next != (itx_t *)-1)
-			itx = itx_next;
-		else
-			itx = list_head(&zilog->zl_itx_list);
-		for (; itx != NULL; itx = list_next(&zilog->zl_itx_list, itx)) {
-			if (foid == 0) /* push all foids? */
-				break;
-			if (itx->itx_sync) /* push all O_[D]SYNC */
-				break;
-			switch (itx->itx_lr.lrc_txtype) {
-			case TX_SETATTR:
-			case TX_WRITE:
-			case TX_TRUNCATE:
-			case TX_ACL:
-				/* lr_foid is same offset for these records */
-				if (((lr_write_t *)&itx->itx_lr)->lr_foid
-				    != foid) {
-					continue; /* skip this record */
-				}
-			}
-			break;
-		}
-		if (itx == NULL)
-			break;
+		itx_next = list_next(&zilog->zl_itx_list, itx);
+
+		/*
+		 * Determine whether to push this itx.
+		 * Push all transactions related to specified foid and
+		 * all other transactions except those that can be logged
+		 * out of order (TX_WRITE, TX_TRUNCATE, TX_SETATTR, TX_ACL)
+		 * for all other files.
+		 *
+		 * If foid == 0 (meaning "push all foids") or
+		 * itx->itx_sync is set (meaning O_[D]SYNC), push regardless.
+		 */
+		if (foid != 0 && !itx->itx_sync &&
+		    TX_OOO(itx->itx_lr.lrc_txtype) &&
+		    ((lr_ooo_t *)&itx->itx_lr)->lr_foid != foid)
+			continue; /* skip this record */
 
 		if ((itx->itx_lr.lrc_seq > seq) &&
 		    ((lwb == NULL) || (lwb->lwb_nused == 0) ||
-		    (lwb->lwb_nused + itx->itx_sod > ZIL_BLK_DATA_SZ(lwb)))) {
+		    (lwb->lwb_nused + itx->itx_sod > ZIL_BLK_DATA_SZ(lwb))))
 			break;
-		}
 
-		/*
-		 * Save the next pointer.  Even though we soon drop
-		 * zl_lock all threads that may change the list
-		 * (another writer or zil_itx_clean) can't do so until
-		 * they have zl_writer.
-		 */
-		itx_next = list_next(&zilog->zl_itx_list, itx);
 		list_remove(&zilog->zl_itx_list, itx);
 		zilog->zl_itx_list_sz -= itx->itx_sod;
+
 		mutex_exit(&zilog->zl_lock);
+
 		txg = itx->itx_lr.lrc_txg;
 		ASSERT(txg);
 
 		if (txg > spa_last_synced_txg(spa) ||
 		    txg > spa_freeze_txg(spa))
 			lwb = zil_lwb_commit(zilog, itx, lwb);
-		kmem_free(itx, offsetof(itx_t, itx_lr)
-		    + itx->itx_lr.lrc_reclen);
+
+		zil_itx_destroy(itx);
+
 		mutex_enter(&zilog->zl_lock);
 	}
 	DTRACE_PROBE1(zil__cw2, zilog_t *, zilog);
 	/* determine commit sequence number */
 	itx = list_head(&zilog->zl_itx_list);
 	if (itx)
-		commit_seq = itx->itx_lr.lrc_seq;
+		commit_seq = itx->itx_lr.lrc_seq - 1;
 	else
 		commit_seq = zilog->zl_itx_seq;
 	mutex_exit(&zilog->zl_lock);
@@ -1173,22 +1174,28 @@
 	 */
 	if (zilog->zl_root_zio) {
 		DTRACE_PROBE1(zil__cw3, zilog_t *, zilog);
-		(void) zio_wait(zilog->zl_root_zio);
+		error = zio_wait(zilog->zl_root_zio);
 		zilog->zl_root_zio = NULL;
 		DTRACE_PROBE1(zil__cw4, zilog_t *, zilog);
 		zil_flush_vdevs(zilog);
 	}
 
-	if (zilog->zl_log_error || lwb == NULL) {
-		zilog->zl_log_error = 0;
+	if (error || lwb == NULL)
 		txg_wait_synced(zilog->zl_dmu_pool, 0);
-	}
 
 	mutex_enter(&zilog->zl_lock);
 	zilog->zl_writer = B_FALSE;
 
 	ASSERT3U(commit_seq, >=, zilog->zl_commit_seq);
 	zilog->zl_commit_seq = commit_seq;
+
+	/*
+	 * Remember the highest committed log sequence number for ztest.
+	 * We only update this value when all the log writes succeeded,
+	 * because ztest wants to ASSERT that it got the whole log chain.
+	 */
+	if (error == 0 && lwb != NULL)
+		zilog->zl_commit_lr_seq = zilog->zl_lr_seq;
 }
 
 /*
@@ -1208,7 +1215,7 @@
 
 	while (zilog->zl_writer) {
 		cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
-		if (seq < zilog->zl_commit_seq) {
+		if (seq <= zilog->zl_commit_seq) {
 			mutex_exit(&zilog->zl_lock);
 			return;
 		}
@@ -1220,6 +1227,33 @@
 }
 
 /*
+ * Report whether all transactions are committed.
+ */
+static boolean_t
+zil_is_committed(zilog_t *zilog)
+{
+	lwb_t *lwb;
+	boolean_t committed;
+
+	mutex_enter(&zilog->zl_lock);
+
+	while (zilog->zl_writer)
+		cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
+
+	if (!list_is_empty(&zilog->zl_itx_list))
+		committed = B_FALSE;		/* unpushed transactions */
+	else if ((lwb = list_head(&zilog->zl_lwb_list)) == NULL)
+		committed = B_TRUE;		/* intent log never used */
+	else if (list_next(&zilog->zl_lwb_list, lwb) != NULL)
+		committed = B_FALSE;		/* zil_sync() not done yet */
+	else
+		committed = B_TRUE;		/* everything synced */
+
+	mutex_exit(&zilog->zl_lock);
+	return (committed);
+}
+
+/*
  * Called in syncing context to free committed log blocks and update log header.
  */
 void
@@ -1228,6 +1262,7 @@
 	zil_header_t *zh = zil_header_in_syncing_context(zilog);
 	uint64_t txg = dmu_tx_get_txg(tx);
 	spa_t *spa = zilog->zl_spa;
+	uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK];
 	lwb_t *lwb;
 
 	/*
@@ -1241,7 +1276,11 @@
 
 	ASSERT(zilog->zl_stop_sync == 0);
 
-	zh->zh_replay_seq = zilog->zl_replayed_seq[txg & TXG_MASK];
+	if (*replayed_seq != 0) {
+		ASSERT(zh->zh_replay_seq < *replayed_seq);
+		zh->zh_replay_seq = *replayed_seq;
+		*replayed_seq = 0;
+	}
 
 	if (zilog->zl_destroy_txg == txg) {
 		blkptr_t blk = zh->zh_log;
@@ -1270,7 +1309,7 @@
 		if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg)
 			break;
 		list_remove(&zilog->zl_lwb_list, lwb);
-		zio_free_blk(spa, &lwb->lwb_blk, txg);
+		zio_free_zil(spa, txg, &lwb->lwb_blk);
 		kmem_cache_free(zil_lwb_cache, lwb);
 
 		/*
@@ -1393,7 +1432,7 @@
 	if (!zil_is_committed(zilog)) {
 		uint64_t txg;
 		dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
-		(void) dmu_tx_assign(tx, TXG_WAIT);
+		VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
 		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 		txg = dmu_tx_get_txg(tx);
 		dmu_tx_commit(tx);
@@ -1466,102 +1505,89 @@
 	mutex_exit(&zilog->zl_lock);
 }
 
-/*
- * Read in the data for the dmu_sync()ed block, and change the log
- * record to write this whole block.
- */
-void
-zil_get_replay_data(zilog_t *zilog, lr_write_t *lr)
-{
-	blkptr_t *wbp = &lr->lr_blkptr;
-	char *wbuf = (char *)(lr + 1); /* data follows lr_write_t */
-	uint64_t blksz;
-
-	if (BP_IS_HOLE(wbp)) {	/* compressed to a hole */
-		blksz = BP_GET_LSIZE(&lr->lr_blkptr);
-		/*
-		 * If the blksz is zero then we must be replaying a log
-		 * from an version prior to setting the blksize of null blocks.
-		 * So we just zero the actual write size reqeusted.
-		 */
-		if (blksz == 0) {
-			bzero(wbuf, lr->lr_length);
-			return;
-		}
-		bzero(wbuf, blksz);
-	} else {
-		/*
-		 * A subsequent write may have overwritten this block, in which
-		 * case wbp may have been been freed and reallocated, and our
-		 * read of wbp may fail with a checksum error.  We can safely
-		 * ignore this because the later write will provide the
-		 * correct data.
-		 */
-		zbookmark_t zb;
-
-		zb.zb_objset = dmu_objset_id(zilog->zl_os);
-		zb.zb_object = lr->lr_foid;
-		zb.zb_level = 0;
-		zb.zb_blkid = -1; /* unknown */
-
-		blksz = BP_GET_LSIZE(&lr->lr_blkptr);
-		(void) zio_wait(zio_read(NULL, zilog->zl_spa, wbp, wbuf, blksz,
-		    NULL, NULL, ZIO_PRIORITY_SYNC_READ,
-		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb));
-	}
-	lr->lr_offset -= lr->lr_offset % blksz;
-	lr->lr_length = blksz;
-}
-
 typedef struct zil_replay_arg {
-	objset_t	*zr_os;
 	zil_replay_func_t **zr_replay;
 	void		*zr_arg;
 	boolean_t	zr_byteswap;
-	char		*zr_lrbuf;
+	char		*zr_lr;
 } zil_replay_arg_t;
 
-static void
+static int
+zil_replay_error(zilog_t *zilog, lr_t *lr, int error)
+{
+	char name[MAXNAMELEN];
+
+	zilog->zl_replaying_seq--;	/* didn't actually replay this one */
+
+	dmu_objset_name(zilog->zl_os, name);
+
+	cmn_err(CE_WARN, "ZFS replay transaction error %d, "
+	    "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name,
+	    (u_longlong_t)lr->lrc_seq,
+	    (u_longlong_t)(lr->lrc_txtype & ~TX_CI),
+	    (lr->lrc_txtype & TX_CI) ? "CI" : "");
+
+	return (error);
+}
+
+static int
 zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
 {
 	zil_replay_arg_t *zr = zra;
 	const zil_header_t *zh = zilog->zl_header;
 	uint64_t reclen = lr->lrc_reclen;
 	uint64_t txtype = lr->lrc_txtype;
-	char *name;
-	int pass, error;
+	int error = 0;
 
-	if (!zilog->zl_replay)			/* giving up */
-		return;
+	zilog->zl_replaying_seq = lr->lrc_seq;
+
+	if (lr->lrc_seq <= zh->zh_replay_seq)	/* already replayed */
+		return (0);
 
 	if (lr->lrc_txg < claim_txg)		/* already committed */
-		return;
-
-	if (lr->lrc_seq <= zh->zh_replay_seq)	/* already replayed */
-		return;
+		return (0);
 
 	/* Strip case-insensitive bit, still present in log record */
 	txtype &= ~TX_CI;
 
-	if (txtype == 0 || txtype >= TX_MAX_TYPE) {
-		error = EINVAL;
-		goto bad;
+	if (txtype == 0 || txtype >= TX_MAX_TYPE)
+		return (zil_replay_error(zilog, lr, EINVAL));
+
+	/*
+	 * If this record type can be logged out of order, the object
+	 * (lr_foid) may no longer exist.  That's legitimate, not an error.
+	 */
+	if (TX_OOO(txtype)) {
+		error = dmu_object_info(zilog->zl_os,
+		    ((lr_ooo_t *)lr)->lr_foid, NULL);
+		if (error == ENOENT || error == EEXIST)
+			return (0);
 	}
 
 	/*
 	 * Make a copy of the data so we can revise and extend it.
 	 */
-	bcopy(lr, zr->zr_lrbuf, reclen);
+	bcopy(lr, zr->zr_lr, reclen);
+
+	/*
+	 * If this is a TX_WRITE with a blkptr, suck in the data.
+	 */
+	if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
+		error = zil_read_log_data(zilog, (lr_write_t *)lr,
+		    zr->zr_lr + reclen);
+		if (error)
+			return (zil_replay_error(zilog, lr, error));
+	}
 
 	/*
 	 * The log block containing this lr may have been byteswapped
 	 * so that we can easily examine common fields like lrc_txtype.
-	 * However, the log is a mix of different data types, and only the
+	 * However, the log is a mix of different record types, and only the
 	 * replay vectors know how to byteswap their records.  Therefore, if
 	 * the lr was byteswapped, undo it before invoking the replay vector.
 	 */
 	if (zr->zr_byteswap)
-		byteswap_uint64_array(zr->zr_lrbuf, reclen);
+		byteswap_uint64_array(zr->zr_lr, reclen);
 
 	/*
 	 * We must now do two things atomically: replay this log record,
@@ -1569,42 +1595,30 @@
 	 * we did so. At the end of each replay function the sequence number
 	 * is updated if we are in replay mode.
 	 */
-	for (pass = 1; pass <= 2; pass++) {
-		zilog->zl_replaying_seq = lr->lrc_seq;
-		/* Only byteswap (if needed) on the 1st pass.  */
-		error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf,
-		    zr->zr_byteswap && pass == 1);
-
-		if (!error)
-			return;
-
+	error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap);
+	if (error) {
 		/*
 		 * The DMU's dnode layer doesn't see removes until the txg
 		 * commits, so a subsequent claim can spuriously fail with
 		 * EEXIST. So if we receive any error we try syncing out
-		 * any removes then retry the transaction.
+		 * any removes then retry the transaction.  Note that we
+		 * specify B_FALSE for byteswap now, so we don't do it twice.
 		 */
-		if (pass == 1)
-			txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
+		txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
+		error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE);
+		if (error)
+			return (zil_replay_error(zilog, lr, error));
 	}
-
-bad:
-	ASSERT(error);
-	name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
-	dmu_objset_name(zr->zr_os, name);
-	cmn_err(CE_WARN, "ZFS replay transaction error %d, "
-	    "dataset %s, seq 0x%llx, txtype %llu %s\n",
-	    error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype,
-	    (lr->lrc_txtype & TX_CI) ? "CI" : "");
-	zilog->zl_replay = B_FALSE;
-	kmem_free(name, MAXNAMELEN);
+	return (0);
 }
 
 /* ARGSUSED */
-static void
+static int
 zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
 {
 	zilog->zl_replay_blks++;
+
+	return (0);
 }
 
 /*
@@ -1622,11 +1636,10 @@
 		return;
 	}
 
-	zr.zr_os = os;
 	zr.zr_replay = replay_func;
 	zr.zr_arg = arg;
 	zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
-	zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
+	zr.zr_lr = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
 
 	/*
 	 * Wait for in-progress removes to sync before starting replay.
@@ -1638,54 +1651,27 @@
 	ASSERT(zilog->zl_replay_blks == 0);
 	(void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
 	    zh->zh_claim_txg);
-	kmem_free(zr.zr_lrbuf, 2 * SPA_MAXBLOCKSIZE);
+	kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE);
 
 	zil_destroy(zilog, B_FALSE);
 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 	zilog->zl_replay = B_FALSE;
 }
 
-/*
- * Report whether all transactions are committed
- */
-int
-zil_is_committed(zilog_t *zilog)
+boolean_t
+zil_replaying(zilog_t *zilog, dmu_tx_t *tx)
 {
-	lwb_t *lwb;
-	int ret;
+	if (zilog == NULL)
+		return (B_TRUE);
 
-	mutex_enter(&zilog->zl_lock);
-	while (zilog->zl_writer)
-		cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
-
-	/* recent unpushed intent log transactions? */
-	if (!list_is_empty(&zilog->zl_itx_list)) {
-		ret = B_FALSE;
-		goto out;
+	if (zilog->zl_replay) {
+		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+		zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
+		    zilog->zl_replaying_seq;
+		return (B_TRUE);
 	}
 
-	/* intent log never used? */
-	lwb = list_head(&zilog->zl_lwb_list);
-	if (lwb == NULL) {
-		ret = B_TRUE;
-		goto out;
-	}
-
-	/*
-	 * more than 1 log buffer means zil_sync() hasn't yet freed
-	 * entries after a txg has committed
-	 */
-	if (list_next(&zilog->zl_lwb_list, lwb)) {
-		ret = B_FALSE;
-		goto out;
-	}
-
-	ASSERT(zil_empty(zilog));
-	ret = B_TRUE;
-out:
-	cv_broadcast(&zilog->zl_cv_writer);
-	mutex_exit(&zilog->zl_lock);
-	return (ret);
+	return (B_FALSE);
 }
 
 /* ARGSUSED */
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/zio.c
--- a/usr/src/uts/common/fs/zfs/zio.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/zio.c	Sun Nov 01 14:14:46 2009 -0800
@@ -32,6 +32,9 @@
 #include <sys/zio_impl.h>
 #include <sys/zio_compress.h>
 #include <sys/zio_checksum.h>
+#include <sys/dmu_objset.h>
+#include <sys/arc.h>
+#include <sys/ddt.h>
 
 /*
  * ==========================================================================
@@ -59,10 +62,6 @@
 char *zio_type_name[ZIO_TYPES] = {
 	"null", "read", "write", "free", "claim", "ioctl" };
 
-#define	SYNC_PASS_DEFERRED_FREE	1	/* defer frees after this pass */
-#define	SYNC_PASS_DONT_COMPRESS	4	/* don't compress after this pass */
-#define	SYNC_PASS_REWRITE	1	/* rewrite new bps after this pass */
-
 /*
  * ==========================================================================
  * I/O kmem caches
@@ -81,8 +80,13 @@
  * An allocating zio is one that either currently has the DVA allocate
  * stage set or will have it later in its lifetime.
  */
-#define	IO_IS_ALLOCATING(zio) \
-	((zio)->io_orig_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE))
+#define	IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
+
+#ifdef ZFS_DEBUG
+int zio_buf_debug_limit = 16384;
+#else
+int zio_buf_debug_limit = 0;
+#endif
 
 void
 zio_init(void)
@@ -124,12 +128,13 @@
 			char name[36];
 			(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
 			zio_buf_cache[c] = kmem_cache_create(name, size,
-			    align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG);
+			    align, NULL, NULL, NULL, NULL, NULL,
+			    size > zio_buf_debug_limit ? KMC_NODEBUG : 0);
 
 			(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
 			zio_data_buf_cache[c] = kmem_cache_create(name, size,
 			    align, NULL, NULL, NULL, NULL, data_alloc_arena,
-			    KMC_NODEBUG);
+			    size > zio_buf_debug_limit ? KMC_NODEBUG : 0);
 		}
 	}
 
@@ -264,7 +269,8 @@
 			zt->zt_transform(zio,
 			    zt->zt_orig_data, zt->zt_orig_size);
 
-		zio_buf_free(zio->io_data, zt->zt_bufsize);
+		if (zt->zt_bufsize != 0)
+			zio_buf_free(zio->io_data, zt->zt_bufsize);
 
 		zio->io_data = zt->zt_orig_data;
 		zio->io_size = zt->zt_orig_size;
@@ -293,7 +299,7 @@
 {
 	if (zio->io_error == 0 &&
 	    zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
-	    zio->io_data, zio->io_size, data, size) != 0)
+	    zio->io_data, data, zio->io_size, size) != 0)
 		zio->io_error = EIO;
 }
 
@@ -378,6 +384,9 @@
 	list_insert_head(&pio->io_child_list, zl);
 	list_insert_head(&cio->io_parent_list, zl);
 
+	pio->io_child_count++;
+	cio->io_parent_count++;
+
 	mutex_exit(&pio->io_lock);
 	mutex_exit(&cio->io_lock);
 }
@@ -394,6 +403,9 @@
 	list_remove(&pio->io_child_list, zl);
 	list_remove(&cio->io_parent_list, zl);
 
+	pio->io_child_count--;
+	cio->io_parent_count--;
+
 	mutex_exit(&pio->io_lock);
 	mutex_exit(&cio->io_lock);
 
@@ -409,7 +421,7 @@
 	mutex_enter(&zio->io_lock);
 	ASSERT(zio->io_stall == NULL);
 	if (*countp != 0) {
-		zio->io_stage--;
+		zio->io_stage >>= 1;
 		zio->io_stall = countp;
 		waiting = B_TRUE;
 	}
@@ -451,10 +463,11 @@
  * ==========================================================================
  */
 static zio_t *
-zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
     void *data, uint64_t size, zio_done_func_t *done, void *private,
-    zio_type_t type, int priority, int flags, vdev_t *vd, uint64_t offset,
-    const zbookmark_t *zb, uint8_t stage, uint32_t pipeline)
+    zio_type_t type, int priority, enum zio_flag flags,
+    vdev_t *vd, uint64_t offset, const zbookmark_t *zb,
+    enum zio_stage stage, enum zio_stage pipeline)
 {
 	zio_t *zio;
 
@@ -481,14 +494,17 @@
 		zio->io_child_type = ZIO_CHILD_VDEV;
 	else if (flags & ZIO_FLAG_GANG_CHILD)
 		zio->io_child_type = ZIO_CHILD_GANG;
+	else if (flags & ZIO_FLAG_DDT_CHILD)
+		zio->io_child_type = ZIO_CHILD_DDT;
 	else
 		zio->io_child_type = ZIO_CHILD_LOGICAL;
 
 	if (bp != NULL) {
-		zio->io_bp = bp;
+		zio->io_bp = (blkptr_t *)bp;
 		zio->io_bp_copy = *bp;
 		zio->io_bp_orig = *bp;
-		if (type != ZIO_TYPE_WRITE)
+		if (type != ZIO_TYPE_WRITE ||
+		    zio->io_child_type == ZIO_CHILD_DDT)
 			zio->io_bp = &zio->io_bp_copy;	/* so caller can free */
 		if (zio->io_child_type == ZIO_CHILD_LOGICAL)
 			zio->io_logical = zio;
@@ -498,14 +514,14 @@
 
 	zio->io_spa = spa;
 	zio->io_txg = txg;
-	zio->io_data = data;
-	zio->io_size = size;
 	zio->io_done = done;
 	zio->io_private = private;
 	zio->io_type = type;
 	zio->io_priority = priority;
 	zio->io_vd = vd;
 	zio->io_offset = offset;
+	zio->io_orig_data = zio->io_data = data;
+	zio->io_orig_size = zio->io_size = size;
 	zio->io_orig_flags = zio->io_flags = flags;
 	zio->io_orig_stage = zio->io_stage = stage;
 	zio->io_orig_pipeline = zio->io_pipeline = pipeline;
@@ -539,7 +555,7 @@
 
 zio_t *
 zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
-    void *private, int flags)
+    void *private, enum zio_flag flags)
 {
 	zio_t *zio;
 
@@ -551,7 +567,7 @@
 }
 
 zio_t *
-zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags)
+zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
 {
 	return (zio_null(NULL, spa, NULL, done, private, flags));
 }
@@ -559,33 +575,24 @@
 zio_t *
 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     void *data, uint64_t size, zio_done_func_t *done, void *private,
-    int priority, int flags, const zbookmark_t *zb)
+    int priority, enum zio_flag flags, const zbookmark_t *zb)
 {
 	zio_t *zio;
 
-	zio = zio_create(pio, spa, bp->blk_birth, (blkptr_t *)bp,
+	zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
 	    data, size, done, private,
 	    ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
-	    ZIO_STAGE_OPEN, ZIO_READ_PIPELINE);
+	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
+	    ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
 
 	return (zio);
 }
 
-void
-zio_skip_write(zio_t *zio)
-{
-	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
-	ASSERT(zio->io_stage == ZIO_STAGE_READY);
-	ASSERT(!BP_IS_GANG(zio->io_bp));
-
-	zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
-}
-
 zio_t *
 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    void *data, uint64_t size, zio_prop_t *zp,
+    void *data, uint64_t size, const zio_prop_t *zp,
     zio_done_func_t *ready, zio_done_func_t *done, void *private,
-    int priority, int flags, const zbookmark_t *zb)
+    int priority, enum zio_flag flags, const zbookmark_t *zb)
 {
 	zio_t *zio;
 
@@ -595,13 +602,15 @@
 	    zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
 	    zp->zp_type < DMU_OT_NUMTYPES &&
 	    zp->zp_level < 32 &&
-	    zp->zp_ndvas > 0 &&
-	    zp->zp_ndvas <= spa_max_replication(spa));
-	ASSERT(ready != NULL);
+	    zp->zp_copies > 0 &&
+	    zp->zp_copies <= spa_max_replication(spa) &&
+	    zp->zp_dedup <= 1 &&
+	    zp->zp_dedup_verify <= 1);
 
 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
-	    ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE);
+	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
+	    ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
 
 	zio->io_ready = ready;
 	zio->io_prop = *zp;
@@ -612,7 +621,7 @@
 zio_t *
 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
     uint64_t size, zio_done_func_t *done, void *private, int priority,
-    int flags, zbookmark_t *zb)
+    enum zio_flag flags, zbookmark_t *zb)
 {
 	zio_t *zio;
 
@@ -623,33 +632,44 @@
 	return (zio);
 }
 
+void
+zio_write_override(zio_t *zio, blkptr_t *bp, int copies)
+{
+	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
+	ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
+
+	zio->io_prop.zp_copies = copies;
+	zio->io_bp_override = bp;
+}
+
+void
+zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
+{
+	bplist_enqueue_deferred(&spa->spa_free_bplist[txg & TXG_MASK], bp);
+}
+
 zio_t *
-zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    zio_done_func_t *done, void *private, int flags)
+zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
+    enum zio_flag flags)
 {
 	zio_t *zio;
 
 	ASSERT(!BP_IS_HOLE(bp));
-
-	if (bp->blk_fill == BLK_FILL_ALREADY_FREED)
-		return (zio_null(pio, spa, NULL, NULL, NULL, flags));
-
-	if (txg == spa->spa_syncing_txg &&
-	    spa_sync_pass(spa) > SYNC_PASS_DEFERRED_FREE) {
-		bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
-		return (zio_null(pio, spa, NULL, NULL, NULL, flags));
-	}
+	ASSERT(spa_syncing_txg(spa) == txg);
+	ASSERT(spa_sync_pass(spa) <= SYNC_PASS_DEFERRED_FREE);
 
 	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
-	    done, private, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags,
+	    NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags,
 	    NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
 
 	return (zio);
 }
 
 zio_t *
-zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    zio_done_func_t *done, void *private, int flags)
+zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
+    zio_done_func_t *done, void *private, enum zio_flag flags)
 {
 	zio_t *zio;
 
@@ -663,9 +683,11 @@
 	 *
 	 * All claims *must* be resolved in the first txg -- before the SPA
 	 * starts allocating blocks -- so that nothing is allocated twice.
+	 * If txg == 0 we just verify that the block is claimable.
 	 */
 	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
-	ASSERT3U(spa_first_txg(spa), <=, txg);
+	ASSERT(txg == spa_first_txg(spa) || txg == 0);
+	ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));	/* zdb(1M) */
 
 	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 	    done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
@@ -676,7 +698,7 @@
 
 zio_t *
 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
-    zio_done_func_t *done, void *private, int priority, int flags)
+    zio_done_func_t *done, void *private, int priority, enum zio_flag flags)
 {
 	zio_t *zio;
 	int c;
@@ -701,7 +723,7 @@
 zio_t *
 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     void *data, int checksum, zio_done_func_t *done, void *private,
-    int priority, int flags, boolean_t labels)
+    int priority, enum zio_flag flags, boolean_t labels)
 {
 	zio_t *zio;
 
@@ -722,7 +744,7 @@
 zio_t *
 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     void *data, int checksum, zio_done_func_t *done, void *private,
-    int priority, int flags, boolean_t labels)
+    int priority, enum zio_flag flags, boolean_t labels)
 {
 	zio_t *zio;
 
@@ -757,10 +779,10 @@
  */
 zio_t *
 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
-	void *data, uint64_t size, int type, int priority, int flags,
+	void *data, uint64_t size, int type, int priority, enum zio_flag flags,
 	zio_done_func_t *done, void *private)
 {
-	uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE;
+	enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
 	zio_t *zio;
 
 	ASSERT(vd->vdev_parent ==
@@ -773,26 +795,33 @@
 		 * detection as close to the leaves as possible and
 		 * eliminates redundant checksums in the interior nodes.
 		 */
-		pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY;
-		pio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
+		pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
+		pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
 	}
 
 	if (vd->vdev_children == 0)
 		offset += VDEV_LABEL_START_SIZE;
 
+	flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE;
+
+	/*
+	 * If we've decided to do a repair, the write is not speculative --
+	 * even if the original read was.
+	 */
+	if (flags & ZIO_FLAG_IO_REPAIR)
+		flags &= ~ZIO_FLAG_SPECULATIVE;
+
 	zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
-	    done, private, type, priority,
-	    (pio->io_flags & ZIO_FLAG_VDEV_INHERIT) |
-	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | flags,
-	    vd, offset, &pio->io_bookmark,
-	    ZIO_STAGE_VDEV_IO_START - 1, pipeline);
+	    done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
+	    ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
 
 	return (zio);
 }
 
 zio_t *
 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
-	int type, int priority, int flags, zio_done_func_t *done, void *private)
+	int type, int priority, enum zio_flag flags,
+	zio_done_func_t *done, void *private)
 {
 	zio_t *zio;
 
@@ -802,7 +831,7 @@
 	    data, size, done, private, type, priority,
 	    flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY,
 	    vd, offset, NULL,
-	    ZIO_STAGE_VDEV_IO_START - 1, ZIO_VDEV_CHILD_PIPELINE);
+	    ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
 
 	return (zio);
 }
@@ -829,28 +858,30 @@
 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
 	    zio->io_child_type == ZIO_CHILD_LOGICAL &&
 	    !(zio->io_flags & ZIO_FLAG_RAW)) {
-		uint64_t csize = BP_GET_PSIZE(bp);
-		void *cbuf = zio_buf_alloc(csize);
-
-		zio_push_transform(zio, cbuf, csize, csize, zio_decompress);
+		uint64_t psize = BP_GET_PSIZE(bp);
+		void *cbuf = zio_buf_alloc(psize);
+
+		zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
 	}
 
 	if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0)
 		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
 
+	if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
+		zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
+
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static int
 zio_write_bp_init(zio_t *zio)
 {
+	spa_t *spa = zio->io_spa;
 	zio_prop_t *zp = &zio->io_prop;
-	int compress = zp->zp_compress;
+	enum zio_compress compress = zp->zp_compress;
 	blkptr_t *bp = zio->io_bp;
-	void *cbuf;
 	uint64_t lsize = zio->io_size;
-	uint64_t csize = lsize;
-	uint64_t cbufsize = 0;
+	uint64_t psize = lsize;
 	int pass = 1;
 
 	/*
@@ -864,7 +895,29 @@
 	if (!IO_IS_ALLOCATING(zio))
 		return (ZIO_PIPELINE_CONTINUE);
 
-	ASSERT(compress != ZIO_COMPRESS_INHERIT);
+	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
+
+	if (zio->io_bp_override) {
+		ASSERT(bp->blk_birth != zio->io_txg);
+		ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
+
+		*bp = *zio->io_bp_override;
+		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+
+		if (BP_IS_HOLE(bp) || !zp->zp_dedup)
+			return (ZIO_PIPELINE_CONTINUE);
+
+		ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup ||
+		    zp->zp_dedup_verify);
+
+		if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
+			BP_SET_DEDUP(bp, 1);
+			zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
+			return (ZIO_PIPELINE_CONTINUE);
+		}
+		zio->io_bp_override = NULL;
+		BP_ZERO(bp);
+	}
 
 	if (bp->blk_birth == zio->io_txg) {
 		/*
@@ -876,22 +929,29 @@
 		 * convergence take longer.  Therefore, after the first
 		 * few passes, stop compressing to ensure convergence.
 		 */
-		pass = spa_sync_pass(zio->io_spa);
+		pass = spa_sync_pass(spa);
+
+		ASSERT(zio->io_txg == spa_syncing_txg(spa));
+		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+		ASSERT(!BP_GET_DEDUP(bp));
 
 		if (pass > SYNC_PASS_DONT_COMPRESS)
 			compress = ZIO_COMPRESS_OFF;
 
 		/* Make sure someone doesn't change their mind on overwrites */
-		ASSERT(MIN(zp->zp_ndvas + BP_IS_GANG(bp),
-		    spa_max_replication(zio->io_spa)) == BP_GET_NDVAS(bp));
+		ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp),
+		    spa_max_replication(spa)) == BP_GET_NDVAS(bp));
 	}
 
 	if (compress != ZIO_COMPRESS_OFF) {
-		if (!zio_compress_data(compress, zio->io_data, zio->io_size,
-		    &cbuf, &csize, &cbufsize)) {
+		void *cbuf = zio_buf_alloc(lsize);
+		psize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
+		if (psize == 0 || psize == lsize) {
 			compress = ZIO_COMPRESS_OFF;
-		} else if (csize != 0) {
-			zio_push_transform(zio, cbuf, csize, cbufsize, NULL);
+			zio_buf_free(cbuf, lsize);
+		} else {
+			ASSERT(psize < lsize);
+			zio_push_transform(zio, cbuf, psize, lsize, NULL);
 		}
 	}
 
@@ -903,10 +963,10 @@
 	 * spa_sync() to allocate new blocks, but force rewrites after that.
 	 * There should only be a handful of blocks after pass 1 in any case.
 	 */
-	if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize &&
+	if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize &&
 	    pass > SYNC_PASS_REWRITE) {
-		ASSERT(csize != 0);
-		uint32_t gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
+		ASSERT(psize != 0);
+		enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
 		zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
 		zio->io_flags |= ZIO_FLAG_IO_REWRITE;
 	} else {
@@ -914,17 +974,38 @@
 		zio->io_pipeline = ZIO_WRITE_PIPELINE;
 	}
 
-	if (csize == 0) {
+	if (psize == 0) {
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 	} else {
 		ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
 		BP_SET_LSIZE(bp, lsize);
-		BP_SET_PSIZE(bp, csize);
+		BP_SET_PSIZE(bp, psize);
 		BP_SET_COMPRESS(bp, compress);
 		BP_SET_CHECKSUM(bp, zp->zp_checksum);
 		BP_SET_TYPE(bp, zp->zp_type);
 		BP_SET_LEVEL(bp, zp->zp_level);
+		BP_SET_DEDUP(bp, zp->zp_dedup);
 		BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+		if (zp->zp_dedup) {
+			ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
+			zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
+		}
+	}
+
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_free_bp_init(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+
+	if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
+		if (BP_GET_DEDUP(bp))
+			zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
+		else
+			arc_free(zio->io_spa, bp);
 	}
 
 	if (zio_injection_enabled &&
@@ -1003,7 +1084,7 @@
  * There's no locking on io_stage because there's no legitimate way
  * for multiple threads to be attempting to process the same I/O.
  */
-static zio_pipe_stage_t *zio_pipeline[ZIO_STAGES];
+static zio_pipe_stage_t *zio_pipeline[];
 
 void
 zio_execute(zio_t *zio)
@@ -1011,32 +1092,34 @@
 	zio->io_executor = curthread;
 
 	while (zio->io_stage < ZIO_STAGE_DONE) {
-		uint32_t pipeline = zio->io_pipeline;
-		zio_stage_t stage = zio->io_stage;
+		enum zio_stage pipeline = zio->io_pipeline;
+		enum zio_stage stage = zio->io_stage;
 		int rv;
 
 		ASSERT(!MUTEX_HELD(&zio->io_lock));
-
-		while (((1U << ++stage) & pipeline) == 0)
-			continue;
+		ASSERT(ISP2(stage));
+		ASSERT(zio->io_stall == NULL);
+
+		do {
+			stage <<= 1;
+		} while ((stage & pipeline) == 0);
 
 		ASSERT(stage <= ZIO_STAGE_DONE);
-		ASSERT(zio->io_stall == NULL);
 
 		/*
 		 * If we are in interrupt context and this pipeline stage
 		 * will grab a config lock that is held across I/O,
-		 * issue async to avoid deadlock.
+		 * or may wait for an I/O that needs an interrupt thread
+		 * to complete, issue async to avoid deadlock.
 		 */
-		if (((1U << stage) & ZIO_CONFIG_LOCK_BLOCKING_STAGES) &&
-		    zio->io_vd == NULL &&
+		if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
 		    zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
 			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
 			return;
 		}
 
 		zio->io_stage = stage;
-		rv = zio_pipeline[stage](zio);
+		rv = zio_pipeline[highbit(stage) - 1](zio);
 
 		if (rv == ZIO_PIPELINE_STOP)
 			return;
@@ -1119,19 +1202,8 @@
 	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
 		pio->io_child_error[c] = 0;
 
-	if (IO_IS_ALLOCATING(pio)) {
-		/*
-		 * Remember the failed bp so that the io_ready() callback
-		 * can update its accounting upon reexecution.  The block
-		 * was already freed in zio_done(); we indicate this with
-		 * a fill count of -1 so that zio_free() knows to skip it.
-		 */
-		blkptr_t *bp = pio->io_bp;
-		ASSERT(bp->blk_birth == 0 || bp->blk_birth == pio->io_txg);
-		bp->blk_fill = BLK_FILL_ALREADY_FREED;
-		pio->io_bp_orig = *bp;
-		BP_ZERO(bp);
-	}
+	if (IO_IS_ALLOCATING(pio))
+		BP_ZERO(pio->io_bp);
 
 	/*
 	 * As we reexecute pio's children, new children could be created.
@@ -1319,6 +1391,12 @@
 			zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
 			    data, BP_GET_PSIZE(bp));
 		}
+		/*
+		 * If we are here to damage data for testing purposes,
+		 * leave the GBH alone so that we can detect the damage.
+		 */
+		if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
+			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
 	} else {
 		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
 		    data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
@@ -1332,8 +1410,8 @@
 zio_t *
 zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
 {
-	return (zio_free(pio, pio->io_spa, pio->io_txg, bp,
-	    NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
+	return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
+	    ZIO_GANG_CHILD_FLAGS(pio)));
 }
 
 /* ARGSUSED */
@@ -1417,7 +1495,7 @@
 	blkptr_t *bp = zio->io_bp;
 
 	ASSERT(gio == zio_unique_parent(zio));
-	ASSERT(zio_walk_children(zio) == NULL);
+	ASSERT(zio->io_child_count == 0);
 
 	if (zio->io_error)
 		return;
@@ -1523,9 +1601,9 @@
 	ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
 
 	ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
-	ASSERT3U(zio->io_prop.zp_ndvas, ==, gio->io_prop.zp_ndvas);
-	ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(zio->io_bp));
-	ASSERT3U(pio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(pio->io_bp));
+	ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
+	ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
+	ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
 	ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
 
 	mutex_enter(&pio->io_lock);
@@ -1550,13 +1628,13 @@
 	uint64_t txg = pio->io_txg;
 	uint64_t resid = pio->io_size;
 	uint64_t lsize;
-	int ndvas = gio->io_prop.zp_ndvas;
-	int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa));
+	int copies = gio->io_prop.zp_copies;
+	int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
 	zio_prop_t zp;
 	int error;
 
-	error = metaslab_alloc(spa, spa->spa_normal_class, SPA_GANGBLOCKSIZE,
-	    bp, gbh_ndvas, txg, pio == gio ? NULL : gio->io_bp,
+	error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
+	    bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
 	    METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
 	if (error) {
 		pio->io_error = error;
@@ -1592,7 +1670,9 @@
 		zp.zp_compress = ZIO_COMPRESS_OFF;
 		zp.zp_type = DMU_OT_NONE;
 		zp.zp_level = 0;
-		zp.zp_ndvas = gio->io_prop.zp_ndvas;
+		zp.zp_copies = gio->io_prop.zp_copies;
+		zp.zp_dedup = 0;
+		zp.zp_dedup_verify = 0;
 
 		zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
 		    (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
@@ -1613,15 +1693,380 @@
 
 /*
  * ==========================================================================
+ * Dedup
+ * ==========================================================================
+ */
+static void
+zio_ddt_child_read_done(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	ddt_entry_t *dde = zio->io_private;
+	ddt_phys_t *ddp;
+	zio_t *pio = zio_unique_parent(zio);
+
+	mutex_enter(&pio->io_lock);
+	ddp = ddt_phys_select(dde, bp);
+	if (zio->io_error == 0)
+		ddt_phys_clear(ddp);	/* this ddp doesn't need repair */
+	if (zio->io_error == 0 && dde->dde_repair_data == NULL)
+		dde->dde_repair_data = zio->io_data;
+	else
+		zio_buf_free(zio->io_data, zio->io_size);
+	mutex_exit(&pio->io_lock);
+}
+
+static int
+zio_ddt_read_start(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+
+	ASSERT(BP_GET_DEDUP(bp));
+	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
+	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+	if (zio->io_child_error[ZIO_CHILD_DDT]) {
+		ddt_t *ddt = ddt_select(zio->io_spa, bp);
+		ddt_entry_t *dde = ddt_repair_start(ddt, bp);
+		ddt_phys_t *ddp = dde->dde_phys;
+		ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
+		blkptr_t blk;
+
+		ASSERT(zio->io_vsd == NULL);
+		zio->io_vsd = dde;
+
+		if (ddp_self == NULL)
+			return (ZIO_PIPELINE_CONTINUE);
+
+		for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+			if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
+				continue;
+			ddt_bp_create(ddt, &dde->dde_key, ddp, &blk);
+			zio_nowait(zio_read(zio, zio->io_spa, &blk,
+			    zio_buf_alloc(zio->io_size), zio->io_size,
+			    zio_ddt_child_read_done, dde, zio->io_priority,
+			    ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE,
+			    &zio->io_bookmark));
+		}
+		return (ZIO_PIPELINE_CONTINUE);
+	}
+
+	zio_nowait(zio_read(zio, zio->io_spa, bp,
+	    zio->io_data, zio->io_size, NULL, NULL, zio->io_priority,
+	    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
+
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_ddt_read_done(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+
+	if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
+		return (ZIO_PIPELINE_STOP);
+
+	ASSERT(BP_GET_DEDUP(bp));
+	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
+	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+	if (zio->io_child_error[ZIO_CHILD_DDT]) {
+		ddt_t *ddt = ddt_select(zio->io_spa, bp);
+		ddt_entry_t *dde = zio->io_vsd;
+		if (ddt == NULL) {
+			ASSERT(zio->io_spa->spa_load_state != SPA_LOAD_NONE);
+			return (ZIO_PIPELINE_CONTINUE);
+		}
+		if (dde == NULL) {
+			zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
+			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
+			return (ZIO_PIPELINE_STOP);
+		}
+		if (dde->dde_repair_data != NULL) {
+			bcopy(dde->dde_repair_data, zio->io_data, zio->io_size);
+			zio->io_child_error[ZIO_CHILD_DDT] = 0;
+		}
+		ddt_repair_done(ddt, dde);
+		zio->io_vsd = NULL;
+	}
+
+	ASSERT(zio->io_vsd == NULL);
+
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+static boolean_t
+zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
+{
+	spa_t *spa = zio->io_spa;
+
+	/*
+	 * Note: we compare the original data, not the transformed data,
+	 * because when zio->io_bp is an override bp, we will not have
+	 * pushed the I/O transforms.  That's an important optimization
+	 * because otherwise we'd compress/encrypt all dmu_sync() data twice.
+	 */
+	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+		zio_t *lio = dde->dde_lead_zio[p];
+
+		if (lio != NULL) {
+			return (lio->io_orig_size != zio->io_orig_size ||
+			    bcmp(zio->io_orig_data, lio->io_orig_data,
+			    zio->io_orig_size) != 0);
+		}
+	}
+
+	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+		ddt_phys_t *ddp = &dde->dde_phys[p];
+
+		if (ddp->ddp_phys_birth != 0) {
+			arc_buf_t *abuf = NULL;
+			uint32_t aflags = ARC_WAIT;
+			blkptr_t blk = *zio->io_bp;
+			int error;
+
+			ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
+
+			ddt_exit(ddt);
+
+			error = arc_read_nolock(NULL, spa, &blk,
+			    arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
+			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+			    &aflags, &zio->io_bookmark);
+
+			if (error == 0) {
+				if (arc_buf_size(abuf) != zio->io_orig_size ||
+				    bcmp(abuf->b_data, zio->io_orig_data,
+				    zio->io_orig_size) != 0)
+					error = EEXIST;
+				VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
+			}
+
+			ddt_enter(ddt);
+			return (error != 0);
+		}
+	}
+
+	return (B_FALSE);
+}
+
+static void
+zio_ddt_child_write_ready(zio_t *zio)
+{
+	int p = zio->io_prop.zp_copies;
+	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
+	ddt_entry_t *dde = zio->io_private;
+	ddt_phys_t *ddp = &dde->dde_phys[p];
+	zio_t *pio;
+
+	if (zio->io_error)
+		return;
+
+	ddt_enter(ddt);
+
+	ASSERT(dde->dde_lead_zio[p] == zio);
+
+	ddt_phys_fill(ddp, zio->io_bp);
+
+	while ((pio = zio_walk_parents(zio)) != NULL)
+		ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
+
+	ddt_exit(ddt);
+}
+
+static void
+zio_ddt_child_write_done(zio_t *zio)
+{
+	int p = zio->io_prop.zp_copies;
+	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
+	ddt_entry_t *dde = zio->io_private;
+	ddt_phys_t *ddp = &dde->dde_phys[p];
+
+	ddt_enter(ddt);
+
+	ASSERT(ddp->ddp_refcnt == 0);
+	ASSERT(dde->dde_lead_zio[p] == zio);
+	dde->dde_lead_zio[p] = NULL;
+
+	if (zio->io_error == 0) {
+		while (zio_walk_parents(zio) != NULL)
+			ddt_phys_addref(ddp);
+	} else {
+		ddt_phys_clear(ddp);
+	}
+
+	ddt_exit(ddt);
+}
+
+static void
+zio_ddt_ditto_write_done(zio_t *zio)
+{
+	int p = DDT_PHYS_DITTO;
+	zio_prop_t *zp = &zio->io_prop;
+	blkptr_t *bp = zio->io_bp;
+	ddt_t *ddt = ddt_select(zio->io_spa, bp);
+	ddt_entry_t *dde = zio->io_private;
+	ddt_phys_t *ddp = &dde->dde_phys[p];
+	ddt_key_t *ddk = &dde->dde_key;
+
+	ddt_enter(ddt);
+
+	ASSERT(ddp->ddp_refcnt == 0);
+	ASSERT(dde->dde_lead_zio[p] == zio);
+	dde->dde_lead_zio[p] = NULL;
+
+	if (zio->io_error == 0) {
+		ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
+		ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
+		ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
+		if (ddp->ddp_phys_birth != 0)
+			ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
+		ddt_phys_fill(ddp, bp);
+	}
+
+	ddt_exit(ddt);
+}
+
+static int
+zio_ddt_write(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+	blkptr_t *bp = zio->io_bp;
+	uint64_t txg = zio->io_txg;
+	zio_prop_t *zp = &zio->io_prop;
+	int p = zp->zp_copies;
+	int ditto_copies;
+	zio_t *cio = NULL;
+	zio_t *dio = NULL;
+	ddt_t *ddt = ddt_select(spa, bp);
+	ddt_entry_t *dde;
+	ddt_phys_t *ddp;
+
+	ASSERT(BP_GET_DEDUP(bp));
+	ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
+	ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
+
+	ddt_enter(ddt);
+	dde = ddt_lookup(ddt, bp, B_TRUE);
+	ddp = &dde->dde_phys[p];
+
+	if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
+		/*
+		 * If we're using a weak checksum, upgrade to a strong checksum
+		 * and try again.  If we're already using a strong checksum,
+		 * we can't resolve it, so just convert to an ordinary write.
+		 * (And automatically e-mail a paper to Nature?)
+		 */
+		if (!zio_checksum_table[zp->zp_checksum].ci_dedup) {
+			zp->zp_checksum = spa_dedup_checksum(spa);
+			zio_pop_transforms(zio);
+			zio->io_stage = ZIO_STAGE_OPEN;
+			BP_ZERO(bp);
+		} else {
+			zp->zp_dedup = 0;
+		}
+		zio->io_pipeline = ZIO_WRITE_PIPELINE;
+		ddt_exit(ddt);
+		return (ZIO_PIPELINE_CONTINUE);
+	}
+
+	ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
+	ASSERT(ditto_copies < SPA_DVAS_PER_BP);
+
+	if (ditto_copies > ddt_ditto_copies_present(dde) &&
+	    dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
+		zio_prop_t czp = *zp;
+
+		czp.zp_copies = ditto_copies;
+
+		/*
+		 * If we arrived here with an override bp, we won't have run
+		 * the transform stack, so we won't have the data we need to
+		 * generate a child i/o.  So, toss the override bp and restart.
+		 * This is safe, because using the override bp is just an
+		 * optimization; and it's rare, so the cost doesn't matter.
+		 */
+		if (zio->io_bp_override) {
+			zio_pop_transforms(zio);
+			zio->io_stage = ZIO_STAGE_OPEN;
+			zio->io_pipeline = ZIO_WRITE_PIPELINE;
+			zio->io_bp_override = NULL;
+			BP_ZERO(bp);
+			ddt_exit(ddt);
+			return (ZIO_PIPELINE_CONTINUE);
+		}
+
+		dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
+		    zio->io_orig_size, &czp, NULL,
+		    zio_ddt_ditto_write_done, dde, zio->io_priority,
+		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
+
+		zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
+		dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
+	}
+
+	if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
+		if (ddp->ddp_phys_birth != 0)
+			ddt_bp_fill(ddp, bp, txg);
+		if (dde->dde_lead_zio[p] != NULL)
+			zio_add_child(zio, dde->dde_lead_zio[p]);
+		else
+			ddt_phys_addref(ddp);
+	} else if (zio->io_bp_override) {
+		ASSERT(bp->blk_birth == txg);
+		ASSERT(BP_EQUAL(bp, zio->io_bp_override));
+		ddt_phys_fill(ddp, bp);
+		ddt_phys_addref(ddp);
+	} else {
+		cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
+		    zio->io_orig_size, zp, zio_ddt_child_write_ready,
+		    zio_ddt_child_write_done, dde, zio->io_priority,
+		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
+
+		zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
+		dde->dde_lead_zio[p] = cio;
+	}
+
+	ddt_exit(ddt);
+
+	if (cio)
+		zio_nowait(cio);
+	if (dio)
+		zio_nowait(dio);
+
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_ddt_free(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+	blkptr_t *bp = zio->io_bp;
+	ddt_t *ddt = ddt_select(spa, bp);
+	ddt_entry_t *dde;
+	ddt_phys_t *ddp;
+
+	ASSERT(BP_GET_DEDUP(bp));
+	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+	ddt_enter(ddt);
+	dde = ddt_lookup(ddt, bp, B_TRUE);
+	ddp = ddt_phys_select(dde, bp);
+	ddt_phys_decref(ddp);
+	ddt_exit(ddt);
+
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+/*
+ * ==========================================================================
  * Allocate and free blocks
  * ==========================================================================
  */
-
 static int
 zio_dva_allocate(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
-	metaslab_class_t *mc = spa->spa_normal_class;
+	metaslab_class_t *mc = spa_normal_class(spa);
 	blkptr_t *bp = zio->io_bp;
 	int error;
 
@@ -1632,12 +2077,12 @@
 
 	ASSERT(BP_IS_HOLE(bp));
 	ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
-	ASSERT3U(zio->io_prop.zp_ndvas, >, 0);
-	ASSERT3U(zio->io_prop.zp_ndvas, <=, spa_max_replication(spa));
+	ASSERT3U(zio->io_prop.zp_copies, >, 0);
+	ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
 
 	error = metaslab_alloc(spa, mc, zio->io_size, bp,
-	    zio->io_prop.zp_ndvas, zio->io_txg, NULL, 0);
+	    zio->io_prop.zp_copies, zio->io_txg, NULL, 0);
 
 	if (error) {
 		if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
@@ -1676,36 +2121,11 @@
 static void
 zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
 {
-	spa_t *spa = zio->io_spa;
-	boolean_t now = !(zio->io_flags & ZIO_FLAG_IO_REWRITE);
-
 	ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
-
-	if (zio->io_bp == bp && !now) {
-		/*
-		 * This is a rewrite for sync-to-convergence.
-		 * We can't do a metaslab_free(NOW) because bp wasn't allocated
-		 * during this sync pass, which means that metaslab_sync()
-		 * already committed the allocation.
-		 */
-		ASSERT(DVA_EQUAL(BP_IDENTITY(bp),
-		    BP_IDENTITY(&zio->io_bp_orig)));
-		ASSERT(spa_sync_pass(spa) > 1);
-
-		if (BP_IS_GANG(bp) && gn == NULL) {
-			/*
-			 * This is a gang leader whose gang header(s) we
-			 * couldn't read now, so defer the free until later.
-			 * The block should still be intact because without
-			 * the headers, we'd never even start the rewrite.
-			 */
-			bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
-			return;
-		}
-	}
+	ASSERT(zio->io_bp_override == NULL);
 
 	if (!BP_IS_HOLE(bp))
-		metaslab_free(spa, bp, bp->blk_birth, now);
+		metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
 
 	if (gn != NULL) {
 		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
@@ -1719,17 +2139,19 @@
  * Try to allocate an intent log block.  Return 0 on success, errno on failure.
  */
 int
-zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp,
-    uint64_t txg, boolean_t use_slog)
+zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
+    uint64_t size, boolean_t use_slog)
 {
 	int error = 1;
 
+	ASSERT(txg > spa_syncing_txg(spa));
+
 	if (use_slog)
-		error = metaslab_alloc(spa, spa->spa_log_class, size,
+		error = metaslab_alloc(spa, spa_log_class(spa), size,
 		    new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID);
 
 	if (error)
-		error = metaslab_alloc(spa, spa->spa_normal_class, size,
+		error = metaslab_alloc(spa, spa_normal_class(spa), size,
 		    new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID);
 
 	if (error == 0) {
@@ -1739,6 +2161,7 @@
 		BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG);
 		BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
 		BP_SET_LEVEL(new_bp, 0);
+		BP_SET_DEDUP(new_bp, 0);
 		BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
 	}
 
@@ -1746,15 +2169,15 @@
 }
 
 /*
- * Free an intent log block.  We know it can't be a gang block, so there's
- * nothing to do except metaslab_free() it.
+ * Free an intent log block.
  */
 void
-zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg)
+zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
 {
+	ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
 	ASSERT(!BP_IS_GANG(bp));
 
-	metaslab_free(spa, bp, txg, B_FALSE);
+	zio_free(spa, txg, bp);
 }
 
 /*
@@ -1938,7 +2361,7 @@
 		zio->io_error = 0;
 		zio->io_flags |= ZIO_FLAG_IO_RETRY |
 		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
-		zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1;
+		zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
 		zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
 		return (ZIO_PIPELINE_STOP);
 	}
@@ -1971,7 +2394,7 @@
 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
 	ASSERT(zio->io_error == 0);
 
-	zio->io_stage--;
+	zio->io_stage >>= 1;
 }
 
 void
@@ -1979,7 +2402,7 @@
 {
 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
 
-	zio->io_stage--;
+	zio->io_stage >>= 1;
 }
 
 void
@@ -1989,7 +2412,7 @@
 	ASSERT(zio->io_error == 0);
 
 	zio->io_flags |= ZIO_FLAG_IO_BYPASS;
-	zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1;
+	zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
 }
 
 /*
@@ -2032,10 +2455,11 @@
 zio_checksum_verify(zio_t *zio)
 {
 	zio_bad_cksum_t info;
-
 	blkptr_t *bp = zio->io_bp;
 	int error;
 
+	ASSERT(zio->io_vd != NULL);
+
 	if (bp == NULL) {
 		/*
 		 * This is zio_read_phys().
@@ -2065,7 +2489,7 @@
 void
 zio_checksum_verified(zio_t *zio)
 {
-	zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
+	zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
 }
 
 /*
@@ -2105,7 +2529,8 @@
 	blkptr_t *bp = zio->io_bp;
 	zio_t *pio, *pio_next;
 
-	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY))
+	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
+	    zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
 		return (ZIO_PIPELINE_STOP);
 
 	if (zio->io_ready) {
@@ -2139,6 +2564,15 @@
 		zio_notify_parent(pio, zio, ZIO_WAIT_READY);
 	}
 
+	if (zio->io_flags & ZIO_FLAG_NODATA) {
+		if (BP_IS_GANG(bp)) {
+			zio->io_flags &= ~ZIO_FLAG_NODATA;
+		} else {
+			ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE);
+			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
+		}
+	}
+
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
@@ -2158,6 +2592,7 @@
 	 */
 	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
 	    zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
+	    zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
 	    zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
 		return (ZIO_PIPELINE_STOP);
 
@@ -2168,23 +2603,51 @@
 	if (bp != NULL) {
 		ASSERT(bp->blk_pad[0] == 0);
 		ASSERT(bp->blk_pad[1] == 0);
-		ASSERT(bp->blk_pad[2] == 0);
 		ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
 		    (bp == zio_unique_parent(zio)->io_bp));
 		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
+		    zio->io_bp_override == NULL &&
 		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
 			ASSERT(!BP_SHOULD_BYTESWAP(bp));
-			ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(bp));
+			ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp));
 			ASSERT(BP_COUNT_GANG(bp) == 0 ||
 			    (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
 		}
 	}
 
 	/*
-	 * If there were child vdev or gang errors, they apply to us now.
+	 * If there were child vdev/gang/ddt errors, they apply to us now.
 	 */
 	zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
 	zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
+	zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
+
+	/*
+	 * If the I/O on the transformed data was successful, generate any
+	 * checksum reports now while we still have the transformed data.
+	 */
+	if (zio->io_error == 0) {
+		while (zio->io_cksum_report != NULL) {
+			zio_cksum_report_t *zcr = zio->io_cksum_report;
+			uint64_t align = zcr->zcr_align;
+			uint64_t asize = P2ROUNDUP(psize, align);
+			char *abuf = zio->io_data;
+
+			if (asize != psize) {
+				abuf = zio_buf_alloc(asize);
+				bcopy(zio->io_data, abuf, psize);
+				bzero(abuf + psize, asize - psize);
+			}
+
+			zio->io_cksum_report = zcr->zcr_next;
+			zcr->zcr_next = NULL;
+			zcr->zcr_finish(zcr, abuf);
+			zfs_ereport_free_checksum(zcr);
+
+			if (asize != psize)
+				zio_buf_free(abuf, asize);
+		}
+	}
 
 	zio_pop_transforms(zio);	/* note: may set zio->io_error */
 
@@ -2219,12 +2682,15 @@
 		 * propagate all the way to the root via zio_notify_parent().
 		 */
 		ASSERT(vd == NULL && bp != NULL);
-
-		if (IO_IS_ALLOCATING(zio))
+		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+		if (IO_IS_ALLOCATING(zio) &&
+		    !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
 			if (zio->io_error != ENOSPC)
 				zio->io_reexecute |= ZIO_REEXECUTE_NOW;
 			else
 				zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
+		}
 
 		if ((zio->io_type == ZIO_TYPE_READ ||
 		    zio->io_type == ZIO_TYPE_FREE) &&
@@ -2253,11 +2719,10 @@
 	 */
 	zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
 
-	if ((zio->io_error || zio->io_reexecute) && IO_IS_ALLOCATING(zio) &&
-	    zio->io_child_type == ZIO_CHILD_LOGICAL) {
-		ASSERT(zio->io_child_type != ZIO_CHILD_GANG);
+	if ((zio->io_error || zio->io_reexecute) &&
+	    IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
+	    !(zio->io_flags & ZIO_FLAG_IO_REWRITE))
 		zio_dva_unallocate(zio, zio->io_gang_tree, bp);
-	}
 
 	zio_gang_tree_free(&zio->io_gang_tree);
 
@@ -2335,22 +2800,19 @@
 		return (ZIO_PIPELINE_STOP);
 	}
 
-	ASSERT(zio_walk_children(zio) == NULL);
+	ASSERT(zio->io_child_count == 0);
 	ASSERT(zio->io_reexecute == 0);
 	ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
 
-	/* Report any checksum errors, since the IO is complete */
+	/*
+	 * Report any checksum errors, since the I/O is complete.
+	 */
 	while (zio->io_cksum_report != NULL) {
-		zio_cksum_report_t *rpt = zio->io_cksum_report;
-
-		zio->io_cksum_report = rpt->zcr_next;
-		rpt->zcr_next = NULL;
-
-		/* only pass in our data buffer if we've succeeded. */
-		rpt->zcr_finish(rpt,
-		    (zio->io_error == 0) ? zio->io_data : NULL);
-
-		zfs_ereport_free_checksum(rpt);
+		zio_cksum_report_t *zcr = zio->io_cksum_report;
+		zio->io_cksum_report = zcr->zcr_next;
+		zcr->zcr_next = NULL;
+		zcr->zcr_finish(zcr, NULL);
+		zfs_ereport_free_checksum(zcr);
 	}
 
 	/*
@@ -2389,12 +2851,17 @@
  * I/O pipeline definition
  * ==========================================================================
  */
-static zio_pipe_stage_t *zio_pipeline[ZIO_STAGES] = {
+static zio_pipe_stage_t *zio_pipeline[] = {
 	NULL,
+	zio_read_bp_init,
+	zio_free_bp_init,
 	zio_issue_async,
-	zio_read_bp_init,
 	zio_write_bp_init,
 	zio_checksum_generate,
+	zio_ddt_read_start,
+	zio_ddt_read_done,
+	zio_ddt_write,
+	zio_ddt_free,
 	zio_gang_assemble,
 	zio_gang_issue,
 	zio_dva_allocate,
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/zio_checksum.c
--- a/usr/src/uts/common/fs/zfs/zio_checksum.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/zio_checksum.c	Sun Nov 01 14:14:46 2009 -0800
@@ -49,13 +49,13 @@
  *	we want the ability to take advantage of that hardware.
  *
  * Of course, we don't want a checksum upgrade to invalidate existing
- * data, so we store the checksum *function* in five bits of the DVA.
- * This gives us room for up to 32 different checksum functions.
+ * data, so we store the checksum *function* in eight bits of the bp.
+ * This gives us room for up to 256 different checksum functions.
  *
  * When writing a block, we always checksum it with the latest-and-greatest
  * checksum function of the appropriate strength.  When reading a block,
  * we compare the expected checksum against the actual checksum, which we
- * compute via the checksum function specified in the DVA encoding.
+ * compute via the checksum function specified by BP_GET_CHECKSUM(bp).
  */
 
 /*ARGSUSED*/
@@ -66,19 +66,19 @@
 }
 
 zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
-	{{NULL,			NULL},			0, 0,	"inherit"},
-	{{NULL,			NULL},			0, 0,	"on"},
-	{{zio_checksum_off,	zio_checksum_off},	0, 0,	"off"},
-	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 1,	"label"},
-	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 1,	"gang_header"},
-	{{fletcher_2_native,	fletcher_2_byteswap},	0, 1,	"zilog"},
-	{{fletcher_2_native,	fletcher_2_byteswap},	0, 0,	"fletcher2"},
-	{{fletcher_4_native,	fletcher_4_byteswap},	1, 0,	"fletcher4"},
-	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 0,	"SHA256"},
+	{{NULL,			NULL},			0, 0, 0, "inherit"},
+	{{NULL,			NULL},			0, 0, 0, "on"},
+	{{zio_checksum_off,	zio_checksum_off},	0, 0, 0, "off"},
+	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 1, 0, "label"},
+	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 1, 0, "gang_header"},
+	{{fletcher_2_native,	fletcher_2_byteswap},	0, 1, 0, "zilog"},
+	{{fletcher_2_native,	fletcher_2_byteswap},	0, 0, 0, "fletcher2"},
+	{{fletcher_4_native,	fletcher_4_byteswap},	1, 0, 0, "fletcher4"},
+	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 0, 1, "sha256"},
 };
 
-uint8_t
-zio_checksum_select(uint8_t child, uint8_t parent)
+enum zio_checksum
+zio_checksum_select(enum zio_checksum child, enum zio_checksum parent)
 {
 	ASSERT(child < ZIO_CHECKSUM_FUNCTIONS);
 	ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS);
@@ -93,6 +93,29 @@
 	return (child);
 }
 
+enum zio_checksum
+zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child,
+    enum zio_checksum parent)
+{
+	ASSERT((child & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
+	ASSERT((parent & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
+	ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
+
+	if (child == ZIO_CHECKSUM_INHERIT)
+		return (parent);
+
+	if (child == ZIO_CHECKSUM_ON)
+		return (spa_dedup_checksum(spa));
+
+	if (child == (ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY))
+		return (spa_dedup_checksum(spa) | ZIO_CHECKSUM_VERIFY);
+
+	ASSERT(zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_dedup ||
+	    (child & ZIO_CHECKSUM_VERIFY) || child == ZIO_CHECKSUM_OFF);
+
+	return (child);
+}
+
 /*
  * Set the external verifier for a gang block based on <vdev, offset, txg>,
  * a tuple which is guaranteed to be unique for the life of the pool.
@@ -101,7 +124,7 @@
 zio_checksum_gang_verifier(zio_cksum_t *zcp, blkptr_t *bp)
 {
 	dva_t *dva = BP_IDENTITY(bp);
-	uint64_t txg = bp->blk_birth;
+	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
 
 	ASSERT(BP_IS_GANG(bp));
 
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/zio_compress.c
--- a/usr/src/uts/common/fs/zfs/zio_compress.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/zio_compress.c	Sun Nov 01 14:14:46 2009 -0800
@@ -20,12 +20,10 @@
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/compress.h>
 #include <sys/spa.h>
@@ -51,10 +49,11 @@
 	{gzip_compress,		gzip_decompress,	7,	"gzip-7"},
 	{gzip_compress,		gzip_decompress,	8,	"gzip-8"},
 	{gzip_compress,		gzip_decompress,	9,	"gzip-9"},
+	{zle_compress,		zle_decompress,		64,	"zle"},
 };
 
-uint8_t
-zio_compress_select(uint8_t child, uint8_t parent)
+enum zio_compress
+zio_compress_select(enum zio_compress child, enum zio_compress parent)
 {
 	ASSERT(child < ZIO_COMPRESS_FUNCTIONS);
 	ASSERT(parent < ZIO_COMPRESS_FUNCTIONS);
@@ -69,80 +68,65 @@
 	return (child);
 }
 
-int
-zio_compress_data(int cpfunc, void *src, uint64_t srcsize, void **destp,
-    uint64_t *destsizep, uint64_t *destbufsizep)
+size_t
+zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len)
 {
 	uint64_t *word, *word_end;
-	uint64_t ciosize, gapsize, destbufsize;
-	zio_compress_info_t *ci = &zio_compress_table[cpfunc];
-	char *dest;
-	uint_t allzero;
+	size_t c_len, d_len, r_len;
+	zio_compress_info_t *ci = &zio_compress_table[c];
 
-	ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS);
-	ASSERT((uint_t)cpfunc == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL);
+	ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS);
+	ASSERT((uint_t)c == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL);
 
 	/*
 	 * If the data is all zeroes, we don't even need to allocate
-	 * a block for it.  We indicate this by setting *destsizep = 0.
+	 * a block for it.  We indicate this by returning zero size.
 	 */
-	allzero = 1;
-	word = src;
-	word_end = (uint64_t *)(uintptr_t)((uintptr_t)word + srcsize);
-	while (word < word_end) {
-		if (*word++ != 0) {
-			allzero = 0;
+	word_end = (uint64_t *)((char *)src + s_len);
+	for (word = src; word < word_end; word++)
+		if (*word != 0)
 			break;
-		}
-	}
-	if (allzero) {
-		*destp = NULL;
-		*destsizep = 0;
-		*destbufsizep = 0;
-		return (1);
-	}
 
-	if (cpfunc == ZIO_COMPRESS_EMPTY)
+	if (word == word_end)
 		return (0);
 
+	if (c == ZIO_COMPRESS_EMPTY)
+		return (s_len);
+
 	/* Compress at least 12.5% */
-	destbufsize = P2ALIGN(srcsize - (srcsize >> 3), SPA_MINBLOCKSIZE);
-	if (destbufsize == 0)
-		return (0);
-	dest = zio_buf_alloc(destbufsize);
+	d_len = P2ALIGN(s_len - (s_len >> 3), (size_t)SPA_MINBLOCKSIZE);
+	if (d_len == 0)
+		return (s_len);
+
+	c_len = ci->ci_compress(src, dst, s_len, d_len, ci->ci_level);
+
+	if (c_len > d_len)
+		return (s_len);
 
-	ciosize = ci->ci_compress(src, dest, (size_t)srcsize,
-	    (size_t)destbufsize, ci->ci_level);
-	if (ciosize > destbufsize) {
-		zio_buf_free(dest, destbufsize);
-		return (0);
+	/*
+	 * Cool.  We compressed at least as much as we were hoping to.
+	 * For both security and repeatability, pad out the last sector.
+	 */
+	r_len = P2ROUNDUP(c_len, (size_t)SPA_MINBLOCKSIZE);
+	if (r_len > c_len) {
+		bzero((char *)dst + c_len, r_len - c_len);
+		c_len = r_len;
 	}
 
-	/* Cool.  We compressed at least as much as we were hoping to. */
+	ASSERT3U(c_len, <=, d_len);
+	ASSERT(P2PHASE(c_len, (size_t)SPA_MINBLOCKSIZE) == 0);
 
-	/* For security, make sure we don't write random heap crap to disk */
-	gapsize = P2ROUNDUP(ciosize, SPA_MINBLOCKSIZE) - ciosize;
-	if (gapsize != 0) {
-		bzero(dest + ciosize, gapsize);
-		ciosize += gapsize;
-	}
-
-	ASSERT3U(ciosize, <=, destbufsize);
-	ASSERT(P2PHASE(ciosize, SPA_MINBLOCKSIZE) == 0);
-	*destp = dest;
-	*destsizep = ciosize;
-	*destbufsizep = destbufsize;
-
-	return (1);
+	return (c_len);
 }
 
 int
-zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
-	void *dest, uint64_t destsize)
+zio_decompress_data(enum zio_compress c, void *src, void *dst,
+    size_t s_len, size_t d_len)
 {
-	zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+	zio_compress_info_t *ci = &zio_compress_table[c];
 
-	ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS);
+	if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL)
+		return (EINVAL);
 
-	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
+	return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level));
 }
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/zio_inject.c
--- a/usr/src/uts/common/fs/zfs/zio_inject.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/zio_inject.c	Sun Nov 01 14:14:46 2009 -0800
@@ -43,8 +43,8 @@
 #include <sys/arc.h>
 #include <sys/zio_impl.h>
 #include <sys/zfs_ioctl.h>
-#include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
+#include <sys/dmu_objset.h>
 #include <sys/fs/zfs.h>
 
 uint32_t zio_injection_enabled;
@@ -70,8 +70,9 @@
 	/*
 	 * Check for a match against the MOS, which is based on type
 	 */
-	if (zb->zb_objset == 0 && record->zi_objset == 0 &&
-	    record->zi_object == 0) {
+	if (zb->zb_objset == DMU_META_OBJSET &&
+	    record->zi_objset == DMU_META_OBJSET &&
+	    record->zi_object == DMU_META_DNODE_OBJECT) {
 		if (record->zi_type == DMU_OT_NONE ||
 		    type == record->zi_type)
 			return (record->zi_freq == 0 ||
@@ -357,7 +358,7 @@
 			VERIFY(handler->zi_record.zi_timer == 0 ||
 			    handler->zi_record.zi_timer -
 			    handler->zi_record.zi_duration >=
-			    spa->spa_syncing_txg);
+			    spa_syncing_txg(spa));
 		}
 	}
 
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/zle.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/fs/zfs/zle.c	Sun Nov 01 14:14:46 2009 -0800
@@ -0,0 +1,86 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Zero-length encoding.  This is a fast and simple algorithm to eliminate
+ * runs of zeroes.  Each chunk of compressed data begins with a length byte, b.
+ * If b < n (where n is the compression parameter) then the next b + 1 bytes
+ * are literal values.  If b >= n then the next (256 - b + 1) bytes are zero.
+ */
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+
+size_t
+zle_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+	uchar_t *src = s_start;
+	uchar_t *dst = d_start;
+	uchar_t *s_end = src + s_len;
+	uchar_t *d_end = dst + d_len;
+
+	while (src < s_end && dst < d_end - 1) {
+		uchar_t *first = src;
+		uchar_t *len = dst++;
+		if (src[0] == 0) {
+			uchar_t *last = src + (256 - n);
+			while (src < MIN(last, s_end) && src[0] == 0)
+				src++;
+			*len = src - first - 1 + n;
+		} else {
+			uchar_t *last = src + n;
+			if (d_end - dst < n)
+				break;
+			while (src < MIN(last, s_end) - 1 && (src[0] | src[1]))
+				*dst++ = *src++;
+			if (src[0])
+				*dst++ = *src++;
+			*len = src - first - 1;
+		}
+	}
+	return (src == s_end ? dst - (uchar_t *)d_start : s_len);
+}
+
+int
+zle_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+	uchar_t *src = s_start;
+	uchar_t *dst = d_start;
+	uchar_t *s_end = src + s_len;
+	uchar_t *d_end = dst + d_len;
+
+	while (src < s_end && dst < d_end) {
+		int len = 1 + *src++;
+		if (len <= n) {
+			while (len-- != 0)
+				*dst++ = *src++;
+		} else {
+			len -= n;
+			while (len-- != 0)
+				*dst++ = 0;
+		}
+	}
+	return (dst == d_end ? 0 : -1);
+}
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/fs/zfs/zvol.c
--- a/usr/src/uts/common/fs/zfs/zvol.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/zvol.c	Sun Nov 01 14:14:46 2009 -0800
@@ -246,8 +246,8 @@
 
 /*ARGSUSED*/
 static int
-zvol_map_block(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
-    const dnode_phys_t *dnp, void *arg)
+zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	struct maparg *ma = arg;
 	zvol_extent_t *ze;
@@ -361,25 +361,32 @@
 {
 	objset_t *os = zv->zv_objset;
 	char *data = (char *)(lr + 1);	/* data follows lr_write_t */
-	uint64_t off = lr->lr_offset;
-	uint64_t len = lr->lr_length;
+	uint64_t offset, length;
 	dmu_tx_t *tx;
 	int error;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
-	/* If it's a dmu_sync() block get the data and write the whole block */
-	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t))
-		zil_get_replay_data(dmu_objset_zil(os), lr);
+	offset = lr->lr_offset;
+	length = lr->lr_length;
+
+	/* If it's a dmu_sync() block, write the whole block */
+	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
+		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
+		if (length < blocksize) {
+			offset -= offset % blocksize;
+			length = blocksize;
+		}
+	}
 
 	tx = dmu_tx_create(os);
-	dmu_tx_hold_write(tx, ZVOL_OBJ, off, len);
+	dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 	} else {
-		dmu_write(os, ZVOL_OBJ, off, len, data, tx);
+		dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
 		dmu_tx_commit(tx);
 	}
 
@@ -882,14 +889,16 @@
 }
 
 static void
-zvol_get_done(dmu_buf_t *db, void *vzgd)
+zvol_get_done(zgd_t *zgd, int error)
 {
-	zgd_t *zgd = (zgd_t *)vzgd;
-	rl_t *rl = zgd->zgd_rl;
+	if (zgd->zgd_db)
+		dmu_buf_rele(zgd->zgd_db, zgd);
 
-	dmu_buf_rele(db, vzgd);
-	zfs_range_unlock(rl);
-	zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
+	zfs_range_unlock(zgd->zgd_rl);
+
+	if (error == 0 && zgd->zgd_bp)
+		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
+
 	kmem_free(zgd, sizeof (zgd_t));
 }
 
@@ -901,15 +910,20 @@
 {
 	zvol_state_t *zv = arg;
 	objset_t *os = zv->zv_objset;
+	uint64_t object = ZVOL_OBJ;
+	uint64_t offset = lr->lr_offset;
+	uint64_t size = lr->lr_length;	/* length of user data */
+	blkptr_t *bp = &lr->lr_blkptr;
 	dmu_buf_t *db;
-	rl_t *rl;
 	zgd_t *zgd;
-	uint64_t boff; 			/* block starting offset */
-	int dlen = lr->lr_length;	/* length of user data */
 	int error;
 
-	ASSERT(zio);
-	ASSERT(dlen != 0);
+	ASSERT(zio != NULL);
+	ASSERT(size != 0);
+
+	zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
+	zgd->zgd_zilog = zv->zv_zilog;
+	zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
 
 	/*
 	 * Write records come in two flavors: immediate and indirect.
@@ -918,49 +932,30 @@
 	 * sync the data and get a pointer to it (indirect) so that
 	 * we don't have to write the data twice.
 	 */
-	if (buf != NULL) /* immediate write */
-		return (dmu_read(os, ZVOL_OBJ, lr->lr_offset, dlen, buf,
-		    DMU_READ_NO_PREFETCH));
-
-	zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
-	zgd->zgd_zilog = zv->zv_zilog;
-	zgd->zgd_bp = &lr->lr_blkptr;
+	if (buf != NULL) {	/* immediate write */
+		error = dmu_read(os, object, offset, size, buf,
+		    DMU_READ_NO_PREFETCH);
+	} else {
+		size = zv->zv_volblocksize;
+		offset = P2ALIGN(offset, size);
+		error = dmu_buf_hold(os, object, offset, zgd, &db);
+		if (error == 0) {
+			zgd->zgd_db = db;
+			zgd->zgd_bp = bp;
 
-	/*
-	 * Lock the range of the block to ensure that when the data is
-	 * written out and its checksum is being calculated that no other
-	 * thread can change the block.
-	 */
-	boff = P2ALIGN_TYPED(lr->lr_offset, zv->zv_volblocksize, uint64_t);
-	rl = zfs_range_lock(&zv->zv_znode, boff, zv->zv_volblocksize,
-	    RL_READER);
-	zgd->zgd_rl = rl;
+			ASSERT(db->db_offset == offset);
+			ASSERT(db->db_size == size);
 
-	VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, lr->lr_offset, zgd, &db));
+			error = dmu_sync(zio, lr->lr_common.lrc_txg,
+			    zvol_get_done, zgd);
 
-	error = dmu_sync(zio, db, &lr->lr_blkptr,
-	    lr->lr_common.lrc_txg, zvol_get_done, zgd);
-	if (error == 0) {
-		/*
-		 * dmu_sync() can compress a block of zeros to a null blkptr
-		 * but the block size still needs to be passed through to
-		 * replay.
-		 */
-		BP_SET_LSIZE(&lr->lr_blkptr, db->db_size);
-		zil_add_block(zv->zv_zilog, &lr->lr_blkptr);
+			if (error == 0)
+				return (0);
+		}
 	}
 
-	/*
-	 * If we get EINPROGRESS, then we need to wait for a
-	 * write IO initiated by dmu_sync() to complete before
-	 * we can release this dbuf.  We will finish everything
-	 * up in the zvol_get_done() callback.
-	 */
-	if (error == EINPROGRESS)
-		return (0);
-	dmu_buf_rele(db, zgd);
-	zfs_range_unlock(rl);
-	kmem_free(zgd, sizeof (zgd_t));
+	zvol_get_done(zgd, error);
+
 	return (error);
 }
 
@@ -984,12 +979,8 @@
 	if (zil_disable)
 		return;
 
-	if (zilog->zl_replay) {
-		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
-		zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
-		    zilog->zl_replaying_seq;
+	if (zil_replaying(zilog, tx))
 		return;
-	}
 
 	immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
 	    ? 0 : zvol_immediate_write_sz;
@@ -1024,8 +1015,7 @@
 		lr = (lr_write_t *)&itx->itx_lr;
 		if (write_state == WR_COPIED && dmu_read(zv->zv_objset,
 		    ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
-			kmem_free(itx, offsetof(itx_t, itx_lr) +
-			    itx->itx_lr.lrc_reclen);
+			zil_itx_destroy(itx);
 			itx = zil_itx_create(TX_WRITE, sizeof (*lr));
 			lr = (lr_write_t *)&itx->itx_lr;
 			write_state = WR_NEED_COPY;
@@ -1037,7 +1027,7 @@
 		lr->lr_foid = ZVOL_OBJ;
 		lr->lr_offset = off;
 		lr->lr_length = len;
-		lr->lr_blkoff = off - P2ALIGN_TYPED(off, blocksize, uint64_t);
+		lr->lr_blkoff = 0;
 		BP_ZERO(&lr->lr_blkptr);
 
 		itx->itx_private = zv;
@@ -1791,7 +1781,8 @@
 		dmu_tx_abort(tx);
 		return (error);
 	}
-	(void) dmu_object_set_blocksize(os, ZVOL_OBJ, vbs, 0, tx);
+	if (dmu_object_set_blocksize(os, ZVOL_OBJ, vbs, 0, tx) == 0)
+		zv->zv_volblocksize = vbs;
 	dmu_tx_commit(tx);
 
 	return (0);
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/sys/avl.h
--- a/usr/src/uts/common/sys/avl.h	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/sys/avl.h	Sun Nov 01 14:14:46 2009 -0800
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_AVL_H
 #define	_AVL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * This is a private header file.  Applications should not directly include
  * this file.
@@ -163,7 +161,7 @@
  * node   - node that has the value being looked for
  * where  - position for use with avl_nearest() or avl_insert(), may be NULL
  */
-extern void *avl_find(avl_tree_t *tree, void *node, avl_index_t *where);
+extern void *avl_find(avl_tree_t *tree, const void *node, avl_index_t *where);
 
 /*
  * Insert a node into the tree.
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/common/sys/fs/zfs.h
--- a/usr/src/uts/common/sys/fs/zfs.h	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/sys/fs/zfs.h	Sun Nov 01 14:14:46 2009 -0800
@@ -119,6 +119,7 @@
 	ZFS_PROP_LOGBIAS,
 	ZFS_PROP_UNIQUE,		/* not exposed to the user */
 	ZFS_PROP_OBJSETID,		/* not exposed to the user */
+	ZFS_PROP_DEDUP,
 	ZFS_NUM_PROPS
 } zfs_prop_t;
 
@@ -155,6 +156,8 @@
 	ZPOOL_PROP_FAILUREMODE,
 	ZPOOL_PROP_LISTSNAPS,
 	ZPOOL_PROP_AUTOEXPAND,
+	ZPOOL_PROP_DEDUPDITTO,
+	ZPOOL_PROP_DEDUPRATIO,
 	ZPOOL_NUM_PROPS
 } zpool_prop_t;
 
@@ -197,6 +200,7 @@
 boolean_t zfs_prop_userquota(const char *name);
 int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **);
 int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *);
+uint64_t zfs_prop_random_value(zfs_prop_t, uint64_t seed);
 boolean_t zfs_prop_valid_for_type(int, zfs_type_t);
 
 /*
@@ -209,6 +213,7 @@
 boolean_t zpool_prop_readonly(zpool_prop_t);
 int zpool_prop_index_to_string(zpool_prop_t, uint64_t, const char **);
 int zpool_prop_string_to_index(zpool_prop_t, const char *, uint64_t *);
+uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed);
 
 /*
  * Definitions for the Delegation.
@@ -296,14 +301,16 @@
 #define	SPA_VERSION_17			17ULL
 #define	SPA_VERSION_18			18ULL
 #define	SPA_VERSION_19			19ULL
+#define	SPA_VERSION_20			20ULL
+#define	SPA_VERSION_21			21ULL
 /*
  * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk
  * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*},
  * and do the appropriate changes.  Also bump the version number in
  * usr/src/grub/capability.
  */
-#define	SPA_VERSION			SPA_VERSION_19
-#define	SPA_VERSION_STRING		"19"
+#define	SPA_VERSION			SPA_VERSION_21
+#define	SPA_VERSION_STRING		"21"
 
 /*
  * Symbolic names for the changes that caused a SPA_VERSION switch.
@@ -344,6 +351,8 @@
 #define	SPA_VERSION_RAIDZ3		SPA_VERSION_17
 #define	SPA_VERSION_USERREFS		SPA_VERSION_18
 #define	SPA_VERSION_HOLES		SPA_VERSION_19
+#define	SPA_VERSION_ZLE_COMPRESSION	SPA_VERSION_20
+#define	SPA_VERSION_DEDUP		SPA_VERSION_21
 
 /*
  * ZPL version - rev'd whenever an incompatible on-disk format change
@@ -559,7 +568,6 @@
 	uint64_t	vs_alloc;		/* space allocated	*/
 	uint64_t	vs_space;		/* total capacity	*/
 	uint64_t	vs_dspace;		/* deflated capacity	*/
-	uint64_t	vs_defer;		/* in-core deferred	*/
 	uint64_t	vs_rsize;		/* replaceable dev size */
 	uint64_t	vs_ops[ZIO_TYPES];	/* operation count	*/
 	uint64_t	vs_bytes[ZIO_TYPES];	/* bytes read/written	*/
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/intel/zfs/spa_boot.c
--- a/usr/src/uts/intel/zfs/spa_boot.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/intel/zfs/spa_boot.c	Sun Nov 01 14:14:46 2009 -0800
@@ -20,12 +20,11 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
+#include <sys/zio.h>
 #include <sys/spa.h>
 #include <sys/sunddi.h>
 
diff -r 8aac17999e4d -r e2081f502306 usr/src/uts/sparc/zfs/spa_boot.c
--- a/usr/src/uts/sparc/zfs/spa_boot.c	Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/sparc/zfs/spa_boot.c	Sun Nov 01 14:14:46 2009 -0800
@@ -20,12 +20,11 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
+#include <sys/zio.h>
 #include <sys/spa.h>
 #include <sys/bootconf.h>