changeset 1775:e51e26b432c0

6410698 ZFS metadata needs to be more highly replicated (ditto blocks) 6410700 zdb should support reading raw blocks out of storage pool 6410709 ztest: spa config can change before pool export
author billm
date Mon, 10 Apr 2006 05:03:38 -0700
parents 274a4306dfe0
children 779af7da6661
files usr/src/cmd/mdb/common/modules/zfs/zfs.c usr/src/cmd/zdb/zdb.c usr/src/cmd/zpool/zpool_main.c usr/src/cmd/ztest/ztest.c usr/src/uts/common/fs/zfs/arc.c usr/src/uts/common/fs/zfs/dbuf.c usr/src/uts/common/fs/zfs/dmu.c usr/src/uts/common/fs/zfs/dmu_objset.c usr/src/uts/common/fs/zfs/dsl_pool.c usr/src/uts/common/fs/zfs/metaslab.c usr/src/uts/common/fs/zfs/spa.c usr/src/uts/common/fs/zfs/spa_misc.c usr/src/uts/common/fs/zfs/sys/arc.h usr/src/uts/common/fs/zfs/sys/dmu.h usr/src/uts/common/fs/zfs/sys/metaslab.h usr/src/uts/common/fs/zfs/sys/spa.h usr/src/uts/common/fs/zfs/sys/vdev.h usr/src/uts/common/fs/zfs/sys/zio.h usr/src/uts/common/fs/zfs/sys/zio_impl.h usr/src/uts/common/fs/zfs/vdev.c usr/src/uts/common/fs/zfs/vdev_mirror.c usr/src/uts/common/fs/zfs/vdev_raidz.c usr/src/uts/common/fs/zfs/vdev_root.c usr/src/uts/common/fs/zfs/zfs_ioctl.c usr/src/uts/common/fs/zfs/zio.c usr/src/uts/common/fs/zfs/zio_checksum.c usr/src/uts/common/sys/fs/zfs.h
diffstat 27 files changed, 1177 insertions(+), 525 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c	Mon Apr 10 05:03:38 2006 -0700
@@ -437,20 +437,28 @@
 		zct[i].ci_name = local_strdup(buf);
 	}
 
-	for (i = 0; i < SPA_DVAS_PER_BP; i++) {
+	/*
+	 * Super-ick warning:  This code is also duplicated in
+	 * cmd/zdb.c .   Yeah, I hate code replication, too.
+	 */
+	for (i = 0; i < BP_GET_NDVAS(&bp); i++) {
 		dva_t *dva = &bp.blk_dva[i];
-		mdb_printf("DVA[%d]: GANG: %-5s  GRID: %2x  ASIZE: %5x  "
-		    "vdev %llu  offset %llx\n",
-		    i,
-		    DVA_GET_GANG(dva) ? "TRUE" : "FALSE",
-		    DVA_GET_GRID(dva),
-		    DVA_GET_ASIZE(dva),
-		    DVA_GET_VDEV(dva),
-		    DVA_GET_OFFSET(dva));
+
+		mdb_printf("DVA[%d]: vdev_id %lld / %llx\n", i,
+		    DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva));
+		mdb_printf("DVA[%d]:       GANG: %-5s  GRID:  %04x\t"
+		    "ASIZE: %llx\n", i, DVA_GET_GANG(dva) ? "TRUE" : "FALSE",
+		    DVA_GET_GRID(dva), DVA_GET_ASIZE(dva));
+		mdb_printf("DVA[%d]: :%llu:%llx:%llx:%s%s%s%s\n", i,
+		    DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), BP_GET_PSIZE(&bp),
+		    BP_SHOULD_BYTESWAP(&bp) ? "e" : "",
+		    !DVA_GET_GANG(dva) && BP_GET_LEVEL(&bp) != 0 ? "i" : "",
+		    DVA_GET_GANG(dva) ? "g" : "",
+		    BP_GET_COMPRESS(&bp) != 0 ? "d" : "");
 	}
 	mdb_printf("LSIZE:  %-16llx\t\tPSIZE: %llx\n",
 	    BP_GET_LSIZE(&bp), BP_GET_PSIZE(&bp));
-	mdb_printf("ENDIAN: %-6s  TYPE: %s\n",
+	mdb_printf("ENDIAN: %6s\t\t\t\t\tTYPE:  %s\n",
 	    BP_GET_BYTEORDER(&bp) ? "LITTLE" : "BIG",
 	    doti[BP_GET_TYPE(&bp)].ot_name);
 	mdb_printf("BIRTH:  %-16llx   LEVEL: %-2d\tFILL:  %llx\n",
--- a/usr/src/cmd/zdb/zdb.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/cmd/zdb/zdb.c	Mon Apr 10 05:03:38 2006 -0700
@@ -27,6 +27,7 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <ctype.h>
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
@@ -84,8 +85,9 @@
 	    "Usage: %s [-udibcsvLU] [-O order] [-B os:obj:level:blkid] "
 	    "dataset [object...]\n"
 	    "       %s -C [pool]\n"
-	    "       %s -l dev\n",
-	    cmdname, cmdname, cmdname);
+	    "       %s -l dev\n"
+	    "       %s -R vdev:offset:size:flags\n",
+	    cmdname, cmdname, cmdname, cmdname);
 
 	(void) fprintf(stderr, "	-u uberblock\n");
 	(void) fprintf(stderr, "	-d datasets\n");
@@ -102,6 +104,8 @@
 	(void) fprintf(stderr, "	-U use zpool.cache in /tmp\n");
 	(void) fprintf(stderr, "	-B objset:object:level:blkid -- "
 	    "simulate bad block\n");
+	(void) fprintf(stderr, "        -R read and display block from a"
+	    "device\n");
 	(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
 	    "to make only that option verbose\n");
 	(void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
@@ -523,20 +527,41 @@
 	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 }
 
+static void
+sprintf_blkptr_compact(char *blkbuf, blkptr_t *bp, int alldvas)
+{
+	dva_t *dva = bp->blk_dva;
+	int ndvas = alldvas ? BP_GET_NDVAS(bp) : 1;
+	int i;
+
+	blkbuf[0] = '\0';
+
+	for (i = 0; i < ndvas; i++)
+		(void) sprintf(blkbuf + strlen(blkbuf), "%llu:%llx:%llx ",
+		    (u_longlong_t)DVA_GET_VDEV(&dva[i]),
+		    (u_longlong_t)DVA_GET_OFFSET(&dva[i]),
+		    (u_longlong_t)DVA_GET_ASIZE(&dva[i]));
+
+	(void) sprintf(blkbuf + strlen(blkbuf), "%llxL/%llxP F=%llu B=%llu",
+	    (u_longlong_t)BP_GET_LSIZE(bp),
+	    (u_longlong_t)BP_GET_PSIZE(bp),
+	    (u_longlong_t)bp->blk_fill,
+	    (u_longlong_t)bp->blk_birth);
+}
+
 /* ARGSUSED */
 static int
 zdb_indirect_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
 {
 	zbookmark_t *zb = &bc->bc_bookmark;
 	blkptr_t *bp = &bc->bc_blkptr;
-	dva_t *dva = &bp->blk_dva[0];
 	void *data = bc->bc_data;
 	dnode_phys_t *dnp = bc->bc_dnode;
-	char buffer[300];
+	char blkbuf[BP_SPRINTF_LEN + 80];
 	int l;
 
 	if (bc->bc_errno) {
-		(void) sprintf(buffer,
+		(void) sprintf(blkbuf,
 		    "Error %d reading <%llu, %llu, %lld, %llu>: ",
 		    bc->bc_errno,
 		    (u_longlong_t)zb->zb_objset,
@@ -581,37 +606,28 @@
 		ASSERT3U(fill, ==, bp->blk_fill);
 	}
 
-	(void) sprintf(buffer, "%16llx ",
+	(void) sprintf(blkbuf, "%16llx ",
 	    (u_longlong_t)blkid2offset(dnp, zb->zb_level, zb->zb_blkid));
 
 	ASSERT(zb->zb_level >= 0);
 
 	for (l = dnp->dn_nlevels - 1; l >= -1; l--) {
 		if (l == zb->zb_level) {
-			(void) sprintf(buffer + strlen(buffer), "L%llx",
+			(void) sprintf(blkbuf + strlen(blkbuf), "L%llx",
 			    (u_longlong_t)zb->zb_level);
 		} else {
-			(void) sprintf(buffer + strlen(buffer), " ");
+			(void) sprintf(blkbuf + strlen(blkbuf), " ");
 		}
 	}
 
 out:
 	if (bp->blk_birth == 0) {
-		(void) sprintf(buffer + strlen(buffer), "<hole>");
-		(void) printf("%s\n", buffer);
+		(void) sprintf(blkbuf + strlen(blkbuf), "<hole>");
+		(void) printf("%s\n", blkbuf);
 	} else {
-		// XXBP - Need to print number of active BPs here
-		(void) sprintf(buffer + strlen(buffer),
-		    "vdev=%llu off=%llx %llxL/%llxP/%llxA F=%llu B=%llu",
-		    (u_longlong_t)DVA_GET_VDEV(dva),
-		    (u_longlong_t)DVA_GET_OFFSET(dva),
-		    (u_longlong_t)BP_GET_LSIZE(bp),
-		    (u_longlong_t)BP_GET_PSIZE(bp),
-		    (u_longlong_t)DVA_GET_ASIZE(dva),
-		    (u_longlong_t)bp->blk_fill,
-		    (u_longlong_t)bp->blk_birth);
-
-		(void) printf("%s\n", buffer);
+		sprintf_blkptr_compact(blkbuf + strlen(blkbuf), bp,
+		    dump_opt['d'] > 5 ? 1 : 0);
+		(void) printf("%s\n", blkbuf);
 	}
 
 	return (bc->bc_errno ? ERESTART : 0);
@@ -762,18 +778,12 @@
 	(void) printf("\n");
 
 	while (bplist_iterate(&bpl, &itor, bp) == 0) {
+		char blkbuf[BP_SPRINTF_LEN];
+
 		ASSERT(bp->blk_birth != 0);
-		// XXBP - Do we want to see all DVAs, or just one?
-		(void) printf("\tItem %3llu: vdev=%llu off=%llx "
-		    "%llxL/%llxP/%llxA F=%llu B=%llu\n",
-		    (u_longlong_t)itor - 1,
-		    (u_longlong_t)DVA_GET_VDEV(&bp->blk_dva[0]),
-		    (u_longlong_t)DVA_GET_OFFSET(&bp->blk_dva[0]),
-		    (u_longlong_t)BP_GET_LSIZE(bp),
-		    (u_longlong_t)BP_GET_PSIZE(bp),
-		    (u_longlong_t)DVA_GET_ASIZE(&bp->blk_dva[0]),
-		    (u_longlong_t)bp->blk_fill,
-		    (u_longlong_t)bp->blk_birth);
+		sprintf_blkptr_compact(blkbuf, bp, dump_opt['d'] > 5 ? 1 : 0);
+		(void) printf("\tItem %3llu: %s\n",
+		    (u_longlong_t)itor - 1, blkbuf);
 	}
 
 	bplist_close(&bpl);
@@ -1228,45 +1238,73 @@
 static int
 zdb_space_map_claim(spa_t *spa, blkptr_t *bp, zbookmark_t *zb)
 {
-	dva_t *dva = &bp->blk_dva[0];
-	uint64_t vdev = DVA_GET_VDEV(dva);
-	uint64_t offset = DVA_GET_OFFSET(dva);
-	uint64_t size = DVA_GET_ASIZE(dva);
+	dva_t *dva = bp->blk_dva;
 	vdev_t *vd;
 	metaslab_t *msp;
 	space_map_t *allocmap, *freemap;
 	int error;
+	int d;
+	blkptr_t blk = *bp;
 
-	if ((vd = vdev_lookup_top(spa, vdev)) == NULL)
-		return (ENXIO);
+	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+		uint64_t vdev = DVA_GET_VDEV(&dva[d]);
+		uint64_t offset = DVA_GET_OFFSET(&dva[d]);
+		uint64_t size = DVA_GET_ASIZE(&dva[d]);
+
+		if ((vd = vdev_lookup_top(spa, vdev)) == NULL)
+			return (ENXIO);
+
+		if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
+			return (ENXIO);
+
+		msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+		allocmap = &msp->ms_allocmap[0];
+		freemap = &msp->ms_freemap[0];
 
-	if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
-		return (ENXIO);
+		/* Prepare our copy of the bp in case we need to read GBHs */
+		if (DVA_GET_GANG(&dva[d])) {
+			size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+			DVA_SET_ASIZE(&blk.blk_dva[d], size);
+			DVA_SET_GANG(&blk.blk_dva[d], 0);
+		}
+
+		mutex_enter(&msp->ms_lock);
+		if (space_map_contains(freemap, offset, size)) {
+			mutex_exit(&msp->ms_lock);
+			return (EAGAIN);	/* allocated more than once */
+		}
 
-	if (DVA_GET_GANG(dva)) {
+		if (!space_map_contains(allocmap, offset, size)) {
+			mutex_exit(&msp->ms_lock);
+			return (ESTALE);	/* not allocated at all */
+		}
+
+		space_map_remove(allocmap, offset, size);
+		space_map_add(freemap, offset, size);
+
+		mutex_exit(&msp->ms_lock);
+	}
+
+	if (BP_IS_GANG(bp)) {
 		zio_gbh_phys_t gbh;
-		blkptr_t blk = *bp;
 		int g;
 
 		/* LINTED - compile time assert */
 		ASSERT(sizeof (zio_gbh_phys_t) == SPA_GANGBLOCKSIZE);
-		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
-		DVA_SET_GANG(&blk.blk_dva[0], 0);
-		DVA_SET_ASIZE(&blk.blk_dva[0], size);
+
 		BP_SET_CHECKSUM(&blk, ZIO_CHECKSUM_GANG_HEADER);
 		BP_SET_PSIZE(&blk, SPA_GANGBLOCKSIZE);
 		BP_SET_LSIZE(&blk, SPA_GANGBLOCKSIZE);
 		BP_SET_COMPRESS(&blk, ZIO_COMPRESS_OFF);
-		error = zio_wait(zio_read(NULL, spa, &blk,
-		    &gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
-		    ZIO_PRIORITY_SYNC_READ,
+		error = zio_wait(zio_read(NULL, spa, &blk, &gbh,
+		    SPA_GANGBLOCKSIZE, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_CONFIG_HELD, zb));
 		if (error)
 			return (error);
 		if (BP_SHOULD_BYTESWAP(&blk))
 			byteswap_uint64_array(&gbh, SPA_GANGBLOCKSIZE);
 		for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
-			if (gbh.zg_blkptr[g].blk_birth == 0)
+			if (BP_IS_HOLE(&gbh.zg_blkptr[g]))
 				break;
 			error = zdb_space_map_claim(spa, &gbh.zg_blkptr[g], zb);
 			if (error)
@@ -1274,26 +1312,6 @@
 		}
 	}
 
-	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
-	allocmap = &msp->ms_allocmap[0];
-	freemap = &msp->ms_freemap[0];
-
-	mutex_enter(&msp->ms_lock);
-	if (space_map_contains(freemap, offset, size)) {
-		mutex_exit(&msp->ms_lock);
-		return (EAGAIN);	/* allocated more than once */
-	}
-
-	if (!space_map_contains(allocmap, offset, size)) {
-		mutex_exit(&msp->ms_lock);
-		return (ESTALE);	/* not allocated at all */
-	}
-
-	space_map_remove(allocmap, offset, size);
-	space_map_add(freemap, offset, size);
-
-	mutex_exit(&msp->ms_lock);
-
 	return (0);
 }
 
@@ -1448,7 +1466,7 @@
 
 	zcb->zcb_readfails = 0;
 
-	ASSERT(bp->blk_birth != 0);
+	ASSERT(!BP_IS_HOLE(bp));
 
 	zdb_count_block(spa, zcb, bp, type);
 
@@ -1511,13 +1529,13 @@
 		    spa->spa_sync_bplist_obj));
 
 		while (bplist_iterate(bpl, &itor, &blk) == 0) {
-			zdb_count_block(spa, &zcb, &blk, DMU_OT_DEFERRED);
 			if (dump_opt['b'] >= 4) {
 				char blkbuf[BP_SPRINTF_LEN];
 				sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, &blk);
 				(void) printf("[%s] %s\n",
 				    "deferred free", blkbuf);
 			}
+			zdb_count_block(spa, &zcb, &blk, DMU_OT_DEFERRED);
 		}
 
 		bplist_close(bpl);
@@ -1703,6 +1721,321 @@
 		exit(rc);
 }
 
+#define	ZDB_FLAG_CHECKSUM	0x0001
+#define	ZDB_FLAG_DECOMPRESS	0x0002
+#define	ZDB_FLAG_BSWAP		0x0004
+#define	ZDB_FLAG_GBH		0x0008
+#define	ZDB_FLAG_INDIRECT	0x0010
+#define	ZDB_FLAG_PHYS		0x0020
+#define	ZDB_FLAG_RAW		0x0040
+#define	ZDB_FLAG_PRINT_BLKPTR	0x0080
+
+int flagbits[256];
+
+static void
+zdb_print_blkptr(blkptr_t *bp, int flags)
+{
+	dva_t *dva = bp->blk_dva;
+	int d;
+
+	if (flags & ZDB_FLAG_BSWAP)
+		byteswap_uint64_array((void *)bp, sizeof (blkptr_t));
+	/*
+	 * Super-ick warning:  This code is also duplicated in
+	 * cmd/mdb/common/modules/zfs/zfs.c .  Yeah, I hate code
+	 * replication, too.
+	 */
+	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+		(void) printf("\tDVA[%d]: vdev_id %lld / %llx\n", d,
+		    DVA_GET_VDEV(&dva[d]), DVA_GET_OFFSET(&dva[d]));
+		(void) printf("\tDVA[%d]:       GANG: %-5s  GRID:  %04llx\t"
+		    "ASIZE: %llx\n", d,
+		    DVA_GET_GANG(&dva[d]) ? "TRUE" : "FALSE",
+		    DVA_GET_GRID(&dva[d]), DVA_GET_ASIZE(&dva[d]));
+		(void) printf("\tDVA[%d]: :%llu:%llx:%llx:%s%s%s%s\n", d,
+		    DVA_GET_VDEV(&dva[d]), DVA_GET_OFFSET(&dva[d]),
+		    BP_GET_PSIZE(bp),
+		    BP_SHOULD_BYTESWAP(bp) ? "e" : "",
+		    !DVA_GET_GANG(&dva[d]) && BP_GET_LEVEL(bp) != 0 ?
+		    "d" : "",
+		    DVA_GET_GANG(&dva[d]) ? "g" : "",
+		    BP_GET_COMPRESS(bp) != 0 ? "d" : "");
+	}
+	(void) printf("\tLSIZE:  %-16llx\t\tPSIZE: %llx\n",
+	    BP_GET_LSIZE(bp), BP_GET_PSIZE(bp));
+	(void) printf("\tENDIAN: %6s\t\t\t\t\tTYPE:  %s\n",
+	    BP_GET_BYTEORDER(bp) ? "LITTLE" : "BIG",
+	    dmu_ot[BP_GET_TYPE(bp)].ot_name);
+	(void) printf("\tBIRTH:  %-16llx   LEVEL: %-2llu\tFILL:  %llx\n",
+	    (u_longlong_t)bp->blk_birth, BP_GET_LEVEL(bp),
+	    (u_longlong_t)bp->blk_fill);
+	(void) printf("\tCKFUNC: %-16s\t\tCOMP:  %s\n",
+	    zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name,
+	    zio_compress_table[BP_GET_COMPRESS(bp)].ci_name);
+	(void) printf("\tCKSUM:  %llx:%llx:%llx:%llx\n",
+	    (u_longlong_t)bp->blk_cksum.zc_word[0],
+	    (u_longlong_t)bp->blk_cksum.zc_word[1],
+	    (u_longlong_t)bp->blk_cksum.zc_word[2],
+	    (u_longlong_t)bp->blk_cksum.zc_word[3]);
+}
+
+static void
+zdb_dump_indirect(blkptr_t *bp, int nbps, int flags)
+{
+	int i;
+
+	for (i = 0; i < nbps; i++)
+		zdb_print_blkptr(&bp[i], flags);
+}
+
+static void
+zdb_dump_gbh(void *buf, int flags)
+{
+	zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags);
+}
+
+static void
+zdb_dump_block_raw(void *buf, uint64_t size, int flags)
+{
+	if (flags & ZDB_FLAG_BSWAP)
+		byteswap_uint64_array(buf, size);
+	(void) write(2, buf, size);
+}
+
+static void
+zdb_dump_block(char *label, void *buf, uint64_t size, int flags)
+{
+	uint64_t *d = (uint64_t *)buf;
+	int nwords = size / sizeof (uint64_t);
+	int do_bswap = !!(flags & ZDB_FLAG_BSWAP);
+	int i, j;
+	char *hdr, *c;
+
+
+	if (do_bswap)
+		hdr = " 7 6 5 4 3 2 1 0   f e d c b a 9 8";
+	else
+		hdr = " 0 1 2 3 4 5 6 7   8 9 a b c d e f";
+
+	(void) printf("\n%s\n%6s   %s  0123456789abcdef\n", label, "", hdr);
+
+	for (i = 0; i < nwords; i += 2) {
+		(void) printf("%06llx:  %016llx  %016llx  ",
+		    (u_longlong_t)(i * sizeof (uint64_t)),
+		    (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]),
+		    (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1]));
+
+		c = (char *)&d[i];
+		for (j = 0; j < 2 * sizeof (uint64_t); j++)
+			(void) printf("%c", isprint(c[j]) ? c[j] : '.');
+		(void) printf("\n");
+	}
+}
+
+/*
+ * There are two acceptable formats:
+ *	leaf_name	  - For example: c1t0d0 or /tmp/ztest.0a
+ *	child[.child]*    - For example: 0.1.1
+ *
+ * The second form can be used to specify arbitrary vdevs anywhere
+ * in the heirarchy.  For example, in a pool with a mirror of
+ * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 .
+ */
+static vdev_t *
+zdb_vdev_lookup(vdev_t *vdev, char *path)
+{
+	char *s, *p, *q;
+	int i;
+
+	if (vdev == NULL)
+		return (NULL);
+
+	/* First, assume the x.x.x.x format */
+	i = (int)strtoul(path, &s, 10);
+	if (s == path || (s && *s != '.' && *s != '\0'))
+		goto name;
+	if (i < 0 || i >= vdev->vdev_children)
+		return (NULL);
+
+	vdev = vdev->vdev_child[i];
+	if (*s == '\0')
+		return (vdev);
+	return (zdb_vdev_lookup(vdev, s+1));
+
+name:
+	for (i = 0; i < vdev->vdev_children; i++) {
+		vdev_t *vc = vdev->vdev_child[i];
+
+		if (vc->vdev_path == NULL) {
+			vc = zdb_vdev_lookup(vc, path);
+			if (vc == NULL)
+				continue;
+			else
+				return (vc);
+		}
+
+		p = strrchr(vc->vdev_path, '/');
+		p = p ? p + 1 : vc->vdev_path;
+		q = &vc->vdev_path[strlen(vc->vdev_path) - 2];
+
+		if (strcmp(vc->vdev_path, path) == 0)
+			return (vc);
+		if (strcmp(p, path) == 0)
+			return (vc);
+		if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0)
+			return (vc);
+	}
+
+	return (NULL);
+}
+
+/*
+ * Read a block from a pool and print it out.  The syntax of the
+ * block descriptor is:
+ *
+ *	pool:vdev_specifier:offset:size[:flags]
+ *
+ *	pool           - The name of the pool you wish to read from
+ *	vdev_specifier - Which vdev (see comment for zdb_vdev_lookup)
+ *	offset         - offset, in hex, in bytes
+ *	size           - Amount of data to read, in hex, in bytes
+ *	flags          - A string of characters specifying options
+ *		 b: Decode a blkptr at given offset within block
+ *		*c: Calculate and display checksums
+ *		*d: Decompress data before dumping
+ *		 e: Byteswap data before dumping
+ *		*g: Display data as a gang block header
+ *		*i: Display as an indirect block
+ *		 p: Do I/O to physical offset
+ *		 r: Dump raw data to stdout
+ *
+ *              * = not yet implemented
+ */
+static void
+zdb_read_block(char *thing, spa_t **spap)
+{
+	spa_t *spa = *spap;
+	int flags = 0;
+	uint64_t offset = 0, size = 0, blkptr_offset = 0;
+	zio_t *zio;
+	vdev_t *vd;
+	void *buf;
+	char *s, *p, *dup, *spa_name, *vdev, *flagstr;
+	int i, error, zio_flags;
+
+	dup = strdup(thing);
+	s = strtok(dup, ":");
+	spa_name = s ? s : "";
+	s = strtok(NULL, ":");
+	vdev = s ? s : "";
+	s = strtok(NULL, ":");
+	offset = strtoull(s ? s : "", NULL, 16);
+	s = strtok(NULL, ":");
+	size = strtoull(s ? s : "", NULL, 16);
+	s = strtok(NULL, ":");
+	flagstr = s ? s : "";
+
+	s = NULL;
+	if (size == 0)
+		s = "size must not be zero";
+	if (!IS_P2ALIGNED(size, DEV_BSIZE))
+		s = "size must be a multiple of sector size";
+	if (!IS_P2ALIGNED(offset, DEV_BSIZE))
+		s = "offset must be a multiple of sector size";
+	if (s) {
+		(void) printf("Invalid block specifier: %s  - %s\n", thing, s);
+		free(dup);
+		return;
+	}
+
+	for (s = strtok(flagstr, ":"); s; s = strtok(NULL, ":")) {
+		for (i = 0; flagstr[i]; i++) {
+			int bit = flagbits[flagstr[i]];
+
+			if (bit == 0) {
+				(void) printf("***Invalid flag: %c\n",
+				    flagstr[i]);
+				continue;
+			}
+			flags |= bit;
+
+			/* If it's not something with an argument, keep going */
+			if ((bit & (ZDB_FLAG_CHECKSUM | ZDB_FLAG_DECOMPRESS |
+			    ZDB_FLAG_PRINT_BLKPTR)) == 0)
+				continue;
+
+			p = &flagstr[i + 1];
+			if (bit == ZDB_FLAG_PRINT_BLKPTR)
+				blkptr_offset = strtoull(p, &p, 16);
+			if (*p != ':' && *p != '\0') {
+				(void) printf("***Invalid flag arg: '%s'\n", s);
+				free(dup);
+				return;
+			}
+		}
+	}
+
+	if (spa == NULL || spa->spa_name == NULL ||
+	    strcmp(spa->spa_name, spa_name)) {
+		if (spa && spa->spa_name)
+			spa_close(spa, (void *)zdb_read_block);
+		error = spa_open(spa_name, spap, (void *)zdb_read_block);
+		if (error)
+			fatal("Failed to open pool '%s': errno = %d\n",
+			    spa_name, error);
+		spa = *spap;
+	}
+
+	vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev);
+	if (vd == NULL) {
+		(void) printf("***Invalid vdev: %s\n", vdev);
+		free(dup);
+		return;
+	} else {
+		if (vd->vdev_path)
+			(void) printf("Found vdev: %s\n", vd->vdev_path);
+		else
+			(void) printf("Found vdev type: %s\n",
+			    vd->vdev_ops->vdev_op_type);
+	}
+
+	buf = umem_alloc(size, UMEM_NOFAIL);
+
+	zio_flags = ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE |
+	    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_NOBOOKMARK;
+
+	if (flags & ZDB_FLAG_PHYS)
+		zio_flags |= ZIO_FLAG_PHYSICAL;
+
+	zio = zio_root(spa, NULL, NULL, 0);
+	/* XXX todo - cons up a BP so RAID-Z will be happy */
+	zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset, buf, size,
+	    ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, zio_flags, NULL, NULL));
+	error = zio_wait(zio);
+
+	if (error) {
+		(void) printf("Read of %s failed, error: %d\n", thing, error);
+		goto out;
+	}
+
+	if (flags & ZDB_FLAG_PRINT_BLKPTR)
+		zdb_print_blkptr((blkptr_t *)(void *)
+		    ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags);
+	else if (flags & ZDB_FLAG_RAW)
+		zdb_dump_block_raw(buf, size, flags);
+	else if (flags & ZDB_FLAG_INDIRECT)
+		zdb_dump_indirect((blkptr_t *)buf, size / sizeof (blkptr_t),
+		    flags);
+	else if (flags & ZDB_FLAG_GBH)
+		zdb_dump_gbh(buf, flags);
+	else
+		zdb_dump_block(thing, buf, size, flags);
+
+out:
+	umem_free(buf, size);
+	free(dup);
+}
+
 int
 main(int argc, char **argv)
 {
@@ -1721,7 +2054,7 @@
 
 	dprintf_setup(&argc, argv);
 
-	while ((c = getopt(argc, argv, "udibcsvCLO:B:Ul")) != -1) {
+	while ((c = getopt(argc, argv, "udibcsvCLO:B:UlR")) != -1) {
 		switch (c) {
 		case 'u':
 		case 'd':
@@ -1731,6 +2064,7 @@
 		case 's':
 		case 'C':
 		case 'l':
+		case 'R':
 			dump_opt[c]++;
 			dump_all = 0;
 			break;
@@ -1801,7 +2135,7 @@
 	}
 
 	for (c = 0; c < 256; c++) {
-		if (dump_all && c != 'L' && c != 'l')
+		if (dump_all && c != 'L' && c != 'l' && c != 'R')
 			dump_opt[c] = 1;
 		if (dump_opt[c])
 			dump_opt[c] += verbose;
@@ -1823,6 +2157,27 @@
 		return (0);
 	}
 
+	if (dump_opt['R']) {
+		flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR;
+		flagbits['c'] = ZDB_FLAG_CHECKSUM;
+		flagbits['d'] = ZDB_FLAG_DECOMPRESS;
+		flagbits['e'] = ZDB_FLAG_BSWAP;
+		flagbits['g'] = ZDB_FLAG_GBH;
+		flagbits['i'] = ZDB_FLAG_INDIRECT;
+		flagbits['p'] = ZDB_FLAG_PHYS;
+		flagbits['r'] = ZDB_FLAG_RAW;
+
+		spa = NULL;
+		while (argv[0]) {
+			zdb_read_block(argv[0], &spa);
+			argv++;
+			argc--;
+		}
+		if (spa)
+			spa_close(spa, (void *)zdb_read_block);
+		return (0);
+	}
+
 	if (dump_opt['C'])
 		dump_config(argv[0]);
 
--- a/usr/src/cmd/zpool/zpool_main.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/cmd/zpool/zpool_main.c	Mon Apr 10 05:03:38 2006 -0700
@@ -2783,8 +2783,9 @@
 
 	ret = zpool_upgrade(zhp);
 	if (ret == 0)
-		(void) printf(gettext("Successfully upgraded '%s'\n"),
-		    zpool_get_name(zhp));
+		(void) printf(gettext("Successfully upgraded '%s' "
+		    "from version %llu to version %llu\n"), zpool_get_name(zhp),
+		    (u_longlong_t)version, (u_longlong_t)ZFS_VERSION);
 
 	return (ret != 0);
 }
@@ -2848,8 +2849,10 @@
 		(void) printf(gettext("VER  DESCRIPTION\n"));
 		(void) printf("---  -----------------------------------------"
 		    "---------------\n");
-		(void) printf(gettext(" 1   Initial ZFS version.\n\n"));
-		(void) printf(gettext("For more information on a particular "
+		(void) printf(gettext(" 1   Initial ZFS version.\n"));
+		(void) printf(gettext(" 2   Ditto blocks "
+		    "(replicated metadata)\n"));
+		(void) printf(gettext("\nFor more information on a particular "
 		    "version, including supported releases, see:\n\n"));
 		(void) printf("http://www.opensolaris.org/os/community/zfs/"
 		    "version/N\n\n");
--- a/usr/src/cmd/ztest/ztest.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/cmd/ztest/ztest.c	Mon Apr 10 05:03:38 2006 -0700
@@ -2825,9 +2825,6 @@
 	if (error)
 		fatal(0, "spa_open('%s') = %d", oldname, error);
 
-	ASSERT(spa->spa_config != NULL);
-
-	VERIFY(nvlist_dup(spa->spa_config, &config, 0) == 0);
 	pool_guid = spa_guid(spa);
 	spa_close(spa, FTAG);
 
@@ -2836,7 +2833,7 @@
 	/*
 	 * Export it.
 	 */
-	error = spa_export(oldname);
+	error = spa_export(oldname, &config);
 	if (error)
 		fatal(0, "spa_export('%s') = %d", oldname, error);
 
--- a/usr/src/uts/common/fs/zfs/arc.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/arc.c	Mon Apr 10 05:03:38 2006 -0700
@@ -2186,7 +2186,7 @@
 }
 
 int
-arc_write(zio_t *pio, spa_t *spa, int checksum, int compress,
+arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
     uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
     arc_done_func_t *done, void *private, int priority, int flags,
     uint32_t arc_flags, zbookmark_t *zb)
@@ -2205,7 +2205,7 @@
 	acb->acb_byteswap = (arc_byteswap_func_t *)-1;
 	hdr->b_acb = acb;
 	hdr->b_flags |= ARC_IO_IN_PROGRESS;
-	rzio = zio_write(pio, spa, checksum, compress, txg, bp,
+	rzio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp,
 	    buf->b_data, hdr->b_size, arc_write_done, buf, priority, flags, zb);
 
 	if (arc_flags & ARC_WAIT)
--- a/usr/src/uts/common/fs/zfs/dbuf.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/dbuf.c	Mon Apr 10 05:03:38 2006 -0700
@@ -2029,7 +2029,9 @@
 	zb.zb_object = db->db.db_object;
 	zb.zb_level = db->db_level;
 	zb.zb_blkid = db->db_blkid;
-	(void) arc_write(zio, os->os_spa, checksum, compress, txg,
+
+	(void) arc_write(zio, os->os_spa, checksum, compress,
+	    dmu_get_replication_level(os->os_spa, &zb, dn->dn_type), txg,
 	    db->db_blkptr, *data, dbuf_write_done, db,
 	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_NOWAIT, &zb);
 	/*
--- a/usr/src/uts/common/fs/zfs/dmu.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu.c	Mon Apr 10 05:03:38 2006 -0700
@@ -82,8 +82,6 @@
 	dmu_buf_impl_t *db;
 	int err;
 
-	/* dataset_verify(dd); */
-
 	err = dnode_hold(os->os, object, FTAG, &dn);
 	if (err)
 		return (err);
@@ -1425,7 +1423,8 @@
 dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
     blkptr_t *bp, uint64_t txg)
 {
-	dsl_pool_t *dp = os->os->os_dsl_dataset->ds_dir->dd_pool;
+	objset_impl_t *osi = os->os;
+	dsl_pool_t *dp = osi->os_dsl_dataset->ds_dir->dd_pool;
 	tx_state_t *tx = &dp->dp_tx;
 	dmu_buf_impl_t *db;
 	blkptr_t *blk;
@@ -1508,7 +1507,7 @@
 		}
 		arc_release(db->db_d.db_data_old[txg&TXG_MASK], db);
 		if (!BP_IS_HOLE(blk)) {
-			(void) arc_free(NULL, os->os->os_spa, txg, blk,
+			(void) arc_free(NULL, osi->os_spa, txg, blk,
 			    NULL, NULL, ARC_WAIT);
 		}
 		kmem_free(blk, sizeof (blkptr_t));
@@ -1520,13 +1519,14 @@
 	blk = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
 	blk->blk_birth = 0; /* mark as invalid */
 
-	zb.zb_objset = os->os->os_dsl_dataset->ds_object;
+	zb.zb_objset = osi->os_dsl_dataset->ds_object;
 	zb.zb_object = db->db.db_object;
 	zb.zb_level = db->db_level;
 	zb.zb_blkid = db->db_blkid;
-	err = arc_write(NULL, os->os->os_spa,
-	    zio_checksum_select(db->db_dnode->dn_checksum, os->os->os_checksum),
-	    zio_compress_select(db->db_dnode->dn_compress, os->os->os_compress),
+	err = arc_write(NULL, osi->os_spa,
+	    zio_checksum_select(db->db_dnode->dn_checksum, osi->os_checksum),
+	    zio_compress_select(db->db_dnode->dn_compress, osi->os_compress),
+	    dmu_get_replication_level(osi->os_spa, &zb, db->db_dnode->dn_type),
 	    txg, blk, db->db_d.db_data_old[txg&TXG_MASK], NULL, NULL,
 	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT, &zb);
 	ASSERT(err == 0);
@@ -1556,7 +1556,7 @@
 		 * XXX should we be ignoring the return code?
 		 */
 		if (!BP_IS_HOLE(blk)) {
-			(void) arc_free(NULL, os->os->os_spa, txg, blk,
+			(void) arc_free(NULL, osi->os_spa, txg, blk,
 			    NULL, NULL, ARC_WAIT);
 		}
 		kmem_free(blk, sizeof (blkptr_t));
@@ -1625,6 +1625,24 @@
 	dnode_rele(dn, FTAG);
 }
 
+/*
+ * XXX - eventually, this should take into account per-dataset (or
+ *       even per-object?) user requests for higher levels of replication.
+ */
+int
+dmu_get_replication_level(spa_t *spa, zbookmark_t *zb, dmu_object_type_t ot)
+{
+	int ncopies = 1;
+
+	if (dmu_ot[ot].ot_metadata)
+		ncopies++;
+	if (zb->zb_level != 0)
+		ncopies++;
+	if (zb->zb_objset == 0 && zb->zb_object == 0)
+		ncopies++;
+	return (MIN(ncopies, spa_max_replication(spa)));
+}
+
 int
 dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
 {
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c	Mon Apr 10 05:03:38 2006 -0700
@@ -679,7 +679,9 @@
 	zb.zb_level = -1;
 	zb.zb_blkid = 0;
 	err = arc_write(NULL, os->os_spa, os->os_md_checksum,
-	    os->os_md_compress, tx->tx_txg, &os->os_rootbp, abuf, killer, os,
+	    os->os_md_compress,
+	    dmu_get_replication_level(os->os_spa, &zb, DMU_OT_OBJSET),
+	    tx->tx_txg, &os->os_rootbp, abuf, killer, os,
 	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT, &zb);
 	ASSERT(err == 0);
 	VERIFY(arc_buf_remove_ref(abuf, FTAG) == 1);
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c	Mon Apr 10 05:03:38 2006 -0700
@@ -232,7 +232,7 @@
 	uint64_t space, resv;
 
 	/*
-	 * Reserve about 1% (1/128), or at least 16MB, for allocation
+	 * Reserve about 1.6% (1/64), or at least 32MB, for allocation
 	 * efficiency.
 	 * XXX The intent log is not accounted for, so it must fit
 	 * within this slop.
@@ -242,7 +242,7 @@
 	 * (e.g. make it possible to rm(1) files from a full pool).
 	 */
 	space = spa_get_space(dp->dp_spa);
-	resv = MAX(space >> 7, SPA_MINDEVSIZE >> 2);
+	resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1);
 	if (netfree)
 		resv >>= 1;
 
--- a/usr/src/uts/common/fs/zfs/metaslab.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/metaslab.c	Mon Apr 10 05:03:38 2006 -0700
@@ -352,14 +352,19 @@
 	kmem_free(msp, sizeof (metaslab_t));
 }
 
-#define	METASLAB_ACTIVE_WEIGHT	(1ULL << 63)
+#define	METASLAB_WEIGHT_PRIMARY		(1ULL << 63)
+#define	METASLAB_WEIGHT_SECONDARY	(1ULL << 62)
+#define	METASLAB_ACTIVE_MASK		\
+	(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
+#define	METASLAB_SMO_BONUS_MULTIPLIER	2
 
 static uint64_t
 metaslab_weight(metaslab_t *msp)
 {
+	metaslab_group_t *mg = msp->ms_group;
 	space_map_t *sm = &msp->ms_map;
 	space_map_obj_t *smo = &msp->ms_smo;
-	vdev_t *vd = msp->ms_group->mg_vd;
+	vdev_t *vd = mg->mg_vd;
 	uint64_t weight, space;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
@@ -387,26 +392,27 @@
 	 * For locality, assign higher weight to metaslabs we've used before.
 	 */
 	if (smo->smo_object != 0)
-		weight *= 2;
-	ASSERT(weight >= space && weight <= 4 * space);
+		weight *= METASLAB_SMO_BONUS_MULTIPLIER;
+	ASSERT(weight >= space &&
+	    weight <= 2 * METASLAB_SMO_BONUS_MULTIPLIER * space);
 
 	/*
 	 * If this metaslab is one we're actively using, adjust its weight to
 	 * make it preferable to any inactive metaslab so we'll polish it off.
 	 */
-	weight |= (msp->ms_weight & METASLAB_ACTIVE_WEIGHT);
+	weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
 
 	return (weight);
 }
 
 static int
-metaslab_activate(metaslab_t *msp)
+metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
 {
 	space_map_t *sm = &msp->ms_map;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
-	if (msp->ms_weight < METASLAB_ACTIVE_WEIGHT) {
+	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
 		int error = space_map_load(sm, &metaslab_ff_ops,
 		    SM_FREE, &msp->ms_smo,
 		    msp->ms_group->mg_vd->vdev_spa->spa_meta_objset);
@@ -415,10 +421,10 @@
 			return (error);
 		}
 		metaslab_group_sort(msp->ms_group, msp,
-		    msp->ms_weight | METASLAB_ACTIVE_WEIGHT);
+		    msp->ms_weight | activation_weight);
 	}
 	ASSERT(sm->sm_loaded);
-	ASSERT(msp->ms_weight >= METASLAB_ACTIVE_WEIGHT);
+	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
 
 	return (0);
 }
@@ -426,8 +432,8 @@
 static void
 metaslab_passivate(metaslab_t *msp, uint64_t size)
 {
-	metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size - 1));
-	ASSERT(msp->ms_weight < METASLAB_ACTIVE_WEIGHT);
+	metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
+	ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
 }
 
 /*
@@ -571,7 +577,7 @@
 	 * future allocations have synced.  (If we unloaded it now and then
 	 * loaded a moment later, the map wouldn't reflect those allocations.)
 	 */
-	if (sm->sm_loaded && msp->ms_weight < METASLAB_ACTIVE_WEIGHT) {
+	if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
 		int evictable = 1;
 
 		for (t = 1; t < TXG_CONCURRENT_STATES; t++)
@@ -616,7 +622,7 @@
 
 	mutex_enter(&msp->ms_lock);
 
-	error = metaslab_activate(msp);
+	error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
 	if (error) {
 		mutex_exit(&msp->ms_lock);
 		return (error);
@@ -633,25 +639,76 @@
 	return (0);
 }
 
-static metaslab_t *
-metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t *offp,
-	uint64_t txg)
+static uint64_t
+metaslab_distance(metaslab_t *msp, dva_t *dva)
+{
+	uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
+	uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
+	uint64_t start = msp->ms_map.sm_start >> ms_shift;
+
+	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
+		return (1ULL << 63);
+
+	if (offset < start)
+		return ((start - offset) << ms_shift);
+	if (offset > start)
+		return ((offset - start) << ms_shift);
+	return (0);
+}
+
+static uint64_t
+metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
+    uint64_t min_distance, dva_t *dva, int d)
 {
 	metaslab_t *msp = NULL;
 	uint64_t offset = -1ULL;
+	avl_tree_t *t = &mg->mg_metaslab_tree;
+	uint64_t activation_weight;
+	uint64_t target_distance;
+	int i;
+
+	activation_weight = METASLAB_WEIGHT_PRIMARY;
+	for (i = 0; i < d; i++)
+		if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id)
+			activation_weight = METASLAB_WEIGHT_SECONDARY;
 
 	for (;;) {
 		mutex_enter(&mg->mg_lock);
-		msp = avl_first(&mg->mg_metaslab_tree);
-		if (msp == NULL || msp->ms_weight < size) {
-			mutex_exit(&mg->mg_lock);
-			return (NULL);
+		for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
+			if (msp->ms_weight < size) {
+				mutex_exit(&mg->mg_lock);
+				return (-1ULL);
+			}
+
+			if (activation_weight == METASLAB_WEIGHT_PRIMARY)
+				break;
+
+			target_distance = min_distance +
+			    (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1);
+
+			for (i = 0; i < d; i++)
+				if (metaslab_distance(msp, &dva[i]) <
+				    target_distance)
+					break;
+			if (i == d)
+				break;
 		}
 		mutex_exit(&mg->mg_lock);
+		if (msp == NULL)
+			return (-1ULL);
 
 		mutex_enter(&msp->ms_lock);
 
-		if (metaslab_activate(msp) != 0) {
+		if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
+		    activation_weight == METASLAB_WEIGHT_PRIMARY) {
+			metaslab_passivate(msp,
+			    (msp->ms_weight & ~METASLAB_ACTIVE_MASK) /
+			    METASLAB_SMO_BONUS_MULTIPLIER);
+			mutex_exit(&msp->ms_lock);
+			continue;
+		}
+
+		if (metaslab_activate(msp, activation_weight) != 0) {
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
@@ -659,7 +716,7 @@
 		if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL)
 			break;
 
-		metaslab_passivate(msp, size);
+		metaslab_passivate(msp, size - 1);
 
 		mutex_exit(&msp->ms_lock);
 	}
@@ -671,22 +728,24 @@
 
 	mutex_exit(&msp->ms_lock);
 
-	*offp = offset;
-	return (msp);
+	return (offset);
 }
 
 /*
  * Allocate a block for the specified i/o.
  */
-int
-metaslab_alloc(spa_t *spa, uint64_t psize, dva_t *dva, uint64_t txg)
+static int
+metaslab_alloc_one(spa_t *spa, uint64_t psize, dva_t *dva, int d,
+    dva_t *hintdva, uint64_t txg)
 {
-	metaslab_t *msp;
 	metaslab_group_t *mg, *rotor;
 	metaslab_class_t *mc;
 	vdev_t *vd;
+	int dshift = 3;
+	int all_zero;
 	uint64_t offset = -1ULL;
 	uint64_t asize;
+	uint64_t distance;
 
 	mc = spa_metaslab_class_select(spa);
 
@@ -695,17 +754,50 @@
 	 * Note that there's no locking on mc_rotor or mc_allocated because
 	 * nothing actually breaks if we miss a few updates -- we just won't
 	 * allocate quite as evenly.  It all balances out over time.
+	 *
+	 * If we are doing ditto blocks, try to spread them across consecutive
+	 * vdevs.  If we're forced to reuse a vdev before we've allocated
+	 * all of our ditto blocks, then try and spread them out on that
+	 * vdev as much as possible.  If it turns out to not be possible,
+	 * gradually lower our standards until anything becomes acceptable.
+	 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
+	 * gives us hope of containing our fault domains to something we're
+	 * able to reason about.  Otherwise, any two top-level vdev failures
+	 * will guarantee the loss of data.  With consecutive allocation,
+	 * only two adjacent top-level vdev failures will result in data loss.
+	 *
+	 * If we are doing gang blocks (hintdva is non-NULL), try to keep
+	 * ourselves on the same vdev as our gang block header.  That
+	 * way, we can hope for locality in vdev_cache, plus it makes our
+	 * fault domains something tractable.
 	 */
-	mg = rotor = mc->mc_rotor;
+	if (hintdva) {
+		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
+		mg = vd->vdev_mg;
+	} else if (d != 0) {
+		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
+		mg = vd->vdev_mg->mg_next;
+	} else {
+		mg = mc->mc_rotor;
+	}
+	rotor = mg;
+
+top:
+	all_zero = B_TRUE;
 	do {
 		vd = mg->mg_vd;
+
+		distance = vd->vdev_asize >> dshift;
+		if (distance <= (1ULL << vd->vdev_ms_shift))
+			distance = 0;
+		else
+			all_zero = B_FALSE;
+
 		asize = vdev_psize_to_asize(vd, psize);
 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
 
-		msp = metaslab_group_alloc(mg, asize, &offset, txg);
-		if (msp != NULL) {
-			ASSERT(offset != -1ULL);
-
+		offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d);
+		if (offset != -1ULL) {
 			/*
 			 * If we've just selected this metaslab group,
 			 * figure out whether the corresponding vdev is
@@ -740,10 +832,10 @@
 				mc->mc_allocated = 0;
 			}
 
-			DVA_SET_VDEV(dva, vd->vdev_id);
-			DVA_SET_OFFSET(dva, offset);
-			DVA_SET_GANG(dva, 0);
-			DVA_SET_ASIZE(dva, asize);
+			DVA_SET_VDEV(&dva[d], vd->vdev_id);
+			DVA_SET_OFFSET(&dva[d], offset);
+			DVA_SET_GANG(&dva[d], 0);
+			DVA_SET_ASIZE(&dva[d], asize);
 
 			return (0);
 		}
@@ -751,13 +843,46 @@
 		mc->mc_allocated = 0;
 	} while ((mg = mg->mg_next) != rotor);
 
-	DVA_SET_VDEV(dva, 0);
-	DVA_SET_OFFSET(dva, 0);
-	DVA_SET_GANG(dva, 0);
+	if (!all_zero) {
+		dshift++;
+		ASSERT(dshift < 64);
+		goto top;
+	}
+
+	bzero(&dva[d], sizeof (dva_t));
 
 	return (ENOSPC);
 }
 
+int
+metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp, int ncopies,
+    uint64_t txg, blkptr_t *hintbp)
+{
+	int d, error;
+	dva_t *dva = bp->blk_dva;
+	dva_t *hintdva = hintbp->blk_dva;
+
+	ASSERT(ncopies > 0 && ncopies <= spa_max_replication(spa));
+	ASSERT(BP_GET_NDVAS(bp) == 0);
+	ASSERT(hintbp == NULL || ncopies <= BP_GET_NDVAS(hintbp));
+
+	for (d = 0; d < ncopies; d++) {
+		error = metaslab_alloc_one(spa, psize, dva, d, hintdva, txg);
+		if (error) {
+			for (d--; d >= 0; d--) {
+				ASSERT(DVA_IS_VALID(&dva[d]));
+				metaslab_free(spa, &dva[d], txg, B_TRUE);
+				bzero(&dva[d], sizeof (dva_t));
+			}
+			return (ENOSPC);
+		}
+	}
+	ASSERT(error == 0);
+	ASSERT(BP_GET_NDVAS(bp) == ncopies);
+
+	return (0);
+}
+
 /*
  * Free the block represented by DVA in the context of the specified
  * transaction group.
--- a/usr/src/uts/common/fs/zfs/spa.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/spa.c	Mon Apr 10 05:03:38 2006 -0700
@@ -940,10 +940,13 @@
  * configuration from the cache afterwards.
  */
 static int
-spa_export_common(char *pool, int new_state)
+spa_export_common(char *pool, int new_state, nvlist_t **oldconfig)
 {
 	spa_t *spa;
 
+	if (oldconfig)
+		*oldconfig = NULL;
+
 	if (!(spa_mode & FWRITE))
 		return (EROFS);
 
@@ -1011,6 +1014,9 @@
 		spa_deactivate(spa);
 	}
 
+	if (oldconfig && spa->spa_config)
+		VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
+
 	if (new_state != POOL_STATE_UNINITIALIZED) {
 		spa_remove(spa);
 		spa_config_sync();
@@ -1026,16 +1032,16 @@
 int
 spa_destroy(char *pool)
 {
-	return (spa_export_common(pool, POOL_STATE_DESTROYED));
+	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL));
 }
 
 /*
  * Export a storage pool.
  */
 int
-spa_export(char *pool)
+spa_export(char *pool, nvlist_t **oldconfig)
 {
-	return (spa_export_common(pool, POOL_STATE_EXPORTED));
+	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig));
 }
 
 /*
@@ -1045,7 +1051,7 @@
 int
 spa_reset(char *pool)
 {
-	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED));
+	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL));
 }
 
 
@@ -1497,7 +1503,7 @@
 
 	mutex_enter(&spa->spa_scrub_lock);
 	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
-		vdev_t *vd = zio->io_vd;
+		vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev;
 		spa->spa_scrub_errors++;
 		mutex_enter(&vd->vdev_stat_lock);
 		vd->vdev_stat.vs_scrub_errors++;
@@ -1535,9 +1541,12 @@
 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
 {
 	blkptr_t *bp = &bc->bc_blkptr;
-	vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[0]));
+	vdev_t *vd = spa->spa_root_vdev;
+	dva_t *dva = bp->blk_dva;
+	int needs_resilver = B_FALSE;
+	int d;
 
-	if (bc->bc_errno || vd == NULL) {
+	if (bc->bc_errno) {
 		/*
 		 * We can't scrub this block, but we can continue to scrub
 		 * the rest of the pool.  Note the error and move along.
@@ -1546,43 +1555,52 @@
 		spa->spa_scrub_errors++;
 		mutex_exit(&spa->spa_scrub_lock);
 
-		if (vd != NULL) {
-			mutex_enter(&vd->vdev_stat_lock);
-			vd->vdev_stat.vs_scrub_errors++;
-			mutex_exit(&vd->vdev_stat_lock);
-		}
+		mutex_enter(&vd->vdev_stat_lock);
+		vd->vdev_stat.vs_scrub_errors++;
+		mutex_exit(&vd->vdev_stat_lock);
 
 		return (ERESTART);
 	}
 
 	ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg);
 
-	/*
-	 * Keep track of how much data we've examined so that
-	 * zpool(1M) status can make useful progress reports.
-	 */
-	mutex_enter(&vd->vdev_stat_lock);
-	vd->vdev_stat.vs_scrub_examined += BP_GET_ASIZE(bp);
-	mutex_exit(&vd->vdev_stat_lock);
+	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]));
+
+		ASSERT(vd != NULL);
+
+		/*
+		 * Keep track of how much data we've examined so that
+		 * zpool(1M) status can make useful progress reports.
+		 */
+		mutex_enter(&vd->vdev_stat_lock);
+		vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]);
+		mutex_exit(&vd->vdev_stat_lock);
 
-	if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) {
-		if (DVA_GET_GANG(&bp->blk_dva[0])) {
-			/*
-			 * Gang members may be spread across multiple vdevs,
-			 * so the best we can do is look at the pool-wide DTL.
-			 * XXX -- it would be better to change our allocation
-			 * policy to ensure that this can't happen.
-			 */
-			vd = spa->spa_root_vdev;
+		if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) {
+			if (DVA_GET_GANG(&dva[d])) {
+				/*
+				 * Gang members may be spread across multiple
+				 * vdevs, so the best we can do is look at the
+				 * pool-wide DTL.
+				 * XXX -- it would be better to change our
+				 * allocation policy to ensure that this can't
+				 * happen.
+				 */
+				vd = spa->spa_root_vdev;
+			}
+			if (vdev_dtl_contains(&vd->vdev_dtl_map,
+			    bp->blk_birth, 1))
+				needs_resilver = B_TRUE;
 		}
-		if (vdev_dtl_contains(&vd->vdev_dtl_map, bp->blk_birth, 1)) {
-			spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER,
-			    ZIO_FLAG_RESILVER, &bc->bc_bookmark);
-		}
-	} else {
+	}
+
+	if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING)
 		spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB,
 		    ZIO_FLAG_SCRUB, &bc->bc_bookmark);
-	}
+	else if (needs_resilver)
+		spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER,
+		    ZIO_FLAG_RESILVER, &bc->bc_bookmark);
 
 	return (0);
 }
--- a/usr/src/uts/common/fs/zfs/spa_misc.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c	Mon Apr 10 05:03:38 2006 -0700
@@ -52,60 +52,60 @@
  *
  * spa_namespace_lock (global mutex)
  *
- * 	This lock must be acquired to do any of the following:
+ *	This lock must be acquired to do any of the following:
  *
- * 		- Lookup a spa_t by name
- * 		- Add or remove a spa_t from the namespace
- * 		- Increase spa_refcount from non-zero
- * 		- Check if spa_refcount is zero
- * 		- Rename a spa_t
+ *		- Lookup a spa_t by name
+ *		- Add or remove a spa_t from the namespace
+ *		- Increase spa_refcount from non-zero
+ *		- Check if spa_refcount is zero
+ *		- Rename a spa_t
  *		- add/remove/attach/detach devices
- * 		- Held for the duration of create/destroy/import/export
+ *		- Held for the duration of create/destroy/import/export
  *
- * 	It does not need to handle recursion.  A create or destroy may
- * 	reference objects (files or zvols) in other pools, but by
- * 	definition they must have an existing reference, and will never need
- * 	to lookup a spa_t by name.
+ *	It does not need to handle recursion.  A create or destroy may
+ *	reference objects (files or zvols) in other pools, but by
+ *	definition they must have an existing reference, and will never need
+ *	to lookup a spa_t by name.
  *
  * spa_refcount (per-spa refcount_t protected by mutex)
  *
- * 	This reference count keep track of any active users of the spa_t.  The
- * 	spa_t cannot be destroyed or freed while this is non-zero.  Internally,
- * 	the refcount is never really 'zero' - opening a pool implicitly keeps
- * 	some references in the DMU.  Internally we check against SPA_MINREF, but
- * 	present the image of a zero/non-zero value to consumers.
+ *	This reference count keep track of any active users of the spa_t.  The
+ *	spa_t cannot be destroyed or freed while this is non-zero.  Internally,
+ *	the refcount is never really 'zero' - opening a pool implicitly keeps
+ *	some references in the DMU.  Internally we check against SPA_MINREF, but
+ *	present the image of a zero/non-zero value to consumers.
  *
  * spa_config_lock (per-spa crazy rwlock)
  *
- * 	This SPA special is a recursive rwlock, capable of being acquired from
- * 	asynchronous threads.  It has protects the spa_t from config changes,
- * 	and must be held in the following circumstances:
+ *	This SPA special is a recursive rwlock, capable of being acquired from
+ *	asynchronous threads.  It has protects the spa_t from config changes,
+ *	and must be held in the following circumstances:
  *
- * 		- RW_READER to perform I/O to the spa
- * 		- RW_WRITER to change the vdev config
+ *		- RW_READER to perform I/O to the spa
+ *		- RW_WRITER to change the vdev config
  *
  * spa_config_cache_lock (per-spa mutex)
  *
- * 	This mutex prevents the spa_config nvlist from being updated.  No
+ *	This mutex prevents the spa_config nvlist from being updated.  No
  *      other locks are required to obtain this lock, although implicitly you
  *      must have the namespace lock or non-zero refcount to have any kind
  *      of spa_t pointer at all.
  *
  * The locking order is fairly straightforward:
  *
- * 		spa_namespace_lock	->	spa_refcount
+ *		spa_namespace_lock	->	spa_refcount
  *
- * 	The namespace lock must be acquired to increase the refcount from 0
- * 	or to check if it is zero.
+ *	The namespace lock must be acquired to increase the refcount from 0
+ *	or to check if it is zero.
  *
- * 		spa_refcount 		->	spa_config_lock
+ *		spa_refcount		->	spa_config_lock
  *
- * 	There must be at least one valid reference on the spa_t to acquire
- * 	the config lock.
+ *	There must be at least one valid reference on the spa_t to acquire
+ *	the config lock.
  *
- * 		spa_namespace_lock	->	spa_config_lock
+ *		spa_namespace_lock	->	spa_config_lock
  *
- * 	The namespace lock must always be taken before the config lock.
+ *	The namespace lock must always be taken before the config lock.
  *
  *
  * The spa_namespace_lock and spa_config_cache_lock can be acquired directly and
@@ -114,53 +114,53 @@
  * The namespace is manipulated using the following functions, all which require
  * the spa_namespace_lock to be held.
  *
- * 	spa_lookup()		Lookup a spa_t by name.
+ *	spa_lookup()		Lookup a spa_t by name.
  *
- * 	spa_add()		Create a new spa_t in the namespace.
+ *	spa_add()		Create a new spa_t in the namespace.
  *
- * 	spa_remove()		Remove a spa_t from the namespace.  This also
- * 				frees up any memory associated with the spa_t.
+ *	spa_remove()		Remove a spa_t from the namespace.  This also
+ *				frees up any memory associated with the spa_t.
  *
- * 	spa_next()		Returns the next spa_t in the system, or the
- * 				first if NULL is passed.
+ *	spa_next()		Returns the next spa_t in the system, or the
+ *				first if NULL is passed.
  *
- * 	spa_evict_all()		Shutdown and remove all spa_t structures in
- * 				the system.
+ *	spa_evict_all()		Shutdown and remove all spa_t structures in
+ *				the system.
  *
  *	spa_guid_exists()	Determine whether a pool/device guid exists.
  *
  * The spa_refcount is manipulated using the following functions:
  *
- * 	spa_open_ref()		Adds a reference to the given spa_t.  Must be
- * 				called with spa_namespace_lock held if the
- * 				refcount is currently zero.
+ *	spa_open_ref()		Adds a reference to the given spa_t.  Must be
+ *				called with spa_namespace_lock held if the
+ *				refcount is currently zero.
  *
- * 	spa_close()		Remove a reference from the spa_t.  This will
- * 				not free the spa_t or remove it from the
- * 				namespace.  No locking is required.
+ *	spa_close()		Remove a reference from the spa_t.  This will
+ *				not free the spa_t or remove it from the
+ *				namespace.  No locking is required.
  *
- * 	spa_refcount_zero()	Returns true if the refcount is currently
- * 				zero.  Must be called with spa_namespace_lock
- * 				held.
+ *	spa_refcount_zero()	Returns true if the refcount is currently
+ *				zero.  Must be called with spa_namespace_lock
+ *				held.
  *
  * The spa_config_lock is manipulated using the following functions:
  *
- * 	spa_config_enter()	Acquire the config lock as RW_READER or
- * 				RW_WRITER.  At least one reference on the spa_t
- * 				must exist.
+ *	spa_config_enter()	Acquire the config lock as RW_READER or
+ *				RW_WRITER.  At least one reference on the spa_t
+ *				must exist.
  *
- * 	spa_config_exit()	Release the config lock.
+ *	spa_config_exit()	Release the config lock.
  *
- * 	spa_config_held()	Returns true if the config lock is currently
- * 				held in the given state.
+ *	spa_config_held()	Returns true if the config lock is currently
+ *				held in the given state.
  *
  * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
  *
- * 	spa_vdev_enter()	Acquire the namespace lock and the config lock
+ *	spa_vdev_enter()	Acquire the namespace lock and the config lock
  *				for writing.
  *
- * 	spa_vdev_exit()		Release the config lock, wait for all I/O
- * 				to complete, sync the updated configs to the
+ *	spa_vdev_exit()		Release the config lock, wait for all I/O
+ *				to complete, sync the updated configs to the
  *				cache, and release the namespace lock.
  *
  * The spa_name() function also requires either the spa_namespace_lock
@@ -173,6 +173,7 @@
 kmutex_t spa_namespace_lock;
 static kcondvar_t spa_namespace_cv;
 static int spa_active_count;
+static int spa_max_replication_override = SPA_DVAS_PER_BP;
 
 kmem_cache_t *spa_buffer_pool;
 int spa_mode;
@@ -617,8 +618,7 @@
 void
 sprintf_blkptr(char *buf, int len, blkptr_t *bp)
 {
-	/* XXBP - Need to see if we want all DVAs or not */
-	dva_t *dva = BP_IDENTITY(bp);
+	int d;
 
 	if (bp == NULL) {
 		(void) snprintf(buf, len, "<NULL>");
@@ -630,20 +630,27 @@
 		return;
 	}
 
-	(void) snprintf(buf, len, "[L%llu %s] vdev=%llu offset=%llx "
-	    "size=%llxL/%llxP/%llxA %s %s %s %s "
-	    "birth=%llu fill=%llu cksum=%llx:%llx:%llx:%llx",
+	(void) snprintf(buf, len, "[L%llu %s] %llxL/%llxP ",
 	    (u_longlong_t)BP_GET_LEVEL(bp),
 	    dmu_ot[BP_GET_TYPE(bp)].ot_name,
-	    (u_longlong_t)DVA_GET_VDEV(dva),
-	    (u_longlong_t)DVA_GET_OFFSET(dva),
 	    (u_longlong_t)BP_GET_LSIZE(bp),
-	    (u_longlong_t)BP_GET_PSIZE(bp),
-	    (u_longlong_t)DVA_GET_ASIZE(dva),
+	    (u_longlong_t)BP_GET_PSIZE(bp));
+
+	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+		dva_t *dva = &bp->blk_dva[d];
+		(void) snprintf(buf + strlen(buf), len - strlen(buf),
+		    "DVA[%d]=<%llu:%llx:%llx> ", d,
+		    (u_longlong_t)DVA_GET_VDEV(dva),
+		    (u_longlong_t)DVA_GET_OFFSET(dva),
+		    (u_longlong_t)DVA_GET_ASIZE(dva));
+	}
+
+	(void) snprintf(buf + strlen(buf), len - strlen(buf),
+	    "%s %s %s %s birth=%llu fill=%llu cksum=%llx:%llx:%llx:%llx",
 	    zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name,
 	    zio_compress_table[BP_GET_COMPRESS(bp)].ci_name,
 	    BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE",
-	    DVA_GET_GANG(dva) == 0 ? "contiguous" : "gang",
+	    BP_IS_GANG(bp) ? "gang" : "contiguous",
 	    (u_longlong_t)bp->blk_birth,
 	    (u_longlong_t)bp->blk_fill,
 	    (u_longlong_t)bp->blk_cksum.zc_word[0],
@@ -796,8 +803,29 @@
 	/*
 	 * For now, the worst case is 512-byte RAID-Z blocks, in which
 	 * case the space requirement is exactly 2x; so just assume that.
+	 * Add to this the fact that we can have up to 3 DVAs per bp, and
+	 * we have to multiply by a total of 6x.
 	 */
-	return (lsize << 1);
+	return (lsize * 6);
+}
+
+uint64_t
+spa_version(spa_t *spa)
+{
+	return (spa->spa_ubsync.ub_version);
+}
+
+int
+spa_max_replication(spa_t *spa)
+{
+	/*
+	 * As of ZFS_VERSION == ZFS_VERSION_DITTO_BLOCKS, we are able to
+	 * handle BPs with more than one DVA allocated.  Set our max
+	 * replication level accordingly.
+	 */
+	if (spa_version(spa) < ZFS_VERSION_DITTO_BLOCKS)
+		return (1);
+	return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
 }
 
 /*
--- a/usr/src/uts/common/fs/zfs/sys/arc.h	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/arc.h	Mon Apr 10 05:03:38 2006 -0700
@@ -75,7 +75,7 @@
 int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
     arc_done_func_t *done, void *private, int priority, int flags,
     uint32_t arc_flags, zbookmark_t *zb);
-int arc_write(zio_t *pio, spa_t *spa, int checksum, int compress,
+int arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
     uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
     arc_done_func_t *done, void *private, int priority, int flags,
     uint32_t arc_flags, zbookmark_t *zb);
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h	Mon Apr 10 05:03:38 2006 -0700
@@ -56,6 +56,8 @@
 struct dnode;
 struct drr_begin;
 struct drr_end;
+struct zbookmark;
+struct spa;
 
 typedef struct objset objset_t;
 typedef struct dmu_tx dmu_tx_t;
@@ -263,6 +265,12 @@
     dmu_tx_t *tx);
 
 /*
+ * Decide how many copies of a given block we should make.  Can be from
+ * 1 to SPA_DVAS_PER_BP.
+ */
+int dmu_get_replication_level(struct spa *spa, struct zbookmark *zb,
+    dmu_object_type_t ot);
+/*
  * The bonus data is accessed more or less like a regular buffer.
  * You must dmu_bonus_hold() to get the buffer, which will give you a
  * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus
--- a/usr/src/uts/common/fs/zfs/sys/metaslab.h	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h	Mon Apr 10 05:03:38 2006 -0700
@@ -47,7 +47,8 @@
 extern void metaslab_sync(metaslab_t *msp, uint64_t txg);
 extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg);
 
-extern int metaslab_alloc(spa_t *spa, uint64_t size, dva_t *dva, uint64_t txg);
+extern int metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp,
+    int ncopies, uint64_t txg, blkptr_t *hintbp);
 extern void metaslab_free(spa_t *spa, dva_t *dva, uint64_t txg, boolean_t now);
 extern int metaslab_claim(spa_t *spa, dva_t *dva, uint64_t txg);
 
--- a/usr/src/uts/common/fs/zfs/sys/spa.h	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h	Mon Apr 10 05:03:38 2006 -0700
@@ -234,6 +234,16 @@
 	(DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
 	DVA_GET_ASIZE(&(bp)->blk_dva[2]))
 
+#define	BP_GET_NDVAS(bp)	\
+	(!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
+	!!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
+	!!DVA_GET_ASIZE(&(bp)->blk_dva[2]))
+
+#define	BP_COUNT_GANG(bp)	\
+	(DVA_GET_GANG(&(bp)->blk_dva[0]) + \
+	DVA_GET_GANG(&(bp)->blk_dva[1]) + \
+	DVA_GET_GANG(&(bp)->blk_dva[2]))
+
 #define	DVA_EQUAL(dva1, dva2)	\
 	((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
 	(dva1)->dva_word[0] == (dva2)->dva_word[0])
@@ -248,10 +258,10 @@
 	(zcp)->zc_word[3] = w3;			\
 }
 
+#define	BP_IDENTITY(bp)		(&(bp)->blk_dva[0])
+#define	BP_IS_GANG(bp)		DVA_GET_GANG(BP_IDENTITY(bp))
 #define	BP_IS_HOLE(bp)		((bp)->blk_birth == 0)
 
-#define	BP_IDENTITY(bp)		(&(bp)->blk_dva[0])
-
 #define	BP_ZERO(bp)				\
 {						\
 	(bp)->blk_dva[0].dva_word[0] = 0;	\
@@ -281,7 +291,7 @@
 
 #define	BP_SHOULD_BYTESWAP(bp)	(BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER)
 
-#define	BP_SPRINTF_LEN	256
+#define	BP_SPRINTF_LEN	320
 
 #include <sys/dmu.h>
 
@@ -297,7 +307,7 @@
 extern int spa_import(const char *pool, nvlist_t *config, const char *altroot);
 extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
 extern int spa_destroy(char *pool);
-extern int spa_export(char *pool);
+extern int spa_export(char *pool, nvlist_t **oldconfig);
 extern int spa_reset(char *pool);
 extern void spa_async_request(spa_t *spa, int flag);
 extern void spa_async_suspend(spa_t *spa);
@@ -387,6 +397,8 @@
 extern uint64_t spa_get_alloc(spa_t *spa);
 extern uint64_t spa_get_space(spa_t *spa);
 extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize);
+extern uint64_t spa_version(spa_t *spa);
+extern int spa_max_replication(spa_t *spa);
 extern int spa_busy(void);
 
 /* Miscellaneous support routines */
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h	Mon Apr 10 05:03:38 2006 -0700
@@ -80,6 +80,7 @@
 extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type,
     boolean_t complete);
 extern int vdev_getspec(spa_t *spa, uint64_t vdev, char **vdev_spec);
+extern void vdev_propagate_state(vdev_t *vd);
 extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
     vdev_aux_t aux);
 
--- a/usr/src/uts/common/fs/zfs/sys/zio.h	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h	Mon Apr 10 05:03:38 2006 -0700
@@ -34,6 +34,7 @@
 #include <sys/avl.h>
 #include <sys/dkio.h>
 #include <sys/fs/zfs.h>
+#include <sys/zio_impl.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -58,9 +59,8 @@
 	(SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
 	sizeof (uint64_t))
 
-#define	ZIO_GET_DVA(zio)	(&(zio)->io_bp->blk_dva[(zio)->io_dva_index])
 #define	ZIO_GET_IOSIZE(zio)	\
-	(DVA_GET_GANG(ZIO_GET_DVA(zio)) ? \
+	(BP_IS_GANG((zio)->io_bp) ? \
 	SPA_GANGBLOCKSIZE : BP_GET_PSIZE((zio)->io_bp))
 
 typedef struct zio_gbh {
@@ -152,7 +152,6 @@
 
 typedef struct zio zio_t;
 typedef void zio_done_func_t(zio_t *zio);
-typedef struct zio_transform zio_transform_t;
 
 extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE];
 extern char *zio_type_name[ZIO_TYPES];
@@ -190,9 +189,9 @@
 	zio_t		*io_root;
 	spa_t		*io_spa;
 	zbookmark_t	io_bookmark;
-	int		io_checksum;
-	int		io_compress;
-	int		io_dva_index;
+	enum zio_checksum io_checksum;
+	enum zio_compress io_compress;
+	int		io_ndvas;
 	uint64_t	io_txg;
 	blkptr_t	*io_bp;
 	blkptr_t	io_bp_copy;
@@ -225,8 +224,8 @@
 
 	/* Internal pipeline state */
 	int		io_flags;
-	uint8_t		io_type;
-	uint8_t		io_stage;
+	enum zio_type	io_type;
+	enum zio_stage	io_stage;
 	uint8_t		io_stalled;
 	uint8_t		io_priority;
 	struct dk_callback io_dk_callback;
@@ -257,7 +256,7 @@
     int priority, int flags, zbookmark_t *zb);
 
 extern zio_t *zio_write(zio_t *pio, spa_t *spa, int checksum, int compress,
-    uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
+    int ncopies, uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
     zio_done_func_t *done, void *private, int priority, int flags,
     zbookmark_t *zb);
 
--- a/usr/src/uts/common/fs/zfs/sys/zio_impl.h	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zio_impl.h	Mon Apr 10 05:03:38 2006 -0700
@@ -61,9 +61,6 @@
 
 	ZIO_STAGE_READY,			/* RWFCI */
 
-	ZIO_STAGE_DVA_TRANSLATE,		/* RW--- */
-
-	ZIO_STAGE_VDEV_IO_SETUP,		/* RW--I */
 	ZIO_STAGE_VDEV_IO_START,		/* RW--I */
 	ZIO_STAGE_VDEV_IO_DONE,			/* RW--I */
 	ZIO_STAGE_VDEV_IO_ASSESS,		/* RW--I */
@@ -88,8 +85,7 @@
 	(1U << ZIO_STAGE_READ_DECOMPRESS))
 
 #define	ZIO_VDEV_IO_PIPELINE					\
-	((1U << ZIO_STAGE_VDEV_IO_SETUP) |			\
-	(1U << ZIO_STAGE_VDEV_IO_START) |			\
+	((1U << ZIO_STAGE_VDEV_IO_START) |			\
 	(1U << ZIO_STAGE_VDEV_IO_DONE) |			\
 	(1U << ZIO_STAGE_VDEV_IO_ASSESS))
 
@@ -103,8 +99,7 @@
 	(1U << ZIO_STAGE_DONE))
 
 #define	ZIO_READ_PIPELINE					\
-	((1U << ZIO_STAGE_DVA_TRANSLATE) |			\
-	ZIO_READ_PHYS_PIPELINE)
+	ZIO_READ_PHYS_PIPELINE
 
 #define	ZIO_WRITE_PHYS_PIPELINE					\
 	((1U << ZIO_STAGE_OPEN) |				\
@@ -116,8 +111,7 @@
 	(1U << ZIO_STAGE_DONE))
 
 #define	ZIO_WRITE_COMMON_PIPELINE				\
-	((1U << ZIO_STAGE_DVA_TRANSLATE) |			\
-	ZIO_WRITE_PHYS_PIPELINE)
+	ZIO_WRITE_PHYS_PIPELINE
 
 #define	ZIO_WRITE_PIPELINE					\
 	((1U << ZIO_STAGE_WRITE_COMPRESS) |			\
@@ -193,6 +187,7 @@
 #define	ZIO_ERROR_PIPELINE_MASK					\
 	ZIO_WAIT_FOR_CHILDREN_PIPELINE
 
+typedef struct zio_transform zio_transform_t;
 struct zio_transform {
 	void		*zt_data;
 	uint64_t	zt_size;
--- a/usr/src/uts/common/fs/zfs/vdev.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/vdev.c	Mon Apr 10 05:03:38 2006 -0700
@@ -847,31 +847,16 @@
 vdev_reopen(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
-	vdev_t *rvd = spa->spa_root_vdev;
-	int c;
 
 	ASSERT(spa_config_held(spa, RW_WRITER));
 
-	if (vd == rvd) {
-		for (c = 0; c < rvd->vdev_children; c++)
-			vdev_reopen(rvd->vdev_child[c]);
-		return;
-	}
-
-	/* only valid for top-level vdevs */
-	ASSERT3P(vd, ==, vd->vdev_top);
-
 	vdev_close(vd);
 	(void) vdev_open(vd);
 
 	/*
 	 * Reassess root vdev's health.
 	 */
-	rvd->vdev_state = VDEV_STATE_HEALTHY;
-	for (c = 0; c < rvd->vdev_children; c++) {
-		uint64_t state = rvd->vdev_child[c]->vdev_state;
-		rvd->vdev_state = MIN(rvd->vdev_state, state);
-	}
+	vdev_propagate_state(spa->spa_root_vdev);
 }
 
 int
@@ -1741,6 +1726,39 @@
 	list_remove(&spa->spa_dirty_list, vd);
 }
 
+void
+vdev_propagate_state(vdev_t *vd)
+{
+	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
+	int degraded = 0, faulted = 0;
+	int corrupted = 0;
+	int c;
+	vdev_t *child;
+
+	for (c = 0; c < vd->vdev_children; c++) {
+		child = vd->vdev_child[c];
+		if (child->vdev_state <= VDEV_STATE_CANT_OPEN)
+			faulted++;
+		else if (child->vdev_state == VDEV_STATE_DEGRADED)
+			degraded++;
+
+		if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
+			corrupted++;
+	}
+
+	vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
+
+	/*
+	 * Root special: if there is a toplevel vdev that cannot be
+	 * opened due to corrupted metadata, then propagate the root
+	 * vdev's aux state as 'corrupt' rather than 'insufficient
+	 * replicas'.
+	 */
+	if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN)
+		vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_CORRUPT_DATA);
+}
+
 /*
  * Set a vdev's state.  If this is during an open, we don't update the parent
  * state, because we're in the process of opening children depth-first.
@@ -1810,36 +1828,6 @@
 	if (isopen)
 		return;
 
-	if (vd->vdev_parent != NULL) {
-		int c;
-		int degraded = 0, faulted = 0;
-		int corrupted = 0;
-		vdev_t *parent, *child;
-
-		parent = vd->vdev_parent;
-		for (c = 0; c < parent->vdev_children; c++) {
-			child = parent->vdev_child[c];
-			if (child->vdev_state <= VDEV_STATE_CANT_OPEN)
-				faulted++;
-			else if (child->vdev_state == VDEV_STATE_DEGRADED)
-				degraded++;
-
-			if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
-				corrupted++;
-		}
-
-		vd->vdev_parent->vdev_ops->vdev_op_state_change(
-		    vd->vdev_parent, faulted, degraded);
-
-		/*
-		 * Root special: if this is a toplevel vdev that cannot be
-		 * opened due to corrupted metadata, then propagate the root
-		 * vdev's aux state as 'corrupt' rather than 'insufficient
-		 * replicas'.
-		 */
-		if (corrupted && vd == vd->vdev_top)
-			vdev_set_state(vd->vdev_spa->spa_root_vdev,
-			    B_FALSE, VDEV_STATE_CANT_OPEN,
-			    VDEV_AUX_CORRUPT_DATA);
-	}
+	if (vd->vdev_parent != NULL)
+		vdev_propagate_state(vd->vdev_parent);
 }
--- a/usr/src/uts/common/fs/zfs/vdev_mirror.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c	Mon Apr 10 05:03:38 2006 -0700
@@ -35,25 +35,85 @@
  * Virtual device vector for mirroring.
  */
 
+typedef struct mirror_child {
+	vdev_t		*mc_vd;
+	uint64_t	mc_offset;
+	int		mc_error;
+	short		mc_tried;
+	short		mc_skipped;
+} mirror_child_t;
+
 typedef struct mirror_map {
-	int	mm_error;
-	short	mm_tried;
-	short	mm_skipped;
+	int		mm_children;
+	int		mm_replacing;
+	int		mm_preferred;
+	int		mm_root;
+	mirror_child_t	mm_child[1];
 } mirror_map_t;
 
 static mirror_map_t *
 vdev_mirror_map_alloc(zio_t *zio)
 {
-	zio->io_vsd = kmem_zalloc(zio->io_vd->vdev_children *
-	    sizeof (mirror_map_t), KM_SLEEP);
-	return (zio->io_vsd);
+	mirror_map_t *mm = NULL;
+	mirror_child_t *mc;
+	vdev_t *vd = zio->io_vd;
+	int c, d;
+
+	if (vd == NULL) {
+		dva_t *dva = zio->io_bp->blk_dva;
+		spa_t *spa = zio->io_spa;
+
+		c = BP_GET_NDVAS(zio->io_bp);
+
+		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
+		mm->mm_children = c;
+		mm->mm_replacing = B_FALSE;
+		mm->mm_preferred = spa_get_random(c);
+		mm->mm_root = B_TRUE;
+
+		/*
+		 * Check the other, lower-index DVAs to see if they're on
+		 * the same vdev as the child we picked.  If they are, use
+		 * them since they are likely to have been allocated from
+		 * the primary metaslab in use at the time, and hence are
+		 * more likely to have locality with single-copy data.
+		 */
+		for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) {
+			if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c]))
+				mm->mm_preferred = d;
+		}
+
+		for (c = 0; c < mm->mm_children; c++) {
+			mc = &mm->mm_child[c];
+			mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
+			mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
+		}
+	} else {
+		c = vd->vdev_children;
+
+		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
+		mm->mm_children = c;
+		mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops);
+		mm->mm_preferred = mm->mm_replacing ? 0 : spa_get_random(c);
+		mm->mm_root = B_FALSE;
+
+		for (c = 0; c < mm->mm_children; c++) {
+			mc = &mm->mm_child[c];
+			mc->mc_vd = vd->vdev_child[c];
+			mc->mc_offset = zio->io_offset;
+		}
+	}
+
+	zio->io_vsd = mm;
+	return (mm);
 }
 
 static void
 vdev_mirror_map_free(zio_t *zio)
 {
-	kmem_free(zio->io_vsd,
-	    zio->io_vd->vdev_children * sizeof (mirror_map_t));
+	mirror_map_t *mm = zio->io_vsd;
+
+	kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children]));
 	zio->io_vsd = NULL;
 }
 
@@ -103,30 +163,31 @@
 static void
 vdev_mirror_child_done(zio_t *zio)
 {
-	mirror_map_t *mm = zio->io_private;
+	mirror_child_t *mc = zio->io_private;
 
-	mm->mm_error = zio->io_error;
-	mm->mm_tried = 1;
-	mm->mm_skipped = 0;
+	mc->mc_error = zio->io_error;
+	mc->mc_tried = 1;
+	mc->mc_skipped = 0;
 }
 
 static void
 vdev_mirror_scrub_done(zio_t *zio)
 {
-	mirror_map_t *mm = zio->io_private;
+	mirror_child_t *mc = zio->io_private;
 
 	if (zio->io_error == 0) {
 		zio_t *pio = zio->io_parent;
 		mutex_enter(&pio->io_lock);
+		ASSERT3U(zio->io_size, >=, pio->io_size);
 		bcopy(zio->io_data, pio->io_data, pio->io_size);
 		mutex_exit(&pio->io_lock);
 	}
 
 	zio_buf_free(zio->io_data, zio->io_size);
 
-	mm->mm_error = zio->io_error;
-	mm->mm_tried = 1;
-	mm->mm_skipped = 0;
+	mc->mc_error = zio->io_error;
+	mc->mc_tried = 1;
+	mc->mc_skipped = 0;
 }
 
 static void
@@ -144,60 +205,42 @@
 vdev_mirror_child_select(zio_t *zio)
 {
 	mirror_map_t *mm = zio->io_vsd;
-	vdev_t *vd = zio->io_vd;
-	vdev_t *cvd;
+	mirror_child_t *mc;
 	uint64_t txg = zio->io_txg;
 	int i, c;
 
 	ASSERT(zio->io_bp == NULL || zio->io_bp->blk_birth == txg);
 
 	/*
-	 * Select the child we'd like to read from absent any errors.
-	 * The current policy is to alternate sides at 8M granularity.
-	 * XXX -- investigate other policies for read distribution.
-	 */
-	c = (zio->io_offset >> (SPA_MAXBLOCKSHIFT + 6)) % vd->vdev_children;
-
-	/*
-	 * If this is a replacing vdev, always try child 0 (the source) first.
-	 */
-	if (vd->vdev_ops == &vdev_replacing_ops)
-		c = 0;
-
-	/*
 	 * Try to find a child whose DTL doesn't contain the block to read.
 	 * If a child is known to be completely inaccessible (indicated by
 	 * vdev_is_dead() returning B_TRUE), don't even try.
 	 */
-	for (i = 0; i < vd->vdev_children; i++, c++) {
-		if (c >= vd->vdev_children)
+	for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) {
+		if (c >= mm->mm_children)
 			c = 0;
-		if (mm[c].mm_tried || mm[c].mm_skipped)
+		mc = &mm->mm_child[c];
+		if (mc->mc_tried || mc->mc_skipped)
 			continue;
-		cvd = vd->vdev_child[c];
-		if (vdev_is_dead(cvd)) {
-			mm[c].mm_error = ENXIO;
-			mm[c].mm_tried = 1;	/* don't even try */
-			mm[c].mm_skipped = 1;
+		if (vdev_is_dead(mc->mc_vd)) {
+			mc->mc_error = ENXIO;
+			mc->mc_tried = 1;	/* don't even try */
+			mc->mc_skipped = 1;
 			continue;
 		}
-		if (!vdev_dtl_contains(&cvd->vdev_dtl_map, txg, 1))
+		if (!vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, txg, 1))
 			return (c);
-		mm[c].mm_error = ESTALE;
-		mm[c].mm_skipped = 1;
+		mc->mc_error = ESTALE;
+		mc->mc_skipped = 1;
 	}
 
 	/*
 	 * Every device is either missing or has this txg in its DTL.
-	 * If we don't have any sibling replicas to consult, look for
-	 * any child we haven't already tried before giving up.
+	 * Look for any child we haven't already tried before giving up.
 	 */
-	if (vd == vd->vdev_top || vd->vdev_parent->vdev_children <= 1) {
-		for (c = 0; c < vd->vdev_children; c++) {
-			if (!mm[c].mm_tried)
-				return (c);
-		}
-	}
+	for (c = 0; c < mm->mm_children; c++)
+		if (!mm->mm_child[c].mc_tried)
+			return (c);
 
 	/*
 	 * Every child failed.  There's no place left to look.
@@ -208,28 +251,28 @@
 static void
 vdev_mirror_io_start(zio_t *zio)
 {
-	vdev_t *vd = zio->io_vd;
 	mirror_map_t *mm;
+	mirror_child_t *mc;
 	int c, children;
 
 	mm = vdev_mirror_map_alloc(zio);
 
 	if (zio->io_type == ZIO_TYPE_READ) {
-		if ((zio->io_flags & ZIO_FLAG_SCRUB) &&
-		    vd->vdev_ops != &vdev_replacing_ops) {
+		if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) {
 			/*
 			 * For scrubbing reads we need to allocate a read
 			 * buffer for each child and issue reads to all
 			 * children.  If any child succeeds, it will copy its
 			 * data into zio->io_data in vdev_mirror_scrub_done.
 			 */
-			for (c = 0; c < vd->vdev_children; c++) {
+			for (c = 0; c < mm->mm_children; c++) {
+				mc = &mm->mm_child[c];
 				zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
-				    vd->vdev_child[c], zio->io_offset,
+				    mc->mc_vd, mc->mc_offset,
 				    zio_buf_alloc(zio->io_size), zio->io_size,
 				    zio->io_type, zio->io_priority,
-				    ZIO_FLAG_CANFAIL, vdev_mirror_scrub_done,
-				    &mm[c]));
+				    ZIO_FLAG_CANFAIL,
+				    vdev_mirror_scrub_done, mc));
 			}
 			zio_wait_children_done(zio);
 			return;
@@ -248,23 +291,23 @@
 		 * first child happens to have a DTL entry here as well.
 		 * All other writes go to all children.
 		 */
-		if ((zio->io_flags & ZIO_FLAG_RESILVER) &&
-		    vd->vdev_ops == &vdev_replacing_ops &&
-		    !vdev_dtl_contains(&vd->vdev_child[0]->vdev_dtl_map,
+		if ((zio->io_flags & ZIO_FLAG_RESILVER) && mm->mm_replacing &&
+		    !vdev_dtl_contains(&mm->mm_child[0].mc_vd->vdev_dtl_map,
 		    zio->io_txg, 1)) {
-			c = vd->vdev_children - 1;
+			c = mm->mm_children - 1;
 			children = 1;
 		} else {
 			c = 0;
-			children = vd->vdev_children;
+			children = mm->mm_children;
 		}
 	}
 
 	while (children--) {
+		mc = &mm->mm_child[c];
 		zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
-		    vd->vdev_child[c], zio->io_offset, zio->io_data,
-		    zio->io_size, zio->io_type, zio->io_priority,
-		    ZIO_FLAG_CANFAIL, vdev_mirror_child_done, &mm[c]));
+		    mc->mc_vd, mc->mc_offset,
+		    zio->io_data, zio->io_size, zio->io_type, zio->io_priority,
+		    ZIO_FLAG_CANFAIL, vdev_mirror_child_done, mc));
 		c++;
 	}
 
@@ -274,20 +317,19 @@
 static void
 vdev_mirror_io_done(zio_t *zio)
 {
-	vdev_t *vd = zio->io_vd;
-	vdev_t *cvd;
 	mirror_map_t *mm = zio->io_vsd;
+	mirror_child_t *mc;
 	int c;
 	int good_copies = 0;
 	int unexpected_errors = 0;
 
-	ASSERT(mm != NULL);
-
 	zio->io_error = 0;
 	zio->io_numerrors = 0;
 
-	for (c = 0; c < vd->vdev_children; c++) {
-		if (mm[c].mm_tried && mm[c].mm_error == 0) {
+	for (c = 0; c < mm->mm_children; c++) {
+		mc = &mm->mm_child[c];
+
+		if (mc->mc_tried && mc->mc_error == 0) {
 			good_copies++;
 			continue;
 		}
@@ -296,10 +338,10 @@
 		 * We preserve any EIOs because those may be worth retrying;
 		 * whereas ECKSUM and ENXIO are more likely to be persistent.
 		 */
-		if (mm[c].mm_error) {
+		if (mc->mc_error) {
 			if (zio->io_error != EIO)
-				zio->io_error = mm[c].mm_error;
-			if (!mm[c].mm_skipped)
+				zio->io_error = mc->mc_error;
+			if (!mc->mc_skipped)
 				unexpected_errors++;
 			zio->io_numerrors++;
 		}
@@ -308,11 +350,12 @@
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		/*
 		 * XXX -- for now, treat partial writes as success.
+		 * XXX -- For a replacing vdev, we need to make sure the
+		 *	  new child succeeds.
 		 */
 		/* XXPOLICY */
 		if (good_copies != 0)
 			zio->io_error = 0;
-		ASSERT(mm != NULL);
 		vdev_mirror_map_free(zio);
 		zio_next_stage(zio);
 		return;
@@ -325,17 +368,16 @@
 	 */
 	/* XXPOLICY */
 	if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
-		ASSERT(c >= 0 && c < vd->vdev_children);
-		cvd = vd->vdev_child[c];
-		dprintf("%s: retrying i/o (err=%d) on child %s\n",
-		    vdev_description(zio->io_vd), zio->io_error,
-		    vdev_description(cvd));
+		ASSERT(c >= 0 && c < mm->mm_children);
+		mc = &mm->mm_child[c];
+		dprintf("retrying i/o (err=%d) on child %s\n",
+		    zio->io_error, vdev_description(mc->mc_vd));
 		zio->io_error = 0;
 		zio_vdev_io_redone(zio);
-		zio_nowait(zio_vdev_child_io(zio, zio->io_bp, cvd,
-		    zio->io_offset, zio->io_data, zio->io_size,
+		zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+		    mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size,
 		    ZIO_TYPE_READ, zio->io_priority, ZIO_FLAG_CANFAIL,
-		    vdev_mirror_child_done, &mm[c]));
+		    vdev_mirror_child_done, mc));
 		zio_wait_children_done(zio);
 		return;
 	}
@@ -360,7 +402,7 @@
 		rio = zio_null(zio, zio->io_spa,
 		    vdev_mirror_repair_done, zio, ZIO_FLAG_CANFAIL);
 
-		for (c = 0; c < vd->vdev_children; c++) {
+		for (c = 0; c < mm->mm_children; c++) {
 			/*
 			 * Don't rewrite known good children.
 			 * Not only is it unnecessary, it could
@@ -368,24 +410,23 @@
 			 * power while rewriting the only good copy,
 			 * there would be no good copies left!
 			 */
-			cvd = vd->vdev_child[c];
+			mc = &mm->mm_child[c];
 
-			if (mm[c].mm_error == 0) {
-				if (mm[c].mm_tried)
+			if (mc->mc_error == 0) {
+				if (mc->mc_tried)
 					continue;
-				if (!vdev_dtl_contains(&cvd->vdev_dtl_map,
+				if (!vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map,
 				    zio->io_txg, 1))
 					continue;
-				mm[c].mm_error = ESTALE;
+				mc->mc_error = ESTALE;
 			}
 
-			dprintf("%s resilvered %s @ 0x%llx error %d\n",
-			    vdev_description(vd),
-			    vdev_description(cvd),
-			    zio->io_offset, mm[c].mm_error);
+			dprintf("resilvered %s @ 0x%llx error %d\n",
+			    vdev_description(mc->mc_vd), mc->mc_offset,
+			    mc->mc_error);
 
-			zio_nowait(zio_vdev_child_io(rio, zio->io_bp, cvd,
-			    zio->io_offset, zio->io_data, zio->io_size,
+			zio_nowait(zio_vdev_child_io(rio, zio->io_bp, mc->mc_vd,
+			    mc->mc_offset, zio->io_data, zio->io_size,
 			    ZIO_TYPE_WRITE, zio->io_priority,
 			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL |
 			    ZIO_FLAG_DONT_PROPAGATE, NULL, NULL));
--- a/usr/src/uts/common/fs/zfs/vdev_raidz.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c	Mon Apr 10 05:03:38 2006 -0700
@@ -272,12 +272,7 @@
 
 	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children);
 
-	if (DVA_GET_GANG(ZIO_GET_DVA(zio))) {
-		ASSERT3U(rm->rm_asize, ==,
-		    vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE));
-	} else {
-		ASSERT3U(rm->rm_asize, ==, DVA_GET_ASIZE(ZIO_GET_DVA(zio)));
-	}
+	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
 
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 
@@ -357,11 +352,10 @@
 	vdev_t *cvd;
 	raidz_map_t *rm = zio->io_vsd;
 	raidz_col_t *rc;
-	blkptr_t *bp = zio->io_bp;
 	int unexpected_errors = 0;
 	int c;
 
-	ASSERT(bp != NULL);	/* XXX need to add code to enforce this */
+	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
 
 	zio->io_error = 0;
 	zio->io_numerrors = 0;
--- a/usr/src/uts/common/fs/zfs/vdev_root.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/vdev_root.c	Mon Apr 10 05:03:38 2006 -0700
@@ -35,12 +35,29 @@
  * Virtual device vector for the pool's root vdev.
  */
 
+/*
+ * We should be able to tolerate one failure with absolutely no damage
+ * to our metadata.  Two failures will take out space maps, a bunch of
+ * indirect block trees, meta dnodes, dnodes, etc.  Probably not a happy
+ * place to live.  When we get smarter, we can liberalize this policy.
+ * e.g. If we haven't lost two consecutive top-level vdevs, then we are
+ * probably fine.  Adding bean counters during alloc/free can make this
+ * future guesswork more accurate.
+ */
+/*ARGSUSED*/
+static int
+too_many_errors(vdev_t *vd, int numerrors)
+{
+	return (numerrors > 0);
+}
+
 static int
 vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
 {
 	vdev_t *cvd;
 	int c, error;
 	int lasterror = 0;
+	int numerrors = 0;
 
 	if (vd->vdev_children == 0) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
@@ -52,17 +69,20 @@
 
 		if ((error = vdev_open(cvd)) != 0) {
 			lasterror = error;
+			numerrors++;
 			continue;
 		}
 	}
 
-	if (lasterror)
+	if (too_many_errors(vd, numerrors)) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+		return (lasterror);
+	}
 
 	*asize = 0;
 	*ashift = 0;
 
-	return (lasterror);
+	return (0);
 }
 
 static void
@@ -77,7 +97,7 @@
 static void
 vdev_root_state_change(vdev_t *vd, int faulted, int degraded)
 {
-	if (faulted > 0)
+	if (too_many_errors(vd, faulted))
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_NO_REPLICAS);
 	else if (degraded != 0)
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c	Mon Apr 10 05:03:38 2006 -0700
@@ -392,7 +392,7 @@
 static int
 zfs_ioc_pool_export(zfs_cmd_t *zc)
 {
-	return (spa_export(zc->zc_name));
+	return (spa_export(zc->zc_name, NULL));
 }
 
 static int
--- a/usr/src/uts/common/fs/zfs/zio.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/zio.c	Mon Apr 10 05:03:38 2006 -0700
@@ -248,8 +248,6 @@
 		zio->io_bp = bp;
 		zio->io_bp_copy = *bp;
 		zio->io_bp_orig = *bp;
-		/* XXBP - Need to inherit this when it matters */
-		zio->io_dva_index = 0;
 	}
 	zio->io_done = done;
 	zio->io_private = private;
@@ -279,6 +277,7 @@
 		if (pio->io_child != NULL)
 			pio->io_child->io_sibling_prev = zio;
 		pio->io_child = zio;
+		zio->io_ndvas = pio->io_ndvas;
 		mutex_exit(&pio->io_lock);
 	}
 
@@ -310,7 +309,6 @@
     int priority, int flags, zbookmark_t *zb)
 {
 	zio_t *zio;
-	dva_t *dva;
 
 	ASSERT3U(size, ==, BP_GET_LSIZE(bp));
 
@@ -325,9 +323,6 @@
 	 */
 	zio->io_bp = &zio->io_bp_copy;
 
-	bp = zio->io_bp;
-	dva = ZIO_GET_DVA(zio);
-
 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
 		uint64_t csize = BP_GET_PSIZE(bp);
 		void *cbuf = zio_buf_alloc(csize);
@@ -336,7 +331,7 @@
 		zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS;
 	}
 
-	if (DVA_GET_GANG(dva)) {
+	if (BP_IS_GANG(bp)) {
 		uint64_t gsize = SPA_GANGBLOCKSIZE;
 		void *gbuf = zio_buf_alloc(gsize);
 
@@ -348,7 +343,7 @@
 }
 
 zio_t *
-zio_write(zio_t *pio, spa_t *spa, int checksum, int compress,
+zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
     uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
     zio_done_func_t *done, void *private, int priority, int flags,
     zbookmark_t *zb)
@@ -371,6 +366,7 @@
 
 	zio->io_checksum = checksum;
 	zio->io_compress = compress;
+	zio->io_ndvas = ncopies;
 
 	if (compress != ZIO_COMPRESS_OFF)
 		zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS;
@@ -380,6 +376,10 @@
 		BP_ZERO(bp);
 		BP_SET_LSIZE(bp, size);
 		BP_SET_PSIZE(bp, size);
+	} else {
+		/* Make sure someone doesn't change their mind on overwrites */
+		ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp),
+		    spa_max_replication(spa)) == BP_GET_NDVAS(bp));
 	}
 
 	return (zio);
@@ -393,7 +393,6 @@
 {
 	zio_t *zio;
 
-	/* XXBP - We need to re-evaluate when to insert pipeline stages */
 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 	    ZIO_TYPE_WRITE, priority, flags,
 	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
@@ -402,6 +401,9 @@
 	zio->io_checksum = checksum;
 	zio->io_compress = ZIO_COMPRESS_OFF;
 
+	if (pio != NULL)
+		ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp));
+
 	return (zio);
 }
 
@@ -441,7 +443,6 @@
 		return (zio_null(pio, spa, NULL, NULL, 0));
 	}
 
-	/* XXBP - We need to re-evaluate when to insert pipeline stages */
 	zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
 	    ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, 0,
 	    ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
@@ -471,7 +472,6 @@
 	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
 	ASSERT3U(spa_first_txg(spa), <=, txg);
 
-	/* XXBP - We need to re-evaluate when to insert pipeline stages */
 	zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
 	    ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0,
 	    ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
@@ -623,7 +623,7 @@
 	cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size,
 	    done, private, type, priority,
 	    (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags,
-	    ZIO_STAGE_VDEV_IO_SETUP - 1, pipeline);
+	    ZIO_STAGE_VDEV_IO_START - 1, pipeline);
 
 	cio->io_vd = vd;
 	cio->io_offset = offset;
@@ -748,8 +748,13 @@
 		ASSERT(bp->blk_pad[2] == 0);
 		ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0);
 		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
-		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR))
+		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
 			ASSERT(!BP_SHOULD_BYTESWAP(bp));
+			if (zio->io_ndvas != 0)
+				ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp));
+			ASSERT(BP_COUNT_GANG(bp) == 0 ||
+			    (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
+		}
 	}
 
 	if (vd != NULL)
@@ -902,6 +907,7 @@
 			BP_ZERO(bp);
 			zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE;
 		} else {
+			ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
 			BP_SET_LSIZE(bp, lsize);
 			BP_SET_PSIZE(bp, csize);
 			BP_SET_COMPRESS(bp, compress);
@@ -946,7 +952,7 @@
 	 * By default, the pipeline assumes that we're dealing with a gang
 	 * block.  If we're not, strip out any gang-specific stages.
 	 */
-	if (!DVA_GET_GANG(ZIO_GET_DVA(zio)))
+	if (!BP_IS_GANG(zio->io_bp))
 		zio->io_pipeline &= ~ZIO_GANG_STAGES;
 
 	zio_next_stage(zio);
@@ -968,7 +974,7 @@
 	uint64_t gsize = SPA_GANGBLOCKSIZE;
 	void *gbuf = zio_buf_alloc(gsize);
 
-	ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio)));
+	ASSERT(BP_IS_GANG(bp));
 
 	zio_push_transform(zio, gbuf, gsize, gsize);
 
@@ -987,7 +993,7 @@
 	uint64_t gsize, gbufsize, loff, lsize;
 	int i;
 
-	ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio)));
+	ASSERT(BP_IS_GANG(zio->io_bp));
 
 	zio_gang_byteswap(zio);
 	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
@@ -1019,7 +1025,7 @@
 	uint64_t gsize, gbufsize, loff, lsize;
 	int i;
 
-	ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio)));
+	ASSERT(BP_IS_GANG(zio->io_bp));
 	ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
 
 	zio_gang_byteswap(zio);
@@ -1054,7 +1060,7 @@
 	uint64_t gsize, gbufsize;
 	int i;
 
-	ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio)));
+	ASSERT(BP_IS_GANG(zio->io_bp));
 
 	zio_gang_byteswap(zio);
 	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
@@ -1079,7 +1085,7 @@
 	uint64_t gsize, gbufsize;
 	int i;
 
-	ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio)));
+	ASSERT(BP_IS_GANG(zio->io_bp));
 
 	zio_gang_byteswap(zio);
 	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
@@ -1100,17 +1106,23 @@
 zio_write_allocate_gang_member_done(zio_t *zio)
 {
 	zio_t *pio = zio->io_parent;
-	dva_t *cdva = ZIO_GET_DVA(zio);
-	dva_t *pdva = ZIO_GET_DVA(pio);
+	dva_t *cdva = zio->io_bp->blk_dva;
+	dva_t *pdva = pio->io_bp->blk_dva;
 	uint64_t asize;
-
-	ASSERT(DVA_GET_GANG(pdva));
+	int d;
 
-	/* XXBP - Need to be careful here with multiple DVAs */
+	ASSERT3U(pio->io_ndvas, ==, zio->io_ndvas);
+	ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
+	ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(zio->io_bp));
+	ASSERT3U(pio->io_ndvas, <=, BP_GET_NDVAS(pio->io_bp));
+
 	mutex_enter(&pio->io_lock);
-	asize = DVA_GET_ASIZE(pdva);
-	asize += DVA_GET_ASIZE(cdva);
-	DVA_SET_ASIZE(pdva, asize);
+	for (d = 0; d < BP_GET_NDVAS(pio->io_bp); d++) {
+		ASSERT(DVA_GET_GANG(&pdva[d]));
+		asize = DVA_GET_ASIZE(&pdva[d]);
+		asize += DVA_GET_ASIZE(&cdva[d]);
+		DVA_SET_ASIZE(&pdva[d], asize);
+	}
 	mutex_exit(&pio->io_lock);
 }
 
@@ -1118,41 +1130,50 @@
 zio_write_allocate_gang_members(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
-	dva_t *dva = ZIO_GET_DVA(zio);
+	dva_t *dva = bp->blk_dva;
+	spa_t *spa = zio->io_spa;
 	zio_gbh_phys_t *gbh;
+	uint64_t txg = zio->io_txg;
 	uint64_t resid = zio->io_size;
 	uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE);
 	uint64_t gsize, loff, lsize;
 	uint32_t gbps_left;
+	int ndvas = zio->io_ndvas;
+	int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa));
 	int error;
-	int i;
+	int i, d;
 
 	gsize = SPA_GANGBLOCKSIZE;
 	gbps_left = SPA_GBH_NBLKPTRS;
 
-	error = metaslab_alloc(zio->io_spa, gsize, dva, zio->io_txg);
+	error = metaslab_alloc(spa, gsize, bp, gbh_ndvas, txg, NULL);
 	if (error == ENOSPC)
 		panic("can't allocate gang block header");
 	ASSERT(error == 0);
 
-	DVA_SET_GANG(dva, 1);
+	for (d = 0; d < gbh_ndvas; d++)
+		DVA_SET_GANG(&dva[d], 1);
 
-	bp->blk_birth = zio->io_txg;
+	bp->blk_birth = txg;
 
 	gbh = zio_buf_alloc(gsize);
 	bzero(gbh, gsize);
 
+	/* We need to test multi-level gang blocks */
+	if (maxalloc >= zio_gang_bang && (lbolt & 0x1) == 0)
+		maxalloc = MAX(maxalloc >> 2, SPA_MINBLOCKSIZE);
+
 	for (loff = 0, i = 0; loff != zio->io_size;
 	    loff += lsize, resid -= lsize, gbps_left--, i++) {
 		blkptr_t *gbp = &gbh->zg_blkptr[i];
-		dva = &gbp->blk_dva[0];
+		dva = gbp->blk_dva;
 
 		ASSERT(gbps_left != 0);
 		maxalloc = MIN(maxalloc, resid);
 
 		while (resid <= maxalloc * gbps_left) {
-			error = metaslab_alloc(zio->io_spa, maxalloc, dva,
-			    zio->io_txg);
+			error = metaslab_alloc(spa, maxalloc, gbp, ndvas,
+			    txg, bp);
 			if (error == 0)
 				break;
 			ASSERT3U(error, ==, ENOSPC);
@@ -1166,9 +1187,9 @@
 			BP_SET_LSIZE(gbp, lsize);
 			BP_SET_PSIZE(gbp, lsize);
 			BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF);
-			gbp->blk_birth = zio->io_txg;
-			zio_nowait(zio_rewrite(zio, zio->io_spa,
-			    zio->io_checksum, zio->io_txg, gbp,
+			gbp->blk_birth = txg;
+			zio_nowait(zio_rewrite(zio, spa,
+			    zio->io_checksum, txg, gbp,
 			    (char *)zio->io_data + loff, lsize,
 			    zio_write_allocate_gang_member_done, NULL,
 			    zio->io_priority, zio->io_flags,
@@ -1176,8 +1197,8 @@
 		} else {
 			lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE);
 			ASSERT(lsize != SPA_MINBLOCKSIZE);
-			zio_nowait(zio_write_allocate(zio, zio->io_spa,
-			    zio->io_checksum, zio->io_txg, gbp,
+			zio_nowait(zio_write_allocate(zio, spa,
+			    zio->io_checksum, txg, gbp,
 			    (char *)zio->io_data + loff, lsize,
 			    zio_write_allocate_gang_member_done, NULL,
 			    zio->io_priority, zio->io_flags));
@@ -1189,6 +1210,12 @@
 	zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE;
 
 	zio_push_transform(zio, gbh, gsize, gsize);
+	/*
+	 * As much as we'd like this to be zio_wait_children_ready(),
+	 * updating our ASIZE doesn't happen until the io_done callback,
+	 * so we have to wait for that to finish in order for our BP
+	 * to be stable.
+	 */
 	zio_wait_children_done(zio);
 }
 
@@ -1201,10 +1228,12 @@
 zio_dva_allocate(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
-	dva_t *dva = ZIO_GET_DVA(zio);
 	int error;
 
 	ASSERT(BP_IS_HOLE(bp));
+	ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
+	ASSERT3U(zio->io_ndvas, >, 0);
+	ASSERT3U(zio->io_ndvas, <=, spa_max_replication(zio->io_spa));
 
 	/* For testing, make some blocks above a certain size be gang blocks */
 	if (zio->io_size >= zio_gang_bang && (lbolt & 0x3) == 0) {
@@ -1214,7 +1243,8 @@
 
 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
 
-	error = metaslab_alloc(zio->io_spa, zio->io_size, dva, zio->io_txg);
+	error = metaslab_alloc(zio->io_spa, zio->io_size, bp, zio->io_ndvas,
+	    zio->io_txg, NULL);
 
 	if (error == 0) {
 		bp->blk_birth = zio->io_txg;
@@ -1233,11 +1263,13 @@
 zio_dva_free(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
-	dva_t *dva = ZIO_GET_DVA(zio);
+	dva_t *dva = bp->blk_dva;
+	int d;
 
 	ASSERT(!BP_IS_HOLE(bp));
 
-	metaslab_free(zio->io_spa, dva, zio->io_txg, B_FALSE);
+	for (d = 0; d < BP_GET_NDVAS(bp); d++)
+		metaslab_free(zio->io_spa, &dva[d], zio->io_txg, B_FALSE);
 
 	BP_ZERO(bp);
 
@@ -1248,31 +1280,17 @@
 zio_dva_claim(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
-	dva_t *dva = ZIO_GET_DVA(zio);
+	dva_t *dva = bp->blk_dva;
+	int error = 0;
+	int d;
 
 	ASSERT(!BP_IS_HOLE(bp));
 
-	zio->io_error = metaslab_claim(zio->io_spa, dva, zio->io_txg);
-
-	zio_next_stage(zio);
-}
-
-static void
-zio_dva_translate(zio_t *zio)
-{
-	spa_t *spa = zio->io_spa;
-	dva_t *dva = ZIO_GET_DVA(zio);
-	uint64_t vdev = DVA_GET_VDEV(dva);
-	uint64_t offset = DVA_GET_OFFSET(dva);
-
-	ASSERT3U(zio->io_size, ==, ZIO_GET_IOSIZE(zio));
-
-	zio->io_offset = offset;
-
-	if ((zio->io_vd = vdev_lookup_top(spa, vdev)) == NULL)
-		zio->io_error = ENXIO;
-	else if (offset + zio->io_size > zio->io_vd->vdev_asize)
-		zio->io_error = EOVERFLOW;
+	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+		error = metaslab_claim(zio->io_spa, &dva[d], zio->io_txg);
+		if (error)
+			zio->io_error = error;
+	}
 
 	zio_next_stage(zio);
 }
@@ -1284,17 +1302,26 @@
  */
 
 static void
-zio_vdev_io_setup(zio_t *zio)
+zio_vdev_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
-	vdev_t *tvd = vd->vdev_top;
-	uint64_t align = 1ULL << tvd->vdev_ashift;
+	vdev_t *tvd = vd ? vd->vdev_top : NULL;
+	blkptr_t *bp = zio->io_bp;
+	uint64_t align;
 
-	/* XXPOLICY */
+	if (vd == NULL) {
+		/* The mirror_ops handle multiple DVAs in a single BP */
+		vdev_mirror_ops.vdev_op_io_start(zio);
+		return;
+	}
+
+	align = 1ULL << tvd->vdev_ashift;
+
 	if (zio->io_retries == 0 && vd == tvd)
 		zio->io_flags |= ZIO_FLAG_FAILFAST;
 
-	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && vd->vdev_children == 0) {
+	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
+	    vd->vdev_children == 0) {
 		zio->io_flags |= ZIO_FLAG_PHYSICAL;
 		zio->io_offset += VDEV_LABEL_START_SIZE;
 	}
@@ -1312,15 +1339,6 @@
 		zio->io_flags |= ZIO_FLAG_SUBBLOCK;
 	}
 
-	zio_next_stage(zio);
-}
-
-static void
-zio_vdev_io_start(zio_t *zio)
-{
-	blkptr_t *bp = zio->io_bp;
-	uint64_t align = 1ULL << zio->io_vd->vdev_top->vdev_ashift;
-
 	ASSERT(P2PHASE(zio->io_offset, align) == 0);
 	ASSERT(P2PHASE(zio->io_size, align) == 0);
 	ASSERT(bp == NULL ||
@@ -1335,7 +1353,11 @@
 static void
 zio_vdev_io_done(zio_t *zio)
 {
-	vdev_io_done(zio);
+	if (zio->io_vd == NULL)
+		/* The mirror_ops handle multiple DVAs in a single BP */
+		vdev_mirror_ops.vdev_op_io_done(zio);
+	else
+		vdev_io_done(zio);
 }
 
 /* XXPOLICY */
@@ -1348,7 +1370,7 @@
 		return (B_FALSE);
 	if (zio->io_delegate_list != NULL)
 		return (B_FALSE);
-	if (vd != vd->vdev_top)
+	if (vd && vd != vd->vdev_top)
 		return (B_FALSE);
 	if (zio->io_flags & ZIO_FLAG_DONT_RETRY)
 		return (B_FALSE);
@@ -1362,7 +1384,7 @@
 zio_vdev_io_assess(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
-	vdev_t *tvd = vd->vdev_top;
+	vdev_t *tvd = vd ? vd->vdev_top : NULL;
 
 	ASSERT(zio->io_vsd == NULL);
 
@@ -1394,7 +1416,7 @@
 		/* XXPOLICY */
 		zio->io_flags &= ~ZIO_FLAG_FAILFAST;
 		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
-		zio->io_stage = ZIO_STAGE_VDEV_IO_SETUP - 1;
+		zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1;
 
 		dprintf("retry #%d for %s to %s offset %llx\n",
 		    zio->io_retries, zio_type_name[zio->io_type],
@@ -1404,8 +1426,8 @@
 		return;
 	}
 
-	if (zio->io_error != 0 && !(zio->io_flags & ZIO_FLAG_SPECULATIVE) &&
-	    zio->io_error != ECKSUM) {
+	if (zio->io_error != 0 && zio->io_error != ECKSUM &&
+	    !(zio->io_flags & ZIO_FLAG_SPECULATIVE) && vd) {
 		/*
 		 * Poor man's hotplug support.  Even if we're done retrying this
 		 * I/O, try to reopen the vdev to see if it's still attached.
@@ -1480,8 +1502,8 @@
 	zio_cksum_t zc;
 	zio_gbh_phys_t *gbh = zio->io_data;
 
+	ASSERT(BP_IS_GANG(zio->io_bp));
 	ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
-	ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio)));
 
 	zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum);
 
@@ -1518,9 +1540,11 @@
 void
 zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp)
 {
-	zcp->zc_word[0] = DVA_GET_VDEV(ZIO_GET_DVA(zio));
-	zcp->zc_word[1] = DVA_GET_OFFSET(ZIO_GET_DVA(zio));
-	zcp->zc_word[2] = zio->io_bp->blk_birth;
+	blkptr_t *bp = zio->io_bp;
+
+	zcp->zc_word[0] = DVA_GET_VDEV(BP_IDENTITY(bp));
+	zcp->zc_word[1] = DVA_GET_OFFSET(BP_IDENTITY(bp));
+	zcp->zc_word[2] = bp->blk_birth;
 	zcp->zc_word[3] = 0;
 }
 
@@ -1552,8 +1576,6 @@
 	zio_dva_claim,
 	zio_gang_checksum_generate,
 	zio_ready,
-	zio_dva_translate,
-	zio_vdev_io_setup,
 	zio_vdev_io_start,
 	zio_vdev_io_done,
 	zio_vdev_io_assess,
@@ -1656,7 +1678,7 @@
 
 	BP_ZERO(bp);
 
-	error = metaslab_alloc(spa, size, BP_IDENTITY(bp), txg);
+	error = metaslab_alloc(spa, size, bp, 1, txg, NULL);
 
 	if (error == 0) {
 		BP_SET_CHECKSUM(bp, checksum);
@@ -1681,7 +1703,7 @@
 void
 zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg)
 {
-	ASSERT(DVA_GET_GANG(BP_IDENTITY(bp)) == 0);
+	ASSERT(!BP_IS_GANG(bp));
 
 	dprintf_bp(bp, "txg %llu: ", txg);
 
--- a/usr/src/uts/common/fs/zfs/zio_checksum.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/zio_checksum.c	Mon Apr 10 05:03:38 2006 -0700
@@ -122,9 +122,8 @@
 zio_checksum_error(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
-	dva_t *dva = ZIO_GET_DVA(zio);
 	zio_cksum_t zc = bp->blk_cksum;
-	uint_t checksum = DVA_GET_GANG(dva) ? ZIO_CHECKSUM_GANG_HEADER :
+	uint_t checksum = BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER :
 	    BP_GET_CHECKSUM(bp);
 	int byteswap = BP_SHOULD_BYTESWAP(bp);
 	void *data = zio->io_data;
@@ -159,7 +158,7 @@
 		}
 		zc = expected_cksum;
 	} else {
-		ASSERT(!DVA_GET_GANG(dva));
+		ASSERT(!BP_IS_GANG(bp));
 		ci->ci_func[byteswap](data, size, &actual_cksum);
 	}
 
--- a/usr/src/uts/common/sys/fs/zfs.h	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/sys/fs/zfs.h	Mon Apr 10 05:03:38 2006 -0700
@@ -109,7 +109,23 @@
 /*
  * On-disk format version.
  */
-#define	ZFS_VERSION			1ULL
+#define	ZFS_VERSION_1			1ULL
+#define	ZFS_VERSION_2			2ULL
+#define	ZFS_VERSION			ZFS_VERSION_2
+
+/*
+ * Symbolic names for the changes that caused a ZFS_VERSION switch.
+ * Used in the code when checking for presence or absence of a feature.
+ * Feel free to define multiple symbolic names for each version if there
+ * were multiple changes to on-disk structures during that version.
+ *
+ * NOTE: When checking the current ZFS_VERSION in your code, be sure
+ *       to use spa_version() since it reports the version of the
+ *       last synced uberblock.  Checking the in-flight version can
+ *       be dangerous in some cases.
+ */
+#define	ZFS_VERSION_INITIAL		ZFS_VERSION_1
+#define	ZFS_VERSION_DITTO_BLOCKS	ZFS_VERSION_2
 
 /*
  * The following are configuration names used in the nvlist describing a pool's