changeset 1732:9e3ae798af31

6280668 pluggable block allocation policy 6399301 initial read of space maps is super slow 6407365 large-sector disk support in ZFS 6407366 ADVANCE_NOLOCK gathers MOS 6407367 three-way deadlock between db_mtx, dbuf_hash[], and ms_lock
author bonwick
date Sun, 02 Apr 2006 00:47:06 -0800
parents 1efa8b3d1296
children a7c3bc84e012
files usr/src/cmd/mdb/common/modules/zfs/zfs.c usr/src/cmd/zdb/zdb.c usr/src/cmd/ztest/ztest.c usr/src/uts/common/fs/zfs/dmu_traverse.c usr/src/uts/common/fs/zfs/metaslab.c usr/src/uts/common/fs/zfs/spa.c usr/src/uts/common/fs/zfs/space_map.c usr/src/uts/common/fs/zfs/sys/dmu_traverse.h usr/src/uts/common/fs/zfs/sys/metaslab.h usr/src/uts/common/fs/zfs/sys/metaslab_impl.h usr/src/uts/common/fs/zfs/sys/space_map.h usr/src/uts/common/fs/zfs/sys/uberblock_impl.h usr/src/uts/common/fs/zfs/sys/vdev_impl.h usr/src/uts/common/fs/zfs/sys/zio.h usr/src/uts/common/fs/zfs/vdev.c usr/src/uts/common/fs/zfs/vdev_disk.c usr/src/uts/common/fs/zfs/vdev_label.c usr/src/uts/common/fs/zfs/vdev_mirror.c usr/src/uts/common/fs/zfs/vdev_raidz.c usr/src/uts/common/fs/zfs/vdev_root.c usr/src/uts/common/fs/zfs/zio.c usr/src/uts/common/fs/zfs/zio_checksum.c
diffstat 22 files changed, 999 insertions(+), 916 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c	Sat Apr 01 21:50:51 2006 -0800
+++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c	Sun Apr 02 00:47:06 2006 -0800
@@ -405,7 +405,6 @@
 blkptr(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 {
 	blkptr_t bp;
-	dva_t *dva;
 	dmu_object_type_info_t *doti;
 	zio_compress_info_t *zct;
 	zio_checksum_info_t *zci;
@@ -439,17 +438,20 @@
 	}
 
 	for (i = 0; i < SPA_DVAS_PER_BP; i++) {
-		dva = &bp.blk_dva[i];
-		mdb_printf("DVA[%d]: vdev_id %lld / %llx\n", i,
-		    DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva));
-		mdb_printf("DVA[%d]:                    GRID:  %04x\t"
-		    "ASIZE: %llx\n", i, DVA_GET_GRID(dva), DVA_GET_ASIZE(dva));
+		dva_t *dva = &bp.blk_dva[i];
+		mdb_printf("DVA[%d]: GANG: %-5s  GRID: %2x  ASIZE: %5x  "
+		    "vdev %llu  offset %llx\n",
+		    i,
+		    DVA_GET_GANG(dva) ? "TRUE" : "FALSE",
+		    DVA_GET_GRID(dva),
+		    DVA_GET_ASIZE(dva),
+		    DVA_GET_VDEV(dva),
+		    DVA_GET_OFFSET(dva));
 	}
 	mdb_printf("LSIZE:  %-16llx\t\tPSIZE: %llx\n",
 	    BP_GET_LSIZE(&bp), BP_GET_PSIZE(&bp));
-	mdb_printf("ENDIAN: %6s             GANG:  %-5s\tTYPE:  %s\n",
+	mdb_printf("ENDIAN: %-6s  TYPE: %s\n",
 	    BP_GET_BYTEORDER(&bp) ? "LITTLE" : "BIG",
-	    DVA_GET_GANG(dva) ? "TRUE" : "FALSE",
 	    doti[BP_GET_TYPE(&bp)].ot_name);
 	mdb_printf("BIRTH:  %-16llx   LEVEL: %-2d\tFILL:  %llx\n",
 	    bp.blk_birth, BP_GET_LEVEL(&bp), bp.blk_fill);
@@ -1146,7 +1148,7 @@
 	space_map_t ms_allocmap[TXG_SIZE];
 	space_map_t ms_freemap[TXG_SIZE];
 	space_map_t ms_map;
-	uint64_t ms_usable_space;
+	space_map_obj_t ms_smo;
 } mdb_metaslab_t;
 
 /*
@@ -1170,7 +1172,7 @@
 	uint64_t ms_allocmap[TXG_SIZE] = {0, 0, 0, 0};
 	uint64_t ms_freemap[TXG_SIZE] = {0, 0, 0, 0};
 	uint64_t ms_map = 0;
-	uint64_t ms_usable_space = 0;
+	uint64_t avail = 0;
 	int i, j;
 	int havecompressed = TRUE;
 	int shift = 20;
@@ -1282,7 +1284,7 @@
 			    GETMEMB(vdev_ms[j], struct metaslab,
 			    ms_map, ms.ms_map) ||
 			    GETMEMB(vdev_ms[j], struct metaslab,
-			    ms_usable_space, ms.ms_usable_space)) {
+			    ms_smo, ms.ms_smo)) {
 				return (DCMD_ERR);
 			}
 
@@ -1295,7 +1297,7 @@
 			ms_freemap[2] += ms.ms_freemap[2].sm_space;
 			ms_freemap[3] += ms.ms_freemap[3].sm_space;
 			ms_map += ms.ms_map.sm_space;
-			ms_usable_space += ms.ms_usable_space;
+			avail += ms.ms_map.sm_size - ms.ms_smo.smo_alloc;
 		}
 	}
 
@@ -1310,8 +1312,7 @@
 	    ms_freemap[2] >> shift, suffix,
 	    ms_freemap[3] >> shift, suffix);
 	mdb_printf("ms_map = %llu%s\n", ms_map >> shift, suffix);
-	mdb_printf("ms_usable_space = %llu%s\n",
-	    ms_usable_space >> shift, suffix);
+	mdb_printf("avail = %llu%s\n", avail >> shift, suffix);
 
 	return (DCMD_OK);
 }
--- a/usr/src/cmd/zdb/zdb.c	Sat Apr 01 21:50:51 2006 -0800
+++ b/usr/src/cmd/zdb/zdb.c	Sun Apr 02 00:47:06 2006 -0800
@@ -366,7 +366,7 @@
 dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm)
 {
 	uint64_t alloc, offset, entry;
-	int mapshift = sm->sm_shift;
+	uint8_t mapshift = sm->sm_shift;
 	uint64_t mapstart = sm->sm_start;
 	char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID" };
 
@@ -412,7 +412,7 @@
 dump_metaslab(metaslab_t *msp)
 {
 	char freebuf[5];
-	space_map_obj_t *smo = msp->ms_smo;
+	space_map_obj_t *smo = &msp->ms_smo;
 	vdev_t *vd = msp->ms_group->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 
@@ -921,13 +921,13 @@
 	dnode_t *dn;
 	void *bonus = NULL;
 	size_t bsize = 0;
-	char iblk[6], dblk[6], lsize[6], psize[6], bonus_size[6], segsize[6];
+	char iblk[6], dblk[6], lsize[6], asize[6], bonus_size[6], segsize[6];
 	char aux[50];
 	int error;
 
 	if (*print_header) {
 		(void) printf("\n    Object  lvl   iblk   dblk  lsize"
-		    "  psize  type\n");
+		    "  asize  type\n");
 		*print_header = 0;
 	}
 
@@ -948,7 +948,7 @@
 	nicenum(doi.doi_data_block_size, dblk);
 	nicenum(doi.doi_data_block_size * (doi.doi_max_block_offset + 1),
 	    lsize);
-	nicenum(doi.doi_physical_blks << 9, psize);
+	nicenum(doi.doi_physical_blks << 9, asize);
 	nicenum(doi.doi_bonus_size, bonus_size);
 
 	aux[0] = '\0';
@@ -963,7 +963,7 @@
 
 	(void) printf("%10lld  %3u  %5s  %5s  %5s  %5s  %s%s\n",
 	    (u_longlong_t)object, doi.doi_indirection, iblk, dblk, lsize,
-	    psize, dmu_ot[doi.doi_type].ot_name, aux);
+	    asize, dmu_ot[doi.doi_type].ot_name, aux);
 
 	if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
 		(void) printf("%10s  %3s  %5s  %5s  %5s  %5s  %s\n",
@@ -1214,11 +1214,9 @@
 		vd = rvd->vdev_child[c];
 		for (m = 0; m < vd->vdev_ms_count; m++) {
 			metaslab_t *msp = vd->vdev_ms[m];
-			space_map_t *sm = &msp->ms_allocmap[0];
 			mutex_enter(&msp->ms_lock);
-			error = space_map_load(sm, msp->ms_smo, SM_ALLOC,
-			    spa->spa_meta_objset, msp->ms_usable_end,
-			    sm->sm_size - msp->ms_usable_space);
+			error = space_map_load(&msp->ms_allocmap[0], NULL,
+			    SM_ALLOC, &msp->ms_smo, spa->spa_meta_objset);
 			mutex_exit(&msp->ms_lock);
 			if (error)
 				fatal("%s bad space map #%d, error %d",
@@ -1314,7 +1312,7 @@
 }
 
 static void
-zdb_space_map_vacate(spa_t *spa)
+zdb_space_map_unload(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd;
@@ -1327,6 +1325,7 @@
 			mutex_enter(&msp->ms_lock);
 			space_map_vacate(&msp->ms_allocmap[0], zdb_leak,
 			    &msp->ms_allocmap[0]);
+			space_map_unload(&msp->ms_allocmap[0]);
 			space_map_vacate(&msp->ms_freemap[0], NULL, NULL);
 			mutex_exit(&msp->ms_lock);
 		}
@@ -1534,7 +1533,7 @@
 	th = traverse_init(spa, zdb_blkptr_cb, &zcb, advance, flags);
 	th->th_noread = zdb_noread;
 
-	traverse_add_pool(th, 0, spa_first_txg(spa));
+	traverse_add_pool(th, 0, spa_first_txg(spa) + TXG_CONCURRENT_STATES);
 
 	while (traverse_more(th) == EAGAIN)
 		continue;
@@ -1556,7 +1555,7 @@
 	 * Report any leaked segments.
 	 */
 	if (!dump_opt['L'])
-		zdb_space_map_vacate(spa);
+		zdb_space_map_unload(spa);
 
 	if (dump_opt['L'])
 		(void) printf("\n\n *** Live pool traversal; "
--- a/usr/src/cmd/ztest/ztest.c	Sat Apr 01 21:50:51 2006 -0800
+++ b/usr/src/cmd/ztest/ztest.c	Sun Apr 02 00:47:06 2006 -0800
@@ -110,10 +110,11 @@
 
 static uint64_t zopt_vdevs = 5;
 static uint64_t zopt_vdevtime;
+static int zopt_ashift = SPA_MINBLOCKSHIFT;
 static int zopt_mirrors = 2;
 static int zopt_raidz = 4;
 static size_t zopt_vdev_size = SPA_MINDEVSIZE;
-static int zopt_dirs = 7;
+static int zopt_datasets = 7;
 static int zopt_threads = 23;
 static uint64_t zopt_passtime = 60;	/* 60 seconds */
 static uint64_t zopt_killrate = 70;	/* 70% kill rate */
@@ -341,6 +342,7 @@
 	(void) printf("Usage: %s\n"
 	    "\t[-v vdevs (default: %llu)]\n"
 	    "\t[-s size_of_each_vdev (default: %s)]\n"
+	    "\t[-a alignment_shift (default: %d) (use 0 for random)]\n"
 	    "\t[-m mirror_copies (default: %d)]\n"
 	    "\t[-r raidz_disks (default: %d)]\n"
 	    "\t[-d datasets (default: %d)]\n"
@@ -351,17 +353,17 @@
 	    "\t[-p pool_name (default: %s)]\n"
 	    "\t[-f file directory for vdev files (default: %s)]\n"
 	    "\t[-V(erbose)] (use multiple times for ever more blather)\n"
-	    "\t[-E(xisting)] (use existing pool instead of creating new one\n"
-	    "\t[-I(mport)] (discover and import existing pools)\n"
+	    "\t[-E(xisting)] (use existing pool instead of creating new one)\n"
 	    "\t[-T time] total run time (default: %llu sec)\n"
 	    "\t[-P passtime] time per pass (default: %llu sec)\n"
 	    "",
 	    cmdname,
 	    (u_longlong_t)zopt_vdevs,		/* -v */
 	    nice_vdev_size,			/* -s */
+	    zopt_ashift,			/* -a */
 	    zopt_mirrors,			/* -m */
 	    zopt_raidz,				/* -r */
-	    zopt_dirs,			/* -d */
+	    zopt_datasets,			/* -d */
 	    zopt_threads,			/* -t */
 	    nice_gang_bang,			/* -g */
 	    zopt_init,				/* -i */
@@ -404,14 +406,14 @@
 	zio_gang_bang = 32 << 10;
 
 	while ((opt = getopt(argc, argv,
-	    "v:s:m:r:c:d:t:g:i:k:p:f:VEIT:P:S")) != EOF) {
+	    "v:s:a:m:r:d:t:g:i:k:p:f:VET:P:")) != EOF) {
 		value = 0;
 		switch (opt) {
 		    case 'v':
 		    case 's':
+		    case 'a':
 		    case 'm':
 		    case 'r':
-		    case 'c':
 		    case 'd':
 		    case 't':
 		    case 'g':
@@ -428,6 +430,9 @@
 		    case 's':
 			zopt_vdev_size = MAX(SPA_MINDEVSIZE, value);
 			break;
+		    case 'a':
+			zopt_ashift = value;
+			break;
 		    case 'm':
 			zopt_mirrors = value;
 			break;
@@ -435,7 +440,7 @@
 			zopt_raidz = MAX(1, value);
 			break;
 		    case 'd':
-			zopt_dirs = MAX(1, value);
+			zopt_datasets = MAX(1, value);
 			break;
 		    case 't':
 			zopt_threads = MAX(1, value);
@@ -478,11 +483,20 @@
 	zopt_maxfaults = MAX(zopt_mirrors, 1) * (zopt_raidz >= 2 ? 2 : 1) - 1;
 }
 
+static uint64_t
+ztest_get_ashift(void)
+{
+	if (zopt_ashift == 0)
+		return (SPA_MINBLOCKSHIFT + ztest_random(3));
+	return (zopt_ashift);
+}
+
 static nvlist_t *
 make_vdev_file(size_t size)
 {
 	char dev_name[MAXPATHLEN];
 	uint64_t vdev;
+	uint64_t ashift = ztest_get_ashift();
 	int fd;
 	nvlist_t *file;
 
@@ -505,6 +519,7 @@
 	VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0);
 	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, dev_name) == 0);
+	VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0);
 
 	return (file);
 }
@@ -828,7 +843,6 @@
 	return (NULL);
 }
 
-
 /*
  * Verify that we can attach and detach devices.
  */
@@ -841,6 +855,7 @@
 	nvlist_t *root, *file;
 	uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
 	uint64_t leaf, top;
+	uint64_t ashift = ztest_get_ashift();
 	size_t oldsize, newsize;
 	char oldpath[MAXPATHLEN], newpath[MAXPATHLEN];
 	int replacing;
@@ -917,6 +932,8 @@
 		expected_error = EBUSY;
 	else if (newsize < oldsize)
 		expected_error = EOVERFLOW;
+	else if (ashift > oldvd->vdev_top->vdev_ashift)
+		expected_error = EDOM;
 	else
 		expected_error = 0;
 
@@ -940,6 +957,7 @@
 	VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0);
 	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, newpath) == 0);
+	VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0);
 
 	VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0);
@@ -2691,6 +2709,7 @@
 	nvlist_t *file, *root;
 	int error;
 	uint64_t guid;
+	uint64_t ashift = ztest_get_ashift();
 	vdev_t *vd;
 
 	(void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
@@ -2701,6 +2720,7 @@
 	VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0);
 	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, dev_name) == 0);
+	VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0);
 
 	VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0);
@@ -2714,7 +2734,11 @@
 		guid = vd->vdev_guid;
 	spa_config_exit(spa, FTAG);
 	error = spa_vdev_attach(spa, guid, root, B_TRUE);
-	if (error != 0 && error != EBUSY && error != ENOTSUP && error != ENODEV)
+	if (error != 0 &&
+	    error != EBUSY &&
+	    error != ENOTSUP &&
+	    error != ENODEV &&
+	    error != EDOM)
 		fatal(0, "spa_vdev_attach(in-place) = %d", error);
 
 	nvlist_free(file);
@@ -3032,8 +3056,8 @@
 		za[0].za_kill -= ztest_random(zopt_passtime * NANOSEC);
 
 	for (t = 0; t < zopt_threads; t++) {
-		d = t % zopt_dirs;
-		if (t < zopt_dirs) {
+		d = t % zopt_datasets;
+		if (t < zopt_datasets) {
 			ztest_replay_t zr;
 			(void) rw_rdlock(&ztest_shared->zs_name_lock);
 			(void) snprintf(name, 100, "%s/%s_%d", pool, pool, d);
@@ -3082,7 +3106,7 @@
 			fatal(0, "thr_join(%d) = %d", t, error);
 		if (za[t].za_th)
 			traverse_fini(za[t].za_th);
-		if (t < zopt_dirs) {
+		if (t < zopt_datasets) {
 			zil_close(za[t].za_zilog);
 			dmu_objset_close(za[t].za_os);
 		}
@@ -3105,7 +3129,7 @@
 	if (zs->zs_enospc_count != 0) {
 		(void) rw_rdlock(&ztest_shared->zs_name_lock);
 		(void) snprintf(name, 100, "%s/%s_%d", pool, pool,
-		    (int)ztest_random(zopt_dirs));
+		    (int)ztest_random(zopt_datasets));
 		if (zopt_verbose >= 3)
 			(void) printf("Destroying %s to free up space\n", name);
 		dmu_objset_find(name, ztest_destroy_cb, NULL,
@@ -3226,7 +3250,7 @@
 	if (zopt_verbose >= 1) {
 		(void) printf("%llu vdevs, %d datasets, %d threads,"
 		    " %llu seconds...\n",
-		    (u_longlong_t)zopt_vdevs, zopt_dirs, zopt_threads,
+		    (u_longlong_t)zopt_vdevs, zopt_datasets, zopt_threads,
 		    (u_longlong_t)zopt_time);
 	}
 
--- a/usr/src/uts/common/fs/zfs/dmu_traverse.c	Sat Apr 01 21:50:51 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c	Sun Apr 02 00:47:06 2006 -0800
@@ -283,17 +283,19 @@
 	if (bc->bc_errno == 0 && bc->bc_blkptr.blk_birth >= zseg->seg_maxtxg)
 		return (0);
 
-	if (bc->bc_errno == 0) {
+	/*
+	 * Debugging: verify that the order we visit things agrees with the
+	 * order defined by compare_bookmark().  We don't check this for
+	 * log blocks because there's no defined ordering for them; they're
+	 * always visited (or not) as part of visiting the objset_phys_t.
+	 */
+	if (bc->bc_errno == 0 && bc != &th->th_zil_cache) {
 		zbookmark_t *zb = &bc->bc_bookmark;
 		zbookmark_t *szb = &zseg->seg_start;
 		zbookmark_t *ezb = &zseg->seg_end;
 		zbookmark_t *lzb = &th->th_lastcb;
 		dnode_phys_t *dnp = bc->bc_dnode;
 
-		/*
-		 * Debugging: verify that the order we visit things
-		 * agrees with the order defined by compare_bookmark().
-		 */
 		ASSERT(compare_bookmark(zb, ezb, dnp, th->th_advance) <= 0);
 		ASSERT(compare_bookmark(zb, szb, dnp, th->th_advance) == 0);
 		ASSERT(compare_bookmark(lzb, zb, dnp, th->th_advance) < 0 ||
@@ -477,15 +479,14 @@
 	zbookmark_t *zb = &bc->bc_bookmark;
 	zseg_t *zseg = list_head(&th->th_seglist);
 
-	if (bp->blk_birth <= zseg->seg_mintxg ||
-	    bp->blk_birth >= zseg->seg_maxtxg)
+	if (bp->blk_birth <= zseg->seg_mintxg)
 		return;
 
 	if (claim_txg != 0 || bp->blk_birth < spa_first_txg(th->th_spa)) {
 		zb->zb_object = 0;
 		zb->zb_blkid = bp->blk_cksum.zc_word[3];
 		bc->bc_blkptr = *bp;
-		(void) th->th_func(bc, th->th_spa, th->th_arg);
+		(void) traverse_callback(th, zseg, bc);
 	}
 }
 
@@ -502,15 +503,14 @@
 		lr_write_t *lr = (lr_write_t *)lrc;
 		blkptr_t *bp = &lr->lr_blkptr;
 
-		if (bp->blk_birth <= zseg->seg_mintxg ||
-		    bp->blk_birth >= zseg->seg_maxtxg)
+		if (bp->blk_birth <= zseg->seg_mintxg)
 			return;
 
 		if (claim_txg != 0 && bp->blk_birth >= claim_txg) {
 			zb->zb_object = lr->lr_foid;
 			zb->zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
 			bc->bc_blkptr = *bp;
-			(void) th->th_func(bc, th->th_spa, th->th_arg);
+			(void) traverse_callback(th, zseg, bc);
 		}
 	}
 }
@@ -589,6 +589,20 @@
 
 		SET_BOOKMARK(&bc->bc_bookmark, objset, 0, -1, 0);
 
+		/*
+		 * If we're traversing an open snapshot, we know that it
+		 * can't be deleted (because it's open) and it can't change
+		 * (because it's a snapshot).  Therefore, once we've gotten
+		 * from the uberblock down to the snapshot's objset_phys_t,
+		 * we no longer need to synchronize with spa_sync(); we're
+		 * traversing a completely static block tree from here on.
+		 */
+		if (th->th_advance & ADVANCE_NOLOCK) {
+			ASSERT(th->th_locked);
+			rw_exit(spa_traverse_rwlock(th->th_spa));
+			th->th_locked = 0;
+		}
+
 		rc = traverse_read(th, bc, &dsp->ds_bp, dn);
 
 		if (rc != 0) {
@@ -669,7 +683,7 @@
 		/*
 		 * Give spa_sync() a chance to run.
 		 */
-		if (spa_traverse_wanted(th->th_spa)) {
+		if (th->th_locked && spa_traverse_wanted(th->th_spa)) {
 			th->th_syncs++;
 			return (EAGAIN);
 		}
@@ -723,14 +737,15 @@
 
 	save_txg = zseg->seg_mintxg;
 
-	if (!(th->th_advance & ADVANCE_NOLOCK))
-		rw_enter(rw, RW_READER);
+	rw_enter(rw, RW_READER);
+	th->th_locked = 1;
 
 	rc = traverse_segment(th, zseg, mosbp);
 	ASSERT(rc == ERANGE || rc == EAGAIN || rc == EINTR);
 
-	if (!(th->th_advance & ADVANCE_NOLOCK))
+	if (th->th_locked)
 		rw_exit(rw);
+	th->th_locked = 0;
 
 	zseg->seg_mintxg = save_txg;
 
--- a/usr/src/uts/common/fs/zfs/metaslab.c	Sat Apr 01 21:50:51 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/metaslab.c	Sun Apr 02 00:47:06 2006 -0800
@@ -161,18 +161,18 @@
 	kmem_free(mg, sizeof (metaslab_group_t));
 }
 
-void
-metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
+static void
+metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
 {
 	mutex_enter(&mg->mg_lock);
 	ASSERT(msp->ms_group == NULL);
 	msp->ms_group = mg;
-	msp->ms_weight = weight;
+	msp->ms_weight = 0;
 	avl_add(&mg->mg_metaslab_tree, msp);
 	mutex_exit(&mg->mg_lock);
 }
 
-void
+static void
 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
 {
 	mutex_enter(&mg->mg_lock);
@@ -182,9 +182,11 @@
 	mutex_exit(&mg->mg_lock);
 }
 
-void
+static void
 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 {
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
 	mutex_enter(&mg->mg_lock);
 	ASSERT(msp->ms_group == mg);
 	avl_remove(&mg->mg_metaslab_tree, msp);
@@ -195,277 +197,32 @@
 
 /*
  * ==========================================================================
- * Metaslabs
+ * The first-fit block allocator
  * ==========================================================================
  */
-void
-metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, metaslab_t **mspp,
-	uint64_t start, uint64_t size, uint64_t txg)
+static void
+metaslab_ff_load(space_map_t *sm)
 {
-	vdev_t *vd = mg->mg_vd;
-	metaslab_t *msp;
-	int fm;
-
-	msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
-
-	msp->ms_smo = smo;
-
-	space_map_create(&msp->ms_map, start, size, vd->vdev_ashift,
-	    &msp->ms_lock);
-
-	for (fm = 0; fm < TXG_SIZE; fm++) {
-		space_map_create(&msp->ms_allocmap[fm], start, size,
-		    vd->vdev_ashift, &msp->ms_lock);
-		space_map_create(&msp->ms_freemap[fm], start, size,
-		    vd->vdev_ashift, &msp->ms_lock);
-	}
-
-	/*
-	 * If we're opening an existing pool (txg == 0) or creating
-	 * a new one (txg == TXG_INITIAL), all space is available now.
-	 * If we're adding space to an existing pool, the new space
-	 * does not become available until after this txg has synced.
-	 * We enforce this by assigning an initial weight of 0 to new space.
-	 *
-	 * (Transactional allocations for this txg would actually be OK;
-	 * it's intent log allocations that cause trouble.  If we wrote
-	 * a log block in this txg and lost power, the log replay would be
-	 * based on the DVA translations that had been synced in txg - 1.
-	 * Those translations would not include this metaslab's vdev.)
-	 */
-	metaslab_group_add(mg, msp, txg > TXG_INITIAL ? 0 : size);
-
-	if (txg == 0) {
-		/*
-		 * We're opening the pool.  Make the metaslab's
-		 * free space available immediately.
-		 */
-		vdev_space_update(vd, size, smo->smo_alloc);
-		metaslab_sync_done(msp, 0);
-	} else {
-		/*
-		 * We're adding a new metaslab to an already-open pool.
-		 * Declare all of the metaslab's space to be free.
-		 *
-		 * Note that older transaction groups cannot allocate
-		 * from this metaslab until its existence is committed,
-		 * because we set ms_last_alloc to the current txg.
-		 */
-		smo->smo_alloc = 0;
-		msp->ms_usable_space = size;
-		mutex_enter(&msp->ms_lock);
-		space_map_add(&msp->ms_map, start, size);
-		msp->ms_map_incore = 1;
-		mutex_exit(&msp->ms_lock);
-
-		/* XXX -- we'll need a call to picker_init here */
-		msp->ms_dirty[txg & TXG_MASK] |= MSD_ADD;
-		msp->ms_last_alloc = txg;
-		vdev_dirty(vd, VDD_ADD, txg);
-		(void) txg_list_add(&vd->vdev_ms_list, msp, txg);
-	}
-
-	*mspp = msp;
-}
-
-void
-metaslab_fini(metaslab_t *msp)
-{
-	int fm;
-	metaslab_group_t *mg = msp->ms_group;
-
-	vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size,
-	    -msp->ms_smo->smo_alloc);
-
-	metaslab_group_remove(mg, msp);
-
-	/* XXX -- we'll need a call to picker_fini here */
-
-	mutex_enter(&msp->ms_lock);
-
-	space_map_vacate(&msp->ms_map, NULL, NULL);
-	msp->ms_map_incore = 0;
-	space_map_destroy(&msp->ms_map);
-
-	for (fm = 0; fm < TXG_SIZE; fm++) {
-		space_map_destroy(&msp->ms_allocmap[fm]);
-		space_map_destroy(&msp->ms_freemap[fm]);
-	}
-
-	mutex_exit(&msp->ms_lock);
-
-	kmem_free(msp, sizeof (metaslab_t));
+	ASSERT(sm->sm_ppd == NULL);
+	sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
 }
 
-/*
- * Write a metaslab to disk in the context of the specified transaction group.
- */
-void
-metaslab_sync(metaslab_t *msp, uint64_t txg)
+static void
+metaslab_ff_unload(space_map_t *sm)
 {
-	vdev_t *vd = msp->ms_group->mg_vd;
-	spa_t *spa = vd->vdev_spa;
-	objset_t *os = spa->spa_meta_objset;
-	space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK];
-	space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK];
-	space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
-	space_map_obj_t *smo = msp->ms_smo;
-	uint8_t *dirty = &msp->ms_dirty[txg & TXG_MASK];
-	uint64_t alloc_delta;
-	dmu_buf_t *db;
-	dmu_tx_t *tx;
-
-	dprintf("%s offset %llx\n", vdev_description(vd), msp->ms_map.sm_start);
-
-	mutex_enter(&msp->ms_lock);
-
-	if (*dirty & MSD_ADD)
-		vdev_space_update(vd, msp->ms_map.sm_size, 0);
-
-	if (*dirty & (MSD_ALLOC | MSD_FREE)) {
-		tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
-
-		if (smo->smo_object == 0) {
-			ASSERT(smo->smo_objsize == 0);
-			ASSERT(smo->smo_alloc == 0);
-			smo->smo_object = dmu_object_alloc(os,
-			    DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
-			    DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
-			ASSERT(smo->smo_object != 0);
-			dmu_write(os, vd->vdev_ms_array, sizeof (uint64_t) *
-			    (msp->ms_map.sm_start >> vd->vdev_ms_shift),
-			    sizeof (uint64_t), &smo->smo_object, tx);
-		}
-
-		alloc_delta = allocmap->sm_space - freemap->sm_space;
-		vdev_space_update(vd, 0, alloc_delta);
-		smo->smo_alloc += alloc_delta;
-
-		if (msp->ms_last_alloc == txg && msp->ms_map.sm_space == 0 &&
-		    (*dirty & MSD_CONDENSE) == 0) {
-			space_map_t *sm = &msp->ms_map;
-			space_map_t *tsm;
-			int i;
-
-			ASSERT(msp->ms_map_incore);
-
-			space_map_merge(freemap, freed_map);
-			space_map_vacate(allocmap, NULL, NULL);
-
-			/*
-			 * Write out the current state of the allocation
-			 * world.  The current metaslab is full, minus
-			 * stuff that's been freed this txg (freed_map),
-			 * minus allocations from txgs in the future.
-			 */
-			space_map_add(sm, sm->sm_start, sm->sm_size);
-			for (i = 1; i < TXG_CONCURRENT_STATES; i++) {
-				tsm = &msp->ms_allocmap[(txg + i) & TXG_MASK];
-				space_map_iterate(tsm, space_map_remove, sm);
-			}
-			space_map_iterate(freed_map, space_map_remove, sm);
-
-			space_map_write(sm, smo, os, tx);
-
-			ASSERT(sm->sm_space == 0);
-			ASSERT(freemap->sm_space == 0);
-			ASSERT(allocmap->sm_space == 0);
-
-			*dirty |= MSD_CONDENSE;
-		} else {
-			space_map_sync(allocmap, NULL, smo, SM_ALLOC, os, tx);
-			space_map_sync(freemap, freed_map, smo, SM_FREE,
-			    os, tx);
-		}
-
-		VERIFY(0 == dmu_bonus_hold(os, smo->smo_object, FTAG, &db));
-		dmu_buf_will_dirty(db, tx);
-		ASSERT3U(db->db_size, ==, sizeof (*smo));
-		bcopy(smo, db->db_data, db->db_size);
-		dmu_buf_rele(db, FTAG);
-
-		dmu_tx_commit(tx);
-	}
-
-	*dirty &= ~(MSD_ALLOC | MSD_FREE | MSD_ADD);
-
-	mutex_exit(&msp->ms_lock);
-
-	(void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
+	kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
+	sm->sm_ppd = NULL;
 }
 
-/*
- * Called after a transaction group has completely synced to mark
- * all of the metaslab's free space as usable.
- */
-void
-metaslab_sync_done(metaslab_t *msp, uint64_t txg)
-{
-	uint64_t weight;
-	uint8_t *dirty = &msp->ms_dirty[txg & TXG_MASK];
-	space_map_obj_t *smo = msp->ms_smo;
-
-	dprintf("%s offset %llx txg %llu\n",
-	    vdev_description(msp->ms_group->mg_vd), msp->ms_map.sm_start, txg);
-
-	mutex_enter(&msp->ms_lock);
-
-	ASSERT3U((*dirty & (MSD_ALLOC | MSD_FREE | MSD_ADD)), ==, 0);
-
-	msp->ms_usable_space = msp->ms_map.sm_size - smo->smo_alloc;
-	msp->ms_usable_end = smo->smo_objsize;
-
-	weight = msp->ms_usable_space;
-
-	if (txg != 0) {
-		space_map_t *freed_map =
-		    &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
-
-		/* XXX -- we'll need a call to picker_fini here */
-
-		/* If we're empty, don't bother sticking around */
-		if (msp->ms_usable_space == 0) {
-			space_map_vacate(&msp->ms_map, NULL, NULL);
-			msp->ms_map_incore = 0;
-			ASSERT3U(freed_map->sm_space, ==, 0);
-			weight = 0;
-		} else {
-			/* Add the freed blocks to the available space map */
-			if (msp->ms_map_incore)
-				space_map_merge(freed_map, &msp->ms_map);
-			else
-				space_map_vacate(freed_map, NULL, NULL);
-			weight += msp->ms_map.sm_size;
-		}
-
-		if (msp->ms_last_alloc == txg)
-			/* Safe to use for allocation now */
-			msp->ms_last_alloc = 0;
-
-		*dirty = 0;
-	}
-
-	mutex_exit(&msp->ms_lock);
-
-	metaslab_group_sort(msp->ms_group, msp, weight);
-}
-
-/*
- * The first-fit block picker.  No picker_init or picker_fini,
- * this is just an experiment to see how it feels to separate out
- * the block selection policy from the map updates.
- * Note: the 'cursor' argument is a form of PPD.
- */
 static uint64_t
-metaslab_pick_block(space_map_t *sm, uint64_t size, uint64_t *cursor)
+metaslab_ff_alloc(space_map_t *sm, uint64_t size)
 {
 	avl_tree_t *t = &sm->sm_root;
 	uint64_t align = size & -size;
+	uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
 	space_seg_t *ss, ssearch;
 	avl_index_t where;
-	int tried_once = 0;
 
-again:
 	ssearch.ss_start = *cursor;
 	ssearch.ss_end = *cursor + size;
 
@@ -483,35 +240,351 @@
 		ss = AVL_NEXT(t, ss);
 	}
 
-	/* If we couldn't find a block after cursor, search again */
-	if (tried_once == 0) {
-		tried_once = 1;
-		*cursor = 0;
-		goto again;
+	/*
+	 * If we know we've searched the whole map (*cursor == 0), give up.
+	 * Otherwise, reset the cursor to the beginning and try again.
+	 */
+	if (*cursor == 0)
+		return (-1ULL);
+
+	*cursor = 0;
+	return (metaslab_ff_alloc(sm, size));
+}
+
+/* ARGSUSED */
+static void
+metaslab_ff_claim(space_map_t *sm, uint64_t start, uint64_t size)
+{
+	/* No need to update cursor */
+}
+
+/* ARGSUSED */
+static void
+metaslab_ff_free(space_map_t *sm, uint64_t start, uint64_t size)
+{
+	/* No need to update cursor */
+}
+
+static space_map_ops_t metaslab_ff_ops = {
+	metaslab_ff_load,
+	metaslab_ff_unload,
+	metaslab_ff_alloc,
+	metaslab_ff_claim,
+	metaslab_ff_free
+};
+
+/*
+ * ==========================================================================
+ * Metaslabs
+ * ==========================================================================
+ */
+metaslab_t *
+metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
+	uint64_t start, uint64_t size, uint64_t txg)
+{
+	vdev_t *vd = mg->mg_vd;
+	metaslab_t *msp;
+
+	msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
+
+	msp->ms_smo_syncing = *smo;
+
+	/*
+	 * We create the main space map here, but we don't create the
+	 * allocmaps and freemaps until metaslab_sync_done().  This serves
+	 * two purposes: it allows metaslab_sync_done() to detect the
+	 * addition of new space; and for debugging, it ensures that we'd
+	 * data fault on any attempt to use this metaslab before it's ready.
+	 */
+	space_map_create(&msp->ms_map, start, size,
+	    vd->vdev_ashift, &msp->ms_lock);
+
+	metaslab_group_add(mg, msp);
+
+	/*
+	 * If we're opening an existing pool (txg == 0) or creating
+	 * a new one (txg == TXG_INITIAL), all space is available now.
+	 * If we're adding space to an existing pool, the new space
+	 * does not become available until after this txg has synced.
+	 */
+	if (txg <= TXG_INITIAL)
+		metaslab_sync_done(msp, 0);
+
+	if (txg != 0) {
+		/*
+		 * The vdev is dirty, but the metaslab isn't -- it just needs
+		 * to have metaslab_sync_done() invoked from vdev_sync_done().
+		 * [We could just dirty the metaslab, but that would cause us
+		 * to allocate a space map object for it, which is wasteful
+		 * and would mess up the locality logic in metaslab_weight().]
+		 */
+		ASSERT(TXG_CLEAN(txg) == spa_last_synced_txg(vd->vdev_spa));
+		vdev_dirty(vd, 0, NULL, txg);
+		vdev_dirty(vd, VDD_METASLAB, msp, TXG_CLEAN(txg));
 	}
 
-	return (-1ULL);
+	return (msp);
+}
+
+void
+metaslab_fini(metaslab_t *msp)
+{
+	metaslab_group_t *mg = msp->ms_group;
+	int t;
+
+	vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size,
+	    -msp->ms_smo.smo_alloc);
+
+	metaslab_group_remove(mg, msp);
+
+	mutex_enter(&msp->ms_lock);
+
+	space_map_unload(&msp->ms_map);
+	space_map_destroy(&msp->ms_map);
+
+	for (t = 0; t < TXG_SIZE; t++) {
+		space_map_destroy(&msp->ms_allocmap[t]);
+		space_map_destroy(&msp->ms_freemap[t]);
+	}
+
+	mutex_exit(&msp->ms_lock);
+
+	kmem_free(msp, sizeof (metaslab_t));
 }
 
+#define	METASLAB_ACTIVE_WEIGHT	(1ULL << 63)
+
 static uint64_t
-metaslab_getblock(metaslab_t *msp, uint64_t size, uint64_t txg)
+metaslab_weight(metaslab_t *msp)
+{
+	space_map_t *sm = &msp->ms_map;
+	space_map_obj_t *smo = &msp->ms_smo;
+	vdev_t *vd = msp->ms_group->mg_vd;
+	uint64_t weight, space;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	/*
+	 * The baseline weight is the metaslab's free space.
+	 */
+	space = sm->sm_size - smo->smo_alloc;
+	weight = space;
+
+	/*
+	 * Modern disks have uniform bit density and constant angular velocity.
+	 * Therefore, the outer recording zones are faster (higher bandwidth)
+	 * than the inner zones by the ratio of outer to inner track diameter,
+	 * which is typically around 2:1.  We account for this by assigning
+	 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
+	 * In effect, this means that we'll select the metaslab with the most
+	 * free bandwidth rather than simply the one with the most free space.
+	 */
+	weight = 2 * weight -
+	    ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count;
+	ASSERT(weight >= space && weight <= 2 * space);
+
+	/*
+	 * For locality, assign higher weight to metaslabs we've used before.
+	 */
+	if (smo->smo_object != 0)
+		weight *= 2;
+	ASSERT(weight >= space && weight <= 4 * space);
+
+	/*
+	 * If this metaslab is one we're actively using, adjust its weight to
+	 * make it preferable to any inactive metaslab so we'll polish it off.
+	 */
+	weight |= (msp->ms_weight & METASLAB_ACTIVE_WEIGHT);
+
+	return (weight);
+}
+
+static int
+metaslab_activate(metaslab_t *msp)
 {
 	space_map_t *sm = &msp->ms_map;
-	vdev_t *vd = msp->ms_group->mg_vd;
-	uint64_t offset;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
-	ASSERT(msp->ms_map_incore);
-	ASSERT(sm->sm_space != 0);
-	ASSERT(P2PHASE(size, 1ULL << vd->vdev_ashift) == 0);
+
+	if (msp->ms_weight < METASLAB_ACTIVE_WEIGHT) {
+		int error = space_map_load(sm, &metaslab_ff_ops,
+		    SM_FREE, &msp->ms_smo,
+		    msp->ms_group->mg_vd->vdev_spa->spa_meta_objset);
+		if (error) {
+			metaslab_group_sort(msp->ms_group, msp, 0);
+			return (error);
+		}
+		metaslab_group_sort(msp->ms_group, msp,
+		    msp->ms_weight | METASLAB_ACTIVE_WEIGHT);
+	}
+	ASSERT(sm->sm_loaded);
+	ASSERT(msp->ms_weight >= METASLAB_ACTIVE_WEIGHT);
+
+	return (0);
+}
+
+static void
+metaslab_passivate(metaslab_t *msp, uint64_t size)
+{
+	metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size - 1));
+	ASSERT(msp->ms_weight < METASLAB_ACTIVE_WEIGHT);
+}
+
+/*
+ * Write a metaslab to disk in the context of the specified transaction group.
+ */
+void
+metaslab_sync(metaslab_t *msp, uint64_t txg)
+{
+	vdev_t *vd = msp->ms_group->mg_vd;
+	spa_t *spa = vd->vdev_spa;
+	objset_t *mos = spa->spa_meta_objset;
+	space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK];
+	space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK];
+	space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
+	space_map_t *sm = &msp->ms_map;
+	space_map_obj_t *smo = &msp->ms_smo_syncing;
+	dmu_buf_t *db;
+	dmu_tx_t *tx;
+	int t;
+
+	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+
+	/*
+	 * The only state that can actually be changing concurrently with
+	 * metaslab_sync() is the metaslab's ms_map.  No other thread can
+	 * be modifying this txg's allocmap, freemap, freed_map, or smo.
+	 * Therefore, we only hold ms_lock to satify space_map ASSERTs.
+	 * We drop it whenever we call into the DMU, because the DMU
+	 * can call down to us (e.g. via zio_free()) at any time.
+	 */
+	mutex_enter(&msp->ms_lock);
+
+	if (smo->smo_object == 0) {
+		ASSERT(smo->smo_objsize == 0);
+		ASSERT(smo->smo_alloc == 0);
+		mutex_exit(&msp->ms_lock);
+		smo->smo_object = dmu_object_alloc(mos,
+		    DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
+		    DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
+		ASSERT(smo->smo_object != 0);
+		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
+		    (sm->sm_start >> vd->vdev_ms_shift),
+		    sizeof (uint64_t), &smo->smo_object, tx);
+		mutex_enter(&msp->ms_lock);
+	}
+
+	space_map_walk(freemap, space_map_add, freed_map);
+
+	if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >=
+	    2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) {
+		/*
+		 * The in-core space map representation is twice as compact
+		 * as the on-disk one, so it's time to condense the latter
+		 * by generating a pure allocmap from first principles.
+		 *
+		 * This metaslab is 100% allocated,
+		 * minus the content of the in-core map (sm),
+		 * minus what's been freed this txg (freed_map),
+		 * minus allocations from txgs in the future
+		 * (because they haven't been committed yet).
+		 */
+		space_map_vacate(allocmap, NULL, NULL);
+		space_map_vacate(freemap, NULL, NULL);
+
+		space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size);
 
-	offset = metaslab_pick_block(sm, size,
-	    &msp->ms_map_cursor[highbit(size & -size) - vd->vdev_ashift - 1]);
-	if (offset != -1ULL) {
-		space_map_remove(sm, offset, size);
-		space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
+		space_map_walk(sm, space_map_remove, allocmap);
+		space_map_walk(freed_map, space_map_remove, allocmap);
+
+		for (t = 1; t < TXG_CONCURRENT_STATES; t++)
+			space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK],
+			    space_map_remove, allocmap);
+
+		mutex_exit(&msp->ms_lock);
+		space_map_truncate(smo, mos, tx);
+		mutex_enter(&msp->ms_lock);
 	}
-	return (offset);
+
+	space_map_sync(allocmap, SM_ALLOC, smo, mos, tx);
+	space_map_sync(freemap, SM_FREE, smo, mos, tx);
+
+	mutex_exit(&msp->ms_lock);
+
+	VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
+	dmu_buf_will_dirty(db, tx);
+	ASSERT3U(db->db_size, ==, sizeof (*smo));
+	bcopy(smo, db->db_data, db->db_size);
+	dmu_buf_rele(db, FTAG);
+
+	dmu_tx_commit(tx);
+}
+
+/*
+ * Called after a transaction group has completely synced to mark
+ * all of the metaslab's free space as usable.
+ */
+void
+metaslab_sync_done(metaslab_t *msp, uint64_t txg)
+{
+	space_map_obj_t *smo = &msp->ms_smo;
+	space_map_obj_t *smosync = &msp->ms_smo_syncing;
+	space_map_t *sm = &msp->ms_map;
+	space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
+	metaslab_group_t *mg = msp->ms_group;
+	vdev_t *vd = mg->mg_vd;
+	int t;
+
+	mutex_enter(&msp->ms_lock);
+
+	/*
+	 * If this metaslab is just becoming available, initialize its
+	 * allocmaps and freemaps and add its capacity to the vdev.
+	 */
+	if (freed_map->sm_size == 0) {
+		for (t = 0; t < TXG_SIZE; t++) {
+			space_map_create(&msp->ms_allocmap[t], sm->sm_start,
+			    sm->sm_size, sm->sm_shift, sm->sm_lock);
+			space_map_create(&msp->ms_freemap[t], sm->sm_start,
+			    sm->sm_size, sm->sm_shift, sm->sm_lock);
+		}
+		vdev_space_update(vd, sm->sm_size, 0);
+	}
+
+	vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc);
+
+	ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0);
+	ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0);
+
+	/*
+	 * If there's a space_map_load() in progress, wait for it to complete
+	 * so that we have a consistent view of the in-core space map.
+	 * Then, add everything we freed in this txg to the map.
+	 */
+	space_map_load_wait(sm);
+	space_map_vacate(freed_map, sm->sm_loaded ? space_map_free : NULL, sm);
+
+	*smo = *smosync;
+
+	/*
+	 * If the map is loaded but no longer active, evict it as soon as all
+	 * future allocations have synced.  (If we unloaded it now and then
+	 * loaded a moment later, the map wouldn't reflect those allocations.)
+	 */
+	if (sm->sm_loaded && msp->ms_weight < METASLAB_ACTIVE_WEIGHT) {
+		int evictable = 1;
+
+		for (t = 1; t < TXG_CONCURRENT_STATES; t++)
+			if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space)
+				evictable = 0;
+
+		if (evictable)
+			space_map_unload(sm);
+	}
+
+	metaslab_group_sort(mg, msp, metaslab_weight(msp));
+
+	mutex_exit(&msp->ms_lock);
 }
 
 /*
@@ -526,11 +599,8 @@
 	uint64_t vdev = DVA_GET_VDEV(dva);
 	uint64_t offset = DVA_GET_OFFSET(dva);
 	uint64_t size = DVA_GET_ASIZE(dva);
-	objset_t *os = spa->spa_meta_objset;
 	vdev_t *vd;
 	metaslab_t *msp;
-	space_map_t *sm;
-	space_map_obj_t *smo;
 	int error;
 
 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL)
@@ -540,123 +610,69 @@
 		return (ENXIO);
 
 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
-	sm = &msp->ms_map;
-	smo = msp->ms_smo;
 
 	if (DVA_GET_GANG(dva))
 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
 
 	mutex_enter(&msp->ms_lock);
 
-	if (msp->ms_map_incore == 0) {
-		error = space_map_load(sm, smo, SM_FREE, os,
-		    msp->ms_usable_end, sm->sm_size - msp->ms_usable_space);
-		ASSERT(error == 0);
-		if (error) {
-			mutex_exit(&msp->ms_lock);
-			return (error);
-		}
-		msp->ms_map_incore = 1;
-		/* XXX -- we'll need a call to picker_init here */
-		bzero(msp->ms_map_cursor, sizeof (msp->ms_map_cursor));
+	error = metaslab_activate(msp);
+	if (error) {
+		mutex_exit(&msp->ms_lock);
+		return (error);
 	}
 
-	space_map_remove(sm, offset, size);
-	space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
+	if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
+		vdev_dirty(vd, VDD_METASLAB, msp, txg);
 
-	if ((msp->ms_dirty[txg & TXG_MASK] & MSD_ALLOC) == 0) {
-		msp->ms_dirty[txg & TXG_MASK] |= MSD_ALLOC;
-		msp->ms_last_alloc = txg;
-		vdev_dirty(vd, VDD_ALLOC, txg);
-		(void) txg_list_add(&vd->vdev_ms_list, msp, txg);
-	}
+	space_map_claim(&msp->ms_map, offset, size);
+	space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
 
 	mutex_exit(&msp->ms_lock);
 
 	return (0);
 }
 
-static int
-metaslab_usable(metaslab_t *msp, uint64_t size, uint64_t txg)
-{
-	/*
-	 * Enforce segregation across transaction groups.
-	 */
-	/* XXX -- We should probably not assume we know what ms_weight means */
-	if (msp->ms_last_alloc == txg)
-		return (msp->ms_map.sm_space >= size && msp->ms_weight >= size);
-
-	if (msp->ms_last_alloc != 0)
-		return (0);
-
-	if (msp->ms_map.sm_space >= size && msp->ms_weight >= size)
-		return (1);
-
-	/* XXX -- the weight test should be in terms of MINFREE */
-	return (msp->ms_usable_space >= size && msp->ms_weight >= size);
-}
-
 static metaslab_t *
-metaslab_pick(metaslab_group_t *mg, uint64_t size, uint64_t txg)
+metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t *offp,
+	uint64_t txg)
 {
-	metaslab_t *msp;
-	avl_tree_t *t = &mg->mg_metaslab_tree;
-
-	mutex_enter(&mg->mg_lock);
-	for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp))
-		if (metaslab_usable(msp, size, txg))
-			break;
-	mutex_exit(&mg->mg_lock);
-
-	return (msp);
-}
+	metaslab_t *msp = NULL;
+	uint64_t offset = -1ULL;
 
-static metaslab_t *
-metaslab_group_alloc(spa_t *spa, metaslab_group_t *mg, uint64_t size,
-    uint64_t *offp, uint64_t txg)
-{
-	metaslab_t *msp;
-	int error;
+	for (;;) {
+		mutex_enter(&mg->mg_lock);
+		msp = avl_first(&mg->mg_metaslab_tree);
+		if (msp == NULL || msp->ms_weight < size) {
+			mutex_exit(&mg->mg_lock);
+			return (NULL);
+		}
+		mutex_exit(&mg->mg_lock);
 
-	while ((msp = metaslab_pick(mg, size, txg)) != NULL) {
-		space_map_obj_t *smo = msp->ms_smo;
 		mutex_enter(&msp->ms_lock);
-		if (!metaslab_usable(msp, size, txg)) {
+
+		if (metaslab_activate(msp) != 0) {
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
-		if (msp->ms_map_incore == 0) {
-			error = space_map_load(&msp->ms_map, smo, SM_FREE,
-			    spa->spa_meta_objset, msp->ms_usable_end,
-			    msp->ms_map.sm_size - msp->ms_usable_space);
-			ASSERT(error == 0);
-			if (error) {
-				mutex_exit(&msp->ms_lock);
-				metaslab_group_sort(mg, msp, 0);
-				continue;
-			}
-			msp->ms_map_incore = 1;
-			/* XXX -- we'll need a call to picker_init here */
-			bzero(msp->ms_map_cursor, sizeof (msp->ms_map_cursor));
-		}
-		*offp = metaslab_getblock(msp, size, txg);
-		if (*offp != -1ULL) {
-			if ((msp->ms_dirty[txg & TXG_MASK] & MSD_ALLOC) == 0) {
-				vdev_t *vd = mg->mg_vd;
-				msp->ms_dirty[txg & TXG_MASK] |= MSD_ALLOC;
-				msp->ms_last_alloc = txg;
-				vdev_dirty(vd, VDD_ALLOC, txg);
-				(void) txg_list_add(&vd->vdev_ms_list,
-				    msp, txg);
-			}
-			mutex_exit(&msp->ms_lock);
-			return (msp);
-		}
+
+		if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL)
+			break;
+
+		metaslab_passivate(msp, size);
+
 		mutex_exit(&msp->ms_lock);
-		metaslab_group_sort(msp->ms_group, msp, size - 1);
 	}
 
-	return (NULL);
+	if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
+		vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
+
+	space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
+
+	mutex_exit(&msp->ms_lock);
+
+	*offp = offset;
+	return (msp);
 }
 
 /*
@@ -686,7 +702,7 @@
 		asize = vdev_psize_to_asize(vd, psize);
 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
 
-		msp = metaslab_group_alloc(spa, mg, asize, &offset, txg);
+		msp = metaslab_group_alloc(mg, asize, &offset, txg);
 		if (msp != NULL) {
 			ASSERT(offset != -1ULL);
 
@@ -716,8 +732,6 @@
 				 */
 				mg->mg_bias = ((su - vu) *
 				    (int64_t)mg->mg_aliquot) / (1024 * 4);
-
-				dprintf("bias = %lld\n", mg->mg_bias);
 			}
 
 			if (atomic_add_64_nv(&mc->mc_allocated, asize) >=
@@ -737,8 +751,6 @@
 		mc->mc_allocated = 0;
 	} while ((mg = mg->mg_next) != rotor);
 
-	dprintf("spa=%p, psize=%llu, txg=%llu: no\n", spa, psize, txg);
-
 	DVA_SET_VDEV(dva, 0);
 	DVA_SET_OFFSET(dva, 0);
 	DVA_SET_GANG(dva, 0);
@@ -751,7 +763,7 @@
  * transaction group.
  */
 void
-metaslab_free(spa_t *spa, dva_t *dva, uint64_t txg)
+metaslab_free(spa_t *spa, dva_t *dva, uint64_t txg, boolean_t now)
 {
 	uint64_t vdev = DVA_GET_VDEV(dva);
 	uint64_t offset = DVA_GET_OFFSET(dva);
@@ -783,13 +795,15 @@
 
 	mutex_enter(&msp->ms_lock);
 
-	if ((msp->ms_dirty[txg & TXG_MASK] & MSD_FREE) == 0) {
-		msp->ms_dirty[txg & TXG_MASK] |= MSD_FREE;
-		vdev_dirty(vd, VDD_FREE, txg);
-		(void) txg_list_add(&vd->vdev_ms_list, msp, txg);
+	if (now) {
+		space_map_remove(&msp->ms_allocmap[txg & TXG_MASK],
+		    offset, size);
+		space_map_free(&msp->ms_map, offset, size);
+	} else {
+		if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0)
+			vdev_dirty(vd, VDD_METASLAB, msp, txg);
+		space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size);
 	}
 
-	space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size);
-
 	mutex_exit(&msp->ms_lock);
 }
--- a/usr/src/uts/common/fs/zfs/spa.c	Sat Apr 01 21:50:51 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/spa.c	Sun Apr 02 00:47:06 2006 -0800
@@ -341,8 +341,7 @@
 	 * If the vdev guid sum doesn't match the uberblock, we have an
 	 * incomplete configuration.
 	 */
-	if (rvd->vdev_guid_sum != ub->ub_guid_sum && (mosconfig ||
-	    state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT)) {
+	if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) {
 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_BAD_GUID_SUM);
 		error = ENXIO;
@@ -842,8 +841,10 @@
 
 	/*
 	 * Pass off the heavy lifting to spa_load().
+	 * Pass TRUE for mosconfig because the user-supplied config
+	 * is actually the one to trust when doing an import.
 	 */
-	error = spa_load(spa, config, SPA_LOAD_IMPORT, B_FALSE);
+	error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE);
 
 	if (error) {
 		spa_unload(spa);
@@ -898,8 +899,10 @@
 
 	/*
 	 * Pass off the heavy lifting to spa_load().
+	 * Pass TRUE for mosconfig because the user-supplied config
+	 * is actually the one to trust when doing an import.
 	 */
-	(void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_FALSE);
+	(void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE);
 
 	/*
 	 * If 'tryconfig' was at least parsable, return the current config.
@@ -1163,7 +1166,11 @@
 	if (newvd->vdev_psize < vdev_get_rsize(oldvd))
 		return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
 
-	if (newvd->vdev_ashift != oldvd->vdev_ashift && oldvd->vdev_ashift != 0)
+	/*
+	 * The new device cannot have a higher alignment requirement
+	 * than the top-level vdev.
+	 */
+	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
 		return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
 
 	/*
@@ -1228,8 +1235,7 @@
 	/*
 	 * Mark newvd's DTL dirty in this txg.
 	 */
-	vdev_dirty(tvd, VDD_DTL, txg);
-	(void) txg_list_add(&tvd->vdev_dtl_list, newvd, txg);
+	vdev_dirty(tvd, VDD_DTL, newvd, txg);
 
 	(void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
 
@@ -1356,12 +1362,11 @@
 
 	/*
 	 * If the device we just detached was smaller than the others,
-	 * it may be possible to add metaslabs (i.e. grow the pool).  We ignore
-	 * the error here because the detach still succeeded - we just weren't
-	 * able to reinitialize the metaslabs.  This pool is in for a world of
-	 * hurt, in any case.
+	 * it may be possible to add metaslabs (i.e. grow the pool).
+	 * vdev_metaslab_init() can't fail because the existing metaslabs
+	 * are already in core, so there's nothing to read from disk.
 	 */
-	(void) vdev_metaslab_init(tvd, txg);
+	VERIFY(vdev_metaslab_init(tvd, txg) == 0);
 
 	vdev_config_dirty(tvd);
 
@@ -1372,11 +1377,10 @@
 	 * But first make sure we're not on any *other* txg's DTL list,
 	 * to prevent vd from being accessed after it's freed.
 	 */
-	vdev_dirty(tvd, VDD_DTL, txg);
-	vd->vdev_detached = B_TRUE;
 	for (t = 0; t < TXG_SIZE; t++)
 		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
-	(void) txg_list_add(&tvd->vdev_dtl_list, vd, txg);
+	vd->vdev_detached = B_TRUE;
+	vdev_dirty(tvd, VDD_DTL, vd, txg);
 
 	dprintf("detached %s in txg %llu\n", vd->vdev_path, txg);
 
@@ -1798,10 +1802,13 @@
 	if (rvd->vdev_dtl_map.sm_space == 0) {
 		/*
 		 * The pool-wide DTL is empty.
-		 * If this is a resilver, there's nothing to do.
+		 * If this is a resilver, there's nothing to do except
+		 * check whether any in-progress replacements have completed.
 		 */
-		if (type == POOL_SCRUB_RESILVER)
+		if (type == POOL_SCRUB_RESILVER) {
 			type = POOL_SCRUB_NONE;
+			spa_async_request(spa, SPA_ASYNC_REPLACE_DONE);
+		}
 	} else {
 		/*
 		 * The pool-wide DTL is non-empty.
--- a/usr/src/uts/common/fs/zfs/space_map.c	Sat Apr 01 21:50:51 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/space_map.c	Sun Apr 02 00:47:06 2006 -0800
@@ -28,6 +28,7 @@
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/dmu.h>
+#include <sys/zio.h>
 #include <sys/space_map.h>
 
 /*
@@ -54,22 +55,24 @@
 }
 
 void
-space_map_create(space_map_t *sm, uint64_t start, uint64_t size, uint64_t shift,
+space_map_create(space_map_t *sm, uint64_t start, uint64_t size, uint8_t shift,
 	kmutex_t *lp)
 {
+	bzero(sm, sizeof (*sm));
+
 	avl_create(&sm->sm_root, space_map_seg_compare,
 	    sizeof (space_seg_t), offsetof(struct space_seg, ss_node));
+
 	sm->sm_start = start;
-	sm->sm_end = start + size;
 	sm->sm_size = size;
 	sm->sm_shift = shift;
-	sm->sm_space = 0;
 	sm->sm_lock = lp;
 }
 
 void
 space_map_destroy(space_map_t *sm)
 {
+	ASSERT(!sm->sm_loaded && !sm->sm_loading);
 	VERIFY3U(sm->sm_space, ==, 0);
 	avl_destroy(&sm->sm_root);
 }
@@ -85,7 +88,7 @@
 	ASSERT(MUTEX_HELD(sm->sm_lock));
 	VERIFY(size != 0);
 	VERIFY3U(start, >=, sm->sm_start);
-	VERIFY3U(end, <=, sm->sm_end);
+	VERIFY3U(end, <=, sm->sm_start + sm->sm_size);
 	VERIFY(sm->sm_space + size <= sm->sm_size);
 	VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
 	VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
@@ -201,7 +204,7 @@
 }
 
 void
-space_map_iterate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
+space_map_walk(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
 {
 	space_seg_t *ss;
 
@@ -210,12 +213,6 @@
 }
 
 void
-space_map_merge(space_map_t *src, space_map_t *dest)
-{
-	space_map_vacate(src, space_map_add, dest);
-}
-
-void
 space_map_excise(space_map_t *sm, uint64_t start, uint64_t size)
 {
 	avl_tree_t *t = &sm->sm_root;
@@ -266,25 +263,57 @@
 	}
 }
 
+/*
+ * Wait for any in-progress space_map_load() to complete.
+ */
+void
+space_map_load_wait(space_map_t *sm)
+{
+	ASSERT(MUTEX_HELD(sm->sm_lock));
+
+	while (sm->sm_loading)
+		cv_wait(&sm->sm_load_cv, sm->sm_lock);
+}
+
+/*
+ * Note: space_map_load() will drop sm_lock across dmu_read() calls.
+ * The caller must be OK with this.
+ */
 int
-space_map_load(space_map_t *sm, space_map_obj_t *smo, uint8_t maptype,
-	objset_t *os, uint64_t end, uint64_t space)
+space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype,
+	space_map_obj_t *smo, objset_t *os)
 {
 	uint64_t *entry, *entry_map, *entry_map_end;
 	uint64_t bufsize, size, offset;
 	uint64_t mapstart = sm->sm_start;
+	uint64_t end = smo->smo_objsize;
+	uint64_t space = smo->smo_alloc;
 
 	ASSERT(MUTEX_HELD(sm->sm_lock));
-	VERIFY3U(sm->sm_space, ==, 0);
+
+	space_map_load_wait(sm);
 
-	bufsize = MIN(end, SPACE_MAP_CHUNKSIZE);
-	entry_map = kmem_alloc(bufsize, KM_SLEEP);
+	if (sm->sm_loaded)
+		return (0);
+
+	sm->sm_loading = B_TRUE;
+
+	ASSERT(sm->sm_ops == NULL);
+	VERIFY3U(sm->sm_space, ==, 0);
 
 	if (maptype == SM_FREE) {
 		space_map_add(sm, sm->sm_start, sm->sm_size);
 		space = sm->sm_size - space;
 	}
 
+	bufsize = 1ULL << SPACE_MAP_BLOCKSHIFT;
+	entry_map = zio_buf_alloc(bufsize);
+
+	mutex_exit(sm->sm_lock);
+	if (end > bufsize)
+		dmu_prefetch(os, smo->smo_object, bufsize, end - bufsize);
+	mutex_enter(sm->sm_lock);
+
 	for (offset = 0; offset < end; offset += bufsize) {
 		size = MIN(end - offset, bufsize);
 		VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0);
@@ -292,8 +321,11 @@
 
 		dprintf("object=%llu  offset=%llx  size=%llx\n",
 		    smo->smo_object, offset, size);
-		VERIFY(0 == dmu_read(os, smo->smo_object, offset, size,
-		    entry_map));
+
+		mutex_exit(sm->sm_lock);
+		VERIFY3U(dmu_read(os, smo->smo_object, offset, size,
+		    entry_map), ==, 0);
+		mutex_enter(sm->sm_lock);
 
 		entry_map_end = entry_map + (size / sizeof (uint64_t));
 		for (entry = entry_map; entry < entry_map_end; entry++) {
@@ -310,14 +342,65 @@
 	}
 	VERIFY3U(sm->sm_space, ==, space);
 
-	kmem_free(entry_map, bufsize);
+	zio_buf_free(entry_map, bufsize);
+
+	sm->sm_loading = B_FALSE;
+	sm->sm_loaded = B_TRUE;
+	sm->sm_ops = ops;
+
+	cv_broadcast(&sm->sm_load_cv);
+
+	if (ops != NULL)
+		ops->smop_load(sm);
 
 	return (0);
 }
 
 void
-space_map_sync(space_map_t *sm, space_map_t *dest, space_map_obj_t *smo,
-    uint8_t maptype, objset_t *os, dmu_tx_t *tx)
+space_map_unload(space_map_t *sm)
+{
+	ASSERT(MUTEX_HELD(sm->sm_lock));
+
+	if (sm->sm_loaded && sm->sm_ops != NULL)
+		sm->sm_ops->smop_unload(sm);
+
+	sm->sm_loaded = B_FALSE;
+	sm->sm_ops = NULL;
+
+	space_map_vacate(sm, NULL, NULL);
+}
+
+uint64_t
+space_map_alloc(space_map_t *sm, uint64_t size)
+{
+	uint64_t start;
+
+	start = sm->sm_ops->smop_alloc(sm, size);
+	if (start != -1ULL)
+		space_map_remove(sm, start, size);
+	return (start);
+}
+
+void
+space_map_claim(space_map_t *sm, uint64_t start, uint64_t size)
+{
+	sm->sm_ops->smop_claim(sm, start, size);
+	space_map_remove(sm, start, size);
+}
+
+void
+space_map_free(space_map_t *sm, uint64_t start, uint64_t size)
+{
+	space_map_add(sm, start, size);
+	sm->sm_ops->smop_free(sm, start, size);
+}
+
+/*
+ * Note: space_map_sync() will drop sm_lock across dmu_write() calls.
+ */
+void
+space_map_sync(space_map_t *sm, uint8_t maptype,
+	space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_objset_spa(os);
 	void *cookie = NULL;
@@ -335,9 +418,14 @@
 	    maptype == SM_ALLOC ? 'A' : 'F', avl_numnodes(&sm->sm_root),
 	    sm->sm_space);
 
+	if (maptype == SM_ALLOC)
+		smo->smo_alloc += sm->sm_space;
+	else
+		smo->smo_alloc -= sm->sm_space;
+
 	bufsize = (8 + avl_numnodes(&sm->sm_root)) * sizeof (uint64_t);
-	bufsize = MIN(bufsize, SPACE_MAP_CHUNKSIZE);
-	entry_map = kmem_alloc(bufsize, KM_SLEEP);
+	bufsize = MIN(bufsize, 1ULL << SPACE_MAP_BLOCKSHIFT);
+	entry_map = zio_buf_alloc(bufsize);
 	entry_map_end = entry_map + (bufsize / sizeof (uint64_t));
 	entry = entry_map;
 
@@ -350,9 +438,6 @@
 		size = ss->ss_end - ss->ss_start;
 		start = (ss->ss_start - sm->sm_start) >> sm->sm_shift;
 
-		if (dest)
-			space_map_add(dest, ss->ss_start, size);
-
 		sm->sm_space -= size;
 		size >>= sm->sm_shift;
 
@@ -360,8 +445,10 @@
 			run_len = MIN(size, SM_RUN_MAX);
 
 			if (entry == entry_map_end) {
+				mutex_exit(sm->sm_lock);
 				dmu_write(os, smo->smo_object, smo->smo_objsize,
 				    bufsize, entry_map, tx);
+				mutex_enter(sm->sm_lock);
 				smo->smo_objsize += bufsize;
 				entry = entry_map;
 			}
@@ -378,30 +465,23 @@
 
 	if (entry != entry_map) {
 		size = (entry - entry_map) * sizeof (uint64_t);
+		mutex_exit(sm->sm_lock);
 		dmu_write(os, smo->smo_object, smo->smo_objsize,
 		    size, entry_map, tx);
+		mutex_enter(sm->sm_lock);
 		smo->smo_objsize += size;
 	}
 
-	kmem_free(entry_map, bufsize);
+	zio_buf_free(entry_map, bufsize);
 
 	VERIFY3U(sm->sm_space, ==, 0);
 }
 
 void
-space_map_write(space_map_t *sm, space_map_obj_t *smo, objset_t *os,
-    dmu_tx_t *tx)
+space_map_truncate(space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx)
 {
-	uint64_t oldsize = smo->smo_objsize;
-
-	VERIFY(0 == dmu_free_range(os, smo->smo_object, 0,
-	    smo->smo_objsize, tx));
+	VERIFY(dmu_free_range(os, smo->smo_object, 0, -1ULL, tx) == 0);
 
 	smo->smo_objsize = 0;
-
-	VERIFY3U(sm->sm_space, ==, smo->smo_alloc);
-	space_map_sync(sm, NULL, smo, SM_ALLOC, os, tx);
-
-	dprintf("write sm object %llu from %llu to %llu bytes in txg %llu\n",
-	    smo->smo_object, oldsize, smo->smo_objsize, dmu_tx_get_txg(tx));
+	smo->smo_alloc = 0;
 }
--- a/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h	Sat Apr 01 21:50:51 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h	Sun Apr 02 00:47:06 2006 -0800
@@ -82,7 +82,8 @@
 	spa_t		*th_spa;
 	blkptr_cb_t	*th_func;
 	void		*th_arg;
-	int		th_advance;
+	uint16_t	th_advance;
+	uint16_t	th_locked;
 	int		th_zio_flags;
 	list_t		th_seglist;
 	traverse_blk_cache_t th_cache[ZB_DEPTH][ZB_MAXLEVEL];
--- a/usr/src/uts/common/fs/zfs/sys/metaslab.h	Sat Apr 01 21:50:51 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h	Sun Apr 02 00:47:06 2006 -0800
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -42,14 +41,14 @@
 typedef struct metaslab_class metaslab_class_t;
 typedef struct metaslab_group metaslab_group_t;
 
-extern void metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
-    metaslab_t **mspp, uint64_t offset, uint64_t size, uint64_t txg);
+extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
+    uint64_t start, uint64_t size, uint64_t txg);
 extern void metaslab_fini(metaslab_t *msp);
 extern void metaslab_sync(metaslab_t *msp, uint64_t txg);
 extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg);
 
 extern int metaslab_alloc(spa_t *spa, uint64_t size, dva_t *dva, uint64_t txg);
-extern void metaslab_free(spa_t *spa, dva_t *dva, uint64_t txg);
+extern void metaslab_free(spa_t *spa, dva_t *dva, uint64_t txg, boolean_t now);
 extern int metaslab_claim(spa_t *spa, dva_t *dva, uint64_t txg);
 
 extern metaslab_class_t *metaslab_class_create(void);
@@ -60,11 +59,6 @@
 extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc,
     vdev_t *vd);
 extern void metaslab_group_destroy(metaslab_group_t *mg);
-extern void metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp,
-    uint64_t weight);
-extern void metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp);
-extern void metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp,
-    uint64_t weight);
 
 #ifdef	__cplusplus
 }
--- a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h	Sat Apr 01 21:50:51 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h	Sun Apr 02 00:47:06 2006 -0800
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -56,68 +55,25 @@
 };
 
 /*
- * Each metaslab's free block list is kept in its own DMU object in the
- * metaslab freelist dataset.  To minimize space consumption, the list
- * is circular.
- *
- * Allocations and frees can happen in multiple transaction groups at
- * the same time, which makes it a bit challening to keep the metaslab
- * consistent.  For example, we cannot allow frees from different
- * transaction groups to be interleaved in the metaslab's free block list.
- *
- * We address this in several ways:
- *
- *	We don't allow allocations from the same metaslab in concurrent
- *	transaction groups.  metaslab_alloc() enforces this by checking
- *	the ms_last_alloc field, which specifies the last txg in which
- *	the metaslab was used for allocations.
- *
- *	We can't segregate frees this way because we can't choose which
- *	DVAs someone wants to free.  So we keep separate in-core freelists
- *	for each active transaction group.  This in-core data is only
- *	written to the metaslab's on-disk freelist in metaslab_sync(),
- *	which solves the interleave problem: we only append frees from
- *	the syncing txg to the on-disk freelist, so the appends all occur
- *	in txg order.
- *
- *	We cannot allow a block which was freed in a given txg to be
- *	allocated again until that txg has closed; otherwise, if we
- *	failed to sync that txg and had to roll back to txg - 1,
- *	changes in txg + 1 could have overwritten the data.  Therefore,
- *	we partition the free blocks into "available" and "limbo" states.
- *	A block is available if the txg in which it was freed has closed;
- *	until then, the block is in limbo.  Each time metaslab_sync() runs,
- *	if first adds any limbo blocks to the avail list, clears the limbo
- *	list, and starts writing the new limbo blocks (i.e. the ones that
- *	were freed in the syncing txg).
+ * Each metaslab's free space is tracked in space map object in the MOS,
+ * which is only updated in syncing context.  Each time we sync a txg,
+ * we append the allocs and frees from that txg to the space map object.
+ * When the txg is done syncing, metaslab_sync_done() updates ms_smo
+ * to ms_smo_syncing.  Everything in ms_smo is always safe to allocate.
  */
-
 struct metaslab {
 	kmutex_t	ms_lock;	/* metaslab lock		*/
-	space_map_obj_t	*ms_smo;	/* space map object		*/
-	uint64_t	ms_last_alloc;	/* txg of last alloc		*/
-	uint64_t	ms_usable_end;	/* end of free_obj at last sync	*/
-	uint64_t	ms_usable_space; /* usable space at last sync	*/
-	metaslab_group_t *ms_group;	/* metaslab group		*/
-	avl_node_t	ms_group_node;	/* node in metaslab group tree	*/
-	uint64_t	ms_weight;	/* weight vs. others in group	*/
-	uint8_t		ms_dirty[TXG_SIZE];	/* per-txg dirty flags	*/
+	space_map_obj_t	ms_smo;		/* synced space map object	*/
+	space_map_obj_t	ms_smo_syncing;	/* syncing space map object	*/
 	space_map_t	ms_allocmap[TXG_SIZE];  /* allocated this txg	*/
 	space_map_t	ms_freemap[TXG_SIZE];	/* freed this txg	*/
+	space_map_t	ms_map;		/* in-core free space map	*/
+	uint64_t	ms_weight;	/* weight vs. others in group	*/
+	metaslab_group_t *ms_group;	/* metaslab group		*/
+	avl_node_t	ms_group_node;	/* node in metaslab group tree	*/
 	txg_node_t	ms_txg_node;	/* per-txg dirty metaslab links	*/
-	space_map_t	ms_map;		/* in-core free space map	*/
-	uint8_t		ms_map_incore;  /* space map contents are valid */
-	uint64_t	ms_map_cursor[SPA_ASIZEBITS]; /* XXX -- PPD	*/
 };
 
-/*
- * ms_dirty[] flags
- */
-#define	MSD_ALLOC	0x01	/* allocated from in this txg		*/
-#define	MSD_FREE	0x02	/* freed to in this txg			*/
-#define	MSD_ADD		0x04	/* added to the pool in this txg	*/
-#define	MSD_CONDENSE	0x08	/* condensed in this txg		*/
-
 #ifdef	__cplusplus
 }
 #endif
--- a/usr/src/uts/common/fs/zfs/sys/space_map.h	Sat Apr 01 21:50:51 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/space_map.h	Sun Apr 02 00:47:06 2006 -0800
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -36,13 +35,20 @@
 extern "C" {
 #endif
 
+typedef struct space_map_ops space_map_ops_t;
+
 typedef struct space_map {
-	avl_tree_t	sm_root;	/* Root of the AVL tree */
-	uint64_t	sm_start;	/* Start of map (inclusive) */
-	uint64_t	sm_end;		/* End of map (exclusive) */
-	uint64_t	sm_size;	/* Size of map (end - start) */
-	uint64_t	sm_shift;	/* Unit shift */
-	uint64_t	sm_space;	/* Sum of all segments in the map */
+	avl_tree_t	sm_root;	/* AVL tree of map segments */
+	uint64_t	sm_space;	/* sum of all segments in the map */
+	uint64_t	sm_start;	/* start of map */
+	uint64_t	sm_size;	/* size of map */
+	uint8_t		sm_shift;	/* unit shift */
+	uint8_t		sm_pad[3];	/* unused */
+	uint8_t		sm_loaded;	/* map loaded? */
+	uint8_t		sm_loading;	/* map loading? */
+	kcondvar_t	sm_load_cv;	/* map load completion */
+	space_map_ops_t	*sm_ops;	/* space map block picker ops vector */
+	void		*sm_ppd;	/* picker-private data */
 	kmutex_t	*sm_lock;	/* pointer to lock that protects map */
 } space_map_t;
 
@@ -58,6 +64,14 @@
 	uint64_t	smo_alloc;	/* space allocated from the map */
 } space_map_obj_t;
 
+struct space_map_ops {
+	void	(*smop_load)(space_map_t *sm);
+	void	(*smop_unload)(space_map_t *sm);
+	uint64_t (*smop_alloc)(space_map_t *sm, uint64_t size);
+	void	(*smop_claim)(space_map_t *sm, uint64_t start, uint64_t size);
+	void	(*smop_free)(space_map_t *sm, uint64_t start, uint64_t size);
+};
+
 /*
  * debug entry
  *
@@ -112,29 +126,33 @@
  */
 #define	SPACE_MAP_BLOCKSHIFT	12
 
-#define	SPACE_MAP_CHUNKSIZE	(1<<20)
-
 typedef void space_map_func_t(space_map_t *sm, uint64_t start, uint64_t size);
 
 extern void space_map_create(space_map_t *sm, uint64_t start, uint64_t size,
-    uint64_t shift, kmutex_t *lp);
+    uint8_t shift, kmutex_t *lp);
 extern void space_map_destroy(space_map_t *sm);
 extern void space_map_add(space_map_t *sm, uint64_t start, uint64_t size);
 extern void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size);
 extern int space_map_contains(space_map_t *sm, uint64_t start, uint64_t size);
 extern void space_map_vacate(space_map_t *sm,
     space_map_func_t *func, space_map_t *mdest);
-extern void space_map_iterate(space_map_t *sm,
+extern void space_map_walk(space_map_t *sm,
     space_map_func_t *func, space_map_t *mdest);
-extern void space_map_merge(space_map_t *dest, space_map_t *src);
 extern void space_map_excise(space_map_t *sm, uint64_t start, uint64_t size);
 extern void space_map_union(space_map_t *smd, space_map_t *sms);
 
-extern int space_map_load(space_map_t *sm, space_map_obj_t *smo,
-    uint8_t maptype, objset_t *os, uint64_t end, uint64_t space);
-extern void space_map_sync(space_map_t *sm, space_map_t *dest,
-    space_map_obj_t *smo, uint8_t maptype, objset_t *os, dmu_tx_t *tx);
-extern void space_map_write(space_map_t *sm, space_map_obj_t *smo,
+extern void space_map_load_wait(space_map_t *sm);
+extern int space_map_load(space_map_t *sm, space_map_ops_t *ops,
+    uint8_t maptype, space_map_obj_t *smo, objset_t *os);
+extern void space_map_unload(space_map_t *sm);
+
+extern uint64_t space_map_alloc(space_map_t *sm, uint64_t size);
+extern void space_map_claim(space_map_t *sm, uint64_t start, uint64_t size);
+extern void space_map_free(space_map_t *sm, uint64_t start, uint64_t size);
+
+extern void space_map_sync(space_map_t *sm, uint8_t maptype,
+    space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx);
+extern void space_map_truncate(space_map_obj_t *smo,
     objset_t *os, dmu_tx_t *tx);
 
 #ifdef	__cplusplus
--- a/usr/src/uts/common/fs/zfs/sys/uberblock_impl.h	Sat Apr 01 21:50:51 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/uberblock_impl.h	Sun Apr 02 00:47:06 2006 -0800
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -45,13 +44,9 @@
  * version mismatch.  If the ub_magic field is moved, applications that
  * expect the magic number in the first word won't work.
  */
-
-#define	UBERBLOCK_SHIFT		(10)
-#define	UBERBLOCK_SIZE		(1ULL << UBERBLOCK_SHIFT)
-
 #define	UBERBLOCK_MAGIC		0x00bab10c		/* oo-ba-bloc!	*/
-
 #define	UBERBLOCK_VERSION	1ULL
+#define	UBERBLOCK_SHIFT		10			/* up to 1K	*/
 
 struct uberblock {
 	uint64_t	ub_magic;	/* UBERBLOCK_MAGIC		*/
@@ -62,13 +57,6 @@
 	blkptr_t	ub_rootbp;	/* MOS objset_phys_t		*/
 };
 
-typedef struct uberblock_phys {
-	uberblock_t	ubp_uberblock;
-	char		ubp_pad[UBERBLOCK_SIZE - sizeof (uberblock_t) -
-	    sizeof (zio_block_tail_t)];
-	zio_block_tail_t ubp_zbt;
-} uberblock_phys_t;
-
 #ifdef	__cplusplus
 }
 #endif
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h	Sat Apr 01 21:50:51 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h	Sun Apr 02 00:47:06 2006 -0800
@@ -147,12 +147,9 @@
 	uint64_t	vdev_ms_count;	/* number of metaslabs		*/
 	metaslab_group_t *vdev_mg;	/* metaslab group		*/
 	metaslab_t	**vdev_ms;	/* metaslab array		*/
-	space_map_obj_t	*vdev_smo;	/* metaslab space map array	*/
 	txg_list_t	vdev_ms_list;	/* per-txg dirty metaslab lists	*/
 	txg_list_t	vdev_dtl_list;	/* per-txg dirty DTL lists	*/
 	txg_node_t	vdev_txg_node;	/* per-txg dirty vdev linkage	*/
-	uint8_t		vdev_dirty[TXG_SIZE]; /* per-txg dirty flags	*/
-	uint8_t		vdev_is_dirty;	/* on config dirty list?	*/
 	uint8_t		vdev_reopen_wanted; /* async reopen wanted?	*/
 	list_node_t	vdev_dirty_node; /* config dirty list		*/
 
@@ -163,13 +160,13 @@
 	space_map_obj_t	vdev_dtl;	/* dirty time log on-disk state	*/
 	txg_node_t	vdev_dtl_node;	/* per-txg dirty DTL linkage	*/
 	uint64_t	vdev_wholedisk;	/* true if this is a whole disk */
+	uint64_t	vdev_offline;	/* device taken offline?	*/
 	char		*vdev_path;	/* vdev path (if any)		*/
 	char		*vdev_devid;	/* vdev devid (if any)		*/
 	uint64_t	vdev_fault_arg; /* fault injection paramater	*/
 	int		vdev_fault_mask; /* zio types to fault		*/
 	uint8_t		vdev_fault_mode; /* fault injection mode	*/
 	uint8_t		vdev_cache_active; /* vdev_cache and vdev_queue	*/
-	uint8_t		vdev_offline;	/* device taken offline?	*/
 	uint8_t		vdev_tmpoffline; /* device taken offline temporarily? */
 	uint8_t		vdev_detached;	/* device detached?		*/
 	vdev_queue_t	vdev_queue;	/* I/O deadline schedule queue	*/
@@ -185,14 +182,21 @@
 	 * incorrect.
 	 */
 	kmutex_t	vdev_dtl_lock;	/* vdev_dtl_{map,resilver}	*/
-	kmutex_t	vdev_dirty_lock; /* vdev_dirty[]		*/
 	kmutex_t	vdev_stat_lock;	/* vdev_stat			*/
 };
 
 #define	VDEV_SKIP_SIZE		(8 << 10)
 #define	VDEV_BOOT_HEADER_SIZE	(8 << 10)
 #define	VDEV_PHYS_SIZE		(112 << 10)
-#define	VDEV_UBERBLOCKS		((128 << 10) >> UBERBLOCK_SHIFT)
+#define	VDEV_UBERBLOCK_RING	(128 << 10)
+
+#define	VDEV_UBERBLOCK_SHIFT(vd)	\
+	MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT)
+#define	VDEV_UBERBLOCK_COUNT(vd)	\
+	(VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd))
+#define	VDEV_UBERBLOCK_OFFSET(vd, n)	\
+	offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)])
+#define	VDEV_UBERBLOCK_SIZE(vd)		(1ULL << VDEV_UBERBLOCK_SHIFT(vd))
 
 #define	VDEV_BOOT_MAGIC		0x2f5b007b10c	/* ZFS boot block	*/
 #define	VDEV_BOOT_VERSION	1		/* version number	*/
@@ -211,13 +215,19 @@
 } vdev_phys_t;
 
 typedef struct vdev_label {
-	char			vl_pad[VDEV_SKIP_SIZE];		/*   8K	*/
-	vdev_boot_header_t	vl_boot_header;			/*   8K	*/
-	vdev_phys_t		vl_vdev_phys;			/* 112K	*/
-	uberblock_phys_t	vl_uberblock[VDEV_UBERBLOCKS];	/* 128K	*/
+	char		vl_pad[VDEV_SKIP_SIZE];			/*   8K	*/
+	vdev_boot_header_t vl_boot_header;			/*   8K	*/
+	vdev_phys_t	vl_vdev_phys;				/* 112K	*/
+	char		vl_uberblock[VDEV_UBERBLOCK_RING];	/* 128K	*/
 } vdev_label_t;							/* 256K total */
 
 /*
+ * vdev_dirty() flags
+ */
+#define	VDD_METASLAB	0x01
+#define	VDD_DTL		0x02
+
+/*
  * Size and offset of embedded boot loader region on each label.
  * The total size of the first two labels plus the boot area is 4MB.
  */
@@ -225,14 +235,6 @@
 #define	VDEV_BOOT_SIZE		(7ULL << 19)			/* 3.5M	*/
 
 /*
- * vdev_dirty[] flags
- */
-#define	VDD_ALLOC	0x01	/* allocated from in this txg		*/
-#define	VDD_FREE	0x02	/* freed to in this txg			*/
-#define	VDD_ADD		0x04	/* added to the pool in this txg	*/
-#define	VDD_DTL		0x08	/* dirty time log entry in this txg	*/
-
-/*
  * Size of label regions at the start and end of each leaf device.
  */
 #define	VDEV_LABEL_START_SIZE	(2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE)
@@ -264,7 +266,7 @@
 extern int vdev_load(vdev_t *vd);
 extern void vdev_sync(vdev_t *vd, uint64_t txg);
 extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
-extern void vdev_dirty(vdev_t *vd, uint8_t flags, uint64_t txg);
+extern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg);
 
 /*
  * Available vdev types.
--- a/usr/src/uts/common/fs/zfs/sys/zio.h	Sat Apr 01 21:50:51 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h	Sun Apr 02 00:47:06 2006 -0800
@@ -125,6 +125,7 @@
 
 #define	ZIO_FLAG_RESILVER		0x01000
 #define	ZIO_FLAG_SCRUB			0x02000
+#define	ZIO_FLAG_SUBBLOCK		0x04000
 
 #define	ZIO_FLAG_NOBOOKMARK		0x10000
 
--- a/usr/src/uts/common/fs/zfs/vdev.c	Sat Apr 01 21:50:51 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev.c	Sun Apr 02 00:47:06 2006 -0800
@@ -77,7 +77,7 @@
 uint64_t
 vdev_default_asize(vdev_t *vd, uint64_t psize)
 {
-	uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_ashift);
+	uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
 	uint64_t csize;
 	uint64_t c;
 
@@ -299,7 +299,6 @@
 	vd->vdev_ops = ops;
 	vd->vdev_state = VDEV_STATE_CLOSED;
 
-	mutex_init(&vd->vdev_dirty_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
 	space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock);
 	space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock);
@@ -328,13 +327,12 @@
 	txg_list_destroy(&vd->vdev_ms_list);
 	txg_list_destroy(&vd->vdev_dtl_list);
 	mutex_enter(&vd->vdev_dtl_lock);
-	space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
+	space_map_unload(&vd->vdev_dtl_map);
 	space_map_destroy(&vd->vdev_dtl_map);
 	space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
 	space_map_destroy(&vd->vdev_dtl_scrub);
 	mutex_exit(&vd->vdev_dtl_lock);
 	mutex_destroy(&vd->vdev_dtl_lock);
-	mutex_destroy(&vd->vdev_dirty_lock);
 
 	if (vd == spa->spa_root_vdev)
 		spa->spa_root_vdev = NULL;
@@ -352,7 +350,7 @@
 {
 	vdev_ops_t *ops;
 	char *type;
-	uint64_t guid = 0, offline = 0;
+	uint64_t guid = 0;
 	vdev_t *vd;
 
 	ASSERT(spa_config_held(spa, RW_WRITER));
@@ -401,6 +399,11 @@
 	    &vd->vdev_not_present);
 
 	/*
+	 * Get the alignment requirement.
+	 */
+	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
+
+	/*
 	 * If we're a top-level vdev, try to load the allocation parameters.
 	 */
 	if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) {
@@ -408,24 +411,18 @@
 		    &vd->vdev_ms_array);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
 		    &vd->vdev_ms_shift);
-		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT,
-		    &vd->vdev_ashift);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
 		    &vd->vdev_asize);
 	}
 
 	/*
-	 * If we're a leaf vdev, try to load the DTL object
-	 * and the offline state.
+	 * If we're a leaf vdev, try to load the DTL object and offline state.
 	 */
-	vd->vdev_offline = B_FALSE;
 	if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) {
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
 		    &vd->vdev_dtl.smo_object);
-
-		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &offline)
-		    == 0)
-			vd->vdev_offline = offline;
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
+		    &vd->vdev_offline);
 	}
 
 	/*
@@ -447,7 +444,7 @@
 	 */
 	vdev_close(vd);
 
-	ASSERT(!vd->vdev_is_dirty);
+	ASSERT(!list_link_active(&vd->vdev_dirty_node));
 
 	/*
 	 * Free all children.
@@ -499,13 +496,13 @@
 	svd->vdev_ms_count = 0;
 
 	tvd->vdev_mg = svd->vdev_mg;
-	tvd->vdev_mg->mg_vd = tvd;
 	tvd->vdev_ms = svd->vdev_ms;
-	tvd->vdev_smo = svd->vdev_smo;
 
 	svd->vdev_mg = NULL;
 	svd->vdev_ms = NULL;
-	svd->vdev_smo = NULL;
+
+	if (tvd->vdev_mg != NULL)
+		tvd->vdev_mg->mg_vd = tvd;
 
 	tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
 	tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
@@ -520,11 +517,9 @@
 			(void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
 		if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
 			(void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
-		tvd->vdev_dirty[t] = svd->vdev_dirty[t];
-		svd->vdev_dirty[t] = 0;
 	}
 
-	if (svd->vdev_is_dirty) {
+	if (list_link_active(&svd->vdev_dirty_node)) {
 		vdev_config_clean(svd);
 		vdev_config_dirty(tvd);
 	}
@@ -560,16 +555,17 @@
 	ASSERT(spa_config_held(spa, RW_WRITER));
 
 	mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
+
+	mvd->vdev_asize = cvd->vdev_asize;
+	mvd->vdev_ashift = cvd->vdev_ashift;
+	mvd->vdev_state = cvd->vdev_state;
+
 	vdev_remove_child(pvd, cvd);
 	vdev_add_child(pvd, mvd);
 	cvd->vdev_id = mvd->vdev_children;
 	vdev_add_child(mvd, cvd);
 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
 
-	mvd->vdev_asize = cvd->vdev_asize;
-	mvd->vdev_ashift = cvd->vdev_ashift;
-	mvd->vdev_state = cvd->vdev_state;
-
 	if (mvd == mvd->vdev_top)
 		vdev_top_transfer(cvd, mvd);
 
@@ -590,6 +586,7 @@
 	ASSERT(mvd->vdev_children == 1);
 	ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
 	    mvd->vdev_ops == &vdev_replacing_ops);
+	cvd->vdev_ashift = mvd->vdev_ashift;
 
 	vdev_remove_child(mvd, cvd);
 	vdev_remove_child(pvd, mvd);
@@ -608,13 +605,13 @@
 vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
+	objset_t *mos = spa->spa_meta_objset;
 	metaslab_class_t *mc = spa_metaslab_class_select(spa);
-	uint64_t c;
+	uint64_t m;
 	uint64_t oldc = vd->vdev_ms_count;
 	uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
-	space_map_obj_t *smo = vd->vdev_smo;
-	metaslab_t **mspp = vd->vdev_ms;
-	int ret;
+	metaslab_t **mspp;
+	int error;
 
 	if (vd->vdev_ms_shift == 0)	/* not being allocated from yet */
 		return (0);
@@ -623,77 +620,43 @@
 
 	ASSERT(oldc <= newc);
 
-	vd->vdev_smo = kmem_zalloc(newc * sizeof (*smo), KM_SLEEP);
-	vd->vdev_ms = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
+	if (vd->vdev_mg == NULL)
+		vd->vdev_mg = metaslab_group_create(mc, vd);
+
+	mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
+
+	if (oldc != 0) {
+		bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
+		kmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
+	}
+
+	vd->vdev_ms = mspp;
 	vd->vdev_ms_count = newc;
 
-	if (vd->vdev_mg == NULL) {
+	for (m = oldc; m < newc; m++) {
+		space_map_obj_t smo = { 0, 0, 0 };
 		if (txg == 0) {
-			dmu_buf_t *db;
-			uint64_t *ms_array;
-
-			ms_array = kmem_zalloc(newc * sizeof (uint64_t),
-			    KM_SLEEP);
-
-			if ((ret = dmu_read(spa->spa_meta_objset,
-			    vd->vdev_ms_array, 0,
-			    newc * sizeof (uint64_t), ms_array)) != 0) {
-				kmem_free(ms_array, newc * sizeof (uint64_t));
-				goto error;
-			}
-
-			for (c = 0; c < newc; c++) {
-				if (ms_array[c] == 0)
-					continue;
-				if ((ret = dmu_bonus_hold(
-				    spa->spa_meta_objset, ms_array[c],
-				    FTAG, &db)) != 0) {
-					kmem_free(ms_array,
-					    newc * sizeof (uint64_t));
-					goto error;
-				}
-				ASSERT3U(db->db_size, ==, sizeof (*smo));
-				bcopy(db->db_data, &vd->vdev_smo[c],
-				    db->db_size);
-				ASSERT3U(vd->vdev_smo[c].smo_object, ==,
-				    ms_array[c]);
+			uint64_t object = 0;
+			error = dmu_read(mos, vd->vdev_ms_array,
+			    m * sizeof (uint64_t), sizeof (uint64_t), &object);
+			if (error)
+				return (error);
+			if (object != 0) {
+				dmu_buf_t *db;
+				error = dmu_bonus_hold(mos, object, FTAG, &db);
+				if (error)
+					return (error);
+				ASSERT3U(db->db_size, ==, sizeof (smo));
+				bcopy(db->db_data, &smo, db->db_size);
+				ASSERT3U(smo.smo_object, ==, object);
 				dmu_buf_rele(db, FTAG);
 			}
-			kmem_free(ms_array, newc * sizeof (uint64_t));
 		}
-		vd->vdev_mg = metaslab_group_create(mc, vd);
-	}
-
-	for (c = 0; c < oldc; c++) {
-		vd->vdev_smo[c] = smo[c];
-		vd->vdev_ms[c] = mspp[c];
-		mspp[c]->ms_smo = &vd->vdev_smo[c];
-	}
-
-	for (c = oldc; c < newc; c++)
-		metaslab_init(vd->vdev_mg, &vd->vdev_smo[c], &vd->vdev_ms[c],
-		    c << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
-
-	if (oldc != 0) {
-		kmem_free(smo, oldc * sizeof (*smo));
-		kmem_free(mspp, oldc * sizeof (*mspp));
+		vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo,
+		    m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
 	}
 
 	return (0);
-
-error:
-	/*
-	 * On error, undo any partial progress we may have made, and restore the
-	 * old metaslab values.
-	 */
-	kmem_free(vd->vdev_smo, newc * sizeof (*smo));
-	kmem_free(vd->vdev_ms, newc * sizeof (*mspp));
-
-	vd->vdev_smo = smo;
-	vd->vdev_ms = mspp;
-	vd->vdev_ms_count = oldc;
-
-	return (ret);
 }
 
 void
@@ -704,15 +667,11 @@
 
 	if (vd->vdev_ms != NULL) {
 		for (m = 0; m < count; m++)
-			metaslab_fini(vd->vdev_ms[m]);
+			if (vd->vdev_ms[m] != NULL)
+				metaslab_fini(vd->vdev_ms[m]);
 		kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
 		vd->vdev_ms = NULL;
 	}
-
-	if (vd->vdev_smo != NULL) {
-		kmem_free(vd->vdev_smo, count * sizeof (space_map_obj_t));
-		vd->vdev_smo = NULL;
-	}
 }
 
 /*
@@ -726,7 +685,7 @@
 	int c;
 	uint64_t osize = 0;
 	uint64_t asize, psize;
-	uint64_t ashift = -1ULL;
+	uint64_t ashift = 0;
 
 	ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
 	    vd->vdev_state == VDEV_STATE_CANT_OPEN ||
@@ -793,7 +752,7 @@
 		psize = osize;
 		asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
 	} else {
-		if (osize < SPA_MINDEVSIZE -
+		if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE -
 		    (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_TOO_SMALL);
@@ -808,14 +767,15 @@
 	if (vd->vdev_asize == 0) {
 		/*
 		 * This is the first-ever open, so use the computed values.
+		 * For testing purposes, a higher ashift can be requested.
 		 */
 		vd->vdev_asize = asize;
-		vd->vdev_ashift = ashift;
+		vd->vdev_ashift = MAX(ashift, vd->vdev_ashift);
 	} else {
 		/*
 		 * Make sure the alignment requirement hasn't increased.
 		 */
-		if (ashift > vd->vdev_ashift) {
+		if (ashift > vd->vdev_top->vdev_ashift) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_BAD_LABEL);
 			return (EINVAL);
@@ -965,17 +925,18 @@
 }
 
 void
-vdev_dirty(vdev_t *vd, uint8_t flags, uint64_t txg)
+vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
 {
-	vdev_t *tvd = vd->vdev_top;
+	ASSERT(vd == vd->vdev_top);
+	ASSERT(ISP2(flags));
 
-	mutex_enter(&tvd->vdev_dirty_lock);
-	if ((tvd->vdev_dirty[txg & TXG_MASK] & flags) != flags) {
-		tvd->vdev_dirty[txg & TXG_MASK] |= flags;
-		(void) txg_list_add(&tvd->vdev_spa->spa_vdev_txg_list,
-		    tvd, txg);
-	}
-	mutex_exit(&tvd->vdev_dirty_lock);
+	if (flags & VDD_METASLAB)
+		(void) txg_list_add(&vd->vdev_ms_list, arg, txg);
+
+	if (flags & VDD_DTL)
+		(void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
+
+	(void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
 }
 
 void
@@ -1031,11 +992,8 @@
 		if (scrub_done)
 			space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
 		mutex_exit(&vd->vdev_dtl_lock);
-		if (txg != 0) {
-			vdev_t *tvd = vd->vdev_top;
-			vdev_dirty(tvd, VDD_DTL, txg);
-			(void) txg_list_add(&tvd->vdev_dtl_list, vd, txg);
-		}
+		if (txg != 0)
+			vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
 		return;
 	}
 
@@ -1068,6 +1026,7 @@
 {
 	spa_t *spa = vd->vdev_spa;
 	space_map_obj_t *smo = &vd->vdev_dtl;
+	objset_t *mos = spa->spa_meta_objset;
 	dmu_buf_t *db;
 	int error;
 
@@ -1076,16 +1035,15 @@
 	if (smo->smo_object == 0)
 		return (0);
 
-	if ((error = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object,
-	    FTAG, &db)) != 0)
+	if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0)
 		return (error);
+
 	ASSERT3U(db->db_size, ==, sizeof (*smo));
 	bcopy(db->db_data, smo, db->db_size);
 	dmu_buf_rele(db, FTAG);
 
 	mutex_enter(&vd->vdev_dtl_lock);
-	error = space_map_load(&vd->vdev_dtl_map, smo, SM_ALLOC,
-	    spa->spa_meta_objset, smo->smo_objsize, smo->smo_alloc);
+	error = space_map_load(&vd->vdev_dtl_map, NULL, SM_ALLOC, smo, mos);
 	mutex_exit(&vd->vdev_dtl_lock);
 
 	return (error);
@@ -1097,10 +1055,9 @@
 	spa_t *spa = vd->vdev_spa;
 	space_map_obj_t *smo = &vd->vdev_dtl;
 	space_map_t *sm = &vd->vdev_dtl_map;
+	objset_t *mos = spa->spa_meta_objset;
 	space_map_t smsync;
 	kmutex_t smlock;
-	avl_tree_t *t = &sm->sm_root;
-	space_seg_t *ss;
 	dmu_buf_t *db;
 	dmu_tx_t *tx;
 
@@ -1111,28 +1068,26 @@
 
 	if (vd->vdev_detached) {
 		if (smo->smo_object != 0) {
-			int err = dmu_object_free(spa->spa_meta_objset,
-			    smo->smo_object, tx);
+			int err = dmu_object_free(mos, smo->smo_object, tx);
 			ASSERT3U(err, ==, 0);
 			smo->smo_object = 0;
 		}
 		dmu_tx_commit(tx);
+		dprintf("detach %s committed in txg %llu\n",
+		    vdev_description(vd), txg);
 		return;
 	}
 
 	if (smo->smo_object == 0) {
 		ASSERT(smo->smo_objsize == 0);
 		ASSERT(smo->smo_alloc == 0);
-		smo->smo_object = dmu_object_alloc(spa->spa_meta_objset,
+		smo->smo_object = dmu_object_alloc(mos,
 		    DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
 		    DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
 		ASSERT(smo->smo_object != 0);
 		vdev_config_dirty(vd->vdev_top);
 	}
 
-	VERIFY(0 == dmu_free_range(spa->spa_meta_objset, smo->smo_object,
-	    0, smo->smo_objsize, tx));
-
 	mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL);
 
 	space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift,
@@ -1141,21 +1096,18 @@
 	mutex_enter(&smlock);
 
 	mutex_enter(&vd->vdev_dtl_lock);
-	for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss))
-		space_map_add(&smsync, ss->ss_start, ss->ss_end - ss->ss_start);
+	space_map_walk(sm, space_map_add, &smsync);
 	mutex_exit(&vd->vdev_dtl_lock);
 
-	smo->smo_objsize = 0;
-	smo->smo_alloc = smsync.sm_space;
+	space_map_truncate(smo, mos, tx);
+	space_map_sync(&smsync, SM_ALLOC, smo, mos, tx);
 
-	space_map_sync(&smsync, NULL, smo, SM_ALLOC, spa->spa_meta_objset, tx);
 	space_map_destroy(&smsync);
 
 	mutex_exit(&smlock);
 	mutex_destroy(&smlock);
 
-	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object,
-	    FTAG, &db));
+	VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
 	dmu_buf_will_dirty(db, tx);
 	ASSERT3U(db->db_size, ==, sizeof (*smo));
 	bcopy(smo, db->db_data, db->db_size);
@@ -1297,45 +1249,30 @@
 }
 
 void
-vdev_add_sync(vdev_t *vd, uint64_t txg)
-{
-	spa_t *spa = vd->vdev_spa;
-	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
-
-	ASSERT(vd == vd->vdev_top);
-
-	if (vd->vdev_ms_array == 0)
-		vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
-		    DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
-
-	ASSERT(vd->vdev_ms_array != 0);
-
-	vdev_config_dirty(vd);
-
-	dmu_tx_commit(tx);
-}
-
-void
 vdev_sync(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *lvd;
 	metaslab_t *msp;
-	uint8_t *dirtyp = &vd->vdev_dirty[txg & TXG_MASK];
-	uint8_t dirty = *dirtyp;
-
-	mutex_enter(&vd->vdev_dirty_lock);
-	*dirtyp &= ~(VDD_ALLOC | VDD_FREE | VDD_ADD | VDD_DTL);
-	mutex_exit(&vd->vdev_dirty_lock);
+	dmu_tx_t *tx;
 
 	dprintf("%s txg %llu pass %d\n",
 	    vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
 
-	if (dirty & VDD_ADD)
-		vdev_add_sync(vd, txg);
+	if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) {
+		ASSERT(vd == vd->vdev_top);
+		tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+		vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
+		    DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
+		ASSERT(vd->vdev_ms_array != 0);
+		vdev_config_dirty(vd);
+		dmu_tx_commit(tx);
+	}
 
-	while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL)
+	while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
 		metaslab_sync(msp, txg);
+		(void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
+	}
 
 	while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
 		vdev_dtl_sync(lvd, txg);
@@ -1425,36 +1362,37 @@
 
 	dprintf("OFFLINE: %s\n", vdev_description(vd));
 
-	/* vdev is already offlined, do nothing */
-	if (vd->vdev_offline)
-		return (spa_vdev_exit(spa, NULL, txg, 0));
-
 	/*
-	 * If this device's top-level vdev has a non-empty DTL,
-	 * don't allow the device to be offlined.
-	 *
-	 * XXX -- we should make this more precise by allowing the offline
-	 * as long as the remaining devices don't have any DTL holes.
+	 * If the device isn't already offline, try to offline it.
 	 */
-	if (vd->vdev_top->vdev_dtl_map.sm_space != 0)
-		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+	if (!vd->vdev_offline) {
+		/*
+		 * If this device's top-level vdev has a non-empty DTL,
+		 * don't allow the device to be offlined.
+		 *
+		 * XXX -- make this more precise by allowing the offline
+		 * as long as the remaining devices don't have any DTL holes.
+		 */
+		if (vd->vdev_top->vdev_dtl_map.sm_space != 0)
+			return (spa_vdev_exit(spa, NULL, txg, EBUSY));
 
-	/*
-	 * Set this device to offline state and reopen its top-level vdev.
-	 * If this action results in the top-level vdev becoming unusable,
-	 * undo it and fail the request.
-	 */
-	vd->vdev_offline = B_TRUE;
-	vdev_reopen(vd->vdev_top);
-	if (vdev_is_dead(vd->vdev_top)) {
-		vd->vdev_offline = B_FALSE;
+		/*
+		 * Offline this device and reopen its top-level vdev.
+		 * If this action results in the top-level vdev becoming
+		 * unusable, undo it and fail the request.
+		 */
+		vd->vdev_offline = B_TRUE;
 		vdev_reopen(vd->vdev_top);
-		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+		if (vdev_is_dead(vd->vdev_top)) {
+			vd->vdev_offline = B_FALSE;
+			vdev_reopen(vd->vdev_top);
+			return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+		}
 	}
 
 	vd->vdev_tmpoffline = istmp;
-	if (!istmp)
-		vdev_config_dirty(vd->vdev_top);
+
+	vdev_config_dirty(vd->vdev_top);
 
 	return (spa_vdev_exit(spa, NULL, txg, 0));
 }
@@ -1613,11 +1551,9 @@
 				vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1);
 		}
 		if (!(flags & ZIO_FLAG_IO_REPAIR)) {
-			vdev_t *tvd = vd->vdev_top;
 			if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1))
 				return;
-			vdev_dirty(tvd, VDD_DTL, txg);
-			(void) txg_list_add(&tvd->vdev_dtl_list, vd, txg);
+			vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
 			for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
 				vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1);
 		}
@@ -1788,10 +1724,8 @@
 	} else {
 		ASSERT(vd == vd->vdev_top);
 
-		if (!vd->vdev_is_dirty) {
+		if (!list_link_active(&vd->vdev_dirty_node))
 			list_insert_head(&spa->spa_dirty_list, vd);
-			vd->vdev_is_dirty = B_TRUE;
-		}
 	}
 }
 
@@ -1803,10 +1737,8 @@
 	ASSERT(spa_config_held(spa, RW_WRITER) ||
 	    dsl_pool_sync_context(spa_get_dsl(spa)));
 
-	ASSERT(vd->vdev_is_dirty);
-
+	ASSERT(list_link_active(&vd->vdev_dirty_node));
 	list_remove(&spa->spa_dirty_list, vd);
-	vd->vdev_is_dirty = B_FALSE;
 }
 
 /*
--- a/usr/src/uts/common/fs/zfs/vdev_disk.c	Sat Apr 01 21:50:51 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev_disk.c	Sun Apr 02 00:47:06 2006 -0800
@@ -48,6 +48,7 @@
 vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 {
 	vdev_disk_t *dvd;
+	struct dk_minfo dkm;
 	int error;
 
 	/*
@@ -153,27 +154,25 @@
 		return (EINVAL);
 	}
 
-	*ashift = SPA_MINBLOCKSHIFT;
-
-
+	/*
+	 * If we own the whole disk, try to enable disk write caching.
+	 * We ignore errors because it's OK if we can't do it.
+	 */
 	if (vd->vdev_wholedisk == 1) {
-
-		int wce, rc;
+		int wce = 1;
+		(void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce,
+		    FKIOCTL, kcred, NULL);
+	}
 
-		/*
-		 * Enable disk write caching if we own the whole disk.
-		 * Ignore errors as this is a performance optimization,
-		 * we work just fine w/o it.
-		 */
-		error = 0;
-		wce = 1;
-		rc = ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce,
-			FKIOCTL, kcred, &error);
+	/*
+	 * Determine the device's minimum transfer size.
+	 * If the ioctl isn't supported, assume DEV_BSIZE.
+	 */
+	if (ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO, (intptr_t)&dkm,
+	    FKIOCTL, kcred, NULL) != 0)
+		dkm.dki_lbsize = DEV_BSIZE;
 
-		if (rc || error)
-			dprintf("%s: DKIOCSETWCE failed %d,%d",
-				vdev_description(vd), rc, error);
-	}
+	*ashift = highbit(MAX(dkm.dki_lbsize, SPA_MINBLOCKSIZE)) - 1;
 
 	return (0);
 }
--- a/usr/src/uts/common/fs/zfs/vdev_label.c	Sat Apr 01 21:50:51 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev_label.c	Sun Apr 02 00:47:06 2006 -0800
@@ -152,6 +152,8 @@
 uint64_t
 vdev_label_offset(uint64_t psize, int l, uint64_t offset)
 {
+	ASSERT(offset < sizeof (vdev_label_t));
+
 	return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
 	    0 : psize - VDEV_LABELS * sizeof (vdev_label_t)));
 }
@@ -253,14 +255,12 @@
 		kmem_free(child, vd->vdev_children * sizeof (nvlist_t *));
 
 	} else {
-		if (!vd->vdev_tmpoffline) {
-		    if (vd->vdev_offline)
+		if (vd->vdev_offline && !vd->vdev_tmpoffline)
 			VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE,
-				B_TRUE) == 0);
-		    else
+			    B_TRUE) == 0);
+		else
 			(void) nvlist_remove(nv, ZPOOL_CONFIG_OFFLINE,
-				DATA_TYPE_UINT64);
-		}
+			    DATA_TYPE_UINT64);
 	}
 
 	return (nv);
@@ -314,7 +314,7 @@
 	nvlist_t *label;
 	vdev_phys_t *vp;
 	vdev_boot_header_t *vb;
-	uberblock_phys_t *ubphys;
+	uberblock_t *ub;
 	zio_t *zio;
 	int l, c, n;
 	char *buf;
@@ -411,10 +411,10 @@
 	/*
 	 * Initialize uberblock template.
 	 */
-	ubphys = zio_buf_alloc(sizeof (uberblock_phys_t));
-	bzero(ubphys, sizeof (uberblock_phys_t));
-	ubphys->ubp_uberblock = spa->spa_uberblock;
-	ubphys->ubp_uberblock.ub_txg = 0;
+	ub = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd));
+	bzero(ub, VDEV_UBERBLOCK_SIZE(vd));
+	*ub = spa->spa_uberblock;
+	ub->ub_txg = 0;
 
 	/*
 	 * Write everything in parallel.
@@ -432,19 +432,17 @@
 		    offsetof(vdev_label_t, vl_boot_header),
 		    sizeof (vdev_boot_header_t), NULL, NULL);
 
-		for (n = 0; n < VDEV_UBERBLOCKS; n++) {
-
-			vdev_label_write(zio, vd, l, ubphys,
-			    offsetof(vdev_label_t, vl_uberblock[n]),
-			    sizeof (uberblock_phys_t), NULL, NULL);
-
+		for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
+			vdev_label_write(zio, vd, l, ub,
+			    VDEV_UBERBLOCK_OFFSET(vd, n),
+			    VDEV_UBERBLOCK_SIZE(vd), NULL, NULL);
 		}
 	}
 
 	error = zio_wait(zio);
 
 	nvlist_free(label);
-	zio_buf_free(ubphys, sizeof (uberblock_phys_t));
+	zio_buf_free(ub, VDEV_UBERBLOCK_SIZE(vd));
 	zio_buf_free(vb, sizeof (vdev_boot_header_t));
 	zio_buf_free(vp, sizeof (vdev_phys_t));
 
@@ -486,12 +484,11 @@
 static void
 vdev_uberblock_load_done(zio_t *zio)
 {
-	uberblock_phys_t *ubphys = zio->io_data;
-	uberblock_t *ub = &ubphys->ubp_uberblock;
+	uberblock_t *ub = zio->io_data;
 	uberblock_t *ubbest = zio->io_private;
 	spa_t *spa = zio->io_spa;
 
-	ASSERT3U(zio->io_size, ==, sizeof (uberblock_phys_t));
+	ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(zio->io_vd));
 
 	if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
 		mutex_enter(&spa->spa_uberblock_lock);
@@ -518,11 +515,11 @@
 		return;
 
 	for (l = 0; l < VDEV_LABELS; l++) {
-		for (n = 0; n < VDEV_UBERBLOCKS; n++) {
+		for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
 			vdev_label_read(zio, vd, l,
-			    zio_buf_alloc(sizeof (uberblock_phys_t)),
-			    offsetof(vdev_label_t, vl_uberblock[n]),
-			    sizeof (uberblock_phys_t),
+			    zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)),
+			    VDEV_UBERBLOCK_OFFSET(vd, n),
+			    VDEV_UBERBLOCK_SIZE(vd),
 			    vdev_uberblock_load_done, ubbest);
 		}
 	}
@@ -542,13 +539,12 @@
 }
 
 static void
-vdev_uberblock_sync(zio_t *zio, uberblock_phys_t *ubphys, vdev_t *vd,
-	uint64_t txg)
+vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, uint64_t txg)
 {
 	int l, c, n;
 
 	for (c = 0; c < vd->vdev_children; c++)
-		vdev_uberblock_sync(zio, ubphys, vd->vdev_child[c], txg);
+		vdev_uberblock_sync(zio, ub, vd->vdev_child[c], txg);
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return;
@@ -556,36 +552,38 @@
 	if (vdev_is_dead(vd))
 		return;
 
-	n = txg & (VDEV_UBERBLOCKS - 1);
+	n = txg & (VDEV_UBERBLOCK_COUNT(vd) - 1);
 
-	ASSERT(ubphys->ubp_uberblock.ub_txg == txg);
+	ASSERT(ub->ub_txg == txg);
 
 	for (l = 0; l < VDEV_LABELS; l++)
-		vdev_label_write(zio, vd, l, ubphys,
-		    offsetof(vdev_label_t, vl_uberblock[n]),
-		    sizeof (uberblock_phys_t), vdev_uberblock_sync_done, NULL);
+		vdev_label_write(zio, vd, l, ub,
+		    VDEV_UBERBLOCK_OFFSET(vd, n),
+		    VDEV_UBERBLOCK_SIZE(vd),
+		    vdev_uberblock_sync_done, NULL);
 
 	dprintf("vdev %s in txg %llu\n", vdev_description(vd), txg);
 }
 
 static int
-vdev_uberblock_sync_tree(spa_t *spa, uberblock_t *ub, vdev_t *uvd, uint64_t txg)
+vdev_uberblock_sync_tree(spa_t *spa, uberblock_t *ub, vdev_t *vd, uint64_t txg)
 {
-	uberblock_phys_t *ubphys;
+	uberblock_t *ubbuf;
+	size_t size = vd->vdev_top ? VDEV_UBERBLOCK_SIZE(vd) : SPA_MAXBLOCKSIZE;
 	uint64_t *good_writes;
 	zio_t *zio;
 	int error;
 
-	ubphys = zio_buf_alloc(sizeof (uberblock_phys_t));
-	bzero(ubphys, sizeof (uberblock_phys_t));
-	ubphys->ubp_uberblock = *ub;
+	ubbuf = zio_buf_alloc(size);
+	bzero(ubbuf, size);
+	*ubbuf = *ub;
 
 	good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
 
 	zio = zio_root(spa, NULL, good_writes,
 	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
 
-	vdev_uberblock_sync(zio, ubphys, uvd, txg);
+	vdev_uberblock_sync(zio, ubbuf, vd, txg);
 
 	error = zio_wait(zio);
 
@@ -602,7 +600,7 @@
 		error = EIO;
 
 	kmem_free(good_writes, sizeof (uint64_t));
-	zio_buf_free(ubphys, sizeof (uberblock_phys_t));
+	zio_buf_free(ubbuf, size);
 
 	return (error);
 }
--- a/usr/src/uts/common/fs/zfs/vdev_mirror.c	Sat Apr 01 21:50:51 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c	Sun Apr 02 00:47:06 2006 -0800
@@ -80,7 +80,7 @@
 		}
 
 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
-		*ashift = cvd->vdev_ashift;
+		*ashift = MAX(*ashift, cvd->vdev_ashift);
 	}
 
 	if (numerrors == vd->vdev_children) {
@@ -129,6 +129,13 @@
 	mm->mm_skipped = 0;
 }
 
+static void
+vdev_mirror_repair_done(zio_t *zio)
+{
+	ASSERT(zio->io_private == zio->io_parent);
+	vdev_mirror_map_free(zio->io_private);
+}
+
 /*
  * Try to find a child whose DTL doesn't contain the block we want to read.
  * If we can't, try the read on any vdev we haven't already tried.
@@ -341,9 +348,18 @@
 
 	if (good_copies && (spa_mode & FWRITE) &&
 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
+		zio_t *rio;
+
 		/*
 		 * Use the good data we have in hand to repair damaged children.
+		 *
+		 * We issue all repair I/Os as children of 'rio' to arrange
+		 * that vdev_mirror_map_free(zio) will be invoked after all
+		 * repairs complete, but before we advance to the next stage.
 		 */
+		rio = zio_null(zio, zio->io_spa,
+		    vdev_mirror_repair_done, zio, ZIO_FLAG_CANFAIL);
+
 		for (c = 0; c < vd->vdev_children; c++) {
 			/*
 			 * Don't rewrite known good children.
@@ -368,12 +384,16 @@
 			    vdev_description(cvd),
 			    zio->io_offset, mm[c].mm_error);
 
-			zio_nowait(zio_vdev_child_io(zio, zio->io_bp, cvd,
+			zio_nowait(zio_vdev_child_io(rio, zio->io_bp, cvd,
 			    zio->io_offset, zio->io_data, zio->io_size,
 			    ZIO_TYPE_WRITE, zio->io_priority,
 			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL |
 			    ZIO_FLAG_DONT_PROPAGATE, NULL, NULL));
 		}
+
+		zio_nowait(rio);
+		zio_wait_children_done(zio);
+		return;
 	}
 
 	vdev_mirror_map_free(zio);
--- a/usr/src/uts/common/fs/zfs/vdev_raidz.c	Sat Apr 01 21:50:51 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c	Sun Apr 02 00:47:06 2006 -0800
@@ -206,7 +206,7 @@
 		}
 
 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
-		*ashift = cvd->vdev_ashift;
+		*ashift = MAX(*ashift, cvd->vdev_ashift);
 	}
 
 	*asize *= vd->vdev_children;
@@ -232,11 +232,12 @@
 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
 {
 	uint64_t asize;
+	uint64_t ashift = vd->vdev_top->vdev_ashift;
 	uint64_t cols = vd->vdev_children;
 
-	asize = psize >> vd->vdev_ashift;
+	asize = ((psize - 1) >> ashift) + 1;
 	asize += (asize + cols - 2) / (cols - 1);
-	asize = P2ROUNDUP(asize, VDEV_RAIDZ_ALIGN) << vd->vdev_ashift;
+	asize = P2ROUNDUP(asize, VDEV_RAIDZ_ALIGN) << ashift;
 
 	return (asize);
 }
@@ -254,28 +255,28 @@
 static void
 vdev_raidz_repair_done(zio_t *zio)
 {
-	zio_buf_free(zio->io_data, zio->io_size);
+	ASSERT(zio->io_private == zio->io_parent);
+	vdev_raidz_map_free(zio->io_private);
 }
 
 static void
 vdev_raidz_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
+	vdev_t *tvd = vd->vdev_top;
 	vdev_t *cvd;
 	blkptr_t *bp = zio->io_bp;
 	raidz_map_t *rm;
 	raidz_col_t *rc;
 	int c;
 
-	rm = vdev_raidz_map_alloc(zio, vd->vdev_ashift, vd->vdev_children);
+	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children);
 
 	if (DVA_GET_GANG(ZIO_GET_DVA(zio))) {
 		ASSERT3U(rm->rm_asize, ==,
 		    vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE));
-		ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
 	} else {
 		ASSERT3U(rm->rm_asize, ==, DVA_GET_ASIZE(ZIO_GET_DVA(zio)));
-		ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
 	}
 
 	if (zio->io_type == ZIO_TYPE_WRITE) {
@@ -549,34 +550,40 @@
 
 	if (zio->io_error == 0 && (spa_mode & FWRITE) &&
 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
+		zio_t *rio;
+
 		/*
 		 * Use the good data we have in hand to repair damaged children.
+		 *
+		 * We issue all repair I/Os as children of 'rio' to arrange
+		 * that vdev_raidz_map_free(zio) will be invoked after all
+		 * repairs complete, but before we advance to the next stage.
 		 */
+		rio = zio_null(zio, zio->io_spa,
+		    vdev_raidz_repair_done, zio, ZIO_FLAG_CANFAIL);
+
 		for (c = 0; c < rm->rm_cols; c++) {
 			rc = &rm->rm_col[c];
 			cvd = vd->vdev_child[rc->rc_col];
 
-			if (rc->rc_error) {
-				/*
-				 * Make a copy of the data because we're
-				 * going to free the RAID-Z map below.
-				 */
-				void *data = zio_buf_alloc(rc->rc_size);
-				bcopy(rc->rc_data, data, rc->rc_size);
+			if (rc->rc_error == 0)
+				continue;
+
+			dprintf("%s resilvered %s @ 0x%llx error %d\n",
+			    vdev_description(vd),
+			    vdev_description(cvd),
+			    zio->io_offset, rc->rc_error);
 
-				dprintf("%s resilvered %s @ 0x%llx error %d\n",
-				    vdev_description(vd),
-				    vdev_description(cvd),
-				    zio->io_offset, rc->rc_error);
+			zio_nowait(zio_vdev_child_io(rio, NULL, cvd,
+			    rc->rc_offset, rc->rc_data, rc->rc_size,
+			    ZIO_TYPE_WRITE, zio->io_priority,
+			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL |
+			    ZIO_FLAG_DONT_PROPAGATE, NULL, NULL));
+		}
 
-				zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
-				    rc->rc_offset, data, rc->rc_size,
-				    ZIO_TYPE_WRITE, zio->io_priority,
-				    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL |
-				    ZIO_FLAG_DONT_PROPAGATE,
-				    vdev_raidz_repair_done, NULL));
-			}
-		}
+		zio_nowait(rio);
+		zio_wait_children_done(zio);
+		return;
 	}
 
 	vdev_raidz_map_free(zio);
--- a/usr/src/uts/common/fs/zfs/vdev_root.c	Sat Apr 01 21:50:51 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev_root.c	Sun Apr 02 00:47:06 2006 -0800
@@ -54,14 +54,14 @@
 			lasterror = error;
 			continue;
 		}
-
-		*asize += cvd->vdev_asize;
-		*ashift = MAX(*ashift, cvd->vdev_ashift);
 	}
 
 	if (lasterror)
 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
 
+	*asize = 0;
+	*ashift = 0;
+
 	return (lasterror);
 }
 
--- a/usr/src/uts/common/fs/zfs/zio.c	Sat Apr 01 21:50:51 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/zio.c	Sun Apr 02 00:47:06 2006 -0800
@@ -762,10 +762,9 @@
 		 * at the block level.  We ignore these errors if the
 		 * device is currently unavailable.
 		 */
-		if (zio->io_error != ECKSUM && zio->io_vd &&
-		    !vdev_is_dead(zio->io_vd))
+		if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
 			zfs_ereport_post(FM_EREPORT_ZFS_IO,
-			    zio->io_spa, zio->io_vd, zio, 0, 0);
+			    zio->io_spa, vd, zio, 0, 0);
 
 		if ((zio->io_error == EIO ||
 		    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) &&
@@ -1238,7 +1237,7 @@
 
 	ASSERT(!BP_IS_HOLE(bp));
 
-	metaslab_free(zio->io_spa, dva, zio->io_txg);
+	metaslab_free(zio->io_spa, dva, zio->io_txg, B_FALSE);
 
 	BP_ZERO(bp);
 
@@ -1288,9 +1287,11 @@
 zio_vdev_io_setup(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
+	vdev_t *tvd = vd->vdev_top;
+	uint64_t align = 1ULL << tvd->vdev_ashift;
 
 	/* XXPOLICY */
-	if (zio->io_retries == 0 && vd == vd->vdev_top)
+	if (zio->io_retries == 0 && vd == tvd)
 		zio->io_flags |= ZIO_FLAG_FAILFAST;
 
 	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && vd->vdev_children == 0) {
@@ -1298,6 +1299,19 @@
 		zio->io_offset += VDEV_LABEL_START_SIZE;
 	}
 
+	if (P2PHASE(zio->io_size, align) != 0) {
+		uint64_t asize = P2ROUNDUP(zio->io_size, align);
+		char *abuf = zio_buf_alloc(asize);
+		ASSERT(vd == tvd);
+		if (zio->io_type == ZIO_TYPE_WRITE) {
+			bcopy(zio->io_data, abuf, zio->io_size);
+			bzero(abuf + zio->io_size, asize - zio->io_size);
+		}
+		zio_push_transform(zio, abuf, asize, asize);
+		ASSERT(!(zio->io_flags & ZIO_FLAG_SUBBLOCK));
+		zio->io_flags |= ZIO_FLAG_SUBBLOCK;
+	}
+
 	zio_next_stage(zio);
 }
 
@@ -1305,10 +1319,12 @@
 zio_vdev_io_start(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
+	uint64_t align = 1ULL << zio->io_vd->vdev_top->vdev_ashift;
 
-	ASSERT(P2PHASE(zio->io_offset, 1ULL << zio->io_vd->vdev_ashift) == 0);
-	ASSERT(P2PHASE(zio->io_size, 1ULL << zio->io_vd->vdev_ashift) == 0);
-	ASSERT(bp == NULL || ZIO_GET_IOSIZE(zio) == zio->io_size);
+	ASSERT(P2PHASE(zio->io_offset, align) == 0);
+	ASSERT(P2PHASE(zio->io_size, align) == 0);
+	ASSERT(bp == NULL ||
+	    P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size);
 	ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE));
 
 	vdev_io_start(zio);
@@ -1350,6 +1366,17 @@
 
 	ASSERT(zio->io_vsd == NULL);
 
+	if (zio->io_flags & ZIO_FLAG_SUBBLOCK) {
+		void *abuf;
+		uint64_t asize;
+		ASSERT(vd == tvd);
+		zio_pop_transform(zio, &abuf, &asize, &asize);
+		if (zio->io_type == ZIO_TYPE_READ)
+			bcopy(abuf, zio->io_data, zio->io_size);
+		zio_buf_free(abuf, asize);
+		zio->io_flags &= ~ZIO_FLAG_SUBBLOCK;
+	}
+
 	if (zio_injection_enabled && !zio->io_error)
 		zio->io_error = zio_handle_fault_injection(zio, EIO);
 
@@ -1660,7 +1687,7 @@
 
 	spa_config_enter(spa, RW_READER, FTAG);
 
-	metaslab_free(spa, BP_IDENTITY(bp), txg);
+	metaslab_free(spa, BP_IDENTITY(bp), txg, B_FALSE);
 
 	spa_config_exit(spa, FTAG);
 }
--- a/usr/src/uts/common/fs/zfs/zio_checksum.c	Sat Apr 01 21:50:51 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/zio_checksum.c	Sun Apr 02 00:47:06 2006 -0800
@@ -128,7 +128,7 @@
 	    BP_GET_CHECKSUM(bp);
 	int byteswap = BP_SHOULD_BYTESWAP(bp);
 	void *data = zio->io_data;
-	uint64_t size = zio->io_size;
+	uint64_t size = ZIO_GET_IOSIZE(zio);
 	zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 	zio_cksum_t actual_cksum, expected_cksum;