changeset 1807:35c8b566d7af

6410711 intent log blocks don't get invited to pool parties
author bonwick
date Thu, 13 Apr 2006 16:15:06 -0700
parents 7f3c457c93fd
children a9c9c8edb499
files usr/src/cmd/zdb/zdb.c usr/src/cmd/zdb/zdb_il.c usr/src/cmd/ztest/ztest.c usr/src/uts/common/fs/zfs/dmu_objset.c usr/src/uts/common/fs/zfs/dmu_traverse.c usr/src/uts/common/fs/zfs/metaslab.c usr/src/uts/common/fs/zfs/spa.c usr/src/uts/common/fs/zfs/spa_misc.c usr/src/uts/common/fs/zfs/sys/metaslab.h usr/src/uts/common/fs/zfs/sys/spa.h usr/src/uts/common/fs/zfs/sys/zil.h usr/src/uts/common/fs/zfs/sys/zil_impl.h usr/src/uts/common/fs/zfs/sys/zio.h usr/src/uts/common/fs/zfs/vdev.c usr/src/uts/common/fs/zfs/vdev_mirror.c usr/src/uts/common/fs/zfs/vdev_queue.c usr/src/uts/common/fs/zfs/zil.c usr/src/uts/common/fs/zfs/zio.c
diffstat 18 files changed, 448 insertions(+), 308 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/cmd/zdb/zdb.c	Thu Apr 13 15:37:22 2006 -0700
+++ b/usr/src/cmd/zdb/zdb.c	Thu Apr 13 16:15:06 2006 -0700
@@ -1468,8 +1468,6 @@
 
 	ASSERT(!BP_IS_HOLE(bp));
 
-	zdb_count_block(spa, zcb, bp, type);
-
 	if (dump_opt['b'] >= 4) {
 		sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, bp);
 		(void) printf("objset %llu object %llu offset 0x%llx %s\n",
@@ -1480,6 +1478,8 @@
 		    blkbuf);
 	}
 
+	zdb_count_block(spa, zcb, bp, type);
+
 	return (0);
 }
 
--- a/usr/src/cmd/zdb/zdb_il.c	Thu Apr 13 15:37:22 2006 -0700
+++ b/usr/src/cmd/zdb/zdb_il.c	Thu Apr 13 16:15:06 2006 -0700
@@ -43,7 +43,7 @@
 extern uint8_t dump_opt[256];
 
 static void
-print_log_bp(blkptr_t *bp, const char *prefix)
+print_log_bp(const blkptr_t *bp, const char *prefix)
 {
 	char blkbuf[BP_SPRINTF_LEN];
 
@@ -130,13 +130,13 @@
 		} else {
 			zbookmark_t zb;
 
-			ASSERT3U(bp->blk_cksum.zc_word[2], ==,
+			ASSERT3U(bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], ==,
 			    dmu_objset_id(zilog->zl_os));
 
-			zb.zb_objset = bp->blk_cksum.zc_word[2];
+			zb.zb_objset = bp->blk_cksum.zc_word[ZIL_ZC_OBJSET];
 			zb.zb_object = 0;
 			zb.zb_level = -1;
-			zb.zb_blkid = bp->blk_cksum.zc_word[3];
+			zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
 
 			error = zio_wait(zio_read(NULL, zilog->zl_spa,
 			    bp, buf, BP_GET_LSIZE(bp), NULL, NULL,
@@ -300,7 +300,7 @@
 		claim = "won't claim";
 
 	(void) printf("\tBlock seqno %llu, %s%s\n",
-	    (u_longlong_t)bp->blk_cksum.zc_word[3], claim, blkbuf);
+	    (u_longlong_t)bp->blk_cksum.zc_word[ZIL_ZC_SEQ], claim, blkbuf);
 }
 
 static void
@@ -329,7 +329,7 @@
 void
 dump_intent_log(zilog_t *zilog)
 {
-	zil_header_t *zh = zilog->zl_header;
+	const zil_header_t *zh = zilog->zl_header;
 	int verbose = MAX(dump_opt['d'], dump_opt['i']);
 	int i;
 
@@ -347,7 +347,7 @@
 
 	if (verbose >= 2) {
 		(void) printf("\n");
-		zil_parse(zilog, print_log_block, print_log_record, NULL,
+		(void) zil_parse(zilog, print_log_block, print_log_record, NULL,
 		    zh->zh_claim_txg);
 		print_log_stats(verbose);
 	}
--- a/usr/src/cmd/ztest/ztest.c	Thu Apr 13 15:37:22 2006 -0700
+++ b/usr/src/cmd/ztest/ztest.c	Thu Apr 13 16:15:06 2006 -0700
@@ -1216,7 +1216,7 @@
 	/*
 	 * Put a random number of objects in there.
 	 */
-	objects = ztest_random(50);
+	objects = ztest_random(20);
 	seq = 0;
 	while (objects-- != 0) {
 		uint64_t object;
@@ -1237,7 +1237,7 @@
 		if (ztest_random(5) == 0) {
 			zil_commit(zilog, seq, FSYNC);
 		}
-		if (ztest_random(5) == 0) {
+		if (ztest_random(100) == 0) {
 			error = zil_suspend(zilog);
 			if (error == 0) {
 				zil_resume(zilog);
@@ -2670,13 +2670,14 @@
 ztest_obliterate_one_disk(uint64_t vdev)
 {
 	int fd;
-	char dev_name[MAXPATHLEN];
+	char dev_name[MAXPATHLEN], copy_name[MAXPATHLEN];
 	size_t fsize;
 
 	if (zopt_maxfaults < 2)
 		return;
 
 	(void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
+	(void) snprintf(copy_name, MAXPATHLEN, "%s.old", dev_name);
 
 	fd = open(dev_name, O_RDWR);
 
@@ -2687,12 +2688,13 @@
 	 * Determine the size.
 	 */
 	fsize = lseek(fd, 0, SEEK_END);
+
 	(void) close(fd);
 
 	/*
-	 * Remove it.
+	 * Rename the old device to dev_name.old (useful for debugging).
 	 */
-	VERIFY(remove(dev_name) == 0);
+	VERIFY(rename(dev_name, copy_name) == 0);
 
 	/*
 	 * Create a new one.
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c	Thu Apr 13 15:37:22 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c	Thu Apr 13 16:15:06 2006 -0700
@@ -541,7 +541,7 @@
 	 */
 	error = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_EXCLUSIVE, &os);
 	if (error == 0) {
-		zil_destroy(dmu_objset_zil(os));
+		zil_destroy(dmu_objset_zil(os), B_FALSE);
 		dmu_objset_close(os);
 	}
 
--- a/usr/src/uts/common/fs/zfs/dmu_traverse.c	Thu Apr 13 15:37:22 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c	Thu Apr 13 16:15:06 2006 -0700
@@ -484,7 +484,7 @@
 
 	if (claim_txg != 0 || bp->blk_birth < spa_first_txg(th->th_spa)) {
 		zb->zb_object = 0;
-		zb->zb_blkid = bp->blk_cksum.zc_word[3];
+		zb->zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
 		bc->bc_blkptr = *bp;
 		(void) traverse_callback(th, zseg, bc);
 	}
@@ -539,7 +539,7 @@
 
 	zilog = zil_alloc(dp->dp_meta_objset, zh);
 
-	zil_parse(zilog, traverse_zil_block, traverse_zil_record, th,
+	(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, th,
 	    claim_txg);
 
 	zil_free(zilog);
--- a/usr/src/uts/common/fs/zfs/metaslab.c	Thu Apr 13 15:37:22 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/metaslab.c	Thu Apr 13 16:15:06 2006 -0700
@@ -593,52 +593,6 @@
 	mutex_exit(&msp->ms_lock);
 }
 
-/*
- * Intent log support: upon opening the pool after a crash, notify the SPA
- * of blocks that the intent log has allocated for immediate write, but
- * which are still considered free by the SPA because the last transaction
- * group didn't commit yet.
- */
-int
-metaslab_claim(spa_t *spa, dva_t *dva, uint64_t txg)
-{
-	uint64_t vdev = DVA_GET_VDEV(dva);
-	uint64_t offset = DVA_GET_OFFSET(dva);
-	uint64_t size = DVA_GET_ASIZE(dva);
-	vdev_t *vd;
-	metaslab_t *msp;
-	int error;
-
-	if ((vd = vdev_lookup_top(spa, vdev)) == NULL)
-		return (ENXIO);
-
-	if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
-		return (ENXIO);
-
-	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
-
-	if (DVA_GET_GANG(dva))
-		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
-
-	mutex_enter(&msp->ms_lock);
-
-	error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
-	if (error) {
-		mutex_exit(&msp->ms_lock);
-		return (error);
-	}
-
-	if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
-		vdev_dirty(vd, VDD_METASLAB, msp, txg);
-
-	space_map_claim(&msp->ms_map, offset, size);
-	space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
-
-	mutex_exit(&msp->ms_lock);
-
-	return (0);
-}
-
 static uint64_t
 metaslab_distance(metaslab_t *msp, dva_t *dva)
 {
@@ -735,7 +689,7 @@
  * Allocate a block for the specified i/o.
  */
 static int
-metaslab_alloc_one(spa_t *spa, uint64_t psize, dva_t *dva, int d,
+metaslab_alloc_dva(spa_t *spa, uint64_t psize, dva_t *dva, int d,
     dva_t *hintdva, uint64_t txg)
 {
 	metaslab_group_t *mg, *rotor;
@@ -747,6 +701,8 @@
 	uint64_t asize;
 	uint64_t distance;
 
+	ASSERT(!DVA_IS_VALID(&dva[d]));
+
 	mc = spa_metaslab_class_select(spa);
 
 	/*
@@ -854,41 +810,12 @@
 	return (ENOSPC);
 }
 
-int
-metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp, int ncopies,
-    uint64_t txg, blkptr_t *hintbp)
-{
-	int d, error;
-	dva_t *dva = bp->blk_dva;
-	dva_t *hintdva = hintbp->blk_dva;
-
-	ASSERT(ncopies > 0 && ncopies <= spa_max_replication(spa));
-	ASSERT(BP_GET_NDVAS(bp) == 0);
-	ASSERT(hintbp == NULL || ncopies <= BP_GET_NDVAS(hintbp));
-
-	for (d = 0; d < ncopies; d++) {
-		error = metaslab_alloc_one(spa, psize, dva, d, hintdva, txg);
-		if (error) {
-			for (d--; d >= 0; d--) {
-				ASSERT(DVA_IS_VALID(&dva[d]));
-				metaslab_free(spa, &dva[d], txg, B_TRUE);
-				bzero(&dva[d], sizeof (dva_t));
-			}
-			return (ENOSPC);
-		}
-	}
-	ASSERT(error == 0);
-	ASSERT(BP_GET_NDVAS(bp) == ncopies);
-
-	return (0);
-}
-
 /*
  * Free the block represented by DVA in the context of the specified
  * transaction group.
  */
-void
-metaslab_free(spa_t *spa, dva_t *dva, uint64_t txg, boolean_t now)
+static void
+metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
 {
 	uint64_t vdev = DVA_GET_VDEV(dva);
 	uint64_t offset = DVA_GET_OFFSET(dva);
@@ -896,19 +823,15 @@
 	vdev_t *vd;
 	metaslab_t *msp;
 
+	ASSERT(DVA_IS_VALID(dva));
+
 	if (txg > spa_freeze_txg(spa))
 		return;
 
-	if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
-		cmn_err(CE_WARN, "metaslab_free(): bad vdev %llu",
-		    (u_longlong_t)vdev);
-		ASSERT(0);
-		return;
-	}
-
-	if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
-		cmn_err(CE_WARN, "metaslab_free(): bad offset %llu",
-		    (u_longlong_t)offset);
+	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
+	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
+		cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
+		    (u_longlong_t)vdev, (u_longlong_t)offset);
 		ASSERT(0);
 		return;
 	}
@@ -932,3 +855,108 @@
 
 	mutex_exit(&msp->ms_lock);
 }
+
+/*
+ * Intent log support: upon opening the pool after a crash, notify the SPA
+ * of blocks that the intent log has allocated for immediate write, but
+ * which are still considered free by the SPA because the last transaction
+ * group didn't commit yet.
+ */
+static int
+metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
+{
+	uint64_t vdev = DVA_GET_VDEV(dva);
+	uint64_t offset = DVA_GET_OFFSET(dva);
+	uint64_t size = DVA_GET_ASIZE(dva);
+	vdev_t *vd;
+	metaslab_t *msp;
+	int error;
+
+	ASSERT(DVA_IS_VALID(dva));
+
+	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
+	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
+		return (ENXIO);
+
+	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+
+	if (DVA_GET_GANG(dva))
+		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+
+	mutex_enter(&msp->ms_lock);
+
+	error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
+	if (error) {
+		mutex_exit(&msp->ms_lock);
+		return (error);
+	}
+
+	if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
+		vdev_dirty(vd, VDD_METASLAB, msp, txg);
+
+	space_map_claim(&msp->ms_map, offset, size);
+	space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
+
+	mutex_exit(&msp->ms_lock);
+
+	return (0);
+}
+
+int
+metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp, int ndvas,
+    uint64_t txg, blkptr_t *hintbp)
+{
+	dva_t *dva = bp->blk_dva;
+	dva_t *hintdva = hintbp->blk_dva;
+	int d;
+	int error = 0;
+
+	ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
+	ASSERT(BP_GET_NDVAS(bp) == 0);
+	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
+
+	for (d = 0; d < ndvas; d++) {
+		error = metaslab_alloc_dva(spa, psize, dva, d, hintdva, txg);
+		if (error) {
+			for (d--; d >= 0; d--) {
+				metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
+				bzero(&dva[d], sizeof (dva_t));
+			}
+			return (error);
+		}
+	}
+	ASSERT(error == 0);
+	ASSERT(BP_GET_NDVAS(bp) == ndvas);
+
+	return (0);
+}
+
+void
+metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
+{
+	const dva_t *dva = bp->blk_dva;
+	int ndvas = BP_GET_NDVAS(bp);
+	int d;
+
+	ASSERT(!BP_IS_HOLE(bp));
+
+	for (d = 0; d < ndvas; d++)
+		metaslab_free_dva(spa, &dva[d], txg, now);
+}
+
+int
+metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
+{
+	const dva_t *dva = bp->blk_dva;
+	int ndvas = BP_GET_NDVAS(bp);
+	int d, error;
+	int last_error = 0;
+
+	ASSERT(!BP_IS_HOLE(bp));
+
+	for (d = 0; d < ndvas; d++)
+		if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
+			last_error = error;
+
+	return (last_error);
+}
--- a/usr/src/uts/common/fs/zfs/spa.c	Thu Apr 13 15:37:22 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/spa.c	Thu Apr 13 16:15:06 2006 -0700
@@ -426,7 +426,7 @@
 	error = zap_lookup(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST,
 	    sizeof (uint64_t), 1, &spa->spa_errlog_last);
-	if (error != 0 &&error != ENOENT) {
+	if (error != 0 && error != ENOENT) {
 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		error = EIO;
@@ -1530,7 +1530,7 @@
 	if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
 		flags |= ZIO_FLAG_SPECULATIVE;	/* intent log block */
 
-	flags |= ZIO_FLAG_CANFAIL;
+	flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL;
 
 	zio_nowait(zio_read(NULL, spa, bp, data, size,
 	    spa_scrub_io_done, NULL, priority, flags, zb));
--- a/usr/src/uts/common/fs/zfs/spa_misc.c	Thu Apr 13 15:37:22 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c	Thu Apr 13 16:15:06 2006 -0700
@@ -616,7 +616,7 @@
 }
 
 void
-sprintf_blkptr(char *buf, int len, blkptr_t *bp)
+sprintf_blkptr(char *buf, int len, const blkptr_t *bp)
 {
 	int d;
 
@@ -637,7 +637,7 @@
 	    (u_longlong_t)BP_GET_PSIZE(bp));
 
 	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
-		dva_t *dva = &bp->blk_dva[d];
+		const dva_t *dva = &bp->blk_dva[d];
 		(void) snprintf(buf + strlen(buf), len - strlen(buf),
 		    "DVA[%d]=<%llu:%llx:%llx> ", d,
 		    (u_longlong_t)DVA_GET_VDEV(dva),
--- a/usr/src/uts/common/fs/zfs/sys/metaslab.h	Thu Apr 13 15:37:22 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h	Thu Apr 13 16:15:06 2006 -0700
@@ -49,8 +49,9 @@
 
 extern int metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp,
     int ncopies, uint64_t txg, blkptr_t *hintbp);
-extern void metaslab_free(spa_t *spa, dva_t *dva, uint64_t txg, boolean_t now);
-extern int metaslab_claim(spa_t *spa, dva_t *dva, uint64_t txg);
+extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg,
+    boolean_t now);
+extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg);
 
 extern metaslab_class_t *metaslab_class_create(void);
 extern void metaslab_class_destroy(metaslab_class_t *mc);
--- a/usr/src/uts/common/fs/zfs/sys/spa.h	Thu Apr 13 15:37:22 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h	Thu Apr 13 16:15:06 2006 -0700
@@ -407,7 +407,7 @@
 extern char *spa_strdup(const char *);
 extern void spa_strfree(char *);
 extern uint64_t spa_get_random(uint64_t range);
-extern void sprintf_blkptr(char *buf, int len, blkptr_t *bp);
+extern void sprintf_blkptr(char *buf, int len, const blkptr_t *bp);
 extern void spa_freeze(spa_t *spa);
 extern void spa_upgrade(spa_t *spa);
 extern void spa_evict_all(void);
--- a/usr/src/uts/common/fs/zfs/sys/zil.h	Thu Apr 13 15:37:22 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zil.h	Thu Apr 13 16:15:06 2006 -0700
@@ -57,7 +57,8 @@
 	uint64_t zh_claim_txg;	/* txg in which log blocks were claimed */
 	uint64_t zh_replay_seq;	/* highest replayed sequence number */
 	blkptr_t zh_log;	/* log chain */
-	uint64_t zit_pad[6];
+	uint64_t zh_claim_seq;	/* highest claimed sequence number */
+	uint64_t zh_pad[5];
 } zil_header_t;
 
 /*
@@ -80,6 +81,14 @@
 #define	ZIL_BLK_DATA_SZ(lwb)	((lwb)->lwb_sz - sizeof (zil_trailer_t))
 
 /*
+ * The words of a log block checksum.
+ */
+#define	ZIL_ZC_GUID_0	0
+#define	ZIL_ZC_GUID_1	1
+#define	ZIL_ZC_OBJSET	2
+#define	ZIL_ZC_SEQ	3
+
+/*
  * Intent log transaction types and record structures
  */
 #define	TX_CREATE	1		/* Create file */
@@ -208,7 +217,7 @@
 typedef int zil_replay_func_t();
 typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf);
 
-extern void	zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
+extern uint64_t	zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
     zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg);
 
 extern void	zil_init(void);
@@ -222,7 +231,7 @@
 
 extern void	zil_replay(objset_t *os, void *arg, uint64_t *txgp,
     zil_replay_func_t *replay_func[TX_MAX_TYPE], void (*rm_wait)(void *));
-extern void	zil_destroy(zilog_t *zilog);
+extern void	zil_destroy(zilog_t *zilog, boolean_t keep_first);
 
 extern itx_t	*zil_itx_create(int txtype, size_t lrsize);
 extern uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
--- a/usr/src/uts/common/fs/zfs/sys/zil_impl.h	Thu Apr 13 15:37:22 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zil_impl.h	Thu Apr 13 16:15:06 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -75,7 +74,7 @@
 	kmutex_t	zl_lock;	/* protects most zilog_t fields */
 	struct dsl_pool	*zl_dmu_pool;	/* DSL pool */
 	spa_t		*zl_spa;	/* handle for read/write log */
-	zil_header_t	*zl_header;	/* log header buffer */
+	const zil_header_t *zl_header;	/* log header buffer */
 	objset_t	*zl_os;		/* object set we're logging */
 	zil_get_data_t	*zl_get_data;	/* callback to get object content */
 	uint64_t	zl_itx_seq;	/* itx sequence number */
@@ -85,6 +84,9 @@
 	uint32_t	zl_suspend;	/* log suspend count */
 	kcondvar_t	zl_cv_write;	/* for waiting to write to log */
 	kcondvar_t	zl_cv_seq;	/* for committing a sequence */
+	kcondvar_t	zl_cv_suspend;	/* log suspend completion */
+	uint8_t		zl_suspending;	/* log is currently suspending */
+	uint8_t		zl_keep_first;	/* keep first log block in destroy */
 	uint8_t		zl_stop_replay;	/* don't replay any further */
 	uint8_t		zl_stop_sync;	/* for debugging */
 	uint8_t		zl_writer;	/* boolean: write setup in progress */
@@ -97,7 +99,6 @@
 	list_t		zl_vdev_list;	/* list of [vdev, seq] pairs */
 	taskq_t		*zl_clean_taskq; /* runs lwb and itx clean tasks */
 	avl_tree_t	zl_dva_tree;	/* track DVAs during log parse */
-	kmutex_t	zl_destroy_lock; /* serializes zil_destroy() calls */
 };
 
 typedef struct zil_dva_node {
--- a/usr/src/uts/common/fs/zfs/sys/zio.h	Thu Apr 13 15:37:22 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h	Thu Apr 13 16:15:06 2006 -0700
@@ -125,7 +125,8 @@
 
 #define	ZIO_FLAG_RESILVER		0x01000
 #define	ZIO_FLAG_SCRUB			0x02000
-#define	ZIO_FLAG_SUBBLOCK		0x04000
+#define	ZIO_FLAG_SCRUB_THREAD		0x04000
+#define	ZIO_FLAG_SUBBLOCK		0x08000
 
 #define	ZIO_FLAG_NOBOOKMARK		0x10000
 
@@ -137,7 +138,8 @@
 	ZIO_FLAG_IO_REPAIR |		\
 	ZIO_FLAG_SPECULATIVE |		\
 	ZIO_FLAG_RESILVER |		\
-	ZIO_FLAG_SCRUB)
+	ZIO_FLAG_SCRUB |		\
+	ZIO_FLAG_SCRUB_THREAD)
 
 #define	ZIO_FLAG_VDEV_INHERIT		\
 	(ZIO_FLAG_GANG_INHERIT |	\
@@ -282,8 +284,7 @@
     uint64_t size, void *data, int checksum,
     zio_done_func_t *done, void *private, int priority, int flags);
 
-extern int zio_alloc_blk(spa_t *spa, int checksum, uint64_t size,
-    blkptr_t *bp, uint64_t txg);
+extern int zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *bp, uint64_t txg);
 extern void zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg);
 
 extern int zio_wait(zio_t *zio);
--- a/usr/src/uts/common/fs/zfs/vdev.c	Thu Apr 13 15:37:22 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/vdev.c	Thu Apr 13 16:15:06 2006 -0700
@@ -1502,7 +1502,7 @@
 		if ((flags & ZIO_FLAG_IO_REPAIR) &&
 		    zio->io_delegate_list == NULL) {
 			mutex_enter(&vd->vdev_stat_lock);
-			if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))
+			if (flags & ZIO_FLAG_SCRUB_THREAD)
 				vs->vs_scrub_repaired += zio->io_size;
 			else
 				vs->vs_self_healed += zio->io_size;
@@ -1530,7 +1530,7 @@
 	if (type == ZIO_TYPE_WRITE) {
 		if (txg == 0 || vd->vdev_children != 0)
 			return;
-		if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) {
+		if (flags & ZIO_FLAG_SCRUB_THREAD) {
 			ASSERT(flags & ZIO_FLAG_IO_REPAIR);
 			for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
 				vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1);
--- a/usr/src/uts/common/fs/zfs/vdev_mirror.c	Thu Apr 13 15:37:22 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c	Thu Apr 13 16:15:06 2006 -0700
@@ -389,7 +389,9 @@
 		ASSERT(zio->io_error != 0);
 
 	if (good_copies && (spa_mode & FWRITE) &&
-	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
+	    (unexpected_errors ||
+	    (zio->io_flags & ZIO_FLAG_RESILVER) ||
+	    ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_replacing))) {
 		zio_t *rio;
 
 		/*
@@ -415,7 +417,8 @@
 			if (mc->mc_error == 0) {
 				if (mc->mc_tried)
 					continue;
-				if (!vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map,
+				if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
+				    !vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map,
 				    zio->io_txg, 1))
 					continue;
 				mc->mc_error = ESTALE;
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c	Thu Apr 13 15:37:22 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c	Thu Apr 13 16:15:06 2006 -0700
@@ -118,7 +118,7 @@
 	avl_add(&vq->vq_deadline_tree, zio);
 	avl_add(zio->io_vdev_tree, zio);
 
-	if ((zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) &&
+	if ((zio->io_flags & ZIO_FLAG_SCRUB_THREAD) &&
 	    ++vq->vq_scrub_count >= vq->vq_scrub_limit)
 		spa_scrub_throttle(zio->io_spa, 1);
 }
@@ -126,7 +126,7 @@
 static void
 vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
 {
-	if ((zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) &&
+	if ((zio->io_flags & ZIO_FLAG_SCRUB_THREAD) &&
 	    vq->vq_scrub_count-- >= vq->vq_scrub_limit)
 		spa_scrub_throttle(zio->io_spa, -1);
 
--- a/usr/src/uts/common/fs/zfs/zil.c	Thu Apr 13 15:37:22 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/zil.c	Thu Apr 13 16:15:06 2006 -0700
@@ -127,76 +127,94 @@
 	return (0);
 }
 
+static zil_header_t *
+zil_header_in_syncing_context(zilog_t *zilog)
+{
+	return ((zil_header_t *)zilog->zl_header);
+}
+
+static void
+zil_init_log_chain(zilog_t *zilog, blkptr_t *bp)
+{
+	zio_cksum_t *zc = &bp->blk_cksum;
+
+	zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL);
+	zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL);
+	zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os);
+	zc->zc_word[ZIL_ZC_SEQ] = 1ULL;
+}
+
 /*
  * Read a log block, make sure it's valid, and byteswap it if necessary.
  */
 static int
-zil_read_log_block(zilog_t *zilog, blkptr_t *bp, char *buf)
+zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp)
 {
-	uint64_t blksz = BP_GET_LSIZE(bp);
-	zil_trailer_t *ztp = (zil_trailer_t *)(buf + blksz) - 1;
-	zio_cksum_t cksum;
+	blkptr_t blk = *bp;
 	zbookmark_t zb;
 	int error;
 
-	zb.zb_objset = bp->blk_cksum.zc_word[2];
+	zb.zb_objset = bp->blk_cksum.zc_word[ZIL_ZC_OBJSET];
 	zb.zb_object = 0;
 	zb.zb_level = -1;
-	zb.zb_blkid = bp->blk_cksum.zc_word[3];
+	zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
+
+	*abufpp = NULL;
+
+	error = arc_read(NULL, zilog->zl_spa, &blk, byteswap_uint64_array,
+	    arc_getbuf_func, abufpp, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL |
+	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB, ARC_WAIT, &zb);
+
+	if (error == 0) {
+		char *data = (*abufpp)->b_data;
+		uint64_t blksz = BP_GET_LSIZE(bp);
+		zil_trailer_t *ztp = (zil_trailer_t *)(data + blksz) - 1;
+		zio_cksum_t cksum = bp->blk_cksum;
 
-	error = zio_wait(zio_read(NULL, zilog->zl_spa, bp, buf, blksz,
-	    NULL, NULL, ZIO_PRIORITY_SYNC_READ,
-	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb));
-	if (error) {
-		dprintf_bp(bp, "zilog %p bp %p read failed, error %d: ",
-		    zilog, bp, error);
-		return (error);
+		/*
+		 * Sequence numbers should be... sequential.  The checksum
+		 * verifier for the next block should be bp's checksum plus 1.
+		 */
+		cksum.zc_word[ZIL_ZC_SEQ]++;
+
+		if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum, sizeof (cksum)))
+			error = ESTALE;
+		else if (BP_IS_HOLE(&ztp->zit_next_blk))
+			error = ENOENT;
+		else if (ztp->zit_nused > (blksz - sizeof (zil_trailer_t)))
+			error = EOVERFLOW;
+
+		if (error) {
+			VERIFY(arc_buf_remove_ref(*abufpp, abufpp) == 1);
+			*abufpp = NULL;
+		}
 	}
 
-	if (BP_SHOULD_BYTESWAP(bp))
-		byteswap_uint64_array(buf, blksz);
-
-	/*
-	 * Sequence numbers should be... sequential.  The checksum verifier for
-	 * the next block should be: <logid[0], logid[1], objset id, seq + 1>.
-	 */
-	cksum = bp->blk_cksum;
-	cksum.zc_word[3]++;
-	if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum, sizeof (cksum)) != 0) {
-		dprintf_bp(bp, "zilog %p bp %p stale pointer: ", zilog, bp);
-		return (ESTALE);
-	}
+	dprintf("error %d on %llu:%llu\n", error, zb.zb_objset, zb.zb_blkid);
 
-	if (BP_IS_HOLE(&ztp->zit_next_blk)) {
-		dprintf_bp(bp, "zilog %p bp %p hole: ", zilog, bp);
-		return (ENOENT);
-	}
-
-	if (ztp->zit_nused > (blksz - sizeof (zil_trailer_t))) {
-		dprintf("zilog %p bp %p nused exceeds blksz\n", zilog, bp);
-		return (EOVERFLOW);
-	}
-
-	dprintf_bp(bp, "zilog %p bp %p good block: ", zilog, bp);
-
-	return (0);
+	return (error);
 }
 
 /*
  * Parse the intent log, and call parse_func for each valid record within.
+ * Return the highest sequence number.
  */
-void
+uint64_t
 zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
     zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg)
 {
-	blkptr_t blk;
+	const zil_header_t *zh = zilog->zl_header;
+	uint64_t claim_seq = zh->zh_claim_seq;
+	uint64_t seq = 0;
+	uint64_t max_seq = 0;
+	blkptr_t blk = zh->zh_log;
+	arc_buf_t *abuf;
 	char *lrbuf, *lrp;
 	zil_trailer_t *ztp;
 	int reclen, error;
 
-	blk = zilog->zl_header->zh_log;
 	if (BP_IS_HOLE(&blk))
-		return;
+		return (max_seq);
 
 	/*
 	 * Starting at the block pointed to by zh_log we read the log chain.
@@ -204,11 +222,20 @@
 	 * ensure its validity.  We stop when an invalid block is found.
 	 * For each block pointer in the chain we call parse_blk_func().
 	 * For each record in each valid block we call parse_lr_func().
+	 * If the log has been claimed, stop if we encounter a sequence
+	 * number greater than the highest claimed sequence number.
 	 */
 	zil_dva_tree_init(&zilog->zl_dva_tree);
-	lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE);
 	for (;;) {
-		error = zil_read_log_block(zilog, &blk, lrbuf);
+		seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
+
+		if (claim_seq != 0 && seq > claim_seq)
+			break;
+
+		ASSERT(max_seq < seq);
+		max_seq = seq;
+
+		error = zil_read_log_block(zilog, &blk, &abuf);
 
 		if (parse_blk_func != NULL)
 			parse_blk_func(zilog, &blk, arg, txg);
@@ -216,11 +243,14 @@
 		if (error)
 			break;
 
+		lrbuf = abuf->b_data;
 		ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;
 		blk = ztp->zit_next_blk;
 
-		if (parse_lr_func == NULL)
+		if (parse_lr_func == NULL) {
+			VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
 			continue;
+		}
 
 		for (lrp = lrbuf; lrp < lrbuf + ztp->zit_nused; lrp += reclen) {
 			lr_t *lr = (lr_t *)lrp;
@@ -228,9 +258,11 @@
 			ASSERT3U(reclen, >=, sizeof (lr_t));
 			parse_lr_func(zilog, lr, arg, txg);
 		}
+		VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
 	}
-	zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE);
 	zil_dva_tree_fini(&zilog->zl_dva_tree);
+
+	return (max_seq);
 }
 
 /* ARGSUSED */
@@ -240,8 +272,6 @@
 	spa_t *spa = zilog->zl_spa;
 	int err;
 
-	dprintf_bp(bp, "first_txg %llu: ", first_txg);
-
 	/*
 	 * Claim log block if not already committed and not already claimed.
 	 */
@@ -291,44 +321,42 @@
 static void
 zil_create(zilog_t *zilog)
 {
+	const zil_header_t *zh = zilog->zl_header;
 	lwb_t *lwb;
-	uint64_t txg;
-	dmu_tx_t *tx;
+	uint64_t txg = 0;
+	dmu_tx_t *tx = NULL;
 	blkptr_t blk;
-	int error;
-	int no_blk;
-
-	ASSERT(zilog->zl_header->zh_claim_txg == 0);
-	ASSERT(zilog->zl_header->zh_replay_seq == 0);
+	int error = 0;
 
 	/*
-	 * Initialize the log header block.
+	 * Wait for any previous destroy to complete.
 	 */
-	tx = dmu_tx_create(zilog->zl_os);
-	(void) dmu_tx_assign(tx, TXG_WAIT);
-	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
-	txg = dmu_tx_get_txg(tx);
+	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
+
+	ASSERT(zh->zh_claim_txg == 0);
+	ASSERT(zh->zh_replay_seq == 0);
+
+	blk = zh->zh_log;
 
 	/*
-	 * If we don't have a log block already then
-	 * allocate the first log block and assign its checksum verifier.
+	 * If we don't already have an initial log block, allocate one now.
 	 */
-	no_blk = BP_IS_HOLE(&zilog->zl_header->zh_log);
-	if (no_blk) {
-		error = zio_alloc_blk(zilog->zl_spa, ZIO_CHECKSUM_ZILOG,
-		    ZIL_MIN_BLKSZ, &blk, txg);
-	} else {
-		blk = zilog->zl_header->zh_log;
-		error = 0;
+	if (BP_IS_HOLE(&blk)) {
+		tx = dmu_tx_create(zilog->zl_os);
+		(void) dmu_tx_assign(tx, TXG_WAIT);
+		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+		txg = dmu_tx_get_txg(tx);
+
+		error = zio_alloc_blk(zilog->zl_spa, ZIL_MIN_BLKSZ, &blk, txg);
+
+		if (error == 0)
+			zil_init_log_chain(zilog, &blk);
 	}
+
+	/*
+	 * Allocate a log write buffer (lwb) for the first log block.
+	 */
 	if (error == 0) {
-		ZIO_SET_CHECKSUM(&blk.blk_cksum,
-		    spa_get_random(-1ULL), spa_get_random(-1ULL),
-		    dmu_objset_id(zilog->zl_os), 1ULL);
-
-		/*
-		 * Allocate a log write buffer (lwb) for the first log block.
-		 */
 		lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
 		lwb->lwb_zilog = zilog;
 		lwb->lwb_blk = blk;
@@ -343,43 +371,81 @@
 		mutex_exit(&zilog->zl_lock);
 	}
 
-	dmu_tx_commit(tx);
-	if (no_blk)
+	/*
+	 * If we just allocated the first log block, commit our transaction
+	 * and wait for zil_sync() to stuff the block poiner into zh_log.
+	 * (zh is part of the MOS, so we cannot modify it in open context.)
+	 */
+	if (tx != NULL) {
+		dmu_tx_commit(tx);
 		txg_wait_synced(zilog->zl_dmu_pool, txg);
+	}
+
+	ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
 }
 
 /*
  * In one tx, free all log blocks and clear the log header.
+ * If keep_first is set, then we're replaying a log with no content.
+ * We want to keep the first block, however, so that the first
+ * synchronous transaction doesn't require a txg_wait_synced()
+ * in zil_create().  We don't need to txg_wait_synced() here either
+ * when keep_first is set, because both zil_create() and zil_destroy()
+ * will wait for any in-progress destroys to complete.
  */
 void
-zil_destroy(zilog_t *zilog)
+zil_destroy(zilog_t *zilog, boolean_t keep_first)
 {
+	const zil_header_t *zh = zilog->zl_header;
+	lwb_t *lwb;
 	dmu_tx_t *tx;
 	uint64_t txg;
 
-	mutex_enter(&zilog->zl_destroy_lock);
+	/*
+	 * Wait for any previous destroy to complete.
+	 */
+	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 
-	if (BP_IS_HOLE(&zilog->zl_header->zh_log)) {
-		mutex_exit(&zilog->zl_destroy_lock);
+	if (BP_IS_HOLE(&zh->zh_log))
 		return;
-	}
 
 	tx = dmu_tx_create(zilog->zl_os);
 	(void) dmu_tx_assign(tx, TXG_WAIT);
 	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 	txg = dmu_tx_get_txg(tx);
 
-	zil_parse(zilog, zil_free_log_block, zil_free_log_record, tx,
-	    zilog->zl_header->zh_claim_txg);
-	/*
-	 * zil_sync clears the zil header as soon as the zl_destroy_txg commits
-	 */
+	mutex_enter(&zilog->zl_lock);
+
+	ASSERT3U(zilog->zl_destroy_txg, <, txg);
 	zilog->zl_destroy_txg = txg;
+	zilog->zl_keep_first = keep_first;
+
+	if (!list_is_empty(&zilog->zl_lwb_list)) {
+		ASSERT(zh->zh_claim_txg == 0);
+		ASSERT(!keep_first);
+		while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
+			list_remove(&zilog->zl_lwb_list, lwb);
+			if (lwb->lwb_buf != NULL)
+				zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+			zio_free_blk(zilog->zl_spa, &lwb->lwb_blk, txg);
+			kmem_cache_free(zil_lwb_cache, lwb);
+		}
+		mutex_exit(&zilog->zl_lock);
+	} else {
+		mutex_exit(&zilog->zl_lock);
+		if (!keep_first) {
+			(void) zil_parse(zilog, zil_free_log_block,
+			    zil_free_log_record, tx, zh->zh_claim_txg);
+		}
+	}
 
 	dmu_tx_commit(tx);
-	txg_wait_synced(zilog->zl_dmu_pool, txg);
 
-	mutex_exit(&zilog->zl_destroy_lock);
+	if (keep_first)			/* no need to wait in this case */
+		return;
+
+	txg_wait_synced(zilog->zl_dmu_pool, txg);
+	ASSERT(BP_IS_HOLE(&zh->zh_log));
 }
 
 void
@@ -399,18 +465,23 @@
 	}
 
 	zilog = dmu_objset_zil(os);
-	zh = zilog->zl_header;
+	zh = zil_header_in_syncing_context(zilog);
 
 	/*
-	 * Claim all log blocks if we haven't already done so.
+	 * Claim all log blocks if we haven't already done so, and remember
+	 * the highest claimed sequence number.  This ensures that if we can
+	 * read only part of the log now (e.g. due to a missing device),
+	 * but we can read the entire log later, we will not try to replay
+	 * or destroy beyond the last block we successfully claimed.
 	 */
 	ASSERT3U(zh->zh_claim_txg, <=, first_txg);
 	if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
 		zh->zh_claim_txg = first_txg;
-		zil_parse(zilog, zil_claim_log_block, zil_claim_log_record,
-		    tx, first_txg);
+		zh->zh_claim_seq = zil_parse(zilog, zil_claim_log_block,
+		    zil_claim_log_record, tx, first_txg);
 		dsl_dataset_dirty(dmu_objset_ds(os), tx);
 	}
+
 	ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
 	dmu_objset_close(os);
 }
@@ -555,6 +626,8 @@
 {
 	lwb_t *nlwb;
 	zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1;
+	spa_t *spa = zilog->zl_spa;
+	blkptr_t *bp = &ztp->zit_next_blk;
 	uint64_t txg;
 	uint64_t zil_blksz;
 	zbookmark_t zb;
@@ -583,8 +656,7 @@
 	if (zil_blksz > ZIL_MAX_BLKSZ)
 		zil_blksz = ZIL_MAX_BLKSZ;
 
-	error = zio_alloc_blk(zilog->zl_spa, ZIO_CHECKSUM_ZILOG,
-	    zil_blksz, &ztp->zit_next_blk, txg);
+	error = zio_alloc_blk(spa, zil_blksz, bp, txg);
 	if (error) {
 		/*
 		 * Reinitialise the lwb.
@@ -599,12 +671,12 @@
 		return (NULL);
 	}
 
-	ASSERT3U(ztp->zit_next_blk.blk_birth, ==, txg);
+	ASSERT3U(bp->blk_birth, ==, txg);
 	ztp->zit_pad = 0;
 	ztp->zit_nused = lwb->lwb_nused;
 	ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
-	ztp->zit_next_blk.blk_cksum = lwb->lwb_blk.blk_cksum;
-	ztp->zit_next_blk.blk_cksum.zc_word[3]++;
+	bp->blk_cksum = lwb->lwb_blk.blk_cksum;
+	bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
 
 	/*
 	 * Allocate a new log write buffer (lwb).
@@ -612,7 +684,7 @@
 	nlwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
 
 	nlwb->lwb_zilog = zilog;
-	nlwb->lwb_blk = ztp->zit_next_blk;
+	nlwb->lwb_blk = *bp;
 	nlwb->lwb_nused = 0;
 	nlwb->lwb_sz = BP_GET_LSIZE(&nlwb->lwb_blk);
 	nlwb->lwb_buf = zio_buf_alloc(nlwb->lwb_sz);
@@ -633,14 +705,12 @@
 	/*
 	 * write the old log block
 	 */
-	dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg);
-
-	zb.zb_objset = lwb->lwb_blk.blk_cksum.zc_word[2];
+	zb.zb_objset = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET];
 	zb.zb_object = 0;
 	zb.zb_level = -1;
-	zb.zb_blkid = lwb->lwb_blk.blk_cksum.zc_word[3];
+	zb.zb_blkid = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
 
-	zio_nowait(zio_rewrite(NULL, zilog->zl_spa, ZIO_CHECKSUM_ZILOG, 0,
+	zio_nowait(zio_rewrite(NULL, spa, ZIO_CHECKSUM_ZILOG, 0,
 	    &lwb->lwb_blk, lwb->lwb_buf, lwb->lwb_sz, zil_lwb_write_done, lwb,
 	    ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb));
 
@@ -949,21 +1019,40 @@
 void
 zil_sync(zilog_t *zilog, dmu_tx_t *tx)
 {
+	zil_header_t *zh = zil_header_in_syncing_context(zilog);
 	uint64_t txg = dmu_tx_get_txg(tx);
 	spa_t *spa = zilog->zl_spa;
 	lwb_t *lwb;
 
+	mutex_enter(&zilog->zl_lock);
+
 	ASSERT(zilog->zl_stop_sync == 0);
 
-	zilog->zl_header->zh_replay_seq = zilog->zl_replay_seq[txg & TXG_MASK];
+	zh->zh_replay_seq = zilog->zl_replay_seq[txg & TXG_MASK];
 
 	if (zilog->zl_destroy_txg == txg) {
-		bzero(zilog->zl_header, sizeof (zil_header_t));
+		blkptr_t blk = zh->zh_log;
+
+		ASSERT(list_head(&zilog->zl_lwb_list) == NULL);
+		ASSERT(spa_sync_pass(spa) == 1);
+
+		bzero(zh, sizeof (zil_header_t));
 		bzero(zilog->zl_replay_seq, sizeof (zilog->zl_replay_seq));
-		zilog->zl_destroy_txg = 0;
+
+		if (zilog->zl_keep_first) {
+			/*
+			 * If this block was part of log chain that couldn't
+			 * be claimed because a device was missing during
+			 * zil_claim(), but that device later returns,
+			 * then this block could erroneously appear valid.
+			 * To guard against this, assign a new GUID to the new
+			 * log chain so it doesn't matter what blk points to.
+			 */
+			zil_init_log_chain(zilog, &blk);
+			zh->zh_log = blk;
+		}
 	}
 
-	mutex_enter(&zilog->zl_lock);
 	for (;;) {
 		lwb = list_head(&zilog->zl_lwb_list);
 		if (lwb == NULL) {
@@ -976,7 +1065,7 @@
 		zio_free_blk(spa, &lwb->lwb_blk, txg);
 		kmem_cache_free(zil_lwb_cache, lwb);
 	}
-	zilog->zl_header->zh_log = lwb->lwb_blk;
+	zh->zh_log = lwb->lwb_blk;
 	mutex_exit(&zilog->zl_lock);
 }
 
@@ -1004,6 +1093,7 @@
 	zilog->zl_os = os;
 	zilog->zl_spa = dmu_objset_spa(os);
 	zilog->zl_dmu_pool = dmu_objset_pool(os);
+	zilog->zl_destroy_txg = TXG_INITIAL - 1;
 
 	list_create(&zilog->zl_itx_list, sizeof (itx_t),
 	    offsetof(itx_t, itx_node));
@@ -1051,18 +1141,17 @@
 static int
 zil_empty(zilog_t *zilog)
 {
-	blkptr_t blk;
-	char *lrbuf;
-	int error;
+	const zil_header_t *zh = zilog->zl_header;
+	arc_buf_t *abuf = NULL;
 
-	blk = zilog->zl_header->zh_log;
-	if (BP_IS_HOLE(&blk))
+	if (BP_IS_HOLE(&zh->zh_log))
 		return (1);
 
-	lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE);
-	error = zil_read_log_block(zilog, &blk, lrbuf);
-	zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE);
-	return (error ? 1 : 0);
+	if (zil_read_log_block(zilog, &zh->zh_log, &abuf) != 0)
+		return (1);
+
+	VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
+	return (0);
 }
 
 /*
@@ -1086,8 +1175,20 @@
 void
 zil_close(zilog_t *zilog)
 {
-	if (!zil_is_committed(zilog))
-		txg_wait_synced(zilog->zl_dmu_pool, 0);
+	/*
+	 * If the log isn't already committed, mark the objset dirty
+	 * (so zil_sync() will be called) and wait for that txg to sync.
+	 */
+	if (!zil_is_committed(zilog)) {
+		uint64_t txg;
+		dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
+		(void) dmu_tx_assign(tx, TXG_WAIT);
+		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+		txg = dmu_tx_get_txg(tx);
+		dmu_tx_commit(tx);
+		txg_wait_synced(zilog->zl_dmu_pool, txg);
+	}
+
 	taskq_destroy(zilog->zl_clean_taskq);
 	zilog->zl_clean_taskq = NULL;
 	zilog->zl_get_data = NULL;
@@ -1105,38 +1206,55 @@
 int
 zil_suspend(zilog_t *zilog)
 {
+	const zil_header_t *zh = zilog->zl_header;
 	lwb_t *lwb;
 
 	mutex_enter(&zilog->zl_lock);
-	if (zilog->zl_header->zh_claim_txg != 0) {	/* unplayed log */
+	if (zh->zh_claim_txg != 0) {		/* unplayed log */
 		mutex_exit(&zilog->zl_lock);
 		return (EBUSY);
 	}
-	zilog->zl_suspend++;
+	if (zilog->zl_suspend++ != 0) {
+		/*
+		 * Someone else already began a suspend.
+		 * Just wait for them to finish.
+		 */
+		while (zilog->zl_suspending)
+			cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock);
+		ASSERT(BP_IS_HOLE(&zh->zh_log));
+		mutex_exit(&zilog->zl_lock);
+		return (0);
+	}
+	zilog->zl_suspending = B_TRUE;
 	mutex_exit(&zilog->zl_lock);
 
 	zil_commit(zilog, UINT64_MAX, FSYNC);
 
 	mutex_enter(&zilog->zl_lock);
-	while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
-		if (lwb->lwb_buf != NULL) {
-			/*
-			 * Wait for the buffer if it's in the process of
-			 * being written.
-			 */
-			if ((lwb->lwb_seq != 0) &&
-			    (lwb->lwb_state != SEQ_COMPLETE)) {
-				cv_wait(&zilog->zl_cv_seq, &zilog->zl_lock);
-				continue;
-			}
-			zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
-		}
-		list_remove(&zilog->zl_lwb_list, lwb);
-		kmem_cache_free(zil_lwb_cache, lwb);
+	for (;;) {
+		/*
+		 * Wait for any in-flight log writes to complete.
+		 */
+		for (lwb = list_head(&zilog->zl_lwb_list); lwb != NULL;
+		    lwb = list_next(&zilog->zl_lwb_list, lwb))
+			if (lwb->lwb_seq != 0 && lwb->lwb_state != SEQ_COMPLETE)
+				break;
+
+		if (lwb == NULL)
+			break;
+
+		cv_wait(&zilog->zl_cv_seq, &zilog->zl_lock);
 	}
+
 	mutex_exit(&zilog->zl_lock);
 
-	zil_destroy(zilog);
+	zil_destroy(zilog, B_FALSE);
+
+	mutex_enter(&zilog->zl_lock);
+	ASSERT(BP_IS_HOLE(&zh->zh_log));
+	zilog->zl_suspending = B_FALSE;
+	cv_broadcast(&zilog->zl_cv_suspend);
+	mutex_exit(&zilog->zl_lock);
 
 	return (0);
 }
@@ -1164,7 +1282,7 @@
 zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
 {
 	zil_replay_arg_t *zr = zra;
-	zil_header_t *zh = zilog->zl_header;
+	const zil_header_t *zh = zilog->zl_header;
 	uint64_t reclen = lr->lrc_reclen;
 	uint64_t txtype = lr->lrc_txtype;
 	int pass, error;
@@ -1310,15 +1428,11 @@
 	zil_replay_func_t *replay_func[TX_MAX_TYPE], void (*rm_sync)(void *arg))
 {
 	zilog_t *zilog = dmu_objset_zil(os);
-		zil_replay_arg_t zr;
+	const zil_header_t *zh = zilog->zl_header;
+	zil_replay_arg_t zr;
 
 	if (zil_empty(zilog)) {
-		/*
-		 * Initialise the log header but don't free the log block
-		 * which will get reused.
-		 */
-		zilog->zl_header->zh_claim_txg = 0;
-		zilog->zl_header->zh_replay_seq = 0;
+		zil_destroy(zilog, B_TRUE);
 		return;
 	}
 
@@ -1327,7 +1441,7 @@
 	zr.zr_arg = arg;
 	zr.zr_rm_sync = rm_sync;
 	zr.zr_txgp = txgp;
-	zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zilog->zl_header->zh_log);
+	zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
 	zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
 
 	/*
@@ -1338,11 +1452,11 @@
 	txg_wait_synced(zilog->zl_dmu_pool, 0);
 
 	zilog->zl_stop_replay = 0;
-	zil_parse(zilog, NULL, zil_replay_log_record, &zr,
-	    zilog->zl_header->zh_claim_txg);
+	(void) zil_parse(zilog, NULL, zil_replay_log_record, &zr,
+	    zh->zh_claim_txg);
 	kmem_free(zr.zr_lrbuf, 2 * SPA_MAXBLOCKSIZE);
 
-	zil_destroy(zilog);
+	zil_destroy(zilog, B_FALSE);
 }
 
 /*
@@ -1353,7 +1467,7 @@
 {
 	lwb_t *lwb;
 
-	if (zilog == NULL || list_head(&zilog->zl_itx_list))
+	if (!list_is_empty(&zilog->zl_itx_list))
 		return (B_FALSE);
 
 	/*
--- a/usr/src/uts/common/fs/zfs/zio.c	Thu Apr 13 15:37:22 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/zio.c	Thu Apr 13 16:15:06 2006 -0700
@@ -1263,13 +1263,8 @@
 zio_dva_free(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
-	dva_t *dva = bp->blk_dva;
-	int d;
 
-	ASSERT(!BP_IS_HOLE(bp));
-
-	for (d = 0; d < BP_GET_NDVAS(bp); d++)
-		metaslab_free(zio->io_spa, &dva[d], zio->io_txg, B_FALSE);
+	metaslab_free(zio->io_spa, bp, zio->io_txg, B_FALSE);
 
 	BP_ZERO(bp);
 
@@ -1279,18 +1274,7 @@
 static void
 zio_dva_claim(zio_t *zio)
 {
-	blkptr_t *bp = zio->io_bp;
-	dva_t *dva = bp->blk_dva;
-	int error = 0;
-	int d;
-
-	ASSERT(!BP_IS_HOLE(bp));
-
-	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
-		error = metaslab_claim(zio->io_spa, &dva[d], zio->io_txg);
-		if (error)
-			zio->io_error = error;
-	}
+	zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
 
 	zio_next_stage(zio);
 }
@@ -1669,8 +1653,7 @@
  * Try to allocate an intent log block.  Return 0 on success, errno on failure.
  */
 int
-zio_alloc_blk(spa_t *spa, int checksum, uint64_t size, blkptr_t *bp,
-    uint64_t txg)
+zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *bp, uint64_t txg)
 {
 	int error;
 
@@ -1681,10 +1664,10 @@
 	error = metaslab_alloc(spa, size, bp, 1, txg, NULL);
 
 	if (error == 0) {
-		BP_SET_CHECKSUM(bp, checksum);
 		BP_SET_LSIZE(bp, size);
 		BP_SET_PSIZE(bp, size);
 		BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
+		BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_ZILOG);
 		BP_SET_TYPE(bp, DMU_OT_INTENT_LOG);
 		BP_SET_LEVEL(bp, 0);
 		BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
@@ -1705,11 +1688,9 @@
 {
 	ASSERT(!BP_IS_GANG(bp));
 
-	dprintf_bp(bp, "txg %llu: ", txg);
-
 	spa_config_enter(spa, RW_READER, FTAG);
 
-	metaslab_free(spa, BP_IDENTITY(bp), txg, B_FALSE);
+	metaslab_free(spa, bp, txg, B_FALSE);
 
 	spa_config_exit(spa, FTAG);
 }