changeset 10685:931790026ac6

6846163 ZFS continues to use faulted logzilla, bringing system to a crawl 6872547 ztest LUN expansion test fails 6873635 zdb should be able to open a pool with a failed slog 6873654 system panics when a slog device is offlined 6875236 zdb should be able to dump the spa history
author George Wilson <George.Wilson@Sun.COM>
date Tue, 29 Sep 2009 07:29:35 -0700
parents 5bf5dbdbb746
children c2381d7785a7
files usr/src/cmd/zdb/zdb.c usr/src/cmd/zinject/zinject.c usr/src/cmd/zpool/zpool_main.c usr/src/cmd/ztest/ztest.c usr/src/lib/libzfs/common/libzfs.h usr/src/lib/libzfs/common/libzfs_pool.c usr/src/lib/libzfs/common/mapfile-vers usr/src/uts/common/fs/zfs/dsl_scrub.c usr/src/uts/common/fs/zfs/spa.c usr/src/uts/common/fs/zfs/spa_misc.c usr/src/uts/common/fs/zfs/sys/spa.h usr/src/uts/common/fs/zfs/sys/spa_impl.h usr/src/uts/common/fs/zfs/sys/vdev.h usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h usr/src/uts/common/fs/zfs/vdev.c usr/src/uts/common/fs/zfs/zfs_ioctl.c usr/src/uts/common/fs/zfs/zfs_vfsops.c usr/src/uts/common/fs/zfs/zil.c usr/src/uts/common/fs/zfs/zio.c usr/src/uts/common/fs/zfs/zio_inject.c usr/src/uts/common/sys/fs/zfs.h
diffstat 21 files changed, 380 insertions(+), 122 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/cmd/zdb/zdb.c	Tue Sep 29 10:20:35 2009 +0200
+++ b/usr/src/cmd/zdb/zdb.c	Tue Sep 29 07:29:35 2009 -0700
@@ -100,6 +100,7 @@
 	(void) fprintf(stderr, "	-u uberblock\n");
 	(void) fprintf(stderr, "	-d datasets\n");
 	(void) fprintf(stderr, "        -C cached pool configuration\n");
+	(void) fprintf(stderr, "        -h pool history\n");
 	(void) fprintf(stderr, "	-i intent logs\n");
 	(void) fprintf(stderr, "	-b block statistics\n");
 	(void) fprintf(stderr, "	-m metaslabs\n");
@@ -504,7 +505,7 @@
 	char *name[DTL_TYPES] = { "missing", "partial", "scrub", "outage" };
 	char prefix[256];
 
-	spa_vdev_state_enter(spa);
+	spa_vdev_state_enter(spa, SCL_NONE);
 	required = vdev_dtl_required(vd);
 	(void) spa_vdev_state_exit(spa, NULL, 0);
 
@@ -534,6 +535,67 @@
 		dump_dtl(vd->vdev_child[c], indent + 4);
 }
 
+static void
+dump_history(spa_t *spa)
+{
+	nvlist_t **events = NULL;
+	char buf[SPA_MAXBLOCKSIZE];
+	uint64_t resid, off = 0;
+	uint64_t len = sizeof (buf);
+	uint_t num = 0;
+	int error;
+	time_t tsec;
+	struct tm t;
+	char tbuf[30];
+	char internalstr[MAXPATHLEN];
+
+	do {
+		if ((error = spa_history_get(spa, &off, &len, buf)) != 0) {
+			(void) fprintf(stderr, "Unable to read history: "
+			    "error %d\n", error);
+			return;
+		}
+
+		if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0)
+			break;
+
+		off -= resid;
+	} while (len != 0);
+
+	(void) printf("\nHistory:\n");
+	for (int i = 0; i < num; i++) {
+		uint64_t time, txg, ievent;
+		char *cmd, *intstr;
+
+		if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME,
+		    &time) != 0)
+			continue;
+		if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD,
+		    &cmd) != 0) {
+			if (nvlist_lookup_uint64(events[i],
+			    ZPOOL_HIST_INT_EVENT, &ievent) != 0)
+				continue;
+			verify(nvlist_lookup_uint64(events[i],
+			    ZPOOL_HIST_TXG, &txg) == 0);
+			verify(nvlist_lookup_string(events[i],
+			    ZPOOL_HIST_INT_STR, &intstr) == 0);
+			if (ievent >= LOG_END)
+				continue;
+
+			(void) snprintf(internalstr,
+			    sizeof (internalstr),
+			    "[internal %s txg:%lld] %s",
+			    hist_event_table[ievent], txg,
+			    intstr);
+			cmd = internalstr;
+		}
+		tsec = time;
+		(void) localtime_r(&tsec, &t);
+		(void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
+		(void) printf("%s %s\n", tbuf, cmd);
+	}
+}
+
 /*ARGSUSED*/
 static void
 dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
@@ -1791,6 +1853,9 @@
 	if (dump_opt['s'])
 		show_pool_stats(spa);
 
+	if (dump_opt['h'])
+		dump_history(spa);
+
 	if (rc != 0)
 		exit(rc);
 }
@@ -2256,11 +2321,12 @@
 
 	dprintf_setup(&argc, argv);
 
-	while ((c = getopt(argc, argv, "udibcmsvCLS:U:lRep:t:")) != -1) {
+	while ((c = getopt(argc, argv, "udhibcmsvCLS:U:lRep:t:")) != -1) {
 		switch (c) {
 		case 'u':
 		case 'd':
 		case 'i':
+		case 'h':
 		case 'b':
 		case 'c':
 		case 'm':
@@ -2415,6 +2481,23 @@
 			    B_TRUE, FTAG, &os);
 		} else {
 			error = spa_open(argv[0], &spa, FTAG);
+			if (error) {
+				/*
+				 * If we're missing the log device then
+				 * try opening the pool after clearing the
+				 * log state.
+				 */
+				mutex_enter(&spa_namespace_lock);
+				if ((spa = spa_lookup(argv[0])) != NULL &&
+				    spa->spa_log_state == SPA_LOG_MISSING) {
+					spa->spa_log_state = SPA_LOG_CLEAR;
+					error = 0;
+				}
+				mutex_exit(&spa_namespace_lock);
+
+				if (!error)
+					error = spa_open(argv[0], &spa, FTAG);
+			}
 		}
 	}
 
--- a/usr/src/cmd/zinject/zinject.c	Tue Sep 29 10:20:35 2009 +0200
+++ b/usr/src/cmd/zinject/zinject.c	Tue Sep 29 07:29:35 2009 -0700
@@ -227,11 +227,15 @@
 	    "\t\tfunctions which call spa_vdev_config_exit(), or \n"
 	    "\t\tspa_vdev_exit() will trigger a panic.\n"
 	    "\n"
-	    "\tzinject -d device [-e errno] [-L <nvlist|uber>] [-F] pool\n"
+	    "\tzinject -d device [-e errno] [-L <nvlist|uber>] [-F]\n"
+	    "\t    [-T <read|write|free|claim|all> pool\n"
 	    "\t\tInject a fault into a particular device or the device's\n"
 	    "\t\tlabel.  Label injection can either be 'nvlist' or 'uber'.\n"
 	    "\t\t'errno' can either be 'nxio' (the default) or 'io'.\n"
 	    "\n"
+	    "\tzinject -d device -A <degrade|fault> pool\n"
+	    "\t\tPerform a specific action on a particular device\n"
+	    "\n"
 	    "\tzinject -b objset:object:level:blkid pool\n"
 	    "\n"
 	    "\t\tInject an error into pool 'pool' with the numeric bookmark\n"
@@ -497,6 +501,22 @@
 }
 
 int
+perform_action(const char *pool, zinject_record_t *record, int cmd)
+{
+	zfs_cmd_t zc;
+
+	ASSERT(cmd == VDEV_STATE_DEGRADED || cmd == VDEV_STATE_FAULTED);
+	(void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name));
+	zc.zc_guid = record->zi_guid;
+	zc.zc_cookie = cmd;
+
+	if (ioctl(zfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
+		return (0);
+
+	return (1);
+}
+
+int
 main(int argc, char **argv)
 {
 	int c;
@@ -509,6 +529,8 @@
 	int quiet = 0;
 	int error = 0;
 	int domount = 0;
+	int io_type = ZIO_TYPES;
+	int action = VDEV_STATE_UNKNOWN;
 	err_type_t type = TYPE_INVAL;
 	err_type_t label = TYPE_INVAL;
 	zinject_record_t record = { 0 };
@@ -546,11 +568,24 @@
 		return (0);
 	}
 
-	while ((c = getopt(argc, argv, ":ab:d:f:Fqhc:t:l:mr:e:uL:p:")) != -1) {
+	while ((c = getopt(argc, argv,
+	    ":aA:b:d:f:Fqhc:t:T:l:mr:e:uL:p:")) != -1) {
 		switch (c) {
 		case 'a':
 			flags |= ZINJECT_FLUSH_ARC;
 			break;
+		case 'A':
+			if (strcasecmp(optarg, "degrade") == 0) {
+				action = VDEV_STATE_DEGRADED;
+			} else if (strcasecmp(optarg, "fault") == 0) {
+				action = VDEV_STATE_FAULTED;
+			} else {
+				(void) fprintf(stderr, "invalid action '%s': "
+				    "must be 'degrade' or 'fault'\n", optarg);
+				usage();
+				return (1);
+			}
+			break;
 		case 'b':
 			raw = optarg;
 			break;
@@ -611,6 +646,25 @@
 		case 'r':
 			range = optarg;
 			break;
+		case 'T':
+			if (strcasecmp(optarg, "read") == 0) {
+				io_type = ZIO_TYPE_READ;
+			} else if (strcasecmp(optarg, "write") == 0) {
+				io_type = ZIO_TYPE_WRITE;
+			} else if (strcasecmp(optarg, "free") == 0) {
+				io_type = ZIO_TYPE_FREE;
+			} else if (strcasecmp(optarg, "claim") == 0) {
+				io_type = ZIO_TYPE_CLAIM;
+			} else if (strcasecmp(optarg, "all") == 0) {
+				io_type = ZIO_TYPES;
+			} else {
+				(void) fprintf(stderr, "invalid I/O type "
+				    "'%s': must be 'read', 'write', 'free', "
+				    "'claim' or 'all'\n", optarg);
+				usage();
+				return (1);
+			}
+			break;
 		case 't':
 			if ((type = name_to_type(optarg)) == TYPE_INVAL &&
 			    !MOS_TYPE(type)) {
@@ -708,10 +762,15 @@
 			return (1);
 		}
 
+		record.zi_iotype = io_type;
 		if (translate_device(pool, device, label, &record) != 0)
 			return (1);
 		if (!error)
 			error = ENXIO;
+
+		if (action != VDEV_STATE_UNKNOWN)
+			return (perform_action(pool, &record, action));
+
 	} else if (raw != NULL) {
 		if (range != NULL || type != TYPE_INVAL || level != 0 ||
 		    record.zi_func[0] != '\0') {
--- a/usr/src/cmd/zpool/zpool_main.c	Tue Sep 29 10:20:35 2009 +0200
+++ b/usr/src/cmd/zpool/zpool_main.c	Tue Sep 29 07:29:35 2009 -0700
@@ -3624,49 +3624,6 @@
 	int internal;
 } hist_cbdata_t;
 
-char *hist_event_table[LOG_END] = {
-	"invalid event",
-	"pool create",
-	"vdev add",
-	"pool remove",
-	"pool destroy",
-	"pool export",
-	"pool import",
-	"vdev attach",
-	"vdev replace",
-	"vdev detach",
-	"vdev online",
-	"vdev offline",
-	"vdev upgrade",
-	"pool clear",
-	"pool scrub",
-	"pool property set",
-	"create",
-	"clone",
-	"destroy",
-	"destroy_begin_sync",
-	"inherit",
-	"property set",
-	"quota set",
-	"permission update",
-	"permission remove",
-	"permission who remove",
-	"promote",
-	"receive",
-	"rename",
-	"reservation set",
-	"replay_inc_sync",
-	"replay_full_sync",
-	"rollback",
-	"snapshot",
-	"filesystem version upgrade",
-	"refquota set",
-	"refreservation set",
-	"pool scrub done",
-	"user hold",
-	"user release",
-};
-
 /*
  * Print out the command history for a specific pool.
  */
--- a/usr/src/cmd/ztest/ztest.c	Tue Sep 29 10:20:35 2009 +0200
+++ b/usr/src/cmd/ztest/ztest.c	Tue Sep 29 07:29:35 2009 -0700
@@ -203,13 +203,13 @@
 
 ztest_info_t ztest_info[] = {
 	{ ztest_dmu_read_write,			1,	&zopt_always	},
-	{ ztest_dmu_read_write_zcopy,		1,	&zopt_always	},
 	{ ztest_dmu_write_parallel,		30,	&zopt_always	},
 	{ ztest_dmu_object_alloc_free,		1,	&zopt_always	},
 	{ ztest_dmu_commit_callbacks,		10,	&zopt_always	},
 	{ ztest_zap,				30,	&zopt_always	},
 	{ ztest_fzap,				30,	&zopt_always	},
 	{ ztest_zap_parallel,			100,	&zopt_always	},
+	{ ztest_dmu_read_write_zcopy,		1,	&zopt_sometimes	},
 	{ ztest_dsl_prop_get_set,		1,	&zopt_sometimes	},
 	{ ztest_dmu_objset_create_destroy,	1,	&zopt_sometimes },
 	{ ztest_dmu_snapshot_create_destroy,	1,	&zopt_sometimes },
@@ -1245,8 +1245,8 @@
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *tvd = vd->vdev_top;
-	vdev_t *pvd = vd->vdev_parent;
 	uint64_t guid = vd->vdev_guid;
+	uint64_t generation = spa->spa_config_generation + 1;
 
 	ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
@@ -1262,10 +1262,14 @@
 	 * vdev may have been detached/replaced while we were
 	 * trying to online it.
 	 */
-	if (vd != vdev_lookup_by_guid(tvd, guid) || vd->vdev_parent != pvd) {
-		if (zopt_verbose >= 6) {
-			(void) printf("vdev %p has disappeared, was "
-			    "guid %llu\n", (void *)vd, (u_longlong_t)guid);
+	if (generation != spa->spa_config_generation) {
+		if (zopt_verbose >= 5) {
+			(void) printf("vdev configuration has changed, "
+			    "guid %llu, state %llu, expected gen %llu, "
+			    "got gen %llu\n", (u_longlong_t)guid,
+			    (u_longlong_t)tvd->vdev_state,
+			    (u_longlong_t)generation,
+			    (u_longlong_t)spa->spa_config_generation);
 		}
 		return (vd);
 	}
@@ -1309,7 +1313,6 @@
 	uint64_t spa_newsize, spa_cursize, ms_count;
 
 	(void) mutex_lock(&ztest_shared->zs_vdev_lock);
-	mutex_enter(&spa_namespace_lock);
 	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
 
 	while (tvd == NULL || tvd->vdev_islog) {
@@ -1330,12 +1333,12 @@
 	psize = vd->vdev_psize;
 
 	/*
-	 * We only try to expand the vdev if it's less than 4x its
-	 * original size and it has a valid psize.
+	 * We only try to expand the vdev if it's healthy, less than 4x its
+	 * original size, and it has a valid psize.
 	 */
-	if (psize == 0 || psize >= 4 * zopt_vdev_size) {
+	if (tvd->vdev_state != VDEV_STATE_HEALTHY ||
+	    psize == 0 || psize >= 4 * zopt_vdev_size) {
 		spa_config_exit(spa, SCL_STATE, spa);
-		mutex_exit(&spa_namespace_lock);
 		(void) mutex_unlock(&ztest_shared->zs_vdev_lock);
 		return;
 	}
@@ -1361,16 +1364,14 @@
 	    tvd->vdev_state != VDEV_STATE_HEALTHY) {
 		if (zopt_verbose >= 5) {
 			(void) printf("Could not expand LUN because "
-			    "some vdevs were not healthy\n");
+			    "the vdev configuration changed.\n");
 		}
 		(void) spa_config_exit(spa, SCL_STATE, spa);
-		mutex_exit(&spa_namespace_lock);
 		(void) mutex_unlock(&ztest_shared->zs_vdev_lock);
 		return;
 	}
 
 	(void) spa_config_exit(spa, SCL_STATE, spa);
-	mutex_exit(&spa_namespace_lock);
 
 	/*
 	 * Expanding the LUN will update the config asynchronously,
@@ -3486,6 +3487,7 @@
 	int maxfaults = zopt_maxfaults;
 	vdev_t *vd0 = NULL;
 	uint64_t guid0 = 0;
+	boolean_t islog = B_FALSE;
 
 	ASSERT(leaves >= 1);
 
@@ -3513,6 +3515,9 @@
 		    zopt_dir, zopt_pool, top * leaves + leaf);
 
 		vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0);
+		if (vd0 != NULL && vd0->vdev_top->vdev_islog)
+			islog = B_TRUE;
+
 		if (vd0 != NULL && maxfaults != 1) {
 			/*
 			 * Make vd0 explicitly claim to be unreadable,
@@ -3558,22 +3563,38 @@
 
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
-	if (maxfaults == 0)
-		return;
-
 	/*
-	 * If we can tolerate two or more faults, randomly online/offline vd0.
+	 * If we can tolerate two or more faults, or we're dealing
+	 * with a slog, randomly online/offline vd0.
 	 */
-	if (maxfaults >= 2 && guid0 != 0) {
+	if ((maxfaults >= 2 || islog) && guid0 != 0) {
 		if (ztest_random(10) < 6) {
 			int flags = (ztest_random(2) == 0 ?
 			    ZFS_OFFLINE_TEMPORARY : 0);
+
+			/*
+			 * We have to grab the zs_name_lock as writer to
+			 * prevent a race between offlining a slog and
+			 * destroying a dataset. Offlining the slog will
+			 * grab a reference on the dataset which may cause
+			 * dmu_objset_destroy() to fail with EBUSY thus
+			 * leaving the dataset in an inconsistent state.
+			 */
+			if (islog)
+				(void) rw_wrlock(&ztest_shared->zs_name_lock);
+
 			VERIFY(vdev_offline(spa, guid0, flags) != EBUSY);
+
+			if (islog)
+				(void) rw_unlock(&ztest_shared->zs_name_lock);
 		} else {
 			(void) vdev_online(spa, guid0, 0, NULL);
 		}
 	}
 
+	if (maxfaults == 0)
+		return;
+
 	/*
 	 * We have at least single-fault tolerance, so inject data corruption.
 	 */
@@ -3921,7 +3942,7 @@
 ztest_resume(spa_t *spa)
 {
 	if (spa_suspended(spa)) {
-		spa_vdev_state_enter(spa);
+		spa_vdev_state_enter(spa, SCL_NONE);
 		vdev_clear(spa, NULL);
 		(void) spa_vdev_state_exit(spa, NULL, 0);
 		(void) zio_resume(spa);
--- a/usr/src/lib/libzfs/common/libzfs.h	Tue Sep 29 10:20:35 2009 +0200
+++ b/usr/src/lib/libzfs/common/libzfs.h	Tue Sep 29 07:29:35 2009 -0700
@@ -332,10 +332,14 @@
  */
 struct zfs_cmd;
 
+extern const char *hist_event_table[LOG_END];
+
 extern char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *,
     boolean_t verbose);
 extern int zpool_upgrade(zpool_handle_t *, uint64_t);
 extern int zpool_get_history(zpool_handle_t *, nvlist_t **);
+extern int zpool_history_unpack(char *, uint64_t, uint64_t *,
+    nvlist_t ***, uint_t *);
 extern void zpool_set_history_str(const char *subcommand, int argc,
     char **argv, char *history_str);
 extern int zpool_stage_history(libzfs_handle_t *, const char *);
--- a/usr/src/lib/libzfs/common/libzfs_pool.c	Tue Sep 29 10:20:35 2009 +0200
+++ b/usr/src/lib/libzfs/common/libzfs_pool.c	Tue Sep 29 07:29:35 2009 -0700
@@ -42,6 +42,49 @@
 #include "zfs_prop.h"
 #include "libzfs_impl.h"
 
+const char *hist_event_table[LOG_END] = {
+	"invalid event",
+	"pool create",
+	"vdev add",
+	"pool remove",
+	"pool destroy",
+	"pool export",
+	"pool import",
+	"vdev attach",
+	"vdev replace",
+	"vdev detach",
+	"vdev online",
+	"vdev offline",
+	"vdev upgrade",
+	"pool clear",
+	"pool scrub",
+	"pool property set",
+	"create",
+	"clone",
+	"destroy",
+	"destroy_begin_sync",
+	"inherit",
+	"property set",
+	"quota set",
+	"permission update",
+	"permission remove",
+	"permission who remove",
+	"promote",
+	"receive",
+	"rename",
+	"reservation set",
+	"replay_inc_sync",
+	"replay_full_sync",
+	"rollback",
+	"snapshot",
+	"filesystem version upgrade",
+	"refquota set",
+	"refreservation set",
+	"pool scrub done",
+	"user hold",
+	"user release",
+};
+
 static int read_efi_label(nvlist_t *config, diskaddr_t *sb);
 
 #if defined(__i386) || defined(__amd64)
@@ -2804,7 +2847,7 @@
  * into 'records'.  'leftover' is set to the number of bytes that weren't
  * processed as there wasn't a complete record.
  */
-static int
+int
 zpool_history_unpack(char *buf, uint64_t bytes_read, uint64_t *leftover,
     nvlist_t ***records, uint_t *numrecords)
 {
--- a/usr/src/lib/libzfs/common/mapfile-vers	Tue Sep 29 10:20:35 2009 +0200
+++ b/usr/src/lib/libzfs/common/mapfile-vers	Tue Sep 29 07:29:35 2009 -0700
@@ -45,6 +45,7 @@
 	fletcher_4_byteswap;
 	fletcher_4_incremental_native;
 	fletcher_4_incremental_byteswap;
+	hist_event_table;
 	libzfs_errno;
 	libzfs_error_action;
 	libzfs_error_description;
@@ -170,6 +171,7 @@
 	zpool_get_prop_int;
 	zpool_get_state;
 	zpool_get_status;
+	zpool_history_unpack;
 	zpool_import;
 	zpool_import_props;
 	zpool_import_status;
--- a/usr/src/uts/common/fs/zfs/dsl_scrub.c	Tue Sep 29 10:20:35 2009 +0200
+++ b/usr/src/uts/common/fs/zfs/dsl_scrub.c	Tue Sep 29 07:29:35 2009 -0700
@@ -1033,7 +1033,7 @@
 	 * spa_scrub_reopen flag indicates that vdev_open() should not
 	 * attempt to start another scrub.
 	 */
-	spa_vdev_state_enter(spa);
+	spa_vdev_state_enter(spa, SCL_NONE);
 	spa->spa_scrub_reopen = B_TRUE;
 	vdev_reopen(spa->spa_root_vdev);
 	spa->spa_scrub_reopen = B_FALSE;
--- a/usr/src/uts/common/fs/zfs/spa.c	Tue Sep 29 10:20:35 2009 +0200
+++ b/usr/src/uts/common/fs/zfs/spa.c	Tue Sep 29 07:29:35 2009 -0700
@@ -3958,7 +3958,7 @@
 	 * See if any devices need to be marked REMOVED.
 	 */
 	if (tasks & SPA_ASYNC_REMOVE) {
-		spa_vdev_state_enter(spa);
+		spa_vdev_state_enter(spa, SCL_NONE);
 		spa_async_remove(spa, spa->spa_root_vdev);
 		for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
 			spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
@@ -3977,7 +3977,7 @@
 	 * See if any devices need to be probed.
 	 */
 	if (tasks & SPA_ASYNC_PROBE) {
-		spa_vdev_state_enter(spa);
+		spa_vdev_state_enter(spa, SCL_NONE);
 		spa_async_probe(spa, spa->spa_root_vdev);
 		(void) spa_vdev_state_exit(spa, NULL, 0);
 	}
--- a/usr/src/uts/common/fs/zfs/spa_misc.c	Tue Sep 29 10:20:35 2009 +0200
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c	Tue Sep 29 07:29:35 2009 -0700
@@ -880,6 +880,7 @@
 	if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
 		dsl_pool_scrub_restart(spa->spa_dsl_pool);
 		config_changed = B_TRUE;
+		spa->spa_config_generation++;
 	}
 
 	/*
@@ -939,18 +940,24 @@
  * Lock the given spa_t for the purpose of changing vdev state.
  */
 void
-spa_vdev_state_enter(spa_t *spa)
+spa_vdev_state_enter(spa_t *spa, int oplocks)
 {
-	spa_config_enter(spa, SCL_STATE_ALL, spa, RW_WRITER);
+	int locks = SCL_STATE_ALL | oplocks;
+
+	spa_config_enter(spa, locks, spa, RW_WRITER);
+	spa->spa_vdev_locks = locks;
 }
 
 int
 spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
 {
-	if (vd != NULL)
+	if (vd != NULL) {
 		vdev_state_dirty(vd->vdev_top);
+		spa->spa_config_generation++;
+	}
 
-	spa_config_exit(spa, SCL_STATE_ALL, spa);
+	ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL);
+	spa_config_exit(spa, spa->spa_vdev_locks, spa);
 
 	/*
 	 * If anything changed, wait for it to sync.  This ensures that,
--- a/usr/src/uts/common/fs/zfs/sys/spa.h	Tue Sep 29 10:20:35 2009 +0200
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h	Tue Sep 29 07:29:35 2009 -0700
@@ -411,6 +411,7 @@
 extern void spa_close(spa_t *spa, void *tag);
 extern boolean_t spa_refcount_zero(spa_t *spa);
 
+#define	SCL_NONE	0x00
 #define	SCL_CONFIG	0x01
 #define	SCL_STATE	0x02
 #define	SCL_L2ARC	0x04		/* hack until L2ARC 2.0 */
@@ -436,7 +437,7 @@
 extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error);
 
 /* Pool vdev state change lock */
-extern void spa_vdev_state_enter(spa_t *spa);
+extern void spa_vdev_state_enter(spa_t *spa, int oplock);
 extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error);
 
 /* Accessor functions */
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h	Tue Sep 29 10:20:35 2009 +0200
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h	Tue Sep 29 07:29:35 2009 -0700
@@ -122,6 +122,7 @@
 	spa_aux_vdev_t	spa_spares;		/* hot spares */
 	spa_aux_vdev_t	spa_l2cache;		/* L2ARC cache devices */
 	uint64_t	spa_config_object;	/* MOS object for pool config */
+	uint64_t	spa_config_generation;	/* config generation number */
 	uint64_t	spa_syncing_txg;	/* txg currently syncing */
 	uint64_t	spa_sync_bplist_obj;	/* object for deferred frees */
 	bplist_t	spa_sync_bplist;	/* deferred-free bplist */
@@ -172,6 +173,7 @@
 	spa_log_state_t spa_log_state;		/* log state */
 	uint64_t	spa_autoexpand;		/* lun expansion on/off */
 	boolean_t	spa_autoreplace;	/* autoreplace set in open */
+	int		spa_vdev_locks;		/* locks grabbed */
 	/*
 	 * spa_refcnt & spa_config_lock must be the last elements
 	 * because refcount_t changes size based on compilation options.
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h	Tue Sep 29 10:20:35 2009 +0200
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h	Tue Sep 29 07:29:35 2009 -0700
@@ -80,7 +80,6 @@
 extern void vdev_stat_update(zio_t *zio, uint64_t psize);
 extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type,
     boolean_t complete);
-extern int vdev_getspec(spa_t *spa, uint64_t vdev, char **vdev_spec);
 extern void vdev_propagate_state(vdev_t *vd);
 extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
     vdev_aux_t aux);
--- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h	Tue Sep 29 10:20:35 2009 +0200
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h	Tue Sep 29 07:29:35 2009 -0700
@@ -118,6 +118,8 @@
 	uint32_t	zi_freq;
 	uint32_t	zi_failfast;
 	char		zi_func[MAXNAMELEN];
+	uint32_t	zi_iotype;
+	uint32_t	zi_pad;		/* 64-bit alignment */
 } zinject_record_t;
 
 #define	ZINJECT_NULL		0x1
--- a/usr/src/uts/common/fs/zfs/vdev.c	Tue Sep 29 10:20:35 2009 +0200
+++ b/usr/src/uts/common/fs/zfs/vdev.c	Tue Sep 29 07:29:35 2009 -0700
@@ -1935,7 +1935,7 @@
 {
 	vdev_t *vd;
 
-	spa_vdev_state_enter(spa);
+	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, ENODEV));
@@ -1955,7 +1955,8 @@
 	 * unavailable, then back off and simply mark the vdev as degraded
 	 * instead.
 	 */
-	if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) {
+	if (vdev_is_dead(vd->vdev_top) && !vd->vdev_islog &&
+	    vd->vdev_aux == NULL) {
 		vd->vdev_degraded = 1ULL;
 		vd->vdev_faulted = 0ULL;
 
@@ -1984,7 +1985,7 @@
 {
 	vdev_t *vd;
 
-	spa_vdev_state_enter(spa);
+	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, ENODEV));
@@ -2017,7 +2018,7 @@
 {
 	vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
 
-	spa_vdev_state_enter(spa);
+	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, ENODEV));
@@ -2064,12 +2065,33 @@
 }
 
 int
+vdev_offline_log(spa_t *spa)
+{
+	int error = 0;
+
+	if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
+	    NULL, DS_FIND_CHILDREN)) == 0) {
+
+		/*
+		 * We successfully offlined the log device, sync out the
+		 * current txg so that the "stubby" block can be removed
+		 * by zil_sync().
+		 */
+		txg_wait_synced(spa->spa_dsl_pool, 0);
+	}
+	return (error);
+}
+
+int
 vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
 {
 	vdev_t *vd, *tvd;
-	int error;
-
-	spa_vdev_state_enter(spa);
+	int error = 0;
+	uint64_t generation;
+	metaslab_group_t *mg;
+
+top:
+	spa_vdev_state_enter(spa, SCL_ALLOC);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, ENODEV));
@@ -2078,6 +2100,8 @@
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
 	tvd = vd->vdev_top;
+	mg = tvd->vdev_mg;
+	generation = spa->spa_config_generation + 1;
 
 	/*
 	 * If the device isn't already offline, try to offline it.
@@ -2093,6 +2117,38 @@
 			return (spa_vdev_state_exit(spa, NULL, EBUSY));
 
 		/*
+		 * If the top-level is a slog and it's had allocations
+		 * then proceed. We check that the vdev's metaslab
+		 * grop is not NULL since it's possible that we may
+		 * have just added this vdev and have not yet initialized
+		 * it's metaslabs.
+		 */
+		if (tvd->vdev_islog && mg != NULL) {
+			/*
+			 * Prevent any future allocations.
+			 */
+			metaslab_class_remove(spa->spa_log_class, mg);
+			(void) spa_vdev_state_exit(spa, vd, 0);
+
+			error = vdev_offline_log(spa);
+
+			spa_vdev_state_enter(spa, SCL_ALLOC);
+
+			/*
+			 * Check to see if the config has changed.
+			 */
+			if (error || generation != spa->spa_config_generation) {
+				metaslab_class_add(spa->spa_log_class, mg);
+				if (error)
+					return (spa_vdev_state_exit(spa,
+					    vd, error));
+				(void) spa_vdev_state_exit(spa, vd, 0);
+				goto top;
+			}
+			ASSERT3U(tvd->vdev_stat.vs_alloc, ==, 0);
+		}
+
+		/*
 		 * Offline this device and reopen its top-level vdev.
 		 * If the top-level vdev is a log device then just offline
 		 * it. Otherwise, if this action results in the top-level
@@ -2107,28 +2163,18 @@
 			vdev_reopen(tvd);
 			return (spa_vdev_state_exit(spa, NULL, EBUSY));
 		}
+
+		/*
+		 * Add the device back into the metaslab rotor so that
+		 * once we online the device it's open for business.
+		 */
+		if (tvd->vdev_islog && mg != NULL)
+			metaslab_class_add(spa->spa_log_class, mg);
 	}
 
 	vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
 
-	if (!tvd->vdev_islog || !vdev_is_dead(tvd))
-		return (spa_vdev_state_exit(spa, vd, 0));
-
-	(void) spa_vdev_state_exit(spa, vd, 0);
-
-	error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
-	    NULL, DS_FIND_CHILDREN);
-	if (error) {
-		(void) vdev_online(spa, guid, 0, NULL);
-		return (error);
-	}
-	/*
-	 * If we successfully offlined the log device then we need to
-	 * sync out the current txg so that the "stubby" block can be
-	 * removed by zil_sync().
-	 */
-	txg_wait_synced(spa->spa_dsl_pool, 0);
-	return (0);
+	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 /*
@@ -2356,6 +2402,14 @@
 	    !(zio->io_flags & ZIO_FLAG_IO_RETRY))
 		return;
 
+	/*
+	 * Intent logs writes won't propagate their error to the root
+	 * I/O so don't mark these types of failures as pool-level
+	 * errors.
+	 */
+	if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
+		return;
+
 	mutex_enter(&vd->vdev_stat_lock);
 	if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) {
 		if (zio->io_error == ECKSUM)
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c	Tue Sep 29 10:20:35 2009 +0200
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c	Tue Sep 29 07:29:35 2009 -0700
@@ -2983,7 +2983,7 @@
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
-	spa_vdev_state_enter(spa);
+	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if (zc->zc_guid == 0) {
 		vd = NULL;
--- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c	Tue Sep 29 10:20:35 2009 +0200
+++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c	Tue Sep 29 07:29:35 2009 -0700
@@ -951,7 +951,7 @@
 
 	zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
 	if (zil_disable) {
-		zil_destroy(zfsvfs->z_log, 0);
+		zil_destroy(zfsvfs->z_log, B_FALSE);
 		zfsvfs->z_log = NULL;
 	}
 
--- a/usr/src/uts/common/fs/zfs/zil.c	Tue Sep 29 10:20:35 2009 +0200
+++ b/usr/src/uts/common/fs/zfs/zil.c	Tue Sep 29 07:29:35 2009 -0700
@@ -77,6 +77,8 @@
 
 static kmem_cache_t *zil_lwb_cache;
 
+static boolean_t zil_empty(zilog_t *zilog);
+
 static int
 zil_dva_compare(const void *x1, const void *x2)
 {
@@ -436,23 +438,12 @@
 
 	mutex_enter(&zilog->zl_lock);
 
-	/*
-	 * It is possible for the ZIL to get the previously mounted zilog
-	 * structure of the same dataset if quickly remounted and the dbuf
-	 * eviction has not completed. In this case we can see a non
-	 * empty lwb list and keep_first will be set. We fix this by
-	 * clearing the keep_first. This will be slower but it's very rare.
-	 */
-	if (!list_is_empty(&zilog->zl_lwb_list) && keep_first)
-		keep_first = B_FALSE;
-
 	ASSERT3U(zilog->zl_destroy_txg, <, txg);
 	zilog->zl_destroy_txg = txg;
-	zilog->zl_keep_first = keep_first;
 
 	if (!list_is_empty(&zilog->zl_lwb_list)) {
 		ASSERT(zh->zh_claim_txg == 0);
-		ASSERT(!keep_first);
+		zilog->zl_keep_first = B_FALSE;
 		while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
 			list_remove(&zilog->zl_lwb_list, lwb);
 			if (lwb->lwb_buf != NULL)
@@ -461,9 +452,23 @@
 			kmem_cache_free(zil_lwb_cache, lwb);
 		}
 	} else {
-		if (!keep_first) {
+		zilog->zl_keep_first = keep_first;
+		if (zh->zh_flags & ZIL_REPLAY_NEEDED) {
+			ASSERT(!keep_first);
 			(void) zil_parse(zilog, zil_free_log_block,
 			    zil_free_log_record, tx, zh->zh_claim_txg);
+		} else {
+			/*
+			 * Would like to assert zil_empty() but that
+			 * would force us to read the log chain which
+			 * requires us to do I/O to the log. This is
+			 * overkill since we really just want to destroy
+			 * the chain anyway.
+			 */
+			if (!keep_first) {
+				blkptr_t bp = zh->zh_log;
+				zio_free_blk(zilog->zl_spa, &bp, txg);
+			}
 		}
 	}
 	mutex_exit(&zilog->zl_lock);
@@ -746,7 +751,7 @@
 		lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
 		    0, &lwb->lwb_blk, lwb->lwb_buf, lwb->lwb_sz,
 		    zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE,
-		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb);
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb);
 	}
 }
 
--- a/usr/src/uts/common/fs/zfs/zio.c	Tue Sep 29 10:20:35 2009 +0200
+++ b/usr/src/uts/common/fs/zfs/zio.c	Tue Sep 29 07:29:35 2009 -0700
@@ -2196,8 +2196,9 @@
 		if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
 			zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
 
-		if ((zio->io_error == EIO ||
-		    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && zio == lio) {
+		if ((zio->io_error == EIO || !(zio->io_flags &
+		    (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
+		    zio == lio) {
 			/*
 			 * For logical I/O requests, tell the SPA to log the
 			 * error and generate a logical data ereport.
--- a/usr/src/uts/common/fs/zfs/zio_inject.c	Tue Sep 29 10:20:35 2009 +0200
+++ b/usr/src/uts/common/fs/zfs/zio_inject.c	Tue Sep 29 07:29:35 2009 -0700
@@ -184,7 +184,7 @@
 	int label;
 	int ret = 0;
 
-	if (offset + zio->io_size > VDEV_LABEL_START_SIZE &&
+	if (offset >= VDEV_LABEL_START_SIZE &&
 	    offset < vd->vdev_psize - VDEV_LABEL_END_SIZE)
 		return (0);
 
@@ -226,6 +226,18 @@
 	inject_handler_t *handler;
 	int ret = 0;
 
+	/*
+	 * We skip over faults in the labels unless it's during
+	 * device open (i.e. zio == NULL).
+	 */
+	if (zio != NULL) {
+		uint64_t offset = zio->io_offset;
+
+		if (offset < VDEV_LABEL_START_SIZE ||
+		    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE)
+		return (0);
+	}
+
 	rw_enter(&inject_lock, RW_READER);
 
 	for (handler = list_head(&inject_handlers); handler != NULL;
@@ -243,6 +255,12 @@
 				continue;
 			}
 
+			/* Handle type specific I/O failures */
+			if (zio != NULL &&
+			    handler->zi_record.zi_iotype != ZIO_TYPES &&
+			    handler->zi_record.zi_iotype != zio->io_type)
+				continue;
+
 			if (handler->zi_record.zi_error == error) {
 				/*
 				 * For a failed open, pretend like the device
--- a/usr/src/uts/common/sys/fs/zfs.h	Tue Sep 29 10:20:35 2009 +0200
+++ b/usr/src/uts/common/sys/fs/zfs.h	Tue Sep 29 07:29:35 2009 -0700
@@ -692,7 +692,7 @@
 /*
  * Note: This is encoded on-disk, so new events must be added to the
  * end, and unused events can not be removed.  Be sure to edit
- * zpool_main.c: hist_event_table[].
+ * libzfs_pool.c: hist_event_table[].
  */
 typedef enum history_internal_events {
 	LOG_NO_EVENT = 0,