changeset 10921:8aac17999e4d

PSARC 2009/479 zpool recovery support 6667683 need a way to rollback to an uberblock from a previous txg 6885998 bad ASSERT() in traverse_zil_block()
author Tim Haley <Tim.Haley@Sun.COM>
date Fri, 30 Oct 2009 18:47:17 -0600
parents 5610c58a888f
children e2081f502306
files usr/src/cmd/fm/dicts/ZFS.po usr/src/cmd/fm/modules/common/zfs-retire/zfs_retire.c usr/src/cmd/zdb/zdb.c usr/src/cmd/zinject/zinject.c usr/src/cmd/zpool/zpool_main.c usr/src/common/zfs/zfs_comutil.c usr/src/common/zfs/zfs_comutil.h usr/src/lib/libzfs/common/libzfs.h usr/src/lib/libzfs/common/libzfs_pool.c usr/src/lib/libzfs/common/mapfile-vers usr/src/uts/common/fs/zfs/dmu_traverse.c usr/src/uts/common/fs/zfs/dsl_dir.c usr/src/uts/common/fs/zfs/metaslab.c usr/src/uts/common/fs/zfs/spa.c usr/src/uts/common/fs/zfs/spa_config.c usr/src/uts/common/fs/zfs/spa_misc.c usr/src/uts/common/fs/zfs/space_map.c usr/src/uts/common/fs/zfs/sys/dmu_traverse.h usr/src/uts/common/fs/zfs/sys/dsl_pool.h usr/src/uts/common/fs/zfs/sys/metaslab_impl.h usr/src/uts/common/fs/zfs/sys/spa.h usr/src/uts/common/fs/zfs/sys/spa_impl.h usr/src/uts/common/fs/zfs/sys/txg.h usr/src/uts/common/fs/zfs/sys/uberblock_impl.h usr/src/uts/common/fs/zfs/sys/vdev.h usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h usr/src/uts/common/fs/zfs/sys/zio.h usr/src/uts/common/fs/zfs/txg.c usr/src/uts/common/fs/zfs/vdev.c usr/src/uts/common/fs/zfs/vdev_label.c usr/src/uts/common/fs/zfs/zfs_fm.c usr/src/uts/common/fs/zfs/zfs_ioctl.c usr/src/uts/common/fs/zfs/zil.c usr/src/uts/common/fs/zfs/zio.c usr/src/uts/common/fs/zfs/zio_inject.c usr/src/uts/common/sys/fs/zfs.h
diffstat 36 files changed, 1019 insertions(+), 184 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/cmd/fm/dicts/ZFS.po	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/cmd/fm/dicts/ZFS.po	Fri Oct 30 18:47:17 2009 -0600
@@ -1,5 +1,5 @@
 #
-# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 # CDDL HEADER START
@@ -21,7 +21,6 @@
 #
 # CDDL HEADER END
 #
-#ident	"%Z%%M%	%I%	%E% SMI"
 #
 # DO NOT EDIT -- this file is generated by the Event Registry.
 #
@@ -136,7 +135,7 @@
 msgid "ZFS-8000-72.impact"
 msgstr "The pool is no longer available"
 msgid "ZFS-8000-72.action"
-msgstr "\nIf this error is encountered during 'zpool import', see the section\nbelow.  Otherwise, run 'zpool status -x' to determine which pool is\nfaulted:\n\n\n# zpool status -x\n# zpool import\n  pool: test\n    id: 13783646421373024673\n state: FAULTED\nstatus: The pool metadata is corrupted and cannot be opened.\naction: Destroy the pool and restore from backup.\n   see: http://www.sun.com/msg/ZFS-8000-72\nconfig:\n\n        NAME                  STATE     READ WRITE CKSUM\n        test                  FAULTED      0     0     2  corrupted data\n          mirror              DEGRADED     0     0     2\n            c0t0d0            ONLINE       0     0     2\n            c0t0d1            ONLINE       0     0     2\n\nerrors: No known errors\n\n\nEven though all the devices are available, the on-disk data has been\ncorrupted such that the pool cannot be opened.  All data within the pool is\nlost, and the pool must be destroyed and restored from an appropriate backup\nsource.  ZFS includes built-in metadata replication to prevent this from\nhappening even for unreplicated pools, but running in a replicated configuration\nwill decrease the chances of this happening in the future.\n\nIf this error is encountered during 'zpool import', the pool is\nunrecoverable and cannot be imported.  The pool must be restored from an\nappropriate backup source.\n	"
+msgstr "\nEven though all the devices are available, the on-disk data\nhas been corrupted such that the pool cannot be opened.  If a recovery\naction is presented, the pool can be returned to a usable state.\nOtherwise, all data within the pool is lost, and the pool must be\ndestroyed and restored from an appropriate backup source.  ZFS\nincludes built-in metadata replication to prevent this from happening\neven for unreplicated pools, but running in a replicated configuration\nwill decrease the chances of this happening in the future.\n\nIf this error is encountered during 'zpool import', see the\nsection below.  Otherwise, run 'zpool status -x' to determine which\npool is faulted and if a recovery option is available:\n\n\n# zpool status -x\n  pool: test\n    id: 13783646421373024673\n state: FAULTED\nstatus: The pool metadata is corrupted and cannot be opened.\naction: Recovery is possible, but will result in some data loss.\n        Returning the pool to its state as of Mon Sep 28 10:24:39 2009\n        should correct the problem.  Approximately 59 seconds of data\n        will have to be discarded, irreversibly.  Recovery can be\n        attempted by executing 'zpool clear -F test'.  A scrub of the pool\n        is strongly recommended following a successful recovery.\n   see: http://www.sun.com/msg/ZFS-8000-72\nconfig:\n\n        NAME                  STATE     READ WRITE CKSUM\n        test                  FAULTED      0     0     2  corrupted data\n            c0t0d0            ONLINE       0     0     2\n            c0t0d1            ONLINE       0     0     2\n\n\nIf recovery is unavailable, the recommended action will be:\n\n\naction: Destroy the pool and restore from backup.\n\n\nIf this error is encountered during 'zpool import', and if no\nrecovery option is mentioned, the pool is unrecoverable and cannot be\nimported.  The pool must be restored from an appropriate backup\nsource.  If a recovery option is available, the output from 'zpool\nimport' will look something like the following:\n\n\n# zpool import share\ncannot import 'share': I/O error\n        Recovery is possible, but will result in some data loss.\n        Returning the pool to its state as of Sun Sep 27 12:31:07 2009\n        should correct the problem.  Approximately 53 seconds of data\n        will have to be discarded, irreversibly.  Recovery can be\n        attempted by executing 'zpool import -F share'.  A scrub of the pool\n        is strongly recommended following a successful recovery.\n\n\nRecovery actions are requested with the -F option to either\n'zpool clear' or 'zpool import'.  Recovery will result in some data\nloss, because it reverts the pool to an earlier state.  A dry-run\nrecovery check can be performed by adding the -n option, affirming if\nrecovery is possible without actually reverting the pool to its\nearlier state.\n"
 #
 # code: ZFS-8000-8A
 # keys: ereport.fs.zfs.object.corrupt_data
@@ -200,7 +199,7 @@
 msgid "ZFS-8000-CS.impact"
 msgstr "The pool data is unavailable"
 msgid "ZFS-8000-CS.action"
-msgstr "Run 'zpool status -x' and either attach the missing device or\n	    restore from backup."
+msgstr "Run 'zpool status -x' and attach any missing devices, follow\n     any provided recovery instructions or restore from backup."
 #
 # code: ZFS-8000-D3
 # keys: fault.fs.zfs.device
--- a/usr/src/cmd/fm/modules/common/zfs-retire/zfs_retire.c	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/cmd/fm/modules/common/zfs-retire/zfs_retire.c	Fri Oct 30 18:47:17 2009 -0600
@@ -516,7 +516,7 @@
 			/*
 			 * For pool-level repair events, clear the entire pool.
 			 */
-			(void) zpool_clear(zhp, NULL);
+			(void) zpool_clear(zhp, NULL, NULL);
 			zpool_close(zhp);
 			continue;
 		}
--- a/usr/src/cmd/zdb/zdb.c	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/cmd/zdb/zdb.c	Fri Oct 30 18:47:17 2009 -0600
@@ -1619,7 +1619,7 @@
 	zdb_cb_t *zcb = arg;
 	char blkbuf[BP_SPRINTF_LEN];
 	dmu_object_type_t type;
-	boolean_t is_l0_metadata;
+	boolean_t is_metadata;
 
 	if (bp == NULL)
 		return (0);
@@ -1628,23 +1628,15 @@
 
 	zdb_count_block(spa, zcb, bp, type);
 
-	/*
-	 * if we do metadata-only checksumming there's no need to checksum
-	 * indirect blocks here because it is done during traverse
-	 */
-	is_l0_metadata = (BP_GET_LEVEL(bp) == 0 && type < DMU_OT_NUMTYPES &&
-	    dmu_ot[type].ot_metadata);
+	is_metadata = (BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata);
 
 	if (dump_opt['c'] > 1 || dump_opt['S'] ||
-	    (dump_opt['c'] && is_l0_metadata)) {
-		int ioerr, size;
-		void *data;
-
-		size = BP_GET_LSIZE(bp);
-		data = malloc(size);
-		ioerr = zio_wait(zio_read(NULL, spa, bp, data, size,
+	    (dump_opt['c'] && is_metadata)) {
+		size_t size = BP_GET_PSIZE(bp);
+		void *data = malloc(size);
+		int ioerr = zio_wait(zio_read(NULL, spa, bp, data, size,
 		    NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
-		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB, zb));
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
 		free(data);
 
 		/* We expect io errors on intent log */
@@ -1739,7 +1731,7 @@
 		bplist_close(bpl);
 	}
 
-	zcb.zcb_haderrors |= traverse_pool(spa, zdb_blkptr_cb, &zcb);
+	zcb.zcb_haderrors |= traverse_pool(spa, zdb_blkptr_cb, &zcb, 0);
 
 	if (zcb.zcb_haderrors && !dump_opt['S']) {
 		(void) printf("\nError counts:\n\n");
@@ -2327,6 +2319,8 @@
 	char **searchdirs = NULL;
 	int nsearch = 0;
 	char *target;
+	nvlist_t *policy = NULL;
+	uint64_t max_txg = UINT64_MAX;
 
 	(void) setrlimit(RLIMIT_NOFILE, &rl);
 	(void) enable_extended_FILE_stdio(-1, -1);
@@ -2393,8 +2387,8 @@
 				usage();
 			break;
 		case 't':
-			ub_max_txg = strtoull(optarg, NULL, 0);
-			if (ub_max_txg < TXG_INITIAL) {
+			max_txg = strtoull(optarg, NULL, 0);
+			if (max_txg < TXG_INITIAL) {
 				(void) fprintf(stderr, "incorrect txg "
 				    "specified: %s\n", optarg);
 				usage();
@@ -2453,8 +2447,17 @@
 				(void) printf("\nConfiguration for import:\n");
 				dump_nvlist(cfg, 8);
 			}
+			if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 ||
+			    nvlist_add_uint64(policy,
+			    ZPOOL_REWIND_REQUEST_TXG, max_txg) != 0 ||
+			    nvlist_add_nvlist(cfg,
+			    ZPOOL_REWIND_POLICY, policy) != 0) {
+				fatal("can't open '%s': %s",
+				    target, strerror(ENOMEM));
+			}
 			if ((error = spa_import(name, cfg, NULL)) != 0)
 				error = spa_import_verbatim(name, cfg, NULL);
+			nvlist_free(policy);
 		}
 	}
 
--- a/usr/src/cmd/zinject/zinject.c	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/cmd/zinject/zinject.c	Fri Oct 30 18:47:17 2009 -0600
@@ -236,6 +236,13 @@
 	    "\tzinject -d device -A <degrade|fault> pool\n"
 	    "\t\tPerform a specific action on a particular device\n"
 	    "\n"
+	    "\tzinject -I [-s <seconds> | -g <txgs>] pool\n"
+	    "\t\tCause the pool to stop writing blocks yet not\n"
+	    "\t\treport errors for a duration.  Simulates buggy hardware\n"
+	    "\t\tthat fails to honor cache flush requests.\n"
+	    "\t\tDefault duration is 30 seconds.  The machine is panicked\n"
+	    "\t\tat the end of the duration.\n"
+	    "\n"
 	    "\tzinject -b objset:object:level:blkid pool\n"
 	    "\n"
 	    "\t\tInject an error into pool 'pool' with the numeric bookmark\n"
@@ -479,6 +486,12 @@
 		} else if (record->zi_func[0] != '\0') {
 			(void) printf("  panic function: %s\n",
 			    record->zi_func);
+		} else if (record->zi_duration > 0) {
+			(void) printf(" time: %lld seconds\n",
+			    (u_longlong_t)record->zi_duration);
+		} else if (record->zi_duration < 0) {
+			(void) printf(" txgs: %lld \n",
+			    (u_longlong_t)-record->zi_duration);
 		} else {
 			(void) printf("objset: %llu\n",
 			    (u_longlong_t)record->zi_objset);
@@ -537,6 +550,9 @@
 	char pool[MAXNAMELEN];
 	char dataset[MAXNAMELEN];
 	zfs_handle_t *zhp;
+	int nowrites = 0;
+	int dur_txg = 0;
+	int dur_secs = 0;
 	int ret;
 	int flags = 0;
 
@@ -569,7 +585,7 @@
 	}
 
 	while ((c = getopt(argc, argv,
-	    ":aA:b:d:f:Fqhc:t:T:l:mr:e:uL:p:")) != -1) {
+	    ":aA:b:d:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
 		switch (c) {
 		case 'a':
 			flags |= ZINJECT_FLUSH_ARC;
@@ -621,9 +637,27 @@
 		case 'F':
 			record.zi_failfast = B_TRUE;
 			break;
+		case 'g':
+			dur_txg = 1;
+			record.zi_duration = (int)strtol(optarg, &end, 10);
+			if (record.zi_duration <= 0 || *end != '\0') {
+				(void) fprintf(stderr, "invalid duration '%s': "
+				    "must be a positive integer\n", optarg);
+				usage();
+				return (1);
+			}
+			/* store duration of txgs as its negative */
+			record.zi_duration *= -1;
+			break;
 		case 'h':
 			usage();
 			return (0);
+		case 'I':
+			/* default duration, if one hasn't yet been defined */
+			nowrites = 1;
+			if (dur_secs == 0 && dur_txg == 0)
+				record.zi_duration = 30;
+			break;
 		case 'l':
 			level = (int)strtol(optarg, &end, 10);
 			if (*end != '\0') {
@@ -646,6 +680,16 @@
 		case 'r':
 			range = optarg;
 			break;
+		case 's':
+			dur_secs = 1;
+			record.zi_duration = (int)strtol(optarg, &end, 10);
+			if (record.zi_duration <= 0 || *end != '\0') {
+				(void) fprintf(stderr, "invalid duration '%s': "
+				    "must be a positive integer\n", optarg);
+				usage();
+				return (1);
+			}
+			break;
 		case 'T':
 			if (strcasecmp(optarg, "read") == 0) {
 				io_type = ZIO_TYPE_READ;
@@ -707,7 +751,8 @@
 		 * '-c' is invalid with any other options.
 		 */
 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
-		    level != 0 || record.zi_func[0] != '\0') {
+		    level != 0 || record.zi_func[0] != '\0' ||
+		    record.zi_duration != 0) {
 			(void) fprintf(stderr, "cancel (-c) incompatible with "
 			    "any other options\n");
 			usage();
@@ -739,7 +784,8 @@
 		 * for doing injection, so handle it separately here.
 		 */
 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
-		    level != 0 || record.zi_func[0] != '\0') {
+		    level != 0 || record.zi_func[0] != '\0' ||
+		    record.zi_duration != 0) {
 			(void) fprintf(stderr, "device (-d) incompatible with "
 			    "data error injection\n");
 			usage();
@@ -773,7 +819,7 @@
 
 	} else if (raw != NULL) {
 		if (range != NULL || type != TYPE_INVAL || level != 0 ||
-		    record.zi_func[0] != '\0') {
+		    record.zi_func[0] != '\0' || record.zi_duration != 0) {
 			(void) fprintf(stderr, "raw (-b) format with "
 			    "any other options\n");
 			usage();
@@ -802,7 +848,7 @@
 			error = EIO;
 	} else if (record.zi_func[0] != '\0') {
 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
-		    level != 0 || device != NULL) {
+		    level != 0 || device != NULL || record.zi_duration != 0) {
 			(void) fprintf(stderr, "panic (-p) incompatible with "
 			    "other options\n");
 			usage();
@@ -818,10 +864,32 @@
 
 		(void) strcpy(pool, argv[0]);
 		dataset[0] = '\0';
+	} else if (record.zi_duration != 0) {
+		if (nowrites == 0) {
+			(void) fprintf(stderr, "-s or -g meaningless "
+			    "without -I (ignore writes)\n");
+			usage();
+			return (2);
+		} else if (dur_secs && dur_txg) {
+			(void) fprintf(stderr, "choose a duration either "
+			    "in seconds (-s) or a number of txgs (-g) "
+			    "but not both\n");
+			usage();
+			return (2);
+		} else if (argc != 1) {
+			(void) fprintf(stderr, "ignore writes (-I) "
+			    "injection requires a single pool name\n");
+			usage();
+			return (2);
+		}
+
+		(void) strcpy(pool, argv[0]);
+		dataset[0] = '\0';
 	} else if (type == TYPE_INVAL) {
 		if (flags == 0) {
 			(void) fprintf(stderr, "at least one of '-b', '-d', "
-			    "'-t', '-a', '-p', or '-u' must be specified\n");
+			    "'-t', '-a', '-p', '-I' or '-u' "
+			    "must be specified\n");
 			usage();
 			return (2);
 		}
--- a/usr/src/cmd/zpool/zpool_main.c	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/cmd/zpool/zpool_main.c	Fri Oct 30 18:47:17 2009 -0600
@@ -186,7 +186,7 @@
 		return (gettext("\tattach [-f] <pool> <device> "
 		    "<new-device>\n"));
 	case HELP_CLEAR:
-		return (gettext("\tclear <pool> [device]\n"));
+		return (gettext("\tclear [-nF] <pool> [device]\n"));
 	case HELP_CREATE:
 		return (gettext("\tcreate [-fn] [-o property=value] ... \n"
 		    "\t    [-O file-system-property=value] ... \n"
@@ -201,6 +201,7 @@
 		return (gettext("\thistory [-il] [<pool>] ...\n"));
 	case HELP_IMPORT:
 		return (gettext("\timport [-d dir] [-D]\n"
+		    "\timport [-d dir | -c cachefile] [-n] -F <pool | id>\n"
 		    "\timport [-o mntopts] [-o property=value] ... \n"
 		    "\t    [-d dir | -c cachefile] [-D] [-f] [-R root] -a\n"
 		    "\timport [-o mntopts] [-o property=value] ... \n"
@@ -1294,6 +1295,7 @@
 		free(name);
 	}
 }
+
 /*
  * Display the status for the given pool.
  */
@@ -1486,7 +1488,6 @@
 	char *name;
 	uint64_t state;
 	uint64_t version;
-	int error = 0;
 
 	verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
 	    &name) == 0);
@@ -1549,7 +1550,7 @@
 	}
 
 	zpool_close(zhp);
-	return (error);
+	return (0);
 }
 
 /*
@@ -1557,7 +1558,7 @@
  *       import [-o mntopts] [-o prop=value] ... [-R root] [-D]
  *              [-d dir | -c cachefile] [-f] -a
  *       import [-o mntopts] [-o prop=value] ... [-R root] [-D]
- *              [-d dir | -c cachefile] [-f] <pool | id> [newpool]
+ *              [-d dir | -c cachefile] [-f] [-n] [-F] <pool | id> [newpool]
  *
  *	 -c	Read pool information from a cachefile instead of searching
  *		devices.
@@ -1572,14 +1573,18 @@
  *		the given root.  The pool will remain exported when the machine
  *		is rebooted.
  *
- *       -f	Force import, even if it appears that the pool is active.
- *
- *       -F	Import even in the presence of faulted vdevs.  This is an
+ *       -V	Import even in the presence of faulted vdevs.  This is an
  *       	intentionally undocumented option for testing purposes, and
  *       	treats the pool configuration as complete, leaving any bad
  *		vdevs in the FAULTED state. In other words, it does verbatim
  *		import.
  *
+ *       -f	Force import, even if it appears that the pool is active.
+ *
+ *       -F     Attempt rewind if necessary.
+ *
+ *       -n     See if rewind would work, but don't actually rewind.
+ *
  *       -a	Import all pools found.
  *
  *       -o	Set property=value and/or temporary mount options (without '=').
@@ -1605,14 +1610,19 @@
 	char *searchname = NULL;
 	char *propval;
 	nvlist_t *found_config;
+	nvlist_t *policy = NULL;
 	nvlist_t *props = NULL;
 	boolean_t first;
 	boolean_t do_verbatim = B_FALSE;
+	uint32_t rewind_policy = ZPOOL_NO_REWIND;
+	boolean_t dryrun = B_FALSE;
+	boolean_t do_rewind = B_FALSE;
+	boolean_t xtreme_rewind = B_FALSE;
 	uint64_t pool_state;
 	char *cachefile = NULL;
 
 	/* check options */
-	while ((c = getopt(argc, argv, ":ac:d:DfFo:p:R:")) != -1) {
+	while ((c = getopt(argc, argv, ":aCc:d:DEfFno:p:rR:VX")) != -1) {
 		switch (c) {
 		case 'a':
 			do_all = B_TRUE;
@@ -1640,7 +1650,10 @@
 			do_force = B_TRUE;
 			break;
 		case 'F':
-			do_verbatim = B_TRUE;
+			do_rewind = B_TRUE;
+			break;
+		case 'n':
+			dryrun = B_TRUE;
 			break;
 		case 'o':
 			if ((propval = strchr(optarg, '=')) != NULL) {
@@ -1665,6 +1678,12 @@
 			    ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE))
 				goto error;
 			break;
+		case 'V':
+			do_verbatim = B_TRUE;
+			break;
+		case 'X':
+			xtreme_rewind = B_TRUE;
+			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
 			    "'%c' option\n"), optopt);
@@ -1685,6 +1704,23 @@
 		usage(B_FALSE);
 	}
 
+	if ((dryrun || xtreme_rewind) && !do_rewind) {
+		(void) fprintf(stderr,
+		    gettext("-n or -X only meaningful with -F\n"));
+		usage(B_FALSE);
+	}
+	if (dryrun)
+		rewind_policy = ZPOOL_TRY_REWIND;
+	else if (do_rewind)
+		rewind_policy = ZPOOL_DO_REWIND;
+	if (xtreme_rewind)
+		rewind_policy |= ZPOOL_EXTREME_REWIND;
+
+	/* In the future, we can capture further policy and include it here */
+	if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 ||
+	    nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind_policy) != 0)
+		goto error;
+
 	if (searchdirs == NULL) {
 		searchdirs = safe_malloc(sizeof (char *));
 		searchdirs[0] = "/dev/dsk";
@@ -1712,6 +1748,7 @@
 			(void) fprintf(stderr, gettext("cannot "
 			    "discover pools: permission denied\n"));
 			free(searchdirs);
+			nvlist_free(policy);
 			return (1);
 		}
 	}
@@ -1759,6 +1796,7 @@
 			    "no such pool available\n"), argv[0]);
 		}
 		free(searchdirs);
+		nvlist_free(policy);
 		return (1);
 	}
 
@@ -1782,17 +1820,21 @@
 		if (do_destroyed && pool_state != POOL_STATE_DESTROYED)
 			continue;
 
+		verify(nvlist_add_nvlist(config, ZPOOL_REWIND_POLICY,
+		    policy) == 0);
+
 		if (argc == 0) {
 			if (first)
 				first = B_FALSE;
 			else if (!do_all)
 				(void) printf("\n");
 
-			if (do_all)
+			if (do_all) {
 				err |= do_import(config, NULL, mntopts,
 				    do_force, props, do_verbatim);
-			else
+			} else {
 				show_import(config);
+			}
 		} else if (searchname != NULL) {
 			char *name;
 
@@ -1853,6 +1895,7 @@
 error:
 	nvlist_free(props);
 	nvlist_free(pools);
+	nvlist_free(policy);
 	free(searchdirs);
 
 	return (err ? 1 : 0);
@@ -2793,31 +2836,80 @@
 int
 zpool_do_clear(int argc, char **argv)
 {
+	int c;
 	int ret = 0;
+	boolean_t dryrun = B_FALSE;
+	boolean_t do_rewind = B_FALSE;
+	boolean_t xtreme_rewind = B_FALSE;
+	uint32_t rewind_policy = ZPOOL_NO_REWIND;
+	nvlist_t *policy = NULL;
 	zpool_handle_t *zhp;
 	char *pool, *device;
 
-	if (argc < 2) {
+	/* check options */
+	while ((c = getopt(argc, argv, "FnX")) != -1) {
+		switch (c) {
+		case 'F':
+			do_rewind = B_TRUE;
+			break;
+		case 'n':
+			dryrun = B_TRUE;
+			break;
+		case 'X':
+			xtreme_rewind = B_TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing pool name\n"));
 		usage(B_FALSE);
 	}
 
-	if (argc > 3) {
+	if (argc > 2) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	}
 
-	pool = argv[1];
-	device = argc == 3 ? argv[2] : NULL;
-
-	if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL)
+	if ((dryrun || xtreme_rewind) && !do_rewind) {
+		(void) fprintf(stderr,
+		    gettext("-n or -X only meaningful with -F\n"));
+		usage(B_FALSE);
+	}
+	if (dryrun)
+		rewind_policy = ZPOOL_TRY_REWIND;
+	else if (do_rewind)
+		rewind_policy = ZPOOL_DO_REWIND;
+	if (xtreme_rewind)
+		rewind_policy |= ZPOOL_EXTREME_REWIND;
+
+	/* In future, further rewind policy choices can be passed along here */
+	if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 ||
+	    nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind_policy) != 0)
 		return (1);
 
-	if (zpool_clear(zhp, device) != 0)
+	pool = argv[0];
+	device = argc == 2 ? argv[1] : NULL;
+
+	if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) {
+		nvlist_free(policy);
+		return (1);
+	}
+
+	if (zpool_clear(zhp, device, policy) != 0)
 		ret = 1;
 
 	zpool_close(zhp);
 
+	nvlist_free(policy);
+
 	return (ret);
 }
 
@@ -3121,8 +3213,8 @@
 		    "be used because the label is missing \n\tor invalid.  "
 		    "There are insufficient replicas for the pool to "
 		    "continue\n\tfunctioning.\n"));
-		(void) printf(gettext("action: Destroy and re-create the pool "
-		    "from a backup source.\n"));
+		zpool_explain_recover(zpool_get_handle(zhp),
+		    zpool_get_name(zhp), reason, config);
 		break;
 
 	case ZPOOL_STATUS_FAILING_DEV:
@@ -3177,8 +3269,8 @@
 	case ZPOOL_STATUS_CORRUPT_POOL:
 		(void) printf(gettext("status: The pool metadata is corrupted "
 		    "and the pool cannot be opened.\n"));
-		(void) printf(gettext("action: Destroy and re-create the pool "
-		    "from a backup source.\n"));
+		zpool_explain_recover(zpool_get_handle(zhp),
+		    zpool_get_name(zhp), reason, config);
 		break;
 
 	case ZPOOL_STATUS_VERSION_OLDER:
--- a/usr/src/common/zfs/zfs_comutil.c	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/common/zfs/zfs_comutil.c	Fri Oct 30 18:47:17 2009 -0600
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * This file is intended for functions that ought to be common between user
  * land (libzfs) and the kernel. When many common routines need to be shared
@@ -33,10 +31,13 @@
 
 #if defined(_KERNEL)
 #include <sys/systm.h>
+#else
+#include <string.h>
 #endif
 
 #include <sys/types.h>
 #include <sys/fs/zfs.h>
+#include <sys/int_limits.h>
 #include <sys/nvpair.h>
 
 /*
@@ -63,3 +64,41 @@
 	}
 	return (B_FALSE);
 }
+
+void
+zpool_get_rewind_policy(nvlist_t *nvl, zpool_rewind_policy_t *zrpp)
+{
+	nvlist_t *policy;
+	nvpair_t *elem;
+	char *nm;
+
+	/* Defaults */
+	zrpp->zrp_request = ZPOOL_NO_REWIND;
+	zrpp->zrp_maxmeta = 0;
+	zrpp->zrp_maxdata = UINT32_MAX;
+	zrpp->zrp_txg = UINT64_MAX;
+
+	if (nvl == NULL)
+		return;
+
+	elem = NULL;
+	while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
+		nm = nvpair_name(elem);
+		if (strcmp(nm, ZPOOL_REWIND_POLICY) == 0) {
+			if (nvpair_value_nvlist(elem, &policy) == 0)
+				zpool_get_rewind_policy(policy, zrpp);
+			return;
+		} else if (strcmp(nm, ZPOOL_REWIND_REQUEST) == 0) {
+			if (nvpair_value_uint32(elem,
+			    &zrpp->zrp_request) == 0)
+				if (zrpp->zrp_request & ~ZPOOL_REWIND_MASK)
+					zrpp->zrp_request = ZPOOL_NO_REWIND;
+		} else if (strcmp(nm, ZPOOL_REWIND_REQUEST_TXG) == 0) {
+			(void) nvpair_value_uint64(elem, &zrpp->zrp_txg);
+		} else if (strcmp(nm, ZPOOL_REWIND_META_THRESH) == 0) {
+			(void) nvpair_value_uint32(elem, &zrpp->zrp_maxmeta);
+		} else if (strcmp(nm, ZPOOL_REWIND_DATA_THRESH) == 0) {
+			(void) nvpair_value_uint32(elem, &zrpp->zrp_maxdata);
+		}
+	}
+}
--- a/usr/src/common/zfs/zfs_comutil.h	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/common/zfs/zfs_comutil.h	Fri Oct 30 18:47:17 2009 -0600
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_ZFS_COMUTIL_H
 #define	_ZFS_COMUTIL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/fs/zfs.h>
 #include <sys/types.h>
 
@@ -35,7 +33,8 @@
 extern "C" {
 #endif
 
-extern boolean_t zfs_allocatable_devs(nvlist_t *nv);
+extern boolean_t zfs_allocatable_devs(nvlist_t *);
+extern void zpool_get_rewind_policy(nvlist_t *, zpool_rewind_policy_t *);
 
 #ifdef	__cplusplus
 }
--- a/usr/src/lib/libzfs/common/libzfs.h	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/lib/libzfs/common/libzfs.h	Fri Oct 30 18:47:17 2009 -0600
@@ -216,7 +216,7 @@
  * Functions to manipulate pool and vdev state
  */
 extern int zpool_scrub(zpool_handle_t *, pool_scrub_type_t);
-extern int zpool_clear(zpool_handle_t *, const char *);
+extern int zpool_clear(zpool_handle_t *, const char *, nvlist_t *);
 
 extern int zpool_vdev_online(zpool_handle_t *, const char *, int,
     vdev_state_t *);
@@ -347,6 +347,8 @@
     size_t len);
 extern int zfs_ioctl(libzfs_handle_t *, int, struct zfs_cmd *);
 extern int zpool_get_physpath(zpool_handle_t *, char *, size_t);
+extern void zpool_explain_recover(libzfs_handle_t *, const char *, int,
+    nvlist_t *);
 
 /*
  * Basic handle manipulations.  These functions do not create or destroy the
--- a/usr/src/lib/libzfs/common/libzfs_pool.c	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/lib/libzfs/common/libzfs_pool.c	Fri Oct 30 18:47:17 2009 -0600
@@ -41,6 +41,7 @@
 #include "zfs_namecheck.h"
 #include "zfs_prop.h"
 #include "libzfs_impl.h"
+#include "zfs_comutil.h"
 
 const char *hist_event_table[LOG_END] = {
 	"invalid event",
@@ -1240,6 +1241,127 @@
 	return (zpool_export_common(zhp, B_TRUE, B_TRUE));
 }
 
+static void
+zpool_rewind_exclaim(libzfs_handle_t *hdl, const char *name, boolean_t dryrun,
+    nvlist_t *rbi)
+{
+	uint64_t rewindto;
+	int64_t loss = -1;
+	struct tm t;
+	char timestr[128];
+
+	if (!hdl->libzfs_printerr || rbi == NULL)
+		return;
+
+	if (nvlist_lookup_uint64(rbi, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0)
+		return;
+	(void) nvlist_lookup_int64(rbi, ZPOOL_CONFIG_REWIND_TIME, &loss);
+
+	if (localtime_r((time_t *)&rewindto, &t) != NULL &&
+	    strftime(timestr, 128, 0, &t) != 0) {
+		if (dryrun) {
+			(void) printf(dgettext(TEXT_DOMAIN,
+			    "Would be able to return %s "
+			    "to its state as of %s.\n"),
+			    name, timestr);
+		} else {
+			(void) printf(dgettext(TEXT_DOMAIN,
+			    "Pool %s returned to its state as of %s.\n"),
+			    name, timestr);
+		}
+		if (loss > 120) {
+			(void) printf(dgettext(TEXT_DOMAIN,
+			    "%s approximately %lld "),
+			    dryrun ? "Would discard" : "Discarded",
+			    (loss + 30) / 60);
+			(void) printf(dgettext(TEXT_DOMAIN,
+			    "minutes of transactions.\n"));
+		} else if (loss > 0) {
+			(void) printf(dgettext(TEXT_DOMAIN,
+			    "%s approximately %lld "),
+			    dryrun ? "Would discard" : "Discarded", loss);
+			(void) printf(dgettext(TEXT_DOMAIN,
+			    "seconds of transactions.\n"));
+		}
+	}
+}
+
+void
+zpool_explain_recover(libzfs_handle_t *hdl, const char *name, int reason,
+    nvlist_t *config)
+{
+	int64_t loss = -1;
+	uint64_t edata = UINT64_MAX;
+	uint64_t rewindto;
+	struct tm t;
+	char timestr[128];
+
+	if (!hdl->libzfs_printerr)
+		return;
+
+	if (reason >= 0)
+		(void) printf(dgettext(TEXT_DOMAIN, "action: "));
+	else
+		(void) printf(dgettext(TEXT_DOMAIN, "\t"));
+
+	/* All attempted rewinds failed if ZPOOL_CONFIG_LOAD_TIME missing */
+	if (nvlist_lookup_uint64(config,
+	    ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0)
+		goto no_info;
+
+	(void) nvlist_lookup_int64(config, ZPOOL_CONFIG_REWIND_TIME, &loss);
+	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_LOAD_DATA_ERRORS,
+	    &edata);
+
+	(void) printf(dgettext(TEXT_DOMAIN,
+	    "Recovery is possible, but will result in some data loss.\n"));
+
+	if (localtime_r((time_t *)&rewindto, &t) != NULL &&
+	    strftime(timestr, 128, 0, &t) != 0) {
+		(void) printf(dgettext(TEXT_DOMAIN,
+		    "\tReturning the pool to its state as of %s\n"
+		    "\tshould correct the problem.  "),
+		    timestr);
+	} else {
+		(void) printf(dgettext(TEXT_DOMAIN,
+		    "\tReverting the pool to an earlier state "
+		    "should correct the problem.\n\t"));
+	}
+
+	if (loss > 120) {
+		(void) printf(dgettext(TEXT_DOMAIN,
+		    "Approximately %lld minutes of data\n"
+		    "\tmust be discarded, irreversibly.  "), (loss + 30) / 60);
+	} else if (loss > 0) {
+		(void) printf(dgettext(TEXT_DOMAIN,
+		    "Approximately %lld seconds of data\n"
+		    "\tmust be discarded, irreversibly.  "), loss);
+	}
+	if (edata != 0 && edata != UINT64_MAX) {
+		if (edata == 1) {
+			(void) printf(dgettext(TEXT_DOMAIN,
+			    "After rewind, at least\n"
+			    "\tone persistent user-data error will remain.  "));
+		} else {
+			(void) printf(dgettext(TEXT_DOMAIN,
+			    "After rewind, several\n"
+			    "\tpersistent user-data errors will remain.  "));
+		}
+	}
+	(void) printf(dgettext(TEXT_DOMAIN,
+	    "Recovery can be\n\tattempted by executing "
+	    "'zpool %s -F %s'.  "), reason >= 0 ? "clear" : "import", name);
+
+	(void) printf(dgettext(TEXT_DOMAIN,
+	    "A scrub of the pool\n"
+	    "\tis strongly recommended after recovery.\n"));
+	return;
+
+no_info:
+	(void) printf(dgettext(TEXT_DOMAIN,
+	    "Destroy and re-create the pool from\n\ta backup source.\n"));
+}
+
 /*
  * zpool_import() is a contracted interface. Should be kept the same
  * if possible.
@@ -1289,8 +1411,11 @@
     nvlist_t *props, boolean_t importfaulted)
 {
 	zfs_cmd_t zc = { 0 };
+	zpool_rewind_policy_t policy;
+	nvlist_t *nvi = NULL;
 	char *thename;
 	char *origname;
+	uint64_t returned_size;
 	int ret;
 	char errbuf[1024];
 
@@ -1334,11 +1459,30 @@
 		nvlist_free(props);
 		return (-1);
 	}
+	returned_size =  zc.zc_nvlist_conf_size + 512;
+	if (zcmd_alloc_dst_nvlist(hdl, &zc, returned_size) != 0) {
+		nvlist_free(props);
+		return (-1);
+	}
 
 	zc.zc_cookie = (uint64_t)importfaulted;
 	ret = 0;
 	if (zfs_ioctl(hdl, ZFS_IOC_POOL_IMPORT, &zc) != 0) {
 		char desc[1024];
+
+		(void) zcmd_read_dst_nvlist(hdl, &zc, &nvi);
+		zpool_get_rewind_policy(config, &policy);
+		/*
+		 * Dry-run failed, but we print out what success
+		 * looks like if we found a best txg
+		 */
+		if ((policy.zrp_request & ZPOOL_TRY_REWIND) && nvi) {
+			zpool_rewind_exclaim(hdl, newname ? origname : thename,
+			    B_TRUE, nvi);
+			nvlist_free(nvi);
+			return (-1);
+		}
+
 		if (newname == NULL)
 			(void) snprintf(desc, sizeof (desc),
 			    dgettext(TEXT_DOMAIN, "cannot import '%s'"),
@@ -1361,7 +1505,12 @@
 			break;
 
 		default:
+			(void) zcmd_read_dst_nvlist(hdl, &zc, &nvi);
 			(void) zpool_standard_error(hdl, errno, desc);
+			zpool_explain_recover(hdl,
+			    newname ? origname : thename, -errno, nvi);
+			nvlist_free(nvi);
+			break;
 		}
 
 		ret = -1;
@@ -1375,6 +1524,16 @@
 			ret = -1;
 		else if (zhp != NULL)
 			zpool_close(zhp);
+		(void) zcmd_read_dst_nvlist(hdl, &zc, &nvi);
+		zpool_get_rewind_policy(config, &policy);
+		if (policy.zrp_request &
+		    (ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) {
+			zpool_rewind_exclaim(hdl, newname ? origname : thename,
+			    ((policy.zrp_request & ZPOOL_TRY_REWIND) != 0),
+			    nvi);
+		}
+		nvlist_free(nvi);
+		return (0);
 	}
 
 	zcmd_free_nvlists(&zc);
@@ -2352,13 +2511,15 @@
  * Clear the errors for the pool, or the particular device if specified.
  */
 int
-zpool_clear(zpool_handle_t *zhp, const char *path)
+zpool_clear(zpool_handle_t *zhp, const char *path, nvlist_t *rewindnvl)
 {
 	zfs_cmd_t zc = { 0 };
 	char msg[1024];
 	nvlist_t *tgt;
+	zpool_rewind_policy_t policy;
 	boolean_t avail_spare, l2cache;
 	libzfs_handle_t *hdl = zhp->zpool_hdl;
+	nvlist_t *nvi = NULL;
 
 	if (path)
 		(void) snprintf(msg, sizeof (msg),
@@ -2386,9 +2547,31 @@
 		    &zc.zc_guid) == 0);
 	}
 
-	if (zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc) == 0)
+	zpool_get_rewind_policy(rewindnvl, &policy);
+	zc.zc_cookie = policy.zrp_request;
+
+	if (zcmd_alloc_dst_nvlist(hdl, &zc, 8192) != 0)
+		return (-1);
+
+	if (zcmd_write_src_nvlist(zhp->zpool_hdl, &zc, rewindnvl) != 0)
+		return (-1);
+
+	if (zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc) == 0 ||
+	    ((policy.zrp_request & ZPOOL_TRY_REWIND) &&
+	    errno != EPERM && errno != EACCES)) {
+		if (policy.zrp_request &
+		    (ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) {
+			(void) zcmd_read_dst_nvlist(hdl, &zc, &nvi);
+			zpool_rewind_exclaim(hdl, zc.zc_name,
+			    ((policy.zrp_request & ZPOOL_TRY_REWIND) != 0),
+			    nvi);
+			nvlist_free(nvi);
+		}
+		zcmd_free_nvlists(&zc);
 		return (0);
-
+	}
+
+	zcmd_free_nvlists(&zc);
 	return (zpool_standard_error(hdl, errno, msg));
 }
 
--- a/usr/src/lib/libzfs/common/mapfile-vers	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/lib/libzfs/common/mapfile-vers	Fri Oct 30 18:47:17 2009 -0600
@@ -158,6 +158,7 @@
 	zpool_disable_datasets;
 	zpool_enable_datasets;
 	zpool_expand_proplist;
+	zpool_explain_recover;
 	zpool_export;
 	zpool_export_force;
 	zpool_find_import;
--- a/usr/src/uts/common/fs/zfs/dmu_traverse.c	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c	Fri Oct 30 18:47:17 2009 -0600
@@ -84,7 +84,7 @@
 	zb.zb_object = 0;
 	zb.zb_level = -1;
 	zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
-	VERIFY(0 == td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg));
+	(void) td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg);
 }
 
 /* ARGSUSED */
@@ -108,7 +108,7 @@
 		zb.zb_object = lr->lr_foid;
 		zb.zb_level = BP_GET_LEVEL(bp);
 		zb.zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
-		VERIFY(0 == td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg));
+		(void) td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg);
 	}
 }
 
@@ -378,7 +378,7 @@
  * NB: pool must not be changing on-disk (eg, from zdb or sync context).
  */
 int
-traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg)
+traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg, uint64_t txg_start)
 {
 	int err;
 	uint64_t obj;
@@ -387,12 +387,13 @@
 
 	/* visit the MOS */
 	err = traverse_impl(spa, 0, spa_get_rootblkptr(spa),
-	    0, TRAVERSE_PRE, func, arg);
+	    txg_start, TRAVERSE_PRE | TRAVERSE_PREFETCH, func, arg);
 	if (err)
 		return (err);
 
 	/* visit each dataset */
-	for (obj = 1; err == 0; err = dmu_object_next(mos, &obj, FALSE, 0)) {
+	for (obj = 1; err == 0; err = dmu_object_next(mos, &obj, FALSE,
+	    txg_start)) {
 		dmu_object_info_t doi;
 
 		err = dmu_object_info(mos, obj, &doi);
@@ -401,14 +402,17 @@
 
 		if (doi.doi_type == DMU_OT_DSL_DATASET) {
 			dsl_dataset_t *ds;
+			uint64_t txg = txg_start;
+
 			rw_enter(&dp->dp_config_rwlock, RW_READER);
 			err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
 			rw_exit(&dp->dp_config_rwlock);
 			if (err)
 				return (err);
-			err = traverse_dataset(ds,
-			    ds->ds_phys->ds_prev_snap_txg, TRAVERSE_PRE,
-			    func, arg);
+			if (ds->ds_phys->ds_prev_snap_txg > txg)
+				txg = ds->ds_phys->ds_prev_snap_txg;
+			err = traverse_dataset(ds, txg,
+			    TRAVERSE_PRE | TRAVERSE_PREFETCH, func, arg);
 			dsl_dataset_rele(ds, FTAG);
 			if (err)
 				return (err);
--- a/usr/src/uts/common/fs/zfs/dsl_dir.c	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/dsl_dir.c	Fri Oct 30 18:47:17 2009 -0600
@@ -678,8 +678,9 @@
 {
 	uint64_t txg = tx->tx_txg;
 	uint64_t est_inflight, used_on_disk, quota, parent_rsrv;
+	uint64_t deferred = 0;
 	struct tempreserve *tr;
-	int enospc = EDQUOT;
+	int retval = EDQUOT;
 	int txgidx = txg & TXG_MASK;
 	int i;
 	uint64_t ref_rsrv = 0;
@@ -725,7 +726,8 @@
 		quota = dd->dd_phys->dd_quota;
 
 	/*
-	 * Adjust the quota against the actual pool size at the root.
+	 * Adjust the quota against the actual pool size at the root
+	 * minus any outstanding deferred frees.
 	 * To ensure that it's possible to remove files from a full
 	 * pool without inducing transient overcommits, we throttle
 	 * netfree transactions against a quota that is slightly larger,
@@ -735,9 +737,10 @@
 	 */
 	if (dd->dd_parent == NULL) {
 		uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
-		if (poolsize < quota) {
-			quota = poolsize;
-			enospc = ENOSPC;
+		deferred = spa_get_defers(dd->dd_pool->dp_spa);
+		if (poolsize - deferred < quota) {
+			quota = poolsize - deferred;
+			retval = ENOSPC;
 		}
 	}
 
@@ -747,15 +750,16 @@
 	 * on-disk is over quota and there are no pending changes (which
 	 * may free up space for us).
 	 */
-	if (used_on_disk + est_inflight > quota) {
-		if (est_inflight > 0 || used_on_disk < quota)
-			enospc = ERESTART;
+	if (used_on_disk + est_inflight >= quota) {
+		if (est_inflight > 0 || used_on_disk < quota ||
+		    (retval == ENOSPC && used_on_disk < quota + deferred))
+			retval = ERESTART;
 		dprintf_dd(dd, "failing: used=%lluK inflight = %lluK "
 		    "quota=%lluK tr=%lluK err=%d\n",
 		    used_on_disk>>10, est_inflight>>10,
-		    quota>>10, asize>>10, enospc);
+		    quota>>10, asize>>10, retval);
 		mutex_exit(&dd->dd_lock);
-		return (enospc);
+		return (retval);
 	}
 
 	/* We need to up our estimated delta before dropping dd_lock */
--- a/usr/src/uts/common/fs/zfs/metaslab.c	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/metaslab.c	Fri Oct 30 18:47:17 2009 -0600
@@ -503,16 +503,8 @@
 		metaslab_sync_done(msp, 0);
 
 	if (txg != 0) {
-		/*
-		 * The vdev is dirty, but the metaslab isn't -- it just needs
-		 * to have metaslab_sync_done() invoked from vdev_sync_done().
-		 * [We could just dirty the metaslab, but that would cause us
-		 * to allocate a space map object for it, which is wasteful
-		 * and would mess up the locality logic in metaslab_weight().]
-		 */
-		ASSERT(TXG_CLEAN(txg) == spa_last_synced_txg(vd->vdev_spa));
 		vdev_dirty(vd, 0, NULL, txg);
-		vdev_dirty(vd, VDD_METASLAB, msp, TXG_CLEAN(txg));
+		vdev_dirty(vd, VDD_METASLAB, msp, txg);
 	}
 
 	return (msp);
@@ -522,10 +514,9 @@
 metaslab_fini(metaslab_t *msp)
 {
 	metaslab_group_t *mg = msp->ms_group;
-	int t;
 
 	vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size,
-	    -msp->ms_smo.smo_alloc, B_TRUE);
+	    -msp->ms_smo.smo_alloc, 0, B_TRUE);
 
 	metaslab_group_remove(mg, msp);
 
@@ -534,11 +525,16 @@
 	space_map_unload(&msp->ms_map);
 	space_map_destroy(&msp->ms_map);
 
-	for (t = 0; t < TXG_SIZE; t++) {
+	for (int t = 0; t < TXG_SIZE; t++) {
 		space_map_destroy(&msp->ms_allocmap[t]);
 		space_map_destroy(&msp->ms_freemap[t]);
 	}
 
+	for (int t = 0; t < TXG_DEFER_SIZE; t++)
+		space_map_destroy(&msp->ms_defermap[t]);
+
+	ASSERT3S(msp->ms_deferspace, ==, 0);
+
 	mutex_exit(&msp->ms_lock);
 	mutex_destroy(&msp->ms_lock);
 
@@ -607,11 +603,18 @@
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
-		int error = space_map_load(sm, sm_ops, SM_FREE, &msp->ms_smo,
-		    msp->ms_group->mg_vd->vdev_spa->spa_meta_objset);
-		if (error) {
-			metaslab_group_sort(msp->ms_group, msp, 0);
-			return (error);
+		space_map_load_wait(sm);
+		if (!sm->sm_loaded) {
+			int error = space_map_load(sm, sm_ops, SM_FREE,
+			    &msp->ms_smo,
+			    msp->ms_group->mg_vd->vdev_spa->spa_meta_objset);
+			if (error) {
+				metaslab_group_sort(msp->ms_group, msp, 0);
+				return (error);
+			}
+			for (int t = 0; t < TXG_DEFER_SIZE; t++)
+				space_map_walk(&msp->ms_defermap[t],
+				    space_map_claim, sm);
 		}
 
 		/*
@@ -659,11 +662,11 @@
 	space_map_obj_t *smo = &msp->ms_smo_syncing;
 	dmu_buf_t *db;
 	dmu_tx_t *tx;
-	int t;
 
 	ASSERT(!vd->vdev_ishole);
 
-	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+	if (allocmap->sm_space == 0 && freemap->sm_space == 0)
+		return;
 
 	/*
 	 * The only state that can actually be changing concurrently with
@@ -673,12 +676,12 @@
 	 * We drop it whenever we call into the DMU, because the DMU
 	 * can call down to us (e.g. via zio_free()) at any time.
 	 */
-	mutex_enter(&msp->ms_lock);
+
+	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
 
 	if (smo->smo_object == 0) {
 		ASSERT(smo->smo_objsize == 0);
 		ASSERT(smo->smo_alloc == 0);
-		mutex_exit(&msp->ms_lock);
 		smo->smo_object = dmu_object_alloc(mos,
 		    DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
 		    DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
@@ -686,9 +689,10 @@
 		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
 		    (sm->sm_start >> vd->vdev_ms_shift),
 		    sizeof (uint64_t), &smo->smo_object, tx);
-		mutex_enter(&msp->ms_lock);
 	}
 
+	mutex_enter(&msp->ms_lock);
+
 	space_map_walk(freemap, space_map_add, freed_map);
 
 	if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >=
@@ -701,6 +705,7 @@
 		 * This metaslab is 100% allocated,
 		 * minus the content of the in-core map (sm),
 		 * minus what's been freed this txg (freed_map),
+		 * minus deferred frees (ms_defermap[]),
 		 * minus allocations from txgs in the future
 		 * (because they haven't been committed yet).
 		 */
@@ -712,7 +717,11 @@
 		space_map_walk(sm, space_map_remove, allocmap);
 		space_map_walk(freed_map, space_map_remove, allocmap);
 
-		for (t = 1; t < TXG_CONCURRENT_STATES; t++)
+		for (int t = 0; t < TXG_DEFER_SIZE; t++)
+			space_map_walk(&msp->ms_defermap[t],
+			    space_map_remove, allocmap);
+
+		for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
 			space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK],
 			    space_map_remove, allocmap);
 
@@ -746,9 +755,10 @@
 	space_map_obj_t *smosync = &msp->ms_smo_syncing;
 	space_map_t *sm = &msp->ms_map;
 	space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
+	space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
 	metaslab_group_t *mg = msp->ms_group;
 	vdev_t *vd = mg->mg_vd;
-	int t;
+	int64_t alloc_delta, defer_delta;
 
 	ASSERT(!vd->vdev_ishole);
 
@@ -759,16 +769,25 @@
 	 * allocmaps and freemaps and add its capacity to the vdev.
 	 */
 	if (freed_map->sm_size == 0) {
-		for (t = 0; t < TXG_SIZE; t++) {
+		for (int t = 0; t < TXG_SIZE; t++) {
 			space_map_create(&msp->ms_allocmap[t], sm->sm_start,
 			    sm->sm_size, sm->sm_shift, sm->sm_lock);
 			space_map_create(&msp->ms_freemap[t], sm->sm_start,
 			    sm->sm_size, sm->sm_shift, sm->sm_lock);
 		}
-		vdev_space_update(vd, sm->sm_size, 0, B_TRUE);
+
+		for (int t = 0; t < TXG_DEFER_SIZE; t++)
+			space_map_create(&msp->ms_defermap[t], sm->sm_start,
+			    sm->sm_size, sm->sm_shift, sm->sm_lock);
+
+		vdev_space_update(vd, sm->sm_size, 0, 0, B_TRUE);
 	}
 
-	vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc, B_TRUE);
+	alloc_delta = smosync->smo_alloc - smo->smo_alloc;
+	defer_delta = freed_map->sm_space - defer_map->sm_space;
+
+	vdev_space_update(vd, 0, alloc_delta + defer_delta,
+	    defer_delta, B_TRUE);
 
 	ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0);
 	ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0);
@@ -776,13 +795,26 @@
 	/*
 	 * If there's a space_map_load() in progress, wait for it to complete
 	 * so that we have a consistent view of the in-core space map.
-	 * Then, add everything we freed in this txg to the map.
+	 * Then, add defer_map (oldest deferred frees) to this map and
+	 * transfer freed_map (this txg's frees) to defer_map.
 	 */
 	space_map_load_wait(sm);
-	space_map_vacate(freed_map, sm->sm_loaded ? space_map_free : NULL, sm);
+	space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm);
+	space_map_vacate(freed_map, space_map_add, defer_map);
 
 	*smo = *smosync;
 
+	msp->ms_deferspace += defer_delta;
+	ASSERT3S(msp->ms_deferspace, >=, 0);
+	ASSERT3S(msp->ms_deferspace, <=, sm->sm_size);
+	if (msp->ms_deferspace != 0) {
+		/*
+		 * Keep syncing this metaslab until all deferred frees
+		 * are back in circulation.
+		 */
+		vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
+	}
+
 	/*
 	 * If the map is loaded but no longer active, evict it as soon as all
 	 * future allocations have synced.  (If we unloaded it now and then
@@ -791,7 +823,7 @@
 	if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
 		int evictable = 1;
 
-		for (t = 1; t < TXG_CONCURRENT_STATES; t++)
+		for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
 			if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space)
 				evictable = 0;
 
--- a/usr/src/uts/common/fs/zfs/spa.c	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/spa.c	Fri Oct 30 18:47:17 2009 -0600
@@ -1151,12 +1151,91 @@
 		spa_check_removed(sav->sav_vdevs[i]);
 }
 
+typedef struct spa_load_error {
+	uint64_t	sle_metadata_count;
+	uint64_t	sle_data_count;
+} spa_load_error_t;
+
+static void
+spa_load_verify_done(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	spa_load_error_t *sle = zio->io_private;
+	dmu_object_type_t type = BP_GET_TYPE(bp);
+	int error = zio->io_error;
+
+	if (error) {
+		if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) &&
+		    type != DMU_OT_INTENT_LOG)
+			atomic_add_64(&sle->sle_metadata_count, 1);
+		else
+			atomic_add_64(&sle->sle_data_count, 1);
+	}
+	zio_data_buf_free(zio->io_data, zio->io_size);
+}
+
+/*ARGSUSED*/
+static int
+spa_load_verify_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
+    const dnode_phys_t *dnp, void *arg)
+{
+	if (bp != NULL) {
+		zio_t *rio = arg;
+		size_t size = BP_GET_PSIZE(bp);
+		void *data = zio_data_buf_alloc(size);
+
+		zio_nowait(zio_read(rio, spa, bp, data, size,
+		    spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
+		    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
+		    ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
+	}
+	return (0);
+}
+
+static int
+spa_load_verify(spa_t *spa)
+{
+	zio_t *rio;
+	spa_load_error_t sle = { 0 };
+	zpool_rewind_policy_t policy;
+	boolean_t verify_ok = B_FALSE;
+	int error;
+
+	rio = zio_root(spa, NULL, &sle,
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+
+	error = traverse_pool(spa, spa_load_verify_cb, rio,
+	    spa->spa_verify_min_txg);
+
+	(void) zio_wait(rio);
+
+	zpool_get_rewind_policy(spa->spa_config, &policy);
+
+	spa->spa_load_meta_errors = sle.sle_metadata_count;
+	spa->spa_load_data_errors = sle.sle_data_count;
+
+	if (!error && sle.sle_metadata_count <= policy.zrp_maxmeta &&
+	    sle.sle_data_count <= policy.zrp_maxdata) {
+		verify_ok = B_TRUE;
+		spa->spa_load_txg = spa->spa_uberblock.ub_txg;
+		spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
+	}
+
+	if (error) {
+		if (error != ENXIO && error != EIO)
+			error = EIO;
+		return (error);
+	}
+
+	return (verify_ok ? 0 : EIO);
+}
+
 /*
  * Load an existing storage pool, using the pool's builtin spa_config as a
  * source of configuration information.
  */
 static int
-spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
+spa_load(spa_t *spa, spa_load_state_t state, int mosconfig)
 {
 	int error = 0;
 	nvlist_t *nvconfig, *nvroot = NULL;
@@ -1168,6 +1247,7 @@
 	uint64_t autoreplace = 0;
 	int orig_mode = spa->spa_mode;
 	char *ereport = FM_EREPORT_ZFS_POOL;
+	nvlist_t *config = spa->spa_config;
 
 	/*
 	 * If this is an untrusted config, access the pool in read-only mode.
@@ -1296,11 +1376,15 @@
 	 */
 	spa->spa_state = POOL_STATE_ACTIVE;
 	spa->spa_ubsync = spa->spa_uberblock;
-	spa->spa_first_txg = spa_last_synced_txg(spa) + 1;
+	spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
+	    TXG_INITIAL : spa_last_synced_txg(spa) - TXG_DEFER_SIZE;
+	spa->spa_first_txg = spa->spa_last_ubsync_txg ?
+	    spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
 	error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
 	if (error) {
 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
+		error = EIO;
 		goto out;
 	}
 	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
@@ -1359,7 +1443,7 @@
 		spa_deactivate(spa);
 		spa_activate(spa, orig_mode);
 
-		return (spa_load(spa, nvconfig, state, B_TRUE));
+		return (spa_load(spa, state, B_TRUE));
 	}
 
 	if (zap_lookup(spa->spa_meta_objset,
@@ -1569,7 +1653,17 @@
 		goto out;
 	}
 
-	if (spa_writeable(spa)) {
+	if (state != SPA_LOAD_TRYIMPORT) {
+		error = spa_load_verify(spa);
+		if (error) {
+			vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_CORRUPT_DATA);
+			goto out;
+		}
+	}
+
+	if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
+	    spa->spa_load_max_txg == UINT64_MAX)) {
 		dmu_tx_t *tx;
 		int need_update = B_FALSE;
 
@@ -1578,6 +1672,7 @@
 		/*
 		 * Claim log blocks that haven't been committed yet.
 		 * This must all happen in a single txg.
+		 * Price of rollback is that we abandon the log.
 		 */
 		tx = dmu_tx_create_assigned(spa_get_dsl(spa),
 		    spa_first_txg(spa));
@@ -1602,7 +1697,8 @@
 		 * in-core spa_config and update the disk labels.
 		 */
 		if (config_cache_txg != spa->spa_config_txg ||
-		    state == SPA_LOAD_IMPORT || spa->spa_load_verbatim)
+		    state == SPA_LOAD_IMPORT || spa->spa_load_verbatim ||
+		    state == SPA_LOAD_RECOVER)
 			need_update = B_TRUE;
 
 		for (int c = 0; c < rvd->vdev_children; c++)
@@ -1636,6 +1732,7 @@
 
 	error = 0;
 out:
+
 	spa->spa_minref = refcount_count(&spa->spa_refcount);
 	if (error && error != EBADF)
 		zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
@@ -1645,6 +1742,76 @@
 	return (error);
 }
 
+static int
+spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
+{
+	spa_unload(spa);
+	spa_deactivate(spa);
+
+	spa->spa_load_max_txg--;
+
+	spa_activate(spa, spa_mode_global);
+	spa_async_suspend(spa);
+
+	return (spa_load(spa, state, mosconfig));
+}
+
+static int
+spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
+    uint64_t max_request, boolean_t extreme)
+{
+	nvlist_t *config = NULL;
+	int load_error, rewind_error;
+	uint64_t safe_rollback_txg;
+	uint64_t min_txg;
+
+	if (spa->spa_load_txg && state == SPA_LOAD_RECOVER)
+		spa->spa_load_max_txg = spa->spa_load_txg;
+	else
+		spa->spa_load_max_txg = max_request;
+
+	load_error = rewind_error = spa_load(spa, state, mosconfig);
+	if (load_error == 0)
+		return (0);
+
+	if (spa->spa_root_vdev != NULL)
+		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
+
+	spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
+	spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
+
+	/* specific txg requested */
+	if (spa->spa_load_max_txg != UINT64_MAX && !extreme) {
+		nvlist_free(config);
+		return (load_error);
+	}
+
+	/* Price of rolling back is discarding txgs, including log */
+	if (state == SPA_LOAD_RECOVER)
+		spa->spa_log_state = SPA_LOG_CLEAR;
+
+	spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
+	safe_rollback_txg = spa->spa_uberblock.ub_txg - TXG_DEFER_SIZE;
+
+	min_txg = extreme ? TXG_INITIAL : safe_rollback_txg;
+	while (rewind_error && (spa->spa_uberblock.ub_txg >= min_txg)) {
+		if (spa->spa_load_max_txg < safe_rollback_txg)
+			spa->spa_extreme_rewind = B_TRUE;
+		rewind_error = spa_load_retry(spa, state, mosconfig);
+	}
+
+	if (config)
+		spa_rewind_data_to_nvlist(spa, config);
+
+	spa->spa_extreme_rewind = B_FALSE;
+	spa->spa_load_max_txg = UINT64_MAX;
+
+	if (config && (rewind_error || state != SPA_LOAD_RECOVER))
+		spa_config_set(spa, config);
+
+	return (state == SPA_LOAD_RECOVER ? rewind_error : load_error);
+}
+
 /*
  * Pool Open/Import
  *
@@ -1658,14 +1825,25 @@
  * ambiguous state.
  */
 static int
-spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
+spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
+    nvlist_t **config)
 {
 	spa_t *spa;
+	boolean_t norewind;
+	boolean_t extreme;
+	zpool_rewind_policy_t policy;
+	spa_load_state_t state = SPA_LOAD_OPEN;
 	int error;
 	int locked = B_FALSE;
 
 	*spapp = NULL;
 
+	zpool_get_rewind_policy(nvpolicy, &policy);
+	if (policy.zrp_request & ZPOOL_DO_REWIND)
+		state = SPA_LOAD_RECOVER;
+	norewind = (policy.zrp_request == ZPOOL_NO_REWIND);
+	extreme = ((policy.zrp_request & ZPOOL_EXTREME_REWIND) != 0);
+
 	/*
 	 * As disgusting as this is, we need to support recursive calls to this
 	 * function because dsl_dir_open() is called during spa_load(), and ends
@@ -1682,11 +1860,26 @@
 			mutex_exit(&spa_namespace_lock);
 		return (ENOENT);
 	}
+
 	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
 
 		spa_activate(spa, spa_mode_global);
 
-		error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE);
+		if (spa->spa_last_open_failed && norewind) {
+			if (config != NULL && spa->spa_config)
+				VERIFY(nvlist_dup(spa->spa_config,
+				    config, KM_SLEEP) == 0);
+			spa_deactivate(spa);
+			if (locked)
+				mutex_exit(&spa_namespace_lock);
+			return (spa->spa_last_open_failed);
+		}
+
+		if (state != SPA_LOAD_RECOVER)
+			spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
+
+		error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg,
+		    extreme);
 
 		if (error == EBADF) {
 			/*
@@ -1711,38 +1904,49 @@
 			 * information: the state of each vdev after the
 			 * attempted vdev_open().  Return this to the user.
 			 */
-			if (config != NULL && spa->spa_root_vdev != NULL)
-				*config = spa_config_generate(spa, NULL, -1ULL,
-				    B_TRUE);
+			if (config != NULL && spa->spa_config)
+				VERIFY(nvlist_dup(spa->spa_config, config,
+				    KM_SLEEP) == 0);
 			spa_unload(spa);
 			spa_deactivate(spa);
-			spa->spa_last_open_failed = B_TRUE;
+			spa->spa_last_open_failed = error;
 			if (locked)
 				mutex_exit(&spa_namespace_lock);
 			*spapp = NULL;
 			return (error);
-		} else {
-			spa->spa_last_open_failed = B_FALSE;
 		}
+
 	}
 
 	spa_open_ref(spa, tag);
 
+	spa->spa_last_open_failed = 0;
+
+	if (config != NULL)
+		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
+
+	spa->spa_last_ubsync_txg = 0;
+	spa->spa_load_txg = 0;
+
 	if (locked)
 		mutex_exit(&spa_namespace_lock);
 
 	*spapp = spa;
 
-	if (config != NULL)
-		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
-
 	return (0);
 }
 
 int
+spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
+    nvlist_t **config)
+{
+	return (spa_open_common(name, spapp, tag, policy, config));
+}
+
+int
 spa_open(const char *name, spa_t **spapp, void *tag)
 {
-	return (spa_open_common(name, spapp, tag, NULL));
+	return (spa_open_common(name, spapp, tag, NULL, NULL));
 }
 
 /*
@@ -1883,7 +2087,7 @@
 	spa_t *spa;
 
 	*config = NULL;
-	error = spa_open_common(name, &spa, FTAG, config);
+	error = spa_open_common(name, &spa, FTAG, NULL, config);
 
 	if (spa != NULL) {
 		/*
@@ -2143,7 +2347,7 @@
 	 */
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
-	spa = spa_add(pool, altroot);
+	spa = spa_add(pool, NULL, altroot);
 	spa_activate(spa, spa_mode_global);
 
 	spa->spa_uberblock.ub_txg = txg - 1;
@@ -2450,7 +2654,7 @@
 		spa_remove(spa);
 	}
 
-	spa = spa_add(pname, NULL);
+	spa = spa_add(pname, config, NULL);
 	spa->spa_is_root = B_TRUE;
 	spa->spa_load_verbatim = B_TRUE;
 
@@ -2529,6 +2733,7 @@
 spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props)
 {
 	spa_t *spa;
+	zpool_rewind_policy_t policy;
 	char *altroot = NULL;
 
 	mutex_enter(&spa_namespace_lock);
@@ -2539,12 +2744,13 @@
 
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
-	spa = spa_add(pool, altroot);
+	spa = spa_add(pool, config, altroot);
+
+	zpool_get_rewind_policy(config, &policy);
+	spa->spa_load_max_txg = policy.zrp_txg;
 
 	spa->spa_load_verbatim = B_TRUE;
 
-	VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
-
 	if (props != NULL)
 		spa_configfile_set(spa, props, B_FALSE);
 
@@ -2564,6 +2770,8 @@
 {
 	spa_t *spa;
 	char *altroot = NULL;
+	spa_load_state_t state = SPA_LOAD_IMPORT;
+	zpool_rewind_policy_t policy;
 	int error;
 	nvlist_t *nvroot;
 	nvlist_t **spares, **l2cache;
@@ -2578,12 +2786,16 @@
 		return (EEXIST);
 	}
 
+	zpool_get_rewind_policy(config, &policy);
+	if (policy.zrp_request & ZPOOL_DO_REWIND)
+		state = SPA_LOAD_RECOVER;
+
 	/*
 	 * Create and initialize the spa structure.
 	 */
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
-	spa = spa_add(pool, altroot);
+	spa = spa_add(pool, config, altroot);
 	spa_activate(spa, spa_mode_global);
 
 	/*
@@ -2596,7 +2808,16 @@
 	 * because the user-supplied config is actually the one to trust when
 	 * doing an import.
 	 */
-	error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE);
+	if (state != SPA_LOAD_RECOVER)
+		spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
+	error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg,
+	    ((policy.zrp_request & ZPOOL_EXTREME_REWIND) != 0));
+
+	/*
+	 * Propagate anything learned about failing or best txgs
+	 * back to caller
+	 */
+	spa_rewind_data_to_nvlist(spa, config);
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	/*
@@ -2726,7 +2947,7 @@
 	 * Create and initialize the spa structure.
 	 */
 	mutex_enter(&spa_namespace_lock);
-	spa = spa_add(TRYIMPORT_NAME, NULL);
+	spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
 	spa_activate(spa, FREAD);
 
 	/*
@@ -2734,7 +2955,7 @@
 	 * Pass TRUE for mosconfig because the user-supplied config
 	 * is actually the one to trust when doing an import.
 	 */
-	error = spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE);
+	error = spa_load(spa, SPA_LOAD_TRYIMPORT, B_TRUE);
 
 	/*
 	 * If 'tryconfig' was at least parsable, return the current config.
@@ -4531,6 +4752,8 @@
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
+	spa_handle_ignored_writes(spa);
+
 	/*
 	 * If any async tasks have been requested, kick them off.
 	 */
--- a/usr/src/uts/common/fs/zfs/spa_config.c	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/spa_config.c	Fri Oct 30 18:47:17 2009 -0600
@@ -75,7 +75,6 @@
 	void *buf = NULL;
 	nvlist_t *nvlist, *child;
 	nvpair_t *nvpair;
-	spa_t *spa;
 	char *pathname;
 	struct _buf *file;
 	uint64_t fsize;
@@ -119,7 +118,6 @@
 	mutex_enter(&spa_namespace_lock);
 	nvpair = NULL;
 	while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) {
-
 		if (nvpair_type(nvpair) != DATA_TYPE_NVLIST)
 			continue;
 
@@ -127,13 +125,7 @@
 
 		if (spa_lookup(nvpair_name(nvpair)) != NULL)
 			continue;
-		spa = spa_add(nvpair_name(nvpair), NULL);
-
-		/*
-		 * We blindly duplicate the configuration here.  If it's
-		 * invalid, we will catch it when the pool is first opened.
-		 */
-		VERIFY(nvlist_dup(child, &spa->spa_config, 0) == 0);
+		(void) spa_add(nvpair_name(nvpair), child, NULL);
 	}
 	mutex_exit(&spa_namespace_lock);
 
@@ -313,6 +305,24 @@
 	mutex_exit(&spa->spa_props_lock);
 }
 
+/* Add discovered rewind info, if any to the provided nvlist */
+void
+spa_rewind_data_to_nvlist(spa_t *spa, nvlist_t *tonvl)
+{
+	int64_t loss = 0;
+
+	if (tonvl == NULL || spa->spa_load_txg == 0)
+		return;
+
+	VERIFY(nvlist_add_uint64(tonvl, ZPOOL_CONFIG_LOAD_TIME,
+	    spa->spa_load_txg_ts) == 0);
+	if (spa->spa_last_ubsync_txg)
+		loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
+	VERIFY(nvlist_add_int64(tonvl, ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
+	VERIFY(nvlist_add_uint64(tonvl, ZPOOL_CONFIG_LOAD_DATA_ERRORS,
+	    spa->spa_load_data_errors) == 0);
+}
+
 /*
  * Generate the pool's configuration based on the current in-core state.
  * We infer whether to generate a complete config or just one top-level config
@@ -394,6 +404,8 @@
 	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
 	nvlist_free(nvroot);
 
+	spa_rewind_data_to_nvlist(spa, config);
+
 	if (locked)
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 
--- a/usr/src/uts/common/fs/zfs/spa_misc.c	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c	Fri Oct 30 18:47:17 2009 -0600
@@ -420,7 +420,7 @@
  * exist by calling spa_lookup() first.
  */
 spa_t *
-spa_add(const char *name, const char *altroot)
+spa_add(const char *name, nvlist_t *config, const char *altroot)
 {
 	spa_t *spa;
 	spa_config_dirent_t *dp;
@@ -445,6 +445,7 @@
 	spa->spa_state = POOL_STATE_UNINITIALIZED;
 	spa->spa_freeze_txg = UINT64_MAX;
 	spa->spa_final_txg = UINT64_MAX;
+	spa->spa_load_max_txg = UINT64_MAX;
 
 	refcount_create(&spa->spa_refcount);
 	spa_config_lock_init(spa);
@@ -471,6 +472,9 @@
 	dp->scd_path = spa_strdup(spa_config_path);
 	list_insert_head(&spa->spa_config_list, dp);
 
+	if (config != NULL)
+		VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
+
 	return (spa);
 }
 
@@ -818,7 +822,7 @@
 void
 spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc)
 {
-	vdev_space_update(vd, space, alloc, B_FALSE);
+	vdev_space_update(vd, space, alloc, 0, B_FALSE);
 }
 
 /*
@@ -1292,6 +1296,15 @@
 		return (spa->spa_root_vdev->vdev_stat.vs_space);
 }
 
+/*
+ * Return the amount of space deferred from freeing (in in-core maps only)
+ */
+uint64_t
+spa_get_defers(spa_t *spa)
+{
+	return (spa->spa_root_vdev->vdev_stat.vs_defer);
+}
+
 /* ARGSUSED */
 uint64_t
 spa_get_asize(spa_t *spa, uint64_t lsize)
--- a/usr/src/uts/common/fs/zfs/space_map.c	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/space_map.c	Fri Oct 30 18:47:17 2009 -0600
@@ -276,11 +276,8 @@
 	int error = 0;
 
 	ASSERT(MUTEX_HELD(sm->sm_lock));
-
-	space_map_load_wait(sm);
-
-	if (sm->sm_loaded)
-		return (0);
+	ASSERT(!sm->sm_loaded);
+	ASSERT(!sm->sm_loading);
 
 	sm->sm_loading = B_TRUE;
 	end = smo->smo_objsize;
--- a/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h	Fri Oct 30 18:47:17 2009 -0600
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -48,7 +48,7 @@
 
 int traverse_dataset(struct dsl_dataset *ds, uint64_t txg_start,
     int flags, blkptr_cb_t func, void *arg);
-int traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg);
+int traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg, uint64_t txg_start);
 
 #ifdef	__cplusplus
 }
--- a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h	Fri Oct 30 18:47:17 2009 -0600
@@ -127,6 +127,7 @@
 void dsl_pool_zil_clean(dsl_pool_t *dp);
 int dsl_pool_sync_context(dsl_pool_t *dp);
 uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree);
+uint64_t dsl_pool_adjustedfree(dsl_pool_t *dp, boolean_t netfree);
 int dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx);
 void dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
 void dsl_pool_memory_pressure(dsl_pool_t *dp);
--- a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h	Fri Oct 30 18:47:17 2009 -0600
@@ -67,7 +67,9 @@
 	space_map_obj_t	ms_smo_syncing;	/* syncing space map object	*/
 	space_map_t	ms_allocmap[TXG_SIZE];  /* allocated this txg	*/
 	space_map_t	ms_freemap[TXG_SIZE];	/* freed this txg	*/
+	space_map_t	ms_defermap[TXG_DEFER_SIZE]; /* deferred frees	*/
 	space_map_t	ms_map;		/* in-core free space map	*/
+	int64_t		ms_deferspace;	/* sum of ms_defermap[] space	*/
 	uint64_t	ms_weight;	/* weight vs. others in group	*/
 	metaslab_group_t *ms_group;	/* metaslab group		*/
 	avl_node_t	ms_group_node;	/* node in metaslab group tree	*/
--- a/usr/src/uts/common/fs/zfs/sys/spa.h	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h	Fri Oct 30 18:47:17 2009 -0600
@@ -320,6 +320,8 @@
 
 /* state manipulation functions */
 extern int spa_open(const char *pool, spa_t **, void *tag);
+extern int spa_open_rewind(const char *pool, spa_t **, void *tag,
+    nvlist_t *policy, nvlist_t **config);
 extern int spa_get_stats(const char *pool, nvlist_t **config,
     char *altroot, size_t buflen);
 extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props,
@@ -402,7 +404,7 @@
 
 /* Namespace manipulation */
 extern spa_t *spa_lookup(const char *name);
-extern spa_t *spa_add(const char *name, const char *altroot);
+extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot);
 extern void spa_remove(spa_t *spa);
 extern spa_t *spa_next(spa_t *prev);
 
@@ -457,6 +459,7 @@
 extern uint64_t spa_get_alloc(spa_t *spa);
 extern uint64_t spa_get_space(spa_t *spa);
 extern uint64_t spa_get_dspace(spa_t *spa);
+extern uint64_t spa_get_defers(spa_t *spa);
 extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize);
 extern uint64_t spa_version(spa_t *spa);
 extern int spa_max_replication(spa_t *spa);
@@ -481,6 +484,8 @@
 extern boolean_t spa_has_slogs(spa_t *spa);
 extern boolean_t spa_is_root(spa_t *spa);
 extern boolean_t spa_writeable(spa_t *spa);
+extern void spa_rewind_data_to_nvlist(spa_t *spa, nvlist_t *to);
+
 extern int spa_mode(spa_t *spa);
 extern uint64_t strtonum(const char *str, char **nptr);
 
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h	Fri Oct 30 18:47:17 2009 -0600
@@ -113,6 +113,7 @@
 	uint64_t	spa_first_txg;		/* first txg after spa_open() */
 	uint64_t	spa_final_txg;		/* txg of export/destroy */
 	uint64_t	spa_freeze_txg;		/* freeze pool at this txg */
+	uint64_t	spa_load_max_txg;	/* best initial ub_txg */
 	objset_t	*spa_meta_objset;	/* copy of dp->dp_meta_objset */
 	txg_list_t	spa_vdev_txg_list;	/* per-txg dirty vdev list */
 	vdev_t		*spa_root_vdev;		/* top-level vdev container */
@@ -128,6 +129,7 @@
 	bplist_t	spa_sync_bplist;	/* deferred-free bplist */
 	uberblock_t	spa_ubsync;		/* last synced uberblock */
 	uberblock_t	spa_uberblock;		/* current uberblock */
+	boolean_t	spa_extreme_rewind;	/* rewind past deferred frees */
 	kmutex_t	spa_scrub_lock;		/* resilver/scrub lock */
 	uint64_t	spa_scrub_inflight;	/* in-flight scrub I/Os */
 	uint64_t	spa_scrub_maxinflight;	/* max in-flight scrub I/Os */
@@ -145,7 +147,15 @@
 	uint16_t	spa_async_tasks;	/* async task mask */
 	char		*spa_root;		/* alternate root directory */
 	uint64_t	spa_ena;		/* spa-wide ereport ENA */
-	boolean_t	spa_last_open_failed;	/* true if last open faled */
+	int		spa_last_open_failed;	/* error if last open failed */
+	nvlist_t	*spa_failed_open_cfg;	/* cached config nvlist */
+	uint64_t	spa_last_ubsync_txg;	/* "best" uberblock txg */
+	uint64_t	spa_last_ubsync_txg_ts;	/* timestamp from that ub */
+	uint64_t	spa_load_txg;		/* ub txg that loaded */
+	uint64_t	spa_load_txg_ts;	/* timestamp from that ub */
+	uint64_t	spa_load_meta_errors;	/* verify metadata err count */
+	uint64_t	spa_load_data_errors;	/* verify data err count */
+	uint64_t	spa_verify_min_txg;	/* start txg of verify scrub */
 	kmutex_t	spa_errlog_lock;	/* error log lock */
 	uint64_t	spa_errlog_last;	/* last error log object */
 	uint64_t	spa_errlog_scrub;	/* scrub error log object */
--- a/usr/src/uts/common/fs/zfs/sys/txg.h	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/txg.h	Fri Oct 30 18:47:17 2009 -0600
@@ -39,6 +39,9 @@
 #define	TXG_INITIAL		TXG_SIZE	/* initial txg 		*/
 #define	TXG_IDX			(txg & TXG_MASK)
 
+/* Number of txgs worth of frees we defer adding to in-core spacemaps */
+#define	TXG_DEFER_SIZE		2
+
 #define	TXG_WAIT		1ULL
 #define	TXG_NOWAIT		2ULL
 
--- a/usr/src/uts/common/fs/zfs/sys/uberblock_impl.h	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/uberblock_impl.h	Fri Oct 30 18:47:17 2009 -0600
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -33,11 +33,6 @@
 #endif
 
 /*
- * For zdb use and debugging purposes only
- */
-extern uint64_t ub_max_txg;
-
-/*
  * The uberblock version is incremented whenever an incompatible on-disk
  * format change is made to the SPA, DMU, or ZAP.
  *
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h	Fri Oct 30 18:47:17 2009 -0600
@@ -85,7 +85,7 @@
     vdev_aux_t aux);
 
 extern void vdev_space_update(vdev_t *vd, int64_t space_delta,
-    int64_t alloc_delta, boolean_t update_root);
+    int64_t alloc_delta, int64_t defer_delta, boolean_t update_root);
 
 extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
 
--- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h	Fri Oct 30 18:47:17 2009 -0600
@@ -119,7 +119,8 @@
 	uint32_t	zi_failfast;
 	char		zi_func[MAXNAMELEN];
 	uint32_t	zi_iotype;
-	uint32_t	zi_pad;		/* 64-bit alignment */
+	int32_t		zi_duration;
+	uint64_t	zi_timer;
 } zinject_record_t;
 
 #define	ZINJECT_NULL		0x1
--- a/usr/src/uts/common/fs/zfs/sys/zio.h	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h	Fri Oct 30 18:47:17 2009 -0600
@@ -481,6 +481,7 @@
 extern int zio_handle_fault_injection(zio_t *zio, int error);
 extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
 extern int zio_handle_label_injection(zio_t *zio, int error);
+extern void zio_handle_ignored_writes(zio_t *zio);
 
 /*
  * Checksum ereport functions
@@ -498,6 +499,9 @@
     struct zio *zio, uint64_t offset, uint64_t length,
     const void *good_data, const void *bad_data, struct zio_bad_cksum *info);
 
+/* Called from spa_sync(), but primarily an injection handler */
+extern void spa_handle_ignored_writes(spa_t *spa);
+
 #ifdef	__cplusplus
 }
 #endif
--- a/usr/src/uts/common/fs/zfs/txg.c	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/txg.c	Fri Oct 30 18:47:17 2009 -0600
@@ -188,7 +188,11 @@
 	 * Finish off any work in progress.
 	 */
 	ASSERT(tx->tx_threads == 2);
-	txg_wait_synced(dp, 0);
+
+	/*
+	 * We need to ensure that we've vacated the deferred space_maps.
+	 */
+	txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE);
 
 	/*
 	 * Wake all sync threads and wait for them to die.
--- a/usr/src/uts/common/fs/zfs/vdev.c	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/vdev.c	Fri Oct 30 18:47:17 2009 -0600
@@ -2564,7 +2564,7 @@
  */
 void
 vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta,
-    boolean_t update_root)
+    int64_t defer_delta, boolean_t update_root)
 {
 	int64_t dspace_delta = space_delta;
 	spa_t *spa = vd->vdev_spa;
@@ -2587,6 +2587,7 @@
 	vd->vdev_stat.vs_space += space_delta;
 	vd->vdev_stat.vs_alloc += alloc_delta;
 	vd->vdev_stat.vs_dspace += dspace_delta;
+	vd->vdev_stat.vs_defer += defer_delta;
 	mutex_exit(&vd->vdev_stat_lock);
 
 	if (update_root) {
@@ -2604,6 +2605,7 @@
 		rvd->vdev_stat.vs_space += space_delta;
 		rvd->vdev_stat.vs_alloc += alloc_delta;
 		rvd->vdev_stat.vs_dspace += dspace_delta;
+		rvd->vdev_stat.vs_defer += defer_delta;
 		mutex_exit(&rvd->vdev_stat_lock);
 	}
 }
--- a/usr/src/uts/common/fs/zfs/vdev_label.c	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/vdev_label.c	Fri Oct 30 18:47:17 2009 -0600
@@ -779,11 +779,6 @@
  */
 
 /*
- * For use by zdb and debugging purposes only
- */
-uint64_t ub_max_txg = UINT64_MAX;
-
-/*
  * Consider the following situation: txg is safely synced to disk.  We've
  * written the first uberblock for txg + 1, and then we lose power.  When we
  * come back up, we fail to see the uberblock for txg + 1 because, say,
@@ -812,6 +807,7 @@
 static void
 vdev_uberblock_load_done(zio_t *zio)
 {
+	spa_t *spa = zio->io_spa;
 	zio_t *rio = zio->io_private;
 	uberblock_t *ub = zio->io_data;
 	uberblock_t *ubbest = rio->io_private;
@@ -820,7 +816,7 @@
 
 	if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
 		mutex_enter(&rio->io_lock);
-		if (ub->ub_txg <= ub_max_txg &&
+		if (ub->ub_txg <= spa->spa_load_max_txg &&
 		    vdev_uberblock_compare(ub, ubbest) > 0)
 			*ubbest = *ub;
 		mutex_exit(&rio->io_lock);
--- a/usr/src/uts/common/fs/zfs/zfs_fm.c	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_fm.c	Fri Oct 30 18:47:17 2009 -0600
@@ -109,9 +109,11 @@
 	char class[64];
 
 	/*
-	 * If we are doing a spa_tryimport(), ignore errors.
+	 * If we are doing a spa_tryimport() or in recovery mode,
+	 * ignore errors.
 	 */
-	if (spa->spa_load_state == SPA_LOAD_TRYIMPORT)
+	if (spa->spa_load_state == SPA_LOAD_TRYIMPORT ||
+	    spa->spa_load_state == SPA_LOAD_RECOVER)
 		return;
 
 	/*
@@ -340,6 +342,7 @@
 		    FM_EREPORT_PAYLOAD_ZFS_PREV_STATE,
 		    DATA_TYPE_UINT64, stateoroffset, NULL);
 	}
+
 	mutex_exit(&spa->spa_errlist_lock);
 
 	*ereport_out = ereport;
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c	Fri Oct 30 18:47:17 2009 -0600
@@ -950,9 +950,9 @@
 static int
 zfs_ioc_pool_import(zfs_cmd_t *zc)
 {
-	int error;
 	nvlist_t *config, *props = NULL;
 	uint64_t guid;
+	int error;
 
 	if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
 	    zc->zc_iflags, &config)) != 0)
@@ -969,11 +969,13 @@
 	    guid != zc->zc_guid)
 		error = EINVAL;
 	else if (zc->zc_cookie)
-		error = spa_import_verbatim(zc->zc_name, config,
-		    props);
+		error = spa_import_verbatim(zc->zc_name, config, props);
 	else
 		error = spa_import(zc->zc_name, config, props);
 
+	if (zc->zc_nvlist_dst != 0)
+		(void) put_nvlist(zc, config);
+
 	nvlist_free(config);
 
 	if (props)
@@ -2980,9 +2982,31 @@
 		/* we need to let spa_open/spa_load clear the chains */
 		spa->spa_log_state = SPA_LOG_CLEAR;
 	}
+	spa->spa_last_open_failed = 0;
 	mutex_exit(&spa_namespace_lock);
 
-	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+	if (zc->zc_cookie == ZPOOL_NO_REWIND) {
+		error = spa_open(zc->zc_name, &spa, FTAG);
+	} else {
+		nvlist_t *policy;
+		nvlist_t *config = NULL;
+
+		if (zc->zc_nvlist_src == NULL)
+			return (EINVAL);
+
+		if ((error = get_nvlist(zc->zc_nvlist_src,
+		    zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) {
+			error = spa_open_rewind(zc->zc_name, &spa, FTAG,
+			    policy, &config);
+			if (config != NULL) {
+				(void) put_nvlist(zc, config);
+				nvlist_free(config);
+			}
+			nvlist_free(policy);
+		}
+	}
+
+	if (error)
 		return (error);
 
 	spa_vdev_state_enter(spa, SCL_NONE);
--- a/usr/src/uts/common/fs/zfs/zil.c	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/zil.c	Fri Oct 30 18:47:17 2009 -0600
@@ -519,6 +519,8 @@
 			zio_free_blk(zilog->zl_spa, &zh->zh_log, first_txg);
 		BP_ZERO(&zh->zh_log);
 		dsl_dataset_dirty(dmu_objset_ds(os), tx);
+		dmu_objset_rele(os, FTAG);
+		return (0);
 	}
 
 	/*
--- a/usr/src/uts/common/fs/zfs/zio.c	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/zio.c	Fri Oct 30 18:47:17 2009 -0600
@@ -927,6 +927,10 @@
 		BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
 	}
 
+	if (zio_injection_enabled &&
+	    zio->io_spa->spa_syncing_txg == zio->io_txg)
+		zio_handle_ignored_writes(zio);
+
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
--- a/usr/src/uts/common/fs/zfs/zio_inject.c	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/zio_inject.c	Fri Oct 30 18:47:17 2009 -0600
@@ -152,7 +152,8 @@
 
 		/* Ignore device errors and panic injection */
 		if (handler->zi_record.zi_guid != 0 ||
-		    handler->zi_record.zi_func[0] != '\0')
+		    handler->zi_record.zi_func[0] != '\0' ||
+		    handler->zi_record.zi_duration != 0)
 			continue;
 
 		/* If this handler matches, return EIO */
@@ -197,7 +198,8 @@
 
 		/* Ignore device only faults or panic injection */
 		if (handler->zi_record.zi_start == 0 ||
-		    handler->zi_record.zi_func[0] != '\0')
+		    handler->zi_record.zi_func[0] != '\0' ||
+		    handler->zi_record.zi_duration != 0)
 			continue;
 
 		/*
@@ -243,9 +245,13 @@
 	for (handler = list_head(&inject_handlers); handler != NULL;
 	    handler = list_next(&inject_handlers, handler)) {
 
-		/* Ignore label specific faults or panic injection */
+		/*
+		 * Ignore label specific faults, panic injection
+		 * or fake writes
+		 */
 		if (handler->zi_record.zi_start != 0 ||
-		    handler->zi_record.zi_func[0] != '\0')
+		    handler->zi_record.zi_func[0] != '\0' ||
+		    handler->zi_record.zi_duration != 0)
 			continue;
 
 		if (vd->vdev_guid == handler->zi_record.zi_guid) {
@@ -285,6 +291,80 @@
 }
 
 /*
+ * Simulate hardware that ignores cache flushes.  For requested number
+ * of seconds nix the actual writing to disk.
+ */
+void
+zio_handle_ignored_writes(zio_t *zio)
+{
+	inject_handler_t *handler;
+
+	rw_enter(&inject_lock, RW_READER);
+
+	for (handler = list_head(&inject_handlers); handler != NULL;
+	    handler = list_next(&inject_handlers, handler)) {
+
+		/* Ignore errors not destined for this pool */
+		if (zio->io_spa != handler->zi_spa)
+			continue;
+
+		if (handler->zi_record.zi_duration == 0)
+			continue;
+
+		/*
+		 * Positive duration implies # of seconds, negative
+		 * a number of txgs
+		 */
+		if (handler->zi_record.zi_timer == 0) {
+			if (handler->zi_record.zi_duration > 0)
+				handler->zi_record.zi_timer = lbolt64;
+			else
+				handler->zi_record.zi_timer = zio->io_txg;
+		}
+		zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
+		break;
+	}
+
+	rw_exit(&inject_lock);
+}
+
+void
+spa_handle_ignored_writes(spa_t *spa)
+{
+	inject_handler_t *handler;
+
+	if (zio_injection_enabled == 0)
+		return;
+
+	rw_enter(&inject_lock, RW_READER);
+
+	for (handler = list_head(&inject_handlers); handler != NULL;
+	    handler = list_next(&inject_handlers, handler)) {
+
+		/* Ignore errors not destined for this pool */
+		if (spa != handler->zi_spa)
+			continue;
+
+		if (handler->zi_record.zi_duration == 0)
+			continue;
+
+		if (handler->zi_record.zi_duration > 0) {
+			VERIFY(handler->zi_record.zi_timer == 0 ||
+			    handler->zi_record.zi_timer +
+			    handler->zi_record.zi_duration * hz > lbolt64);
+		} else {
+			/* duration is negative so the subtraction here adds */
+			VERIFY(handler->zi_record.zi_timer == 0 ||
+			    handler->zi_record.zi_timer -
+			    handler->zi_record.zi_duration >=
+			    spa->spa_syncing_txg);
+		}
+	}
+
+	rw_exit(&inject_lock);
+}
+
+/*
  * Create a new handler for the given record.  We add it to the list, adding
  * a reference to the spa_t in the process.  We increment zio_injection_enabled,
  * which is the switch to trigger all fault injection.
--- a/usr/src/uts/common/sys/fs/zfs.h	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/uts/common/sys/fs/zfs.h	Fri Oct 30 18:47:17 2009 -0600
@@ -367,6 +367,20 @@
 #define	ZPL_VERSION_SYSATTR		ZPL_VERSION_3
 #define	ZPL_VERSION_USERSPACE		ZPL_VERSION_4
 
+/* Rewind request information */
+#define	ZPOOL_NO_REWIND		0
+#define	ZPOOL_TRY_REWIND	1 /* Search for best txg, but do not rewind */
+#define	ZPOOL_DO_REWIND		2 /* Rewind to best txg w/in deferred frees */
+#define	ZPOOL_EXTREME_REWIND	4 /* Allow extreme measures to find best txg */
+#define	ZPOOL_REWIND_MASK	7 /* All the possible policy bits */
+
+typedef struct zpool_rewind_policy {
+	uint32_t	zrp_request;	/* rewind behavior requested */
+	uint32_t	zrp_maxmeta;	/* max acceptable meta-data errors */
+	uint32_t	zrp_maxdata;	/* max acceptable data errors */
+	uint64_t	zrp_txg;	/* specific txg to load */
+} zpool_rewind_policy_t;
+
 /*
  * The following are configuration names used in the nvlist describing a pool's
  * configuration.
@@ -421,6 +435,18 @@
 #define	ZPOOL_CONFIG_FRU		"fru"
 #define	ZPOOL_CONFIG_AUX_STATE		"aux_state"
 
+/* Rewind policy parameters */
+#define	ZPOOL_REWIND_POLICY		"rewind-policy"
+#define	ZPOOL_REWIND_REQUEST		"rewind-request"
+#define	ZPOOL_REWIND_REQUEST_TXG	"rewind-request-txg"
+#define	ZPOOL_REWIND_META_THRESH	"rewind-meta-thresh"
+#define	ZPOOL_REWIND_DATA_THRESH	"rewind-data-thresh"
+
+/* Rewind data discovered */
+#define	ZPOOL_CONFIG_LOAD_TIME		"rewind_txg_ts"
+#define	ZPOOL_CONFIG_LOAD_DATA_ERRORS	"verify_data_errors"
+#define	ZPOOL_CONFIG_REWIND_TIME	"seconds_of_rewind"
+
 #define	VDEV_TYPE_ROOT			"root"
 #define	VDEV_TYPE_MIRROR		"mirror"
 #define	VDEV_TYPE_REPLACING		"replacing"
@@ -533,6 +559,7 @@
 	uint64_t	vs_alloc;		/* space allocated	*/
 	uint64_t	vs_space;		/* total capacity	*/
 	uint64_t	vs_dspace;		/* deflated capacity	*/
+	uint64_t	vs_defer;		/* in-core deferred	*/
 	uint64_t	vs_rsize;		/* replaceable dev size */
 	uint64_t	vs_ops[ZIO_TYPES];	/* operation count	*/
 	uint64_t	vs_bytes[ZIO_TYPES];	/* bytes read/written	*/
@@ -631,7 +658,8 @@
 	SPA_LOAD_NONE,		/* no load in progress */
 	SPA_LOAD_OPEN,		/* normal open */
 	SPA_LOAD_IMPORT,	/* import in progress */
-	SPA_LOAD_TRYIMPORT	/* tryimport in progress */
+	SPA_LOAD_TRYIMPORT,	/* tryimport in progress */
+	SPA_LOAD_RECOVER	/* recovery requested */
 } spa_load_state_t;
 
 /*