Mercurial > illumos > illumos-gate

diff usr/src/uts/common/fs/zfs/spa.c @ 1544:938876158511
PSARC 2006/077 zpool clear PSARC 2006/139 FMA for ZFS 6284889 arc should replace the znode cache 6333006 DMU & DSL should not panic upon I/O error 6333092 concurrent reads to a file not scaling with number of readers 6338081 ZFS/FMA phase 1 6338386 need persistent error log 6341326 i/o error causes arc buf hash table corruption 6341639 zfs backup/restore should compute/verify checksum of backup stream 6348002 out of space due to changing properties 6354724 inaccurate error message from zfs restore 6354872 dmu_sync() blows predictive accounting 6355416 zpool scrubbing consumes all memory, system hung 6363995 df should only load libzfs when it encounters a ZFS filesystem 6366320 zfs backup/restore doesn't like signals 6368892 mount -m support needed for legacy mounts 6368902 boot archive fstat support needed for ZFS Mountroot 6369424 BFU complains when bfu'ing a ZFS root filesystem 6374062 mountroot support needed for ZFS 6376356 dirtying dbuf obj=43 lvl=0 blkid=0 but not tx_held 6378391 unused members of dmu_objset_stats_t 6378392 clean up zfs_cmd_t structure 6378685 buf_init should allocate its hash table more carefully 6378976 ziltest should be a first class citizen 6381086 zdb segfaults if there is a spa deferred-free bplist 6381203 deadlock due to i/o while assigning (tc_lock held) 6381209 freed space is not immediately available 6381344 'zpool clear' 6381345 FAULTED devices should really be UNAVAIL 6381346 import should mark devices as persistently unavailable 6383272 recursive mutex_enter() during log replay with zfs root 6386326 origin property is not displayed 6386354 libzfs does too much in its _init section, calls exit(1) 6386624 zpool should not complain about non-existent devices from libdiskmgt 6386910 spa needs to be i/o error hardened 6387735 need a mechanism to inject faults into ZFS 6387736 internal ZFS utilities should be placed in an ON-private package 6389928 libzfs should ship a lint library 6390609 malformed vdev config panics on zpool_create() 6390677 version number checking makes upgrades challenging 6390713 ztest hangs in zil_suspend() 6391873 metadata compression should be turned back on 6392113 ztest sometimes reports leaked blocks because ZIL isn't resilvered 6393004 minor memory leak in unique_insert()
author: eschrock
date: Fri, 03 Mar 2006 20:08:16 -0800
parents: 81359ee1ee63
children: 4ad213e858a9
--- a/usr/src/uts/common/fs/zfs/spa.c	Fri Mar 03 17:59:43 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/spa.c	Fri Mar 03 20:08:16 2006 -0800
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -33,6 +32,7 @@
  */
 
 #include <sys/zfs_context.h>
+#include <sys/fm/fs/zfs.h>
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
@@ -62,6 +62,44 @@
  * ==========================================================================
  */
 
+static int
+spa_error_entry_compare(const void *a, const void *b)
+{
+	spa_error_entry_t *sa = (spa_error_entry_t *)a;
+	spa_error_entry_t *sb = (spa_error_entry_t *)b;
+	int ret;
+
+	ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
+	    sizeof (zbookmark_t));
+
+	if (ret < 0)
+		return (-1);
+	else if (ret > 0)
+		return (1);
+	else
+		return (0);
+}
+
+/*
+ * Utility function which retrieves copies of the current logs and
+ * re-initializes them in the process.
+ */
+void
+spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
+{
+	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
+
+	bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
+	bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
+
+	avl_create(&spa->spa_errlist_scrub,
+	    spa_error_entry_compare, sizeof (spa_error_entry_t),
+	    offsetof(spa_error_entry_t, se_avl));
+	avl_create(&spa->spa_errlist_last,
+	    spa_error_entry_compare, sizeof (spa_error_entry_t),
+	    offsetof(spa_error_entry_t, se_avl));
+}
+
 /*
  * Activate an uninitialized pool.
  */
@@ -76,9 +114,6 @@
 
 	spa->spa_normal_class = metaslab_class_create();
 
-	spa->spa_vdev_retry_taskq = taskq_create("spa_vdev_retry",
-	    4, maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE);
-
 	for (t = 0; t < ZIO_TYPES; t++) {
 		spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue",
 		    8, maxclsyspri, 50, INT_MAX,
@@ -95,6 +130,13 @@
 
 	txg_list_create(&spa->spa_vdev_txg_list,
 	    offsetof(struct vdev, vdev_txg_node));
+
+	avl_create(&spa->spa_errlist_scrub,
+	    spa_error_entry_compare, sizeof (spa_error_entry_t),
+	    offsetof(spa_error_entry_t, se_avl));
+	avl_create(&spa->spa_errlist_last,
+	    spa_error_entry_compare, sizeof (spa_error_entry_t),
+	    offsetof(spa_error_entry_t, se_avl));
 }
 
 /*
@@ -124,12 +166,18 @@
 		spa->spa_zio_intr_taskq[t] = NULL;
 	}
 
-	taskq_destroy(spa->spa_vdev_retry_taskq);
-	spa->spa_vdev_retry_taskq = NULL;
-
 	metaslab_class_destroy(spa->spa_normal_class);
 	spa->spa_normal_class = NULL;
 
+	/*
+	 * If this was part of an import or the open otherwise failed, we may
+	 * still have errors left in the queues.  Empty them just in case.
+	 */
+	spa_errlog_drain(spa);
+
+	avl_destroy(&spa->spa_errlist_scrub);
+	avl_destroy(&spa->spa_errlist_last);
+
 	spa->spa_state = POOL_STATE_UNINITIALIZED;
 }
 
@@ -175,6 +223,11 @@
 spa_unload(spa_t *spa)
 {
 	/*
+	 * Stop async tasks.
+	 */
+	spa_async_suspend(spa);
+
+	/*
 	 * Stop syncing.
 	 */
 	if (spa->spa_sync_on) {
@@ -185,8 +238,8 @@
 	/*
 	 * Wait for any outstanding prefetch I/O to complete.
 	 */
-	spa_config_enter(spa, RW_WRITER);
-	spa_config_exit(spa);
+	spa_config_enter(spa, RW_WRITER, FTAG);
+	spa_config_exit(spa, FTAG);
 
 	/*
 	 * Close the dsl pool.
@@ -203,16 +256,16 @@
 		vdev_free(spa->spa_root_vdev);
 		spa->spa_root_vdev = NULL;
 	}
+
+	spa->spa_async_suspended = 0;
 }
 
 /*
  * Load an existing storage pool, using the pool's builtin spa_config as a
- * source of configuration information.  The 'readonly' flag will prevent us
- * from writing any updated state to disk, and can be use when testing a pool
- * for import.
+ * source of configuration information.
  */
 static int
-spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig)
+spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
 {
 	int error = 0;
 	nvlist_t *nvroot = NULL;
@@ -221,25 +274,34 @@
 	uint64_t pool_guid;
 	zio_t *zio;
 
+	spa->spa_load_state = state;
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
-	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
-		return (EINVAL);
+	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
+		error = EINVAL;
+		goto out;
+	}
 
 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
 	    &spa->spa_config_txg);
 
-	if (import && spa_guid_exists(pool_guid, 0))
-		return (EEXIST);
+	if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
+	    spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
+	    spa_guid_exists(pool_guid, 0)) {
+		error = EEXIST;
+		goto out;
+	}
 
 	/*
 	 * Parse the configuration into a vdev tree.
 	 */
-	spa_config_enter(spa, RW_WRITER);
+	spa_config_enter(spa, RW_WRITER, FTAG);
 	rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_LOAD);
-	spa_config_exit(spa);
+	spa_config_exit(spa, FTAG);
 
-	if (rvd == NULL)
-		return (EINVAL);
+	if (rvd == NULL) {
+		error = EINVAL;
+		goto out;
+	}
 
 	spa->spa_root_vdev = rvd;
 	ASSERT(spa_guid(spa) == pool_guid);
@@ -247,8 +309,10 @@
 	/*
 	 * Try to open all vdevs, loading each label in the process.
 	 */
-	if (vdev_open(rvd) != 0)
-		return (ENXIO);
+	if (vdev_open(rvd) != 0) {
+		error = ENXIO;
+		goto out;
+	}
 
 	/*
 	 * Find the best uberblock.
@@ -264,8 +328,16 @@
 	 * If we weren't able to find a single valid uberblock, return failure.
 	 */
 	if (ub->ub_txg == 0) {
-		dprintf("ub_txg is zero\n");
-		return (ENXIO);
+		error = ENXIO;
+		goto out;
+	}
+
+	/*
+	 * If the pool is newer than the code, we can't open it.
+	 */
+	if (ub->ub_version > UBERBLOCK_VERSION) {
+		error = ENOTSUP;
+		goto out;
 	}
 
 	/*
@@ -273,11 +345,10 @@
 	 * incomplete configuration.
 	 */
 	if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) {
-		rvd->vdev_state = VDEV_STATE_CANT_OPEN;
-		rvd->vdev_stat.vs_aux = VDEV_AUX_BAD_GUID_SUM;
-		dprintf("vdev_guid_sum %llx != ub_guid_sum %llx\n",
-		    rvd->vdev_guid_sum, ub->ub_guid_sum);
-		return (ENXIO);
+		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_BAD_GUID_SUM);
+		error = ENXIO;
+		goto out;
 	}
 
 	/*
@@ -286,12 +357,22 @@
 	spa->spa_state = POOL_STATE_ACTIVE;
 	spa->spa_ubsync = spa->spa_uberblock;
 	spa->spa_first_txg = spa_last_synced_txg(spa) + 1;
-	spa->spa_dsl_pool = dsl_pool_open(spa, spa->spa_first_txg);
+	error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
+	if (error) {
+		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_CORRUPT_DATA);
+		goto out;
+	}
 	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
 
-	VERIFY(zap_lookup(spa->spa_meta_objset,
+	if (zap_lookup(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
-	    sizeof (uint64_t), 1, &spa->spa_config_object) == 0);
+	    sizeof (uint64_t), 1, &spa->spa_config_object) != 0) {
+		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_CORRUPT_DATA);
+		error = EIO;
+		goto out;
+	}
 
 	if (!mosconfig) {
 		dmu_buf_t *db;
@@ -299,21 +380,24 @@
 		size_t nvsize = 0;
 		nvlist_t *newconfig = NULL;
 
-		db = dmu_bonus_hold(spa->spa_meta_objset,
-		    spa->spa_config_object);
-		dmu_buf_read(db);
+		VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset,
+		    spa->spa_config_object, FTAG, &db));
 		nvsize = *(uint64_t *)db->db_data;
-		dmu_buf_rele(db);
+		dmu_buf_rele(db, FTAG);
 
 		packed = kmem_alloc(nvsize, KM_SLEEP);
-		error = dmu_read_canfail(spa->spa_meta_objset,
+		error = dmu_read(spa->spa_meta_objset,
 		    spa->spa_config_object, 0, nvsize, packed);
 		if (error == 0)
 			error = nvlist_unpack(packed, nvsize, &newconfig, 0);
 		kmem_free(packed, nvsize);
 
-		if (error)
-			return (ENXIO);
+		if (error) {
+			vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_CORRUPT_DATA);
+			error = EIO;
+			goto out;
+		}
 
 		spa_config_set(spa, newconfig);
 
@@ -321,39 +405,76 @@
 		spa_deactivate(spa);
 		spa_activate(spa);
 
-		return (spa_load(spa, newconfig, readonly, import, B_TRUE));
+		return (spa_load(spa, newconfig, state, B_TRUE));
+	}
+
+	if (zap_lookup(spa->spa_meta_objset,
+	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
+	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) {
+		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_CORRUPT_DATA);
+		error = EIO;
+		goto out;
 	}
 
-	VERIFY(zap_lookup(spa->spa_meta_objset,
-	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
-	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) == 0);
+	/*
+	 * Load the persistent error log.  If we have an older pool, this will
+	 * not be present.
+	 */
+	error = zap_lookup(spa->spa_meta_objset,
+	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST,
+	    sizeof (uint64_t), 1, &spa->spa_errlog_last);
+	if (error != 0 &&error != ENOENT) {
+		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_CORRUPT_DATA);
+		error = EIO;
+		goto out;
+	}
+
+	error = zap_lookup(spa->spa_meta_objset,
+	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB,
+	    sizeof (uint64_t), 1, &spa->spa_errlog_scrub);
+	if (error != 0 && error != ENOENT) {
+		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_CORRUPT_DATA);
+		error = EIO;
+		goto out;
+	}
 
 	/*
-	 * Load the vdev state for all top level vdevs.
+	 * Load the vdev state for all top level vdevs.  We need to grab the
+	 * config lock because all label I/O is done with the
+	 * ZIO_FLAG_CONFIG_HELD flag.
 	 */
-	if ((error = vdev_load(rvd, import)) != 0)
-		return (error);
+	spa_config_enter(spa, RW_READER, FTAG);
+	if ((error = vdev_load(rvd)) != 0) {
+		spa_config_exit(spa, FTAG);
+		goto out;
+	}
+	spa_config_exit(spa, FTAG);
 
 	/*
 	 * Propagate the leaf DTLs we just loaded all the way up the tree.
 	 */
-	spa_config_enter(spa, RW_WRITER);
+	spa_config_enter(spa, RW_WRITER, FTAG);
 	vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
-	spa_config_exit(spa);
+	spa_config_exit(spa, FTAG);
 
 	/*
 	 * Check the state of the root vdev.  If it can't be opened, it
 	 * indicates one or more toplevel vdevs are faulted.
 	 */
-	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
-		return (ENXIO);
+	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
+		error = ENXIO;
+		goto out;
+	}
 
 	/*
 	 * Claim log blocks that haven't been committed yet, and update all
 	 * top-level vdevs to sync any config changes found in vdev_load().
 	 * This must all happen in a single txg.
 	 */
-	if ((spa_mode & FWRITE) && !readonly) {
+	if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) {
 		dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa),
 		    spa_first_txg(spa));
 		dmu_objset_find(spa->spa_name, zil_claim, tx, 0);
@@ -369,7 +490,14 @@
 		txg_wait_synced(spa->spa_dsl_pool, 0);
 	}
 
-	return (0);
+	error = 0;
+out:
+	if (error)
+		zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0);
+	spa->spa_load_state = SPA_LOAD_NONE;
+	spa->spa_ena = 0;
+
+	return (error);
 }
 
 /*
@@ -415,7 +543,7 @@
 		spa_activate(spa);
 
 		error = spa_load(spa, spa->spa_config,
-		    B_FALSE, B_FALSE, B_FALSE);
+		    SPA_LOAD_OPEN, B_FALSE);
 
 		if (error == EBADF) {
 			/*
@@ -432,7 +560,9 @@
 			if (locked)
 				mutex_exit(&spa_namespace_lock);
 			return (ENOENT);
-		} if (error) {
+		}
+
+		if (error) {
 			/*
 			 * We can't open the pool, but we still have useful
 			 * information: the state of each vdev after the
@@ -443,10 +573,14 @@
 				    B_TRUE);
 			spa_unload(spa);
 			spa_deactivate(spa);
+			spa->spa_last_open_failed = B_TRUE;
 			if (locked)
 				mutex_exit(&spa_namespace_lock);
 			*spapp = NULL;
 			return (error);
+		} else {
+			zfs_post_ok(spa, NULL);
+			spa->spa_last_open_failed = B_FALSE;
 		}
 
 		loaded = B_TRUE;
@@ -459,9 +593,9 @@
 	*spapp = spa;
 
 	if (config != NULL) {
-		spa_config_enter(spa, RW_READER);
+		spa_config_enter(spa, RW_READER, FTAG);
 		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
-		spa_config_exit(spa);
+		spa_config_exit(spa, FTAG);
 	}
 
 	/*
@@ -479,8 +613,36 @@
 	return (spa_open_common(name, spapp, tag, NULL));
 }
 
+/*
+ * Lookup the given spa_t, incrementing the inject count in the process,
+ * preventing it from being exported or destroyed.
+ */
+spa_t *
+spa_inject_addref(char *name)
+{
+	spa_t *spa;
+
+	mutex_enter(&spa_namespace_lock);
+	if ((spa = spa_lookup(name)) == NULL) {
+		mutex_exit(&spa_namespace_lock);
+		return (NULL);
+	}
+	spa->spa_inject_ref++;
+	mutex_exit(&spa_namespace_lock);
+
+	return (spa);
+}
+
+void
+spa_inject_delref(spa_t *spa)
+{
+	mutex_enter(&spa_namespace_lock);
+	spa->spa_inject_ref--;
+	mutex_exit(&spa_namespace_lock);
+}
+
 int
-spa_get_stats(const char *name, nvlist_t **config)
+spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
 {
 	int error;
 	spa_t *spa;
@@ -488,6 +650,29 @@
 	*config = NULL;
 	error = spa_open_common(name, &spa, FTAG, config);
 
+	if (spa && *config != NULL)
+		VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT,
+		    spa_get_errlog_size(spa)) == 0);
+
+	/*
+	 * We want to get the alternate root even for faulted pools, so we cheat
+	 * and call spa_lookup() directly.
+	 */
+	if (altroot) {
+		if (spa == NULL) {
+			mutex_enter(&spa_namespace_lock);
+			spa = spa_lookup(name);
+			if (spa)
+				spa_altroot(spa, altroot, buflen);
+			else
+				altroot[0] = '\0';
+			spa = NULL;
+			mutex_exit(&spa_namespace_lock);
+		} else {
+			spa_altroot(spa, altroot, buflen);
+		}
+	}
+
 	if (spa != NULL)
 		spa_close(spa, FTAG);
 
@@ -551,9 +736,11 @@
 	    DMU_OT_PACKED_NVLIST, 1 << 14,
 	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
 
-	VERIFY(zap_add(spa->spa_meta_objset,
+	if (zap_add(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
-	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) == 0);
+	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
+		cmn_err(CE_PANIC, "failed to add pool config");
+	}
 
 	/*
 	 * Create the deferred-free bplist object.  Turn off compression
@@ -565,9 +752,11 @@
 	dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj,
 	    ZIO_COMPRESS_OFF, tx);
 
-	VERIFY(zap_add(spa->spa_meta_objset,
+	if (zap_add(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
-	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) == 0);
+	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) {
+		cmn_err(CE_PANIC, "failed to add bplist");
+	}
 
 	dmu_tx_commit(tx);
 
@@ -619,7 +808,7 @@
 	 * Pass off the heavy lifting to spa_load().  We pass TRUE for mosconfig
 	 * so that we don't try to open the pool if the config is damaged.
 	 */
-	error = spa_load(spa, config, B_FALSE, B_TRUE, B_TRUE);
+	error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE);
 
 	if (error) {
 		spa_unload(spa);
@@ -694,7 +883,7 @@
 	 * Pass off the heavy lifting to spa_load().  We pass TRUE for mosconfig
 	 * so we don't try to open the pool if the config is damaged.
 	 */
-	(void) spa_load(spa, tryconfig, B_TRUE, B_TRUE, B_TRUE);
+	(void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE);
 
 	/*
 	 * If 'tryconfig' was at least parsable, return the current config.
@@ -738,6 +927,16 @@
 	}
 
 	/*
+	 * Put a hold on the pool, drop the namespace lock, stop async tasks,
+	 * reacquire the namespace lock, and see if we can export.
+	 */
+	spa_open_ref(spa, FTAG);
+	mutex_exit(&spa_namespace_lock);
+	spa_async_suspend(spa);
+	mutex_enter(&spa_namespace_lock);
+	spa_close(spa, FTAG);
+
+	/*
 	 * The pool will be in core if it's openable,
 	 * in which case we can modify its state.
 	 */
@@ -749,17 +948,20 @@
 		spa_scrub_suspend(spa);
 		txg_wait_synced(spa->spa_dsl_pool, 0);
 
-		if (!spa_refcount_zero(spa)) {
+		/*
+		 * A pool cannot be exported or destroyed if there are active
+		 * references.  If we are resetting a pool, allow references by
+		 * fault injection handlers.
+		 */
+		if (!spa_refcount_zero(spa) ||
+		    (spa->spa_inject_ref != 0 &&
+		    new_state != POOL_STATE_UNINITIALIZED)) {
 			spa_scrub_resume(spa);
+			spa_async_resume(spa);
 			mutex_exit(&spa_namespace_lock);
 			return (EBUSY);
 		}
 
-		/*
-		 * Update the pool state.
-		 */
-		spa->spa_state = new_state;
-
 		spa_scrub_resume(spa);
 		VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
 
@@ -771,7 +973,10 @@
 		 * so mark them all dirty.  spa_unload() will do the
 		 * final sync that pushes these changes out.
 		 */
-		vdev_config_dirty(spa->spa_root_vdev);
+		if (new_state != POOL_STATE_UNINITIALIZED) {
+			spa->spa_state = new_state;
+			vdev_config_dirty(spa->spa_root_vdev);
+		}
 	}
 
 	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
@@ -779,8 +984,10 @@
 		spa_deactivate(spa);
 	}
 
-	spa_remove(spa);
-	spa_config_sync();
+	if (new_state != POOL_STATE_UNINITIALIZED) {
+		spa_remove(spa);
+		spa_config_sync();
+	}
 	mutex_exit(&spa_namespace_lock);
 
 	return (0);
@@ -805,6 +1012,17 @@
 }
 
 /*
+ * Similar to spa_export(), this unloads the spa_t without actually removing it
+ * from the namespace in any way.
+ */
+int
+spa_reset(char *pool)
+{
+	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED));
+}
+
+
+/*
  * ==========================================================================
  * Device manipulation
  * ==========================================================================
@@ -845,7 +1063,8 @@
 			tvd->vdev_id = rvd->vdev_children;
 			vdev_add_child(rvd, tvd);
 		}
-		vdev_init(tvd, txg);
+		if ((error = vdev_init(tvd, txg)) != 0)
+			return (spa_vdev_exit(spa, vd, txg, error));
 		vdev_config_dirty(tvd);
 	}
 
@@ -871,7 +1090,7 @@
  * is automatically detached.
  */
 int
-spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing)
+spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 {
 	uint64_t txg, open_txg;
 	int error;
@@ -881,7 +1100,7 @@
 
 	txg = spa_vdev_enter(spa);
 
-	oldvd = vdev_lookup_by_path(rvd, path);
+	oldvd = vdev_lookup_by_guid(rvd, guid);
 
 	if (oldvd == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
@@ -954,6 +1173,12 @@
 	newvd->vdev_id = pvd->vdev_children;
 	vdev_add_child(pvd, newvd);
 
+	/*
+	 * If newvd is smaller than oldvd, but larger than its rsize,
+	 * the addition of newvd may have decreased our parent's asize.
+	 */
+	pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize);
+
 	tvd = newvd->vdev_top;
 	ASSERT(pvd->vdev_top == tvd);
 	ASSERT(tvd->vdev_parent == rvd);
@@ -962,7 +1187,6 @@
 	 * Update the config based on the new in-core state.
 	 */
 	spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0));
-
 	vdev_config_dirty(tvd);
 
 	/*
@@ -976,14 +1200,14 @@
 	    open_txg - TXG_INITIAL + 1);
 	mutex_exit(&newvd->vdev_dtl_lock);
 
+	dprintf("attached %s in txg %llu\n", newvd->vdev_path, txg);
+
 	/*
 	 * Mark newvd's DTL dirty in this txg.
 	 */
 	vdev_dirty(tvd, VDD_DTL, txg);
 	(void) txg_list_add(&tvd->vdev_dtl_list, newvd, txg);
 
-	dprintf("attached %s, replacing=%d\n", path, replacing);
-
 	(void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
 
 	/*
@@ -1000,7 +1224,7 @@
  * is a replacing vdev.
  */
 int
-spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid, int replace_done)
+spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
 {
 	uint64_t txg;
 	int c, t, error;
@@ -1009,14 +1233,11 @@
 
 	txg = spa_vdev_enter(spa);
 
-	vd = vdev_lookup_by_path(rvd, path);
+	vd = vdev_lookup_by_guid(rvd, guid);
 
 	if (vd == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 
-	if (guid != 0 && vd->vdev_guid != guid)
-		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
-
 	pvd = vd->vdev_parent;
 
 	/*
@@ -1105,13 +1326,16 @@
 	/*
 	 * Reopen this top-level vdev to reassess health after detach.
 	 */
-	vdev_reopen(tvd, NULL);
+	vdev_reopen(tvd);
 
 	/*
 	 * If the device we just detached was smaller than the others,
-	 * it may be possible to add metaslabs (i.e. grow the pool).
+	 * it may be possible to add metaslabs (i.e. grow the pool).  We ignore
+	 * the error here because the detach still succeeded - we just weren't
+	 * able to reinitialize the metaslabs.  This pool is in for a world of
+	 * hurt, in any case.
 	 */
-	vdev_metaslab_init(tvd, txg);
+	(void) vdev_metaslab_init(tvd, txg);
 
 	/*
 	 * Update the config based on the new in-core state.
@@ -1133,72 +1357,59 @@
 		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
 	(void) txg_list_add(&tvd->vdev_dtl_list, vd, txg);
 
-	dprintf("detached %s\n", path);
+	dprintf("detached %s in txg %llu\n", vd->vdev_path, txg);
 
 	return (spa_vdev_exit(spa, vd, txg, 0));
 }
 
 /*
- * If there are any replacing vdevs that have finished replacing, detach them.
- * We can't hold the config lock across detaches, so we lock the config,
- * build a list of candidates, unlock the config, and try each candidate.
+ * Find any device that's done replacing, so we can detach it.
  */
-typedef struct vdev_detach_link {
-	char		*vdl_path;
-	uint64_t	vdl_guid;
-	list_node_t	vdl_node;
-} vdev_detach_link_t;
-
-static void
-spa_vdev_replace_done_make_list(list_t *l, vdev_t *vd)
+static vdev_t *
+spa_vdev_replace_done_hunt(vdev_t *vd)
 {
+	vdev_t *newvd, *oldvd;
 	int c;
 
-	for (c = 0; c < vd->vdev_children; c++)
-		spa_vdev_replace_done_make_list(l, vd->vdev_child[c]);
+	for (c = 0; c < vd->vdev_children; c++) {
+		oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]);
+		if (oldvd != NULL)
+			return (oldvd);
+	}
 
 	if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) {
-		vdev_t *cvd0 = vd->vdev_child[0];
-		vdev_t *cvd1 = vd->vdev_child[1];
-		vdev_detach_link_t *vdl;
-		int dirty1;
+		oldvd = vd->vdev_child[0];
+		newvd = vd->vdev_child[1];
 
-		mutex_enter(&cvd1->vdev_dtl_lock);
-		dirty1 = cvd1->vdev_dtl_map.sm_space |
-		    cvd1->vdev_dtl_scrub.sm_space;
-		mutex_exit(&cvd1->vdev_dtl_lock);
+		mutex_enter(&newvd->vdev_dtl_lock);
+		if (newvd->vdev_dtl_map.sm_space == 0 &&
+		    newvd->vdev_dtl_scrub.sm_space == 0) {
+			mutex_exit(&newvd->vdev_dtl_lock);
+			return (oldvd);
+		}
+		mutex_exit(&newvd->vdev_dtl_lock);
+	}
 
-		if (!dirty1) {
-			vdl = kmem_zalloc(sizeof (*vdl), KM_SLEEP);
-			vdl->vdl_path = spa_strdup(cvd0->vdev_path);
-			vdl->vdl_guid = cvd0->vdev_guid;
-			list_insert_tail(l, vdl);
-		}
-	}
+	return (NULL);
 }
 
-void
+static void
 spa_vdev_replace_done(spa_t *spa)
 {
-	vdev_detach_link_t *vdl;
-	list_t vdlist;
+	vdev_t *vd;
+	uint64_t guid;
 
-	list_create(&vdlist, sizeof (vdev_detach_link_t),
-	    offsetof(vdev_detach_link_t, vdl_node));
+	spa_config_enter(spa, RW_READER, FTAG);
 
-	spa_config_enter(spa, RW_READER);
-	spa_vdev_replace_done_make_list(&vdlist, spa->spa_root_vdev);
-	spa_config_exit(spa);
-
-	while ((vdl = list_head(&vdlist)) != NULL) {
-		list_remove(&vdlist, vdl);
-		(void) spa_vdev_detach(spa, vdl->vdl_path, vdl->vdl_guid,
-		    B_TRUE);
-		spa_strfree(vdl->vdl_path);
-		kmem_free(vdl, sizeof (*vdl));
+	while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) {
+		guid = vd->vdev_guid;
+		spa_config_exit(spa, FTAG);
+		if (spa_vdev_detach(spa, guid, B_TRUE) != 0)
+			return;
+		spa_config_enter(spa, RW_READER, FTAG);
 	}
 
-	list_destroy(&vdlist);
+	spa_config_exit(spa, FTAG);
 }
 
 /*
@@ -1234,7 +1445,16 @@
  * ==========================================================================
  */
 
-static int spa_scrub_locked(spa_t *, pool_scrub_type_t, boolean_t);
+void
+spa_scrub_throttle(spa_t *spa, int direction)
+{
+	mutex_enter(&spa->spa_scrub_lock);
+	spa->spa_scrub_throttled += direction;
+	ASSERT(spa->spa_scrub_throttled >= 0);
+	if (spa->spa_scrub_throttled == 0)
+		cv_broadcast(&spa->spa_scrub_io_cv);
+	mutex_exit(&spa->spa_scrub_lock);
+}
 
 static void
 spa_scrub_io_done(zio_t *zio)
@@ -1244,22 +1464,23 @@
 	zio_buf_free(zio->io_data, zio->io_size);
 
 	mutex_enter(&spa->spa_scrub_lock);
-	if (zio->io_error)
+	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+		vdev_t *vd = zio->io_vd;
 		spa->spa_scrub_errors++;
-	if (--spa->spa_scrub_inflight == 0)
-		cv_broadcast(&spa->spa_scrub_io_cv);
-	mutex_exit(&spa->spa_scrub_lock);
-
-	if (zio->io_error) {
-		vdev_t *vd = zio->io_vd;
 		mutex_enter(&vd->vdev_stat_lock);
 		vd->vdev_stat.vs_scrub_errors++;
 		mutex_exit(&vd->vdev_stat_lock);
 	}
+	if (--spa->spa_scrub_inflight == 0) {
+		cv_broadcast(&spa->spa_scrub_io_cv);
+		ASSERT(spa->spa_scrub_throttled == 0);
+	}
+	mutex_exit(&spa->spa_scrub_lock);
 }
 
 static void
-spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags)
+spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags,
+    zbookmark_t *zb)
 {
 	size_t size = BP_GET_LSIZE(bp);
 	void *data = zio_buf_alloc(size);
@@ -1268,8 +1489,13 @@
 	spa->spa_scrub_inflight++;
 	mutex_exit(&spa->spa_scrub_lock);
 
+	if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
+		flags |= ZIO_FLAG_SPECULATIVE;	/* intent log block */
+
+	flags |= ZIO_FLAG_CANFAIL;
+
 	zio_nowait(zio_read(NULL, spa, bp, data, size,
-	    spa_scrub_io_done, NULL, priority, flags));
+	    spa_scrub_io_done, NULL, priority, flags, zb));
 }
 
 /* ARGSUSED */
@@ -1319,12 +1545,11 @@
 		}
 		if (vdev_dtl_contains(&vd->vdev_dtl_map, bp->blk_birth, 1)) {
 			spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER,
-			    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY |
-			    ZIO_FLAG_RESILVER);
+			    ZIO_FLAG_RESILVER, &bc->bc_bookmark);
 		}
 	} else {
 		spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB,
-		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_SCRUB);
+		    ZIO_FLAG_SCRUB, &bc->bc_bookmark);
 	}
 
 	return (0);
@@ -1348,19 +1573,25 @@
 	 */
 	txg_wait_synced(spa_get_dsl(spa), 0);
 
-	spa_config_enter(spa, RW_WRITER);
-	vdev_reopen(rvd, NULL);		/* purge all vdev caches */
+	dprintf("start %s mintxg=%llu maxtxg=%llu\n",
+	    scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
+	    spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg);
+
+	spa_config_enter(spa, RW_WRITER, FTAG);
+	vdev_reopen(rvd);		/* purge all vdev caches */
 	vdev_config_dirty(rvd);		/* rewrite all disk labels */
 	vdev_scrub_stat_update(rvd, scrub_type, B_FALSE);
-	spa_config_exit(spa);
+	spa_config_exit(spa, FTAG);
 
 	mutex_enter(&spa->spa_scrub_lock);
 	spa->spa_scrub_errors = 0;
 	spa->spa_scrub_active = 1;
+	ASSERT(spa->spa_scrub_inflight == 0);
+	ASSERT(spa->spa_scrub_throttled == 0);
 
 	while (!spa->spa_scrub_stop) {
 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
-		while (spa->spa_scrub_suspend) {
+		while (spa->spa_scrub_suspended) {
 			spa->spa_scrub_active = 0;
 			cv_broadcast(&spa->spa_scrub_cv);
 			cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
@@ -1376,6 +1607,9 @@
 		mutex_enter(&spa->spa_scrub_lock);
 		if (error != EAGAIN)
 			break;
+
+		while (spa->spa_scrub_throttled > 0)
+			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
 	}
 
 	while (spa->spa_scrub_inflight)
@@ -1384,16 +1618,25 @@
 	if (spa->spa_scrub_restart_txg != 0)
 		error = ERESTART;
 
+	if (spa->spa_scrub_stop)
+		error = EINTR;
+
 	spa->spa_scrub_active = 0;
 	cv_broadcast(&spa->spa_scrub_cv);
 
 	/*
-	 * If the traverse completed, and there were no errors,
-	 * then the scrub was completely successful.
+	 * Even if there were uncorrectable errors, we consider the scrub
+	 * completed.  The downside is that if there is a transient error during
+	 * a resilver, we won't resilver the data properly to the target.  But
+	 * if the damage is permanent (more likely) we will resilver forever,
+	 * which isn't really acceptable.  Since there is enough information for
+	 * the user to know what has failed and why, this seems like a more
+	 * tractable approach.
 	 */
-	complete = (error == 0 && spa->spa_scrub_errors == 0);
+	complete = (error == 0);
 
-	dprintf("scrub to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n",
+	dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n",
+	    scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
 	    spa->spa_scrub_maxtxg, complete ? "done" : "FAILED",
 	    error, spa->spa_scrub_errors, spa->spa_scrub_stop);
 
@@ -1403,31 +1646,32 @@
 	 * If the scrub/resilver completed, update all DTLs to reflect this.
 	 * Whether it succeeded or not, vacate all temporary scrub DTLs.
 	 */
-	spa_config_enter(spa, RW_WRITER);
+	spa_config_enter(spa, RW_WRITER, FTAG);
 	vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1,
 	    complete ? spa->spa_scrub_maxtxg : 0, B_TRUE);
-	spa_config_exit(spa);
-
-	spa_vdev_replace_done(spa);
-
-	spa_config_enter(spa, RW_READER);
 	vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete);
-	spa_config_exit(spa);
+	spa_errlog_rotate(spa);
+	spa_config_exit(spa, FTAG);
 
 	mutex_enter(&spa->spa_scrub_lock);
 
-	spa->spa_scrub_type = POOL_SCRUB_NONE;
-	spa->spa_scrub_active = 0;
-	spa->spa_scrub_thread = NULL;
-
-	cv_broadcast(&spa->spa_scrub_cv);
+	/*
+	 * We may have finished replacing a device.
+	 * Let the async thread assess this and handle the detach.
+	 */
+	spa_async_request(spa, SPA_ASYNC_REPLACE_DONE);
 
 	/*
 	 * If we were told to restart, our final act is to start a new scrub.
 	 */
 	if (error == ERESTART)
-		VERIFY(spa_scrub_locked(spa, scrub_type, B_TRUE) == 0);
+		spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ?
+		    SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB);
 
+	spa->spa_scrub_type = POOL_SCRUB_NONE;
+	spa->spa_scrub_active = 0;
+	spa->spa_scrub_thread = NULL;
+	cv_broadcast(&spa->spa_scrub_cv);
 	CALLB_CPR_EXIT(&cprinfo);	/* drops &spa->spa_scrub_lock */
 	thread_exit();
 }
@@ -1436,7 +1680,7 @@
 spa_scrub_suspend(spa_t *spa)
 {
 	mutex_enter(&spa->spa_scrub_lock);
-	spa->spa_scrub_suspend++;
+	spa->spa_scrub_suspended++;
 	while (spa->spa_scrub_active) {
 		cv_broadcast(&spa->spa_scrub_cv);
 		cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
@@ -1450,8 +1694,8 @@
 spa_scrub_resume(spa_t *spa)
 {
 	mutex_enter(&spa->spa_scrub_lock);
-	ASSERT(spa->spa_scrub_suspend != 0);
-	if (--spa->spa_scrub_suspend == 0)
+	ASSERT(spa->spa_scrub_suspended != 0);
+	if (--spa->spa_scrub_suspended == 0)
 		cv_broadcast(&spa->spa_scrub_cv);
 	mutex_exit(&spa->spa_scrub_lock);
 }
@@ -1469,17 +1713,19 @@
 	mutex_exit(&spa->spa_scrub_lock);
 }
 
-static int
-spa_scrub_locked(spa_t *spa, pool_scrub_type_t type, boolean_t force)
+int
+spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force)
 {
 	space_seg_t *ss;
 	uint64_t mintxg, maxtxg;
 	vdev_t *rvd = spa->spa_root_vdev;
-	int advance = 0;
+	int advance = ADVANCE_PRE | ADVANCE_ZIL;
 
 	if ((uint_t)type >= POOL_SCRUB_TYPES)
 		return (ENOTSUP);
 
+	mutex_enter(&spa->spa_scrub_lock);
+
 	/*
 	 * If there's a scrub or resilver already in progress, stop it.
 	 */
@@ -1487,9 +1733,10 @@
 		/*
 		 * Don't stop a resilver unless forced.
 		 */
-		if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force)
+		if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) {
+			mutex_exit(&spa->spa_scrub_lock);
 			return (EBUSY);
-
+		}
 		spa->spa_scrub_stop = 1;
 		cv_broadcast(&spa->spa_scrub_cv);
 		cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
@@ -1503,19 +1750,36 @@
 		spa->spa_scrub_th = NULL;
 	}
 
-	spa->spa_scrub_stop = 0;
-	spa->spa_scrub_type = type;
-	spa->spa_scrub_restart_txg = 0;
+	if (rvd == NULL) {
+		ASSERT(spa->spa_scrub_stop == 0);
+		ASSERT(spa->spa_scrub_type == type);
+		ASSERT(spa->spa_scrub_restart_txg == 0);
+		mutex_exit(&spa->spa_scrub_lock);
+		return (0);
+	}
 
 	mintxg = TXG_INITIAL - 1;
 	maxtxg = spa_last_synced_txg(spa) + 1;
 
-	switch (type) {
+	mutex_enter(&rvd->vdev_dtl_lock);
 
-	case POOL_SCRUB_NONE:
-		break;
+	if (rvd->vdev_dtl_map.sm_space == 0) {
+		/*
+		 * The pool-wide DTL is empty.
+		 * If this is a resilver, there's nothing to do.
+		 */
+		if (type == POOL_SCRUB_RESILVER)
+			type = POOL_SCRUB_NONE;
+	} else {
+		/*
+		 * The pool-wide DTL is non-empty.
+		 * If this is a normal scrub, upgrade to a resilver instead.
+		 */
+		if (type == POOL_SCRUB_EVERYTHING)
+			type = POOL_SCRUB_RESILVER;
+	}
 
-	case POOL_SCRUB_RESILVER:
+	if (type == POOL_SCRUB_RESILVER) {
 		/*
 		 * Determine the resilvering boundaries.
 		 *
@@ -1525,26 +1789,22 @@
 		 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1
 		 * so we don't claim to resilver a txg that's still changing.
 		 */
-		mutex_enter(&rvd->vdev_dtl_lock);
 		ss = avl_first(&rvd->vdev_dtl_map.sm_root);
-		mintxg = ss ? ss->ss_start - 1 : 0;
+		mintxg = ss->ss_start - 1;
 		ss = avl_last(&rvd->vdev_dtl_map.sm_root);
-		maxtxg = ss ? ss->ss_end : 0;
-		maxtxg = MIN(maxtxg, spa_last_synced_txg(spa) + 1);
-		mutex_exit(&rvd->vdev_dtl_lock);
+		maxtxg = MIN(ss->ss_end, maxtxg);
 
-		advance = ADVANCE_PRE | ADVANCE_PRUNE;
-		break;
-
-	case POOL_SCRUB_EVERYTHING:
-		/*
-		 * A scrub is like a resilver, but not pruned by DTL.
-		 */
-		advance = ADVANCE_PRE;
-		break;
+		advance |= ADVANCE_PRUNE;
 	}
 
-	if (mintxg != 0 && maxtxg != 0 && type != POOL_SCRUB_NONE) {
+	mutex_exit(&rvd->vdev_dtl_lock);
+
+	spa->spa_scrub_stop = 0;
+	spa->spa_scrub_type = type;
+	spa->spa_scrub_restart_txg = 0;
+
+	if (type != POOL_SCRUB_NONE) {
+		spa->spa_scrub_mintxg = mintxg;
 		spa->spa_scrub_maxtxg = maxtxg;
 		spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL,
 		    advance, ZIO_FLAG_CANFAIL);
@@ -1553,24 +1813,119 @@
 		    spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri);
 	}
 
+	mutex_exit(&spa->spa_scrub_lock);
+
 	return (0);
 }
 
-int
-spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force)
+/*
+ * ==========================================================================
+ * SPA async task processing
+ * ==========================================================================
+ */
+
+static void
+spa_async_reopen(spa_t *spa)
 {
-	int error;
-	traverse_handle_t *th;
+	vdev_t *rvd = spa->spa_root_vdev;
+	vdev_t *tvd;
+	int c;
+
+	spa_config_enter(spa, RW_WRITER, FTAG);
+
+	for (c = 0; c < rvd->vdev_children; c++) {
+		tvd = rvd->vdev_child[c];
+		if (tvd->vdev_reopen_wanted) {
+			tvd->vdev_reopen_wanted = 0;
+			vdev_reopen(tvd);
+		}
+	}
 
-	mutex_enter(&spa->spa_scrub_lock);
-	error = spa_scrub_locked(spa, type, force);
-	th = spa->spa_scrub_th;
-	mutex_exit(&spa->spa_scrub_lock);
+	spa_config_exit(spa, FTAG);
+}
+
+static void
+spa_async_thread(spa_t *spa)
+{
+	int tasks;
+
+	ASSERT(spa->spa_sync_on);
 
-	if (th == NULL && type != POOL_SCRUB_NONE)
+	mutex_enter(&spa->spa_async_lock);
+	tasks = spa->spa_async_tasks;
+	spa->spa_async_tasks = 0;
+	mutex_exit(&spa->spa_async_lock);
+
+	/*
+	 * See if any devices need to be reopened.
+	 */
+	if (tasks & SPA_ASYNC_REOPEN)
+		spa_async_reopen(spa);
+
+	/*
+	 * If any devices are done replacing, detach them.
+	 */
+	if (tasks & SPA_ASYNC_REPLACE_DONE)
 		spa_vdev_replace_done(spa);
 
-	return (error);
+	/*
+	 * Kick off a scrub.
+	 */
+	if (tasks & SPA_ASYNC_SCRUB)
+		VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0);
+
+	/*
+	 * Kick off a resilver.
+	 */
+	if (tasks & SPA_ASYNC_RESILVER)
+		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+
+	/*
+	 * Let the world know that we're done.
+	 */
+	mutex_enter(&spa->spa_async_lock);
+	spa->spa_async_thread = NULL;
+	cv_broadcast(&spa->spa_async_cv);
+	mutex_exit(&spa->spa_async_lock);
+	thread_exit();
+}
+
+void
+spa_async_suspend(spa_t *spa)
+{
+	mutex_enter(&spa->spa_async_lock);
+	spa->spa_async_suspended++;
+	while (spa->spa_async_thread != NULL)
+		cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
+	mutex_exit(&spa->spa_async_lock);
+}
+
+void
+spa_async_resume(spa_t *spa)
+{
+	mutex_enter(&spa->spa_async_lock);
+	ASSERT(spa->spa_async_suspended != 0);
+	spa->spa_async_suspended--;
+	mutex_exit(&spa->spa_async_lock);
+}
+
+static void
+spa_async_dispatch(spa_t *spa)
+{
+	mutex_enter(&spa->spa_async_lock);
+	if (spa->spa_async_tasks && !spa->spa_async_suspended &&
+	    spa->spa_async_thread == NULL)
+		spa->spa_async_thread = thread_create(NULL, 0,
+		    spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
+	mutex_exit(&spa->spa_async_lock);
+}
+
+void
+spa_async_request(spa_t *spa, int task)
+{
+	mutex_enter(&spa->spa_async_lock);
+	spa->spa_async_tasks |= task;
+	mutex_exit(&spa->spa_async_lock);
 }
 
 /*
@@ -1628,17 +1983,19 @@
 
 	packed = kmem_alloc(nvsize, KM_SLEEP);
 
-	VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR, 0) == 0);
+	VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR,
+	    KM_SLEEP) == 0);
 
 	dmu_write(spa->spa_meta_objset, spa->spa_config_object, 0, nvsize,
 	    packed, tx);
 
 	kmem_free(packed, nvsize);
 
-	db = dmu_bonus_hold(spa->spa_meta_objset, spa->spa_config_object);
+	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset,
+	    spa->spa_config_object, FTAG, &db));
 	dmu_buf_will_dirty(db, tx);
 	*(uint64_t *)db->db_data = nvsize;
-	dmu_buf_rele(db);
+	dmu_buf_rele(db, FTAG);
 }
 
 /*
@@ -1651,7 +2008,6 @@
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	objset_t *mos = spa->spa_meta_objset;
 	bplist_t *bpl = &spa->spa_sync_bplist;
-	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd;
 	dmu_tx_t *tx;
 	int dirty_vdevs;
@@ -1659,12 +2015,12 @@
 	/*
 	 * Lock out configuration changes.
 	 */
-	spa_config_enter(spa, RW_READER);
+	spa_config_enter(spa, RW_READER, FTAG);
 
 	spa->spa_syncing_txg = txg;
 	spa->spa_sync_pass = 0;
 
-	bplist_open(bpl, mos, spa->spa_sync_bplist_obj);
+	VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj));
 
 	/*
 	 * If anything has changed in this txg, push the deferred frees
@@ -1685,6 +2041,8 @@
 		spa_sync_config_object(spa, tx);
 		dmu_tx_commit(tx);
 
+		spa_errlog_sync(spa, txg);
+
 		dsl_pool_sync(dp, txg);
 
 		dirty_vdevs = 0;
@@ -1707,11 +2065,7 @@
 	 * Rewrite the vdev configuration (which includes the uberblock)
 	 * to commit the transaction group.
 	 */
-	while (spa_sync_labels(spa, txg)) {
-		dprintf("waiting for devices to heal\n");
-		delay(hz);
-		vdev_reopen(rvd, NULL);
-	}
+	VERIFY(0 == spa_sync_labels(spa, txg));
 
 	/*
 	 * Make a stable copy of the fully synced uberblock.
@@ -1748,7 +2102,12 @@
 	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
 	ASSERT(bpl->bpl_queue == NULL);
 
-	spa_config_exit(spa);
+	spa_config_exit(spa, FTAG);
+
+	/*
+	 * If any async tasks have been requested, kick them off.
+	 */
+	spa_async_dispatch(spa);
 }
 
 /*
@@ -1800,13 +2159,13 @@
 	mutex_enter(&spa_namespace_lock);
 	while ((spa = spa_next(NULL)) != NULL) {
 		/*
-		 * Stop all scrub and resilver activity.  spa_scrub() needs to
-		 * wait for the scrub thread, which may do a detach and sync the
-		 * configs, which needs spa_namespace_lock.  Drop the lock while
-		 * maintaining a hold on the spa_t.
+		 * Stop async tasks.  The async thread may need to detach
+		 * a device that's been replaced, which requires grabbing
+		 * spa_namespace_lock, so we must drop it here.
 		 */
 		spa_open_ref(spa, FTAG);
 		mutex_exit(&spa_namespace_lock);
+		spa_async_suspend(spa);
 		VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
 		mutex_enter(&spa_namespace_lock);
 		spa_close(spa, FTAG);
@@ -1819,3 +2178,9 @@
 	}
 	mutex_exit(&spa_namespace_lock);
 }
+
+vdev_t *
+spa_lookup_by_guid(spa_t *spa, uint64_t guid)
+{
+	return (vdev_lookup_by_guid(spa->spa_root_vdev, guid));
+}
author	eschrock
date	Fri, 03 Mar 2006 20:08:16 -0800
parents	81359ee1ee63
children	4ad213e858a9