diff usr/src/uts/common/fs/zfs/spa.c @ 1635:0ab1193d47cb

6398664 zpool detach: missing argument to error message causes core dump 6398713 don't sync config cache until root is mounted read/write 6396049 spa_sync_labels() should try all devices before giving up 6398380 ASSERT: !vd->vdev_is_dirty, vdev.c:450, when running ziltest 6399272 scrub seems constipated; needs ADVANCE_PRUNE juice
author bonwick
date Thu, 16 Mar 2006 16:31:10 -0800
parents 438b928f80c7
children 9e3ae798af31
line wrap: on
line diff
--- a/usr/src/uts/common/fs/zfs/spa.c	Thu Mar 16 15:25:52 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/spa.c	Thu Mar 16 16:31:10 2006 -0800
@@ -54,8 +54,6 @@
 #include <sys/fs/zfs.h>
 #include <sys/callb.h>
 
-static uint32_t spa_active_count;
-
 /*
  * ==========================================================================
  * SPA state manipulation (open/create/destroy/import/export)
@@ -267,25 +265,24 @@
 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
 {
 	int error = 0;
-	uint64_t config_cache_txg = spa->spa_config_txg;
 	nvlist_t *nvroot = NULL;
 	vdev_t *rvd;
 	uberblock_t *ub = &spa->spa_uberblock;
+	uint64_t config_cache_txg = spa->spa_config_txg;
 	uint64_t pool_guid;
 	zio_t *zio;
 
 	spa->spa_load_state = state;
+
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
-	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
+	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid) ||
+	    (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+	    &spa->spa_config_txg) && mosconfig)) {
 		error = EINVAL;
 		goto out;
 	}
 
-	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
-	    &spa->spa_config_txg);
-
-	if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
-	    spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
+	if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
 	    spa_guid_exists(pool_guid, 0)) {
 		error = EEXIST;
 		goto out;
@@ -344,7 +341,8 @@
 	 * If the vdev guid sum doesn't match the uberblock, we have an
 	 * incomplete configuration.
 	 */
-	if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) {
+	if (rvd->vdev_guid_sum != ub->ub_guid_sum && (mosconfig ||
+	    state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT)) {
 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_BAD_GUID_SUM);
 		error = ENXIO;
@@ -447,11 +445,11 @@
 	 * ZIO_FLAG_CONFIG_HELD flag.
 	 */
 	spa_config_enter(spa, RW_READER, FTAG);
-	if ((error = vdev_load(rvd)) != 0) {
-		spa_config_exit(spa, FTAG);
+	error = vdev_load(rvd);
+	spa_config_exit(spa, FTAG);
+
+	if (error)
 		goto out;
-	}
-	spa_config_exit(spa, FTAG);
 
 	/*
 	 * Propagate the leaf DTLs we just loaded all the way up the tree.
@@ -469,19 +467,15 @@
 		goto out;
 	}
 
-	/*
-	 * Claim log blocks that haven't been committed yet, and update all
-	 * top-level vdevs to sync any config changes found in vdev_load().
-	 * This must all happen in a single txg.
-	 */
 	if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) {
+		dmu_tx_t *tx;
+		int need_update = B_FALSE;
 		int c;
-		dmu_tx_t *tx;
 
-		spa_config_enter(spa, RW_WRITER, FTAG);
-		vdev_config_dirty(rvd);
-		spa_config_exit(spa, FTAG);
-
+		/*
+		 * Claim log blocks that haven't been committed yet.
+		 * This must all happen in a single txg.
+		 */
 		tx = dmu_tx_create_assigned(spa_get_dsl(spa),
 		    spa_first_txg(spa));
 		dmu_objset_find(spa->spa_name, zil_claim, tx, 0);
@@ -496,45 +490,23 @@
 		txg_wait_synced(spa->spa_dsl_pool, 0);
 
 		/*
-		 * If the config cache is stale relative to the mosconfig,
-		 * sync the config cache.
+		 * If the config cache is stale, or we have uninitialized
+		 * metaslabs (see spa_vdev_add()), then update the config.
 		 */
-		if (config_cache_txg != spa->spa_config_txg) {
-			uint64_t txg;
-			spa_config_enter(spa, RW_WRITER, FTAG);
-			txg = spa_last_synced_txg(spa) + 1;
-			spa_config_set(spa,
-			    spa_config_generate(spa, rvd, txg, 0));
-			spa_config_exit(spa, FTAG);
-			txg_wait_synced(spa->spa_dsl_pool, txg);
-			spa_config_sync();
-		}
+		if (config_cache_txg != spa->spa_config_txg ||
+		    state == SPA_LOAD_IMPORT)
+			need_update = B_TRUE;
+
+		for (c = 0; c < rvd->vdev_children; c++)
+			if (rvd->vdev_child[c]->vdev_ms_array == 0)
+				need_update = B_TRUE;
 
 		/*
-		 * If we have top-level vdevs that were added but have
-		 * not yet been prepared for allocation, do that now.
-		 * (It's safe now because the config cache is up to date,
-		 * so it will be able to translate the new DVAs.)
-		 * See comments in spa_vdev_add() for full details.
+		 * Update the config cache asychronously in case we're the
+		 * root pool, in which case the config cache isn't writable yet.
 		 */
-		for (c = 0; c < rvd->vdev_children; c++) {
-			vdev_t *tvd = rvd->vdev_child[c];
-			if (tvd->vdev_ms_array == 0) {
-				uint64_t txg;
-				ASSERT(tvd->vdev_ms_shift == 0);
-				spa_config_enter(spa, RW_WRITER, FTAG);
-				txg = spa_last_synced_txg(spa) + 1;
-				vdev_init(tvd, txg);
-				vdev_config_dirty(tvd);
-				spa_config_set(spa,
-				    spa_config_generate(spa, rvd, txg, 0));
-				spa_config_exit(spa, FTAG);
-				txg_wait_synced(spa->spa_dsl_pool, txg);
-				ASSERT(tvd->vdev_ms_shift != 0);
-				ASSERT(tvd->vdev_ms_array != 0);
-				spa_config_sync();
-			}
-		}
+		if (need_update)
+			spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 	}
 
 	error = 0;
@@ -589,8 +561,7 @@
 
 		spa_activate(spa);
 
-		error = spa_load(spa, spa->spa_config,
-		    SPA_LOAD_OPEN, B_FALSE);
+		error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE);
 
 		if (error == EBADF) {
 			/*
@@ -615,9 +586,12 @@
 			 * information: the state of each vdev after the
 			 * attempted vdev_open().  Return this to the user.
 			 */
-			if (config != NULL && spa->spa_root_vdev != NULL)
+			if (config != NULL && spa->spa_root_vdev != NULL) {
+				spa_config_enter(spa, RW_READER, FTAG);
 				*config = spa_config_generate(spa, NULL, -1ULL,
 				    B_TRUE);
+				spa_config_exit(spa, FTAG);
+			}
 			spa_unload(spa);
 			spa_deactivate(spa);
 			spa->spa_last_open_failed = B_TRUE;
@@ -730,12 +704,13 @@
  * Pool Creation
  */
 int
-spa_create(const char *pool, nvlist_t *nvroot, char *altroot)
+spa_create(const char *pool, nvlist_t *nvroot, const char *altroot)
 {
 	spa_t *spa;
+	vdev_t *rvd;
 	dsl_pool_t *dp;
 	dmu_tx_t *tx;
-	int error;
+	int c, error;
 	uint64_t txg = TXG_INITIAL;
 
 	/*
@@ -746,22 +721,36 @@
 		mutex_exit(&spa_namespace_lock);
 		return (EEXIST);
 	}
-	spa = spa_add(pool);
 
 	/*
 	 * Allocate a new spa_t structure.
 	 */
+	spa = spa_add(pool, altroot);
 	spa_activate(spa);
 
-	if (altroot != NULL) {
-		spa->spa_root = spa_strdup(altroot);
-		atomic_add_32(&spa_active_count, 1);
-	}
-
 	spa->spa_uberblock.ub_txg = txg - 1;
 	spa->spa_ubsync = spa->spa_uberblock;
 
-	error = spa_vdev_add(spa, nvroot);
+	/*
+	 * Create the root vdev.
+	 */
+	spa_config_enter(spa, RW_WRITER, FTAG);
+
+	rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD);
+
+	ASSERT(spa->spa_root_vdev == rvd);
+
+	if (rvd == NULL) {
+		error = EINVAL;
+	} else {
+		if ((error = vdev_create(rvd, txg)) == 0) {
+			for (c = 0; c < rvd->vdev_children; c++)
+				vdev_init(rvd->vdev_child[c], txg);
+			vdev_config_dirty(rvd);
+		}
+	}
+
+	spa_config_exit(spa, FTAG);
 
 	if (error) {
 		spa_unload(spa);
@@ -828,7 +817,7 @@
  * then call spa_load() to do the dirty work.
  */
 int
-spa_import(const char *pool, nvlist_t *config, char *altroot)
+spa_import(const char *pool, nvlist_t *config, const char *altroot)
 {
 	spa_t *spa;
 	int error;
@@ -846,25 +835,15 @@
 	}
 
 	/*
-	 * Create an initialize the spa structure
+	 * Create and initialize the spa structure.
 	 */
-	spa = spa_add(pool);
+	spa = spa_add(pool, altroot);
 	spa_activate(spa);
 
 	/*
-	 * Set the alternate root, if there is one.
+	 * Pass off the heavy lifting to spa_load().
 	 */
-	if (altroot != NULL) {
-		spa->spa_root = spa_strdup(altroot);
-		atomic_add_32(&spa_active_count, 1);
-	}
-
-	/*
-	 * Pass off the heavy lifting to spa_load().  We pass TRUE for mosconfig
-	 * so that we don't try to open the pool if the config is damaged.
-	 * Note: on success, spa_load() will update and sync the config cache.
-	 */
-	error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE);
+	error = spa_load(spa, config, SPA_LOAD_IMPORT, B_FALSE);
 
 	if (error) {
 		spa_unload(spa);
@@ -874,6 +853,11 @@
 		return (error);
 	}
 
+	/*
+	 * Update the config cache to include the newly-imported pool.
+	 */
+	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
+
 	mutex_exit(&spa_namespace_lock);
 
 	/*
@@ -905,27 +889,25 @@
 	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
 		return (NULL);
 
+	/*
+	 * Create and initialize the spa structure.
+	 */
 	mutex_enter(&spa_namespace_lock);
-	spa = spa_add(TRYIMPORT_NAME);
-
-	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
-
-	/*
-	 * Initialize the spa_t structure.
-	 */
+	spa = spa_add(TRYIMPORT_NAME, NULL);
 	spa_activate(spa);
 
 	/*
-	 * Pass off the heavy lifting to spa_load().  We pass TRUE for mosconfig
-	 * so we don't try to open the pool if the config is damaged.
+	 * Pass off the heavy lifting to spa_load().
 	 */
-	(void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE);
+	(void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_FALSE);
 
 	/*
 	 * If 'tryconfig' was at least parsable, return the current config.
 	 */
 	if (spa->spa_root_vdev != NULL) {
+		spa_config_enter(spa, RW_READER, FTAG);
 		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
+		spa_config_exit(spa, FTAG);
 		VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
 		    poolname) == 0);
 		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
@@ -1001,9 +983,6 @@
 		spa_scrub_resume(spa);
 		VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
 
-		if (spa->spa_root != NULL)
-			atomic_add_32(&spa_active_count, -1);
-
 		/*
 		 * We want this to be reflected on every label,
 		 * so mark them all dirty.  spa_unload() will do the
@@ -1012,6 +991,7 @@
 		if (new_state != POOL_STATE_UNINITIALIZED) {
 			spa_config_enter(spa, RW_WRITER, FTAG);
 			spa->spa_state = new_state;
+			spa->spa_final_txg = spa_last_synced_txg(spa) + 1;
 			vdev_config_dirty(spa->spa_root_vdev);
 			spa_config_exit(spa, FTAG);
 		}
@@ -1073,7 +1053,7 @@
 spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 {
 	uint64_t txg;
-	int c, c0, children, error;
+	int c, error;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd, *tvd;
 
@@ -1084,30 +1064,17 @@
 	if (vd == NULL)
 		return (spa_vdev_exit(spa, vd, txg, EINVAL));
 
-	if (rvd == NULL) {			/* spa_create() */
-		rvd = vd;
-		c0 = 0;
-	} else {
-		c0 = rvd->vdev_children;
-	}
-
-	ASSERT(spa->spa_root_vdev == rvd);
-
 	if ((error = vdev_create(vd, txg)) != 0)
 		return (spa_vdev_exit(spa, vd, txg, error));
 
-	children = vd->vdev_children;
-
 	/*
 	 * Transfer each new top-level vdev from vd to rvd.
 	 */
-	for (c = 0; c < children; c++) {
+	for (c = 0; c < vd->vdev_children; c++) {
 		tvd = vd->vdev_child[c];
-		if (vd != rvd) {
-			vdev_remove_child(vd, tvd);
-			tvd->vdev_id = c0 + c;
-			vdev_add_child(rvd, tvd);
-		}
+		vdev_remove_child(vd, tvd);
+		tvd->vdev_id = rvd->vdev_children;
+		vdev_add_child(rvd, tvd);
 		vdev_config_dirty(tvd);
 	}
 
@@ -1118,29 +1085,19 @@
 	 * fail to open the pool because there are DVAs that the config cache
 	 * can't translate.  Therefore, we first add the vdevs without
 	 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
-	 * initialize the metaslabs; and sync the config cache again.
+	 * and then let spa_config_update() initialize the new metaslabs.
 	 *
 	 * spa_load() checks for added-but-not-initialized vdevs, so that
 	 * if we lose power at any point in this sequence, the remaining
 	 * steps will be completed the next time we load the pool.
 	 */
-	if (vd != rvd) {
-		(void) spa_vdev_exit(spa, vd, txg, 0);
-		txg = spa_vdev_enter(spa);
-		vd = NULL;
-	}
+	(void) spa_vdev_exit(spa, vd, txg, 0);
 
-	/*
-	 * Now that the config is safely on disk, we can use the new space.
-	 */
-	for (c = 0; c < children; c++) {
-		tvd = rvd->vdev_child[c0 + c];
-		ASSERT(tvd->vdev_ms_array == 0);
-		vdev_init(tvd, txg);
-		vdev_config_dirty(tvd);
-	}
+	mutex_enter(&spa_namespace_lock);
+	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
+	mutex_exit(&spa_namespace_lock);
 
-	return (spa_vdev_exit(spa, vd, txg, 0));
+	return (0);
 }
 
 /*
@@ -1795,7 +1752,6 @@
 	space_seg_t *ss;
 	uint64_t mintxg, maxtxg;
 	vdev_t *rvd = spa->spa_root_vdev;
-	int advance = ADVANCE_PRE | ADVANCE_ZIL;
 
 	if ((uint_t)type >= POOL_SCRUB_TYPES)
 		return (ENOTSUP);
@@ -1869,8 +1825,6 @@
 		mintxg = ss->ss_start - 1;
 		ss = avl_last(&rvd->vdev_dtl_map.sm_root);
 		maxtxg = MIN(ss->ss_end, maxtxg);
-
-		advance |= ADVANCE_PRUNE;
 	}
 
 	mutex_exit(&rvd->vdev_dtl_lock);
@@ -1883,7 +1837,8 @@
 		spa->spa_scrub_mintxg = mintxg;
 		spa->spa_scrub_maxtxg = maxtxg;
 		spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL,
-		    advance, ZIO_FLAG_CANFAIL);
+		    ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL,
+		    ZIO_FLAG_CANFAIL);
 		traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg);
 		spa->spa_scrub_thread = thread_create(NULL, 0,
 		    spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri);
@@ -1933,6 +1888,15 @@
 	mutex_exit(&spa->spa_async_lock);
 
 	/*
+	 * See if the config needs to be updated.
+	 */
+	if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
+		mutex_enter(&spa_namespace_lock);
+		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
+		mutex_exit(&spa_namespace_lock);
+	}
+
+	/*
 	 * See if any devices need to be reopened.
 	 */
 	if (tasks & SPA_ASYNC_REOPEN)
@@ -1990,7 +1954,8 @@
 {
 	mutex_enter(&spa->spa_async_lock);
 	if (spa->spa_async_tasks && !spa->spa_async_suspended &&
-	    spa->spa_async_thread == NULL)
+	    spa->spa_async_thread == NULL &&
+	    rootdir != NULL && !vn_is_readonly(rootdir))
 		spa->spa_async_thread = thread_create(NULL, 0,
 		    spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
 	mutex_exit(&spa->spa_async_lock);
@@ -2053,7 +2018,9 @@
 
 	config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE);
 
-	spa_config_set(spa, config);
+	if (spa->spa_config_syncing)
+		nvlist_free(spa->spa_config_syncing);
+	spa->spa_config_syncing = config;
 
 	VERIFY(nvlist_size(config, &nvsize, NV_ENCODE_XDR) == 0);
 
@@ -2084,6 +2051,7 @@
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	objset_t *mos = spa->spa_meta_objset;
 	bplist_t *bpl = &spa->spa_sync_bplist;
+	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd;
 	dmu_tx_t *tx;
 	int dirty_vdevs;
@@ -2140,8 +2108,45 @@
 	/*
 	 * Rewrite the vdev configuration (which includes the uberblock)
 	 * to commit the transaction group.
+	 *
+	 * If there are any dirty vdevs, sync the uberblock to all vdevs.
+	 * Otherwise, pick a random top-level vdev that's known to be
+	 * visible in the config cache (see spa_vdev_add() for details).
+	 * If the write fails, try the next vdev until we're tried them all.
 	 */
-	VERIFY(0 == spa_sync_labels(spa, txg));
+	if (!list_is_empty(&spa->spa_dirty_list)) {
+		VERIFY(vdev_config_sync(rvd, txg) == 0);
+	} else {
+		int children = rvd->vdev_children;
+		int c0 = spa_get_random(children);
+		int c;
+
+		for (c = 0; c < children; c++) {
+			vd = rvd->vdev_child[(c0 + c) % children];
+			if (vd->vdev_ms_array == 0)
+				continue;
+			if (vdev_config_sync(vd, txg) == 0)
+				break;
+		}
+		if (c == children)
+			VERIFY(vdev_config_sync(rvd, txg) == 0);
+	}
+
+	/*
+	 * Clear the dirty config list.
+	 */
+	while ((vd = list_head(&spa->spa_dirty_list)) != NULL)
+		vdev_config_clean(vd);
+
+	/*
+	 * Now that the new config has synced transactionally,
+	 * let it become visible to the config cache.
+	 */
+	if (spa->spa_config_syncing != NULL) {
+		spa_config_set(spa, spa->spa_config_syncing);
+		spa->spa_config_txg = txg;
+		spa->spa_config_syncing = NULL;
+	}
 
 	/*
 	 * Make a stable copy of the fully synced uberblock.
@@ -2214,12 +2219,6 @@
  * ==========================================================================
  */
 
-int
-spa_busy(void)
-{
-	return (spa_active_count != 0);
-}
-
 /*
  * Remove all pools in the system.
  */