changeset 1585:4ad213e858a9

6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041 6395485 ensure the gu in vdev_guid 6395487 config cache can become stale relative to mosconfig 6395488 vdev addition must sync to config cache before allocation begins
author bonwick
date Thu, 09 Mar 2006 16:56:05 -0800
parents 889b0aa2153e
children 9657e3dadaf6
files usr/src/cmd/ztest/ztest.c usr/src/uts/common/fs/zfs/spa.c usr/src/uts/common/fs/zfs/spa_misc.c usr/src/uts/common/fs/zfs/sys/vdev.h usr/src/uts/common/fs/zfs/vdev.c
diffstat 5 files changed, 169 insertions(+), 62 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/cmd/ztest/ztest.c	Thu Mar 09 16:31:45 2006 -0800
+++ b/usr/src/cmd/ztest/ztest.c	Thu Mar 09 16:56:05 2006 -0800
@@ -2038,6 +2038,9 @@
 
 			bcopy(&iobuf[blkoff], &rbt, sizeof (rbt));
 
+			if (rbt.bt_objset == 0)		/* concurrent free */
+				continue;
+
 			ASSERT3U(rbt.bt_objset, ==, wbt.bt_objset);
 			ASSERT3U(rbt.bt_object, ==, wbt.bt_object);
 			ASSERT3U(rbt.bt_offset, ==, wbt.bt_offset);
@@ -2970,9 +2973,11 @@
 	 * Verify that we can export the pool and reimport it under a
 	 * different name.
 	 */
-	(void) snprintf(name, 100, "%s_import", pool);
-	ztest_spa_import_export(pool, name);
-	ztest_spa_import_export(name, pool);
+	if (ztest_random(2) == 0) {
+		(void) snprintf(name, 100, "%s_import", pool);
+		ztest_spa_import_export(pool, name);
+		ztest_spa_import_export(name, pool);
+	}
 
 	/*
 	 * Verify that we can loop over all pools.
--- a/usr/src/uts/common/fs/zfs/spa.c	Thu Mar 09 16:31:45 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/spa.c	Thu Mar 09 16:56:05 2006 -0800
@@ -252,10 +252,9 @@
 	/*
 	 * Close all vdevs.
 	 */
-	if (spa->spa_root_vdev) {
+	if (spa->spa_root_vdev)
 		vdev_free(spa->spa_root_vdev);
-		spa->spa_root_vdev = NULL;
-	}
+	ASSERT(spa->spa_root_vdev == NULL);
 
 	spa->spa_async_suspended = 0;
 }
@@ -268,6 +267,7 @@
 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
 {
 	int error = 0;
+	uint64_t config_cache_txg = spa->spa_config_txg;
 	nvlist_t *nvroot = NULL;
 	vdev_t *rvd;
 	uberblock_t *ub = &spa->spa_uberblock;
@@ -303,7 +303,7 @@
 		goto out;
 	}
 
-	spa->spa_root_vdev = rvd;
+	ASSERT(spa->spa_root_vdev == rvd);
 	ASSERT(spa_guid(spa) == pool_guid);
 
 	/*
@@ -475,6 +475,7 @@
 	 * This must all happen in a single txg.
 	 */
 	if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) {
+		int c;
 		dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa),
 		    spa_first_txg(spa));
 		dmu_objset_find(spa->spa_name, zil_claim, tx, 0);
@@ -488,6 +489,38 @@
 		 * Wait for all claims to sync.
 		 */
 		txg_wait_synced(spa->spa_dsl_pool, 0);
+
+		/*
+		 * If the config cache is stale relative to the mosconfig,
+		 * sync the config cache.
+		 */
+		if (config_cache_txg != spa->spa_config_txg)
+			spa_config_sync();
+
+		/*
+		 * If we have top-level vdevs that were added but have
+		 * not yet been prepared for allocation, do that now.
+		 * (It's safe now because the config cache is up to date,
+		 * so it will be able to translate the new DVAs.)
+		 * See comments in spa_vdev_add() for full details.
+		 */
+		for (c = 0; c < rvd->vdev_children; c++) {
+			vdev_t *tvd = rvd->vdev_child[c];
+			if (tvd->vdev_ms_array == 0) {
+				uint64_t txg = spa_last_synced_txg(spa) + 1;
+				ASSERT(tvd->vdev_ms_shift == 0);
+				spa_config_enter(spa, RW_WRITER, FTAG);
+				vdev_init(tvd, txg);
+				vdev_config_dirty(tvd);
+				spa_config_set(spa,
+				    spa_config_generate(spa, rvd, txg, 0));
+				spa_config_exit(spa, FTAG);
+				txg_wait_synced(spa->spa_dsl_pool, txg);
+				ASSERT(tvd->vdev_ms_shift != 0);
+				ASSERT(tvd->vdev_ms_array != 0);
+				spa_config_sync();
+			}
+		}
 	}
 
 	error = 0;
@@ -1035,9 +1068,9 @@
 spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 {
 	uint64_t txg;
-	int c, error;
+	int c, c0, children, error;
 	vdev_t *rvd = spa->spa_root_vdev;
-	vdev_t *vd;
+	vdev_t *vd, *tvd;
 
 	txg = spa_vdev_enter(spa);
 
@@ -1046,32 +1079,61 @@
 	if (vd == NULL)
 		return (spa_vdev_exit(spa, vd, txg, EINVAL));
 
-	if (rvd == NULL)			/* spa_create() */
-		spa->spa_root_vdev = rvd = vd;
+	if (rvd == NULL) {			/* spa_create() */
+		rvd = vd;
+		c0 = 0;
+	} else {
+		c0 = rvd->vdev_children;
+	}
+
+	ASSERT(spa->spa_root_vdev == rvd);
 
 	if ((error = vdev_create(vd, txg)) != 0)
 		return (spa_vdev_exit(spa, vd, txg, error));
 
+	children = vd->vdev_children;
+
 	/*
-	 * Transfer each top-level vdev from the temporary root
-	 * to the spa's root and initialize its metaslabs.
+	 * Transfer each new top-level vdev from vd to rvd.
 	 */
-	for (c = 0; c < vd->vdev_children; c++) {
-		vdev_t *tvd = vd->vdev_child[c];
+	for (c = 0; c < children; c++) {
+		tvd = vd->vdev_child[c];
 		if (vd != rvd) {
 			vdev_remove_child(vd, tvd);
-			tvd->vdev_id = rvd->vdev_children;
+			tvd->vdev_id = c0 + c;
 			vdev_add_child(rvd, tvd);
 		}
-		if ((error = vdev_init(tvd, txg)) != 0)
-			return (spa_vdev_exit(spa, vd, txg, error));
 		vdev_config_dirty(tvd);
 	}
 
 	/*
-	 * Update the config based on the new in-core state.
+	 * We have to be careful when adding new vdevs to an existing pool.
+	 * If other threads start allocating from these vdevs before we
+	 * sync the config cache, and we lose power, then upon reboot we may
+	 * fail to open the pool because there are DVAs that the config cache
+	 * can't translate.  Therefore, we first add the vdevs without
+	 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
+	 * initialize the metaslabs; and sync the config cache again.
+	 *
+	 * spa_load() checks for added-but-not-initialized vdevs, so that
+	 * if we lose power at any point in this sequence, the remaining
+	 * steps will be completed the next time we load the pool.
 	 */
-	spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0));
+	if (vd != rvd) {
+		(void) spa_vdev_exit(spa, vd, txg, 0);
+		txg = spa_vdev_enter(spa);
+		vd = NULL;
+	}
+
+	/*
+	 * Now that the config is safely on disk, we can use the new space.
+	 */
+	for (c = 0; c < children; c++) {
+		tvd = rvd->vdev_child[c0 + c];
+		ASSERT(tvd->vdev_ms_array == 0);
+		vdev_init(tvd, txg);
+		vdev_config_dirty(tvd);
+	}
 
 	return (spa_vdev_exit(spa, vd, txg, 0));
 }
@@ -1105,6 +1167,9 @@
 	if (oldvd == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 
+	if (!oldvd->vdev_ops->vdev_op_leaf)
+		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
 	pvd = oldvd->vdev_parent;
 
 	/*
@@ -1183,10 +1248,6 @@
 	ASSERT(pvd->vdev_top == tvd);
 	ASSERT(tvd->vdev_parent == rvd);
 
-	/*
-	 * Update the config based on the new in-core state.
-	 */
-	spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0));
 	vdev_config_dirty(tvd);
 
 	/*
@@ -1238,6 +1299,9 @@
 	if (vd == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 
+	if (!vd->vdev_ops->vdev_op_leaf)
+		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
 	pvd = vd->vdev_parent;
 
 	/*
@@ -1337,11 +1401,6 @@
 	 */
 	(void) vdev_metaslab_init(tvd, txg);
 
-	/*
-	 * Update the config based on the new in-core state.
-	 */
-	spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0));
-
 	vdev_config_dirty(tvd);
 
 	/*
@@ -1429,11 +1488,12 @@
 	if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENOENT));
 
+	if (!vd->vdev_ops->vdev_op_leaf)
+		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
 	spa_strfree(vd->vdev_path);
 	vd->vdev_path = spa_strdup(newpath);
 
-	spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0));
-
 	vdev_config_dirty(vd->vdev_top);
 
 	return (spa_vdev_exit(spa, NULL, txg, 0));
--- a/usr/src/uts/common/fs/zfs/spa_misc.c	Thu Mar 09 16:31:45 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c	Thu Mar 09 16:56:05 2006 -0800
@@ -439,14 +439,29 @@
 int
 spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
 {
-	ASSERT(txg != 0);
+	vdev_t *rvd = spa->spa_root_vdev;
+	uint64_t next_txg = spa_last_synced_txg(spa) + 1;
+	int config_changed = B_FALSE;
+
+	/*
+	 * Usually txg == next_txg, but spa_vdev_attach()
+	 * actually needs to wait for the open txg to sync.
+	 */
+	ASSERT(txg >= next_txg);
 
 	/*
-	 * Reassess the DTLs.  spa_scrub() looks at the DTLs without
-	 * taking the config lock at all, so keep it safe.
+	 * Reassess the DTLs.
 	 */
-	if (spa->spa_root_vdev)
-		vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
+	if (rvd != NULL)
+		vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
+
+	/*
+	 * Update the in-core config if it changed.
+	 */
+	if (error == 0 && !list_is_empty(&spa->spa_dirty_list)) {
+		config_changed = B_TRUE;
+		spa_config_set(spa, spa_config_generate(spa, rvd, next_txg, 0));
+	}
 
 	spa_config_exit(spa, spa);
 
@@ -457,7 +472,7 @@
 	spa_scrub_restart(spa, txg);
 	spa_scrub_resume(spa);
 
-	if (vd == spa->spa_root_vdev)		/* spa_create() */
+	if (vd == rvd)				/* spa_create() */
 		return (error);
 
 	/*
@@ -474,10 +489,9 @@
 	}
 
 	/*
-	 * If we're in the middle of export or destroy, don't sync the
-	 * config -- it will do that anyway, and we deadlock if we try.
+	 * If the config changed, update the config cache.
 	 */
-	if (error == 0 && spa->spa_state == POOL_STATE_ACTIVE)
+	if (config_changed)
 		spa_config_sync();
 
 	mutex_exit(&spa_namespace_lock);
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h	Thu Mar 09 16:31:45 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h	Thu Mar 09 16:56:05 2006 -0800
@@ -60,7 +60,7 @@
 extern int vdev_open(vdev_t *);
 extern void vdev_close(vdev_t *);
 extern int vdev_create(vdev_t *, uint64_t txg);
-extern int vdev_init(vdev_t *, uint64_t txg);
+extern void vdev_init(vdev_t *, uint64_t txg);
 extern void vdev_reopen(vdev_t *);
 
 extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
--- a/usr/src/uts/common/fs/zfs/vdev.c	Thu Mar 09 16:31:45 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev.c	Thu Mar 09 16:56:05 2006 -0800
@@ -143,7 +143,7 @@
 	int c;
 	vdev_t *mvd;
 
-	if (vd->vdev_children == 0 && vd->vdev_guid == guid)
+	if (vd->vdev_guid == guid)
 		return (vd);
 
 	for (c = 0; c < vd->vdev_children; c++)
@@ -266,10 +266,31 @@
 {
 	vdev_t *vd;
 
-	while (guid == 0)
-		guid = spa_get_random(-1ULL);
+	vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
+
+	if (spa->spa_root_vdev == NULL) {
+		ASSERT(ops == &vdev_root_ops);
+		spa->spa_root_vdev = vd;
+	}
 
-	vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
+	if (guid == 0) {
+		if (spa->spa_root_vdev == vd) {
+			/*
+			 * The root vdev's guid will also be the pool guid,
+			 * which must be unique among all pools.
+			 */
+			while (guid == 0 || spa_guid_exists(guid, 0))
+				guid = spa_get_random(-1ULL);
+		} else {
+			/*
+			 * Any other vdev's guid must be unique within the pool.
+			 */
+			while (guid == 0 ||
+			    spa_guid_exists(spa_guid(spa), guid))
+				guid = spa_get_random(-1ULL);
+		}
+		ASSERT(!spa_guid_exists(spa_guid(spa), guid));
+	}
 
 	vd->vdev_spa = spa;
 	vd->vdev_id = id;
@@ -297,6 +318,8 @@
 static void
 vdev_free_common(vdev_t *vd)
 {
+	spa_t *spa = vd->vdev_spa;
+
 	if (vd->vdev_path)
 		spa_strfree(vd->vdev_path);
 	if (vd->vdev_devid)
@@ -313,6 +336,9 @@
 	mutex_destroy(&vd->vdev_dtl_lock);
 	mutex_destroy(&vd->vdev_dirty_lock);
 
+	if (vd == spa->spa_root_vdev)
+		spa->spa_root_vdev = NULL;
+
 	kmem_free(vd, sizeof (vdev_t));
 }
 
@@ -596,6 +622,9 @@
 	metaslab_t **mspp = vd->vdev_ms;
 	int ret;
 
+	if (vd->vdev_ms_shift == 0)	/* not being allocated from yet */
+		return (0);
+
 	dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc);
 
 	ASSERT(oldc <= newc);
@@ -925,7 +954,7 @@
  * For creation, we want to try to create all vdevs at once and then undo it
  * if anything fails; this is much harder if we have pending transactions.
  */
-int
+void
 vdev_init(vdev_t *vd, uint64_t txg)
 {
 	/*
@@ -935,9 +964,10 @@
 	vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
 
 	/*
-	 * Initialize the vdev's metaslabs.
+	 * Initialize the vdev's metaslabs.  This can't fail because
+	 * there's nothing to read when creating all new metaslabs.
 	 */
-	return (vdev_metaslab_init(vd, txg));
+	VERIFY(vdev_metaslab_init(vd, txg) == 0);
 }
 
 void
@@ -1226,15 +1256,11 @@
 	}
 
 	/*
-	 * If this is a top-level vdev, make sure its allocation parameters
-	 * exist and initialize its metaslabs.
+	 * If this is a top-level vdev, initialize its metaslabs.
 	 */
 	if (vd == vd->vdev_top) {
 
-		if (vd->vdev_ms_array == 0 ||
-		    vd->vdev_ms_shift == 0 ||
-		    vd->vdev_ashift == 0 ||
-		    vd->vdev_asize == 0) {
+		if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			return (0);
@@ -1364,17 +1390,19 @@
 	txg = spa_vdev_enter(spa);
 
 	rvd = spa->spa_root_vdev;
+
 	if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 
+	if (!vd->vdev_ops->vdev_op_leaf)
+		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
 	dprintf("ONLINE: %s\n", vdev_description(vd));
 
 	vd->vdev_offline = B_FALSE;
 	vd->vdev_tmpoffline = B_FALSE;
 	vdev_reopen(vd->vdev_top);
 
-	spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0));
-
 	vdev_config_dirty(vd->vdev_top);
 
 	(void) spa_vdev_exit(spa, NULL, txg, 0);
@@ -1393,9 +1421,13 @@
 	txg = spa_vdev_enter(spa);
 
 	rvd = spa->spa_root_vdev;
+
 	if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 
+	if (!vd->vdev_ops->vdev_op_leaf)
+		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
 	dprintf("OFFLINE: %s\n", vdev_description(vd));
 
 	/* vdev is already offlined, do nothing */
@@ -1426,12 +1458,8 @@
 	}
 
 	vd->vdev_tmpoffline = istmp;
-	if (istmp)
-		return (spa_vdev_exit(spa, NULL, txg, 0));
-
-	spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0));
-
-	vdev_config_dirty(vd->vdev_top);
+	if (!istmp)
+		vdev_config_dirty(vd->vdev_top);
 
 	return (spa_vdev_exit(spa, NULL, txg, 0));
 }