diff usr/src/uts/common/fs/zfs/spa.c @ 3377:a2fa338530c1

6393525 vdev_reopen() should verify that it's still the same device 6414648 zfs allows overlapping devices to be added 6435943 assertion failed: spare != 0L, file: ../../common/fs/zfs/spa_misc.c 6436000 import of actively spared device returns EBUSY 6478316 nfs/server doesn't respect auto_enable setting 6483675 want a private property to return number of clones 6485728 zpool iostat should flush output periodically 6494072 A device which was set as spare disk is not detach 6497563 zfs double-spared an already-spared disk on reboot 6503724 adding spare that is in use in another pool should be allowed 6505225 zpool(1M) can give misleading error when removing active spare
author eschrock
date Mon, 08 Jan 2007 11:15:07 -0800
parents 256464cbb73c
children 5340a4d98e0b
line wrap: on
line diff
--- a/usr/src/uts/common/fs/zfs/spa.c	Mon Jan 08 02:45:56 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/spa.c	Mon Jan 08 11:15:07 2007 -0800
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -301,14 +301,22 @@
 	nvlist_t **spares;
 	uint_t nspares;
 	int i;
+	vdev_t *vd, *tvd;
 
 	/*
 	 * First, close and free any existing spare vdevs.
 	 */
 	for (i = 0; i < spa->spa_nspares; i++) {
-		vdev_close(spa->spa_spares[i]);
-		vdev_free(spa->spa_spares[i]);
+		vd = spa->spa_spares[i];
+
+		/* Undo the call to spa_activate() below */
+		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL &&
+		    tvd->vdev_isspare)
+			spa_spare_remove(tvd);
+		vdev_close(vd);
+		vdev_free(vd);
 	}
+
 	if (spa->spa_spares)
 		kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *));
 
@@ -326,18 +334,42 @@
 
 	/*
 	 * Construct the array of vdevs, opening them to get status in the
-	 * process.
+	 * process.   For each spare, there is potentially two different vdev_t
+	 * structures associated with it: one in the list of spares (used only
+	 * for basic validation purposes) and one in the active vdev
+	 * configuration (if it's spared in).  During this phase we open and
+	 * validate each vdev on the spare list.  If the vdev also exists in the
+	 * active configuration, then we also mark this vdev as an active spare.
 	 */
 	spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP);
 	for (i = 0; i < spa->spa_nspares; i++) {
-		vdev_t *vd;
-
 		VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
 		    VDEV_ALLOC_SPARE) == 0);
 		ASSERT(vd != NULL);
 
 		spa->spa_spares[i] = vd;
 
+		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) {
+			if (!tvd->vdev_isspare)
+				spa_spare_add(tvd);
+
+			/*
+			 * We only mark the spare active if we were successfully
+			 * able to load the vdev.  Otherwise, importing a pool
+			 * with a bad active spare would result in strange
+			 * behavior, because multiple pool would think the spare
+			 * is actively in use.
+			 *
+			 * There is a vulnerability here to an equally bizarre
+			 * circumstance, where a dead active spare is later
+			 * brought back to life (onlined or otherwise).  Given
+			 * the rarity of this scenario, and the extra complexity
+			 * it adds, we ignore the possibility.
+			 */
+			if (!vdev_is_dead(tvd))
+				spa_spare_activate(tvd);
+		}
+
 		if (vdev_open(vd) != 0)
 			continue;
 
@@ -867,6 +899,7 @@
 	uint64_t guid;
 	vdev_stat_t *vs;
 	uint_t vsc;
+	uint64_t pool;
 
 	if (spa->spa_nspares == 0)
 		return;
@@ -889,7 +922,7 @@
 		for (i = 0; i < nspares; i++) {
 			VERIFY(nvlist_lookup_uint64(spares[i],
 			    ZPOOL_CONFIG_GUID, &guid) == 0);
-			if (spa_spare_inuse(guid)) {
+			if (spa_spare_exists(guid, &pool) && pool != 0ULL) {
 				VERIFY(nvlist_lookup_uint64_array(
 				    spares[i], ZPOOL_CONFIG_STATS,
 				    (uint64_t **)&vs, &vsc) == 0);
@@ -943,7 +976,9 @@
 
 /*
  * Validate that the 'spares' array is well formed.  We must have an array of
- * nvlists, each which describes a valid leaf vdev.
+ * nvlists, each which describes a valid leaf vdev.  If this is an import (mode
+ * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long
+ * as they are well-formed.
  */
 static int
 spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
@@ -970,34 +1005,45 @@
 	if (spa_version(spa) < ZFS_VERSION_SPARES)
 		return (ENOTSUP);
 
+	/*
+	 * Set the pending spare list so we correctly handle device in-use
+	 * checking.
+	 */
+	spa->spa_pending_spares = spares;
+	spa->spa_pending_nspares = nspares;
+
 	for (i = 0; i < nspares; i++) {
 		if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0,
 		    mode)) != 0)
-			return (error);
+			goto out;
 
 		if (!vd->vdev_ops->vdev_op_leaf) {
 			vdev_free(vd);
-			return (EINVAL);
-		}
-
-		if ((error = vdev_open(vd)) != 0) {
-			vdev_free(vd);
-			return (error);
+			error = EINVAL;
+			goto out;
 		}
 
 		vd->vdev_top = vd;
-		if ((error = vdev_label_spare(vd, crtxg)) != 0) {
-			vdev_free(vd);
-			return (error);
+
+		if ((error = vdev_open(vd)) == 0 &&
+		    (error = vdev_label_init(vd, crtxg,
+		    VDEV_LABEL_SPARE)) == 0) {
+			VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID,
+			    vd->vdev_guid) == 0);
 		}
 
-		VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID,
-		    vd->vdev_guid) == 0);
-
 		vdev_free(vd);
+
+		if (error && mode != VDEV_ALLOC_SPARE)
+			goto out;
+		else
+			error = 0;
 	}
 
-	return (0);
+out:
+	spa->spa_pending_spares = NULL;
+	spa->spa_pending_nspares = 0;
+	return (error);
 }
 
 /*
@@ -1455,33 +1501,47 @@
 	    VDEV_ALLOC_ADD)) != 0)
 		return (spa_vdev_exit(spa, NULL, txg, error));
 
-	if ((error = spa_validate_spares(spa, nvroot, txg,
-	    VDEV_ALLOC_ADD)) != 0)
-		return (spa_vdev_exit(spa, vd, txg, error));
+	spa->spa_pending_vdev = vd;
 
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 	    &spares, &nspares) != 0)
 		nspares = 0;
 
-	if (vd->vdev_children == 0 && nspares == 0)
+	if (vd->vdev_children == 0 && nspares == 0) {
+		spa->spa_pending_vdev = NULL;
 		return (spa_vdev_exit(spa, vd, txg, EINVAL));
+	}
 
 	if (vd->vdev_children != 0) {
-		if ((error = vdev_create(vd, txg, B_FALSE)) != 0)
+		if ((error = vdev_create(vd, txg, B_FALSE)) != 0) {
+			spa->spa_pending_vdev = NULL;
 			return (spa_vdev_exit(spa, vd, txg, error));
-
-		/*
-		 * Transfer each new top-level vdev from vd to rvd.
-		 */
-		for (c = 0; c < vd->vdev_children; c++) {
-			tvd = vd->vdev_child[c];
-			vdev_remove_child(vd, tvd);
-			tvd->vdev_id = rvd->vdev_children;
-			vdev_add_child(rvd, tvd);
-			vdev_config_dirty(tvd);
 		}
 	}
 
+	/*
+	 * We must validate the spares after checking the children.  Otherwise,
+	 * vdev_inuse() will blindly overwrite the spare.
+	 */
+	if ((error = spa_validate_spares(spa, nvroot, txg,
+	    VDEV_ALLOC_ADD)) != 0) {
+		spa->spa_pending_vdev = NULL;
+		return (spa_vdev_exit(spa, vd, txg, error));
+	}
+
+	spa->spa_pending_vdev = NULL;
+
+	/*
+	 * Transfer each new top-level vdev from vd to rvd.
+	 */
+	for (c = 0; c < vd->vdev_children; c++) {
+		tvd = vd->vdev_child[c];
+		vdev_remove_child(vd, tvd);
+		tvd->vdev_id = rvd->vdev_children;
+		vdev_add_child(rvd, tvd);
+		vdev_config_dirty(tvd);
+	}
+
 	if (nspares != 0) {
 		if (spa->spa_sparelist != NULL) {
 			nvlist_t **oldspares;
@@ -1613,10 +1673,16 @@
 		/*
 		 * If the source is a hot spare, and the parent isn't already a
 		 * spare, then we want to create a new hot spare.  Otherwise, we
-		 * want to create a replacing vdev.
+		 * want to create a replacing vdev.  The user is not allowed to
+		 * attach to a spared vdev child unless the 'isspare' state is
+		 * the same (spare replaces spare, non-spare replaces
+		 * non-spare).
 		 */
 		if (pvd->vdev_ops == &vdev_replacing_ops)
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+		else if (pvd->vdev_ops == &vdev_spare_ops &&
+		    newvd->vdev_isspare != oldvd->vdev_isspare)
+			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 		else if (pvd->vdev_ops != &vdev_spare_ops &&
 		    newvd->vdev_isspare)
 			pvops = &vdev_spare_ops;
@@ -1695,7 +1761,8 @@
 	    open_txg - TXG_INITIAL + 1);
 	mutex_exit(&newvd->vdev_dtl_lock);
 
-	dprintf("attached %s in txg %llu\n", newvd->vdev_path, txg);
+	if (newvd->vdev_isspare)
+		spa_spare_activate(newvd);
 
 	/*
 	 * Mark newvd's DTL dirty in this txg.
@@ -1818,9 +1885,7 @@
 	 * it may be that the unwritability of the disk is the reason
 	 * it's being detached!
 	 */
-	error = vdev_label_init(vd, 0, B_FALSE);
-	if (error)
-		dprintf("unable to erase labels on %s\n", vdev_description(vd));
+	error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
 
 	/*
 	 * Remove vd from its parent and compact the parent's children.
@@ -1841,8 +1906,7 @@
 	 */
 	if (unspare) {
 		ASSERT(cvd->vdev_isspare);
-		spa_spare_remove(cvd->vdev_guid);
-		cvd->vdev_isspare = B_FALSE;
+		spa_spare_remove(cvd);
 		unspare_guid = cvd->vdev_guid;
 	}
 
@@ -1861,39 +1925,37 @@
 	ASSERT(tvd->vdev_parent == rvd);
 
 	/*
-	 * Reopen this top-level vdev to reassess health after detach.
+	 * Reevaluate the parent vdev state.
 	 */
-	vdev_reopen(tvd);
+	vdev_propagate_state(cvd->vdev_parent);
 
 	/*
-	 * If the device we just detached was smaller than the others,
-	 * it may be possible to add metaslabs (i.e. grow the pool).
-	 * vdev_metaslab_init() can't fail because the existing metaslabs
-	 * are already in core, so there's nothing to read from disk.
+	 * If the device we just detached was smaller than the others, it may be
+	 * possible to add metaslabs (i.e. grow the pool).  vdev_metaslab_init()
+	 * can't fail because the existing metaslabs are already in core, so
+	 * there's nothing to read from disk.
 	 */
 	VERIFY(vdev_metaslab_init(tvd, txg) == 0);
 
 	vdev_config_dirty(tvd);
 
 	/*
-	 * Mark vd's DTL as dirty in this txg.
-	 * vdev_dtl_sync() will see that vd->vdev_detached is set
-	 * and free vd's DTL object in syncing context.
-	 * But first make sure we're not on any *other* txg's DTL list,
-	 * to prevent vd from being accessed after it's freed.
+	 * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
+	 * vd->vdev_detached is set and free vd's DTL object in syncing context.
+	 * But first make sure we're not on any *other* txg's DTL list, to
+	 * prevent vd from being accessed after it's freed.
 	 */
 	for (t = 0; t < TXG_SIZE; t++)
 		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
 	vd->vdev_detached = B_TRUE;
 	vdev_dirty(tvd, VDD_DTL, vd, txg);
 
-	dprintf("detached %s in txg %llu\n", vd->vdev_path, txg);
-
 	error = spa_vdev_exit(spa, vd, txg, 0);
 
 	/*
-	 * If we are supposed to remove the given vdev from the list of spares,
-	 * iterate over all pools in the system and replace it if it's present.
+	 * If this was the removal of the original device in a hot spare vdev,
+	 * then we want to go through and remove the device from the hot spare
+	 * list of every other pool.
 	 */
 	if (unspare) {
 		spa = NULL;
@@ -3021,10 +3083,18 @@
 spa_has_spare(spa_t *spa, uint64_t guid)
 {
 	int i;
+	uint64_t spareguid;
 
 	for (i = 0; i < spa->spa_nspares; i++)
 		if (spa->spa_spares[i]->vdev_guid == guid)
 			return (B_TRUE);
 
+	for (i = 0; i < spa->spa_pending_nspares; i++) {
+		if (nvlist_lookup_uint64(spa->spa_pending_spares[i],
+		    ZPOOL_CONFIG_GUID, &spareguid) == 0 &&
+		    spareguid == guid)
+			return (B_TRUE);
+	}
+
 	return (B_FALSE);
 }