Mercurial > illumos > illumos-gate

diff usr/src/uts/common/fs/zfs/spa.c @ 4451:24fbf2d7a5d7
PSARC 2007/197 ZFS hotplug PSARC 2007/283 FMA for ZFS Phase 2 6401126 ZFS DE should verify that diagnosis is still valid before solving cases 6500545 ZFS does not handle changes in devids 6508521 zpool online should warn when it is being used incorrectly 6509807 ZFS checksum ereports are not being posted 6514712 zfs_nicenum() doesn't work with perfectly-sized buffers 6520510 media state doesn't get updated properly on device removal 6520513 ZFS should have better support for device removal 6520514 vdev state should be controlled through a single ioctl() 6520519 ZFS should diagnose faulty devices 6520947 ZFS DE should close cases which no longer apply 6521393 ZFS case timeout should be FMD_TYPE_TIME 6521624 fmd_hash_walk() can dump core when given a bad address 6521946 ZFS DE needlessly subscribes to faults 6522085 ZFS dictionary files contain spelling errors 6523185 vdev_reopen() doesn't correctly propagate state 6523555 'zpool online' should be less chatty unless something goes wrong 6527379 zpool(1M) should not try to open faulted devices 6527700 ZFS should post a sysevent when topology changes 6528194 lofi should support force unmap and DKIO_DEV_GONE 6528732 ZFS should store physical device path in addition to /dev path 6532635 ZFS keeps devices open unnecessarily 6532979 bad argument to ZFS_IOC_VDEV_ATTACH can panic system 6567983 deadlock with spa_scrub_thread() and spa_namespace_lock
author: eschrock
date: Tue, 12 Jun 2007 13:18:17 -0700
parents: 3dfde0f4662d
children: 5d5b6ba91b17
--- a/usr/src/uts/common/fs/zfs/spa.c	Tue Jun 12 11:51:07 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/spa.c	Tue Jun 12 13:18:17 2007 -0700
@@ -424,6 +424,24 @@
 }
 
 /*
+ * Checks to see if the given vdev could not be opened, in which case we post a
+ * sysevent to notify the autoreplace code that the device has been removed.
+ */
+static void
+spa_check_removed(vdev_t *vd)
+{
+	int c;
+
+	for (c = 0; c < vd->vdev_children; c++)
+		spa_check_removed(vd->vdev_child[c]);
+
+	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) {
+		zfs_post_autoreplace(vd->vdev_spa, vd);
+		spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK);
+	}
+}
+
+/*
  * Load an existing storage pool, using the pool's builtin spa_config as a
  * source of configuration information.
  */
@@ -438,6 +456,7 @@
 	uint64_t pool_guid;
 	uint64_t version;
 	zio_t *zio;
+	uint64_t autoreplace = 0;
 
 	spa->spa_load_state = state;
 
@@ -711,11 +730,25 @@
 	if (error == 0) {
 		(void) zap_lookup(spa->spa_meta_objset,
 		    spa->spa_pool_props_object,
-		    zpool_prop_to_name(ZFS_PROP_BOOTFS),
+		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS),
 		    sizeof (uint64_t), 1, &spa->spa_bootfs);
+		(void) zap_lookup(spa->spa_meta_objset,
+		    spa->spa_pool_props_object,
+		    zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE),
+		    sizeof (uint64_t), 1, &autoreplace);
 	}
 
 	/*
+	 * If the 'autoreplace' property is set, then post a resource notifying
+	 * the ZFS DE that it should not issue any faults for unopenable
+	 * devices.  We also iterate over the vdevs, and post a sysevent for any
+	 * unopenable vdevs so that the normal autoreplace handler can take
+	 * over.
+	 */
+	if (autoreplace)
+		spa_check_removed(spa->spa_root_vdev);
+
+	/*
 	 * Load the vdev state for all toplevel vdevs.
 	 */
 	vdev_load(rvd);
@@ -795,7 +828,7 @@
  * The import case is identical to an open except that the configuration is sent
  * down from userland, instead of grabbed from the configuration cache.  For the
  * case of an open, the pool configuration will exist in the
- * POOL_STATE_UNITIALIZED state.
+ * POOL_STATE_UNINITIALIZED state.
  *
  * The stats information (gen/count/ustats) is used to gather vdev statistics at
  * the same time open the pool, without having to keep around the spa_t in some
@@ -879,6 +912,13 @@
 	}
 
 	spa_open_ref(spa, tag);
+
+	/*
+	 * If we just loaded the pool, resilver anything that's out of date.
+	 */
+	if (loaded && (spa_mode & FWRITE))
+		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+
 	if (locked)
 		mutex_exit(&spa_namespace_lock);
 
@@ -890,12 +930,6 @@
 		spa_config_exit(spa, FTAG);
 	}
 
-	/*
-	 * If we just loaded the pool, resilver anything that's out of date.
-	 */
-	if (loaded && (spa_mode & FWRITE))
-		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
-
 	return (0);
 }
 
@@ -1219,7 +1253,7 @@
 
 	dmu_tx_commit(tx);
 
-	spa->spa_bootfs = zfs_prop_default_numeric(ZFS_PROP_BOOTFS);
+	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
 	spa->spa_sync_on = B_TRUE;
 	txg_sync_start(spa->spa_dsl_pool);
 
@@ -1325,14 +1359,14 @@
 	 */
 	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
 
-	mutex_exit(&spa_namespace_lock);
-
 	/*
 	 * Resilver anything that's out of date.
 	 */
 	if (spa_mode & FWRITE)
 		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
 
+	mutex_exit(&spa_namespace_lock);
+
 	return (0);
 }
 
@@ -1476,6 +1510,8 @@
 		}
 	}
 
+	spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY);
+
 	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
 		spa_unload(spa);
 		spa_deactivate(spa);
@@ -1657,7 +1693,7 @@
  *
  * If 'replacing' is specified, the new device is intended to replace the
  * existing device; in this case the two devices are made into their own
- * mirror using the 'replacing' vdev, which is functionally idendical to
+ * mirror using the 'replacing' vdev, which is functionally identical to
  * the mirror vdev (it actually reuses all the same ops) but has a few
  * extra rules: you can't attach to it after it's been created, and upon
  * completion of resilvering, the first disk (the one being replaced)
@@ -1685,7 +1721,10 @@
 	pvd = oldvd->vdev_parent;
 
 	if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
-	    VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1)
+	    VDEV_ALLOC_ADD)) != 0)
+		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+
+	if (newrootvd->vdev_children != 1)
 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
 
 	newvd = newrootvd->vdev_child[0];
@@ -1818,9 +1857,12 @@
 	(void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
 
 	/*
-	 * Kick off a resilver to update newvd.
+	 * Kick off a resilver to update newvd.  We need to grab the namespace
+	 * lock because spa_scrub() needs to post a sysevent with the pool name.
 	 */
+	mutex_enter(&spa_namespace_lock);
 	VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+	mutex_exit(&spa_namespace_lock);
 
 	return (0);
 }
@@ -1973,7 +2015,7 @@
 	/*
 	 * Reevaluate the parent vdev state.
 	 */
-	vdev_propagate_state(cvd->vdev_parent);
+	vdev_propagate_state(cvd);
 
 	/*
 	 * If the device we just detached was smaller than the others, it may be
@@ -1996,6 +2038,8 @@
 	vd->vdev_detached = B_TRUE;
 	vdev_dirty(tvd, VDD_DTL, vd, txg);
 
+	spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
+
 	error = spa_vdev_exit(spa, vd, txg, 0);
 
 	/*
@@ -2098,20 +2142,24 @@
 }
 
 /*
- * Find any device that's done replacing, so we can detach it.
+ * Find any device that's done replacing, or a vdev marked 'unspare' that's
+ * current spared, so we can detach it.
  */
 static vdev_t *
-spa_vdev_replace_done_hunt(vdev_t *vd)
+spa_vdev_resilver_done_hunt(vdev_t *vd)
 {
 	vdev_t *newvd, *oldvd;
 	int c;
 
 	for (c = 0; c < vd->vdev_children; c++) {
-		oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]);
+		oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
 		if (oldvd != NULL)
 			return (oldvd);
 	}
 
+	/*
+	 * Check for a completed replacement.
+	 */
 	if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) {
 		oldvd = vd->vdev_child[0];
 		newvd = vd->vdev_child[1];
@@ -2125,11 +2173,29 @@
 		mutex_exit(&newvd->vdev_dtl_lock);
 	}
 
+	/*
+	 * Check for a completed resilver with the 'unspare' flag set.
+	 */
+	if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) {
+		newvd = vd->vdev_child[0];
+		oldvd = vd->vdev_child[1];
+
+		mutex_enter(&newvd->vdev_dtl_lock);
+		if (newvd->vdev_unspare &&
+		    newvd->vdev_dtl_map.sm_space == 0 &&
+		    newvd->vdev_dtl_scrub.sm_space == 0) {
+			newvd->vdev_unspare = 0;
+			mutex_exit(&newvd->vdev_dtl_lock);
+			return (oldvd);
+		}
+		mutex_exit(&newvd->vdev_dtl_lock);
+	}
+
 	return (NULL);
 }
 
 static void
-spa_vdev_replace_done(spa_t *spa)
+spa_vdev_resilver_done(spa_t *spa)
 {
 	vdev_t *vd;
 	vdev_t *pvd;
@@ -2138,7 +2204,7 @@
 
 	spa_config_enter(spa, RW_READER, FTAG);
 
-	while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) {
+	while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
 		guid = vd->vdev_guid;
 		/*
 		 * If we have just finished replacing a hot spared device, then
@@ -2449,6 +2515,9 @@
 	vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete);
 	spa_errlog_rotate(spa);
 
+	if (scrub_type == POOL_SCRUB_RESILVER && complete)
+		spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_FINISH);
+
 	spa_config_exit(spa, FTAG);
 
 	mutex_enter(&spa->spa_scrub_lock);
@@ -2457,7 +2526,7 @@
 	 * We may have finished replacing a device.
 	 * Let the async thread assess this and handle the detach.
 	 */
-	spa_async_request(spa, SPA_ASYNC_REPLACE_DONE);
+	spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
 
 	/*
 	 * If we were told to restart, our final act is to start a new scrub.
@@ -2568,7 +2637,7 @@
 		 */
 		if (type == POOL_SCRUB_RESILVER) {
 			type = POOL_SCRUB_NONE;
-			spa_async_request(spa, SPA_ASYNC_REPLACE_DONE);
+			spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
 		}
 	} else {
 		/*
@@ -2593,6 +2662,8 @@
 		mintxg = ss->ss_start - 1;
 		ss = avl_last(&rvd->vdev_dtl_map.sm_root);
 		maxtxg = MIN(ss->ss_end, maxtxg);
+
+		spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START);
 	}
 
 	mutex_exit(&rvd->vdev_dtl_lock);
@@ -2624,29 +2695,29 @@
  */
 
 static void
-spa_async_reopen(spa_t *spa)
+spa_async_remove(spa_t *spa, vdev_t *vd)
 {
-	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *tvd;
 	int c;
 
-	spa_config_enter(spa, RW_WRITER, FTAG);
-
-	for (c = 0; c < rvd->vdev_children; c++) {
-		tvd = rvd->vdev_child[c];
-		if (tvd->vdev_reopen_wanted) {
-			tvd->vdev_reopen_wanted = 0;
-			vdev_reopen(tvd);
+	for (c = 0; c < vd->vdev_children; c++) {
+		tvd = vd->vdev_child[c];
+		if (tvd->vdev_remove_wanted) {
+			tvd->vdev_remove_wanted = 0;
+			vdev_set_state(tvd, B_FALSE, VDEV_STATE_REMOVED,
+			    VDEV_AUX_NONE);
+			vdev_clear(spa, tvd);
+			vdev_config_dirty(tvd->vdev_top);
 		}
+		spa_async_remove(spa, tvd);
 	}
-
-	spa_config_exit(spa, FTAG);
 }
 
 static void
 spa_async_thread(spa_t *spa)
 {
 	int tasks;
+	uint64_t txg;
 
 	ASSERT(spa->spa_sync_on);
 
@@ -2665,28 +2736,40 @@
 	}
 
 	/*
-	 * See if any devices need to be reopened.
+	 * See if any devices need to be marked REMOVED.
 	 */
-	if (tasks & SPA_ASYNC_REOPEN)
-		spa_async_reopen(spa);
+	if (tasks & SPA_ASYNC_REMOVE) {
+		txg = spa_vdev_enter(spa);
+		spa_async_remove(spa, spa->spa_root_vdev);
+		(void) spa_vdev_exit(spa, NULL, txg, 0);
+	}
 
 	/*
 	 * If any devices are done replacing, detach them.
 	 */
-	if (tasks & SPA_ASYNC_REPLACE_DONE)
-		spa_vdev_replace_done(spa);
+	if (tasks & SPA_ASYNC_RESILVER_DONE)
+		spa_vdev_resilver_done(spa);
 
 	/*
-	 * Kick off a scrub.
+	 * Kick off a scrub.  When starting a RESILVER scrub (or an EVERYTHING
+	 * scrub which can become a resilver), we need to hold
+	 * spa_namespace_lock() because the sysevent we post via
+	 * spa_event_notify() needs to get the name of the pool.
 	 */
-	if (tasks & SPA_ASYNC_SCRUB)
+	if (tasks & SPA_ASYNC_SCRUB) {
+		mutex_enter(&spa_namespace_lock);
 		VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0);
+		mutex_exit(&spa_namespace_lock);
+	}
 
 	/*
 	 * Kick off a resilver.
 	 */
-	if (tasks & SPA_ASYNC_RESILVER)
+	if (tasks & SPA_ASYNC_RESILVER) {
+		mutex_enter(&spa_namespace_lock);
 		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+		mutex_exit(&spa_namespace_lock);
+	}
 
 	/*
 	 * Let the world know that we're done.
@@ -2810,7 +2893,7 @@
 	/*
 	 * Update the MOS nvlist describing the list of available spares.
 	 * spa_validate_spares() will have already made sure this nvlist is
-	 * valid and the vdevs are labelled appropriately.
+	 * valid and the vdevs are labeled appropriately.
 	 */
 	if (spa->spa_spares_object == 0) {
 		spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset,
@@ -2869,6 +2952,7 @@
 	nvpair_t *nvpair;
 	objset_t *mos = spa->spa_meta_objset;
 	uint64_t zapobj;
+	uint64_t intval;
 
 	mutex_enter(&spa->spa_props_lock);
 	if (spa->spa_pool_props_object == 0) {
@@ -2886,14 +2970,23 @@
 	nvpair = NULL;
 	while ((nvpair = nvlist_next_nvpair(nvp, nvpair))) {
 		switch (zpool_name_to_prop(nvpair_name(nvpair))) {
-		case ZFS_PROP_BOOTFS:
+		case ZPOOL_PROP_BOOTFS:
 			VERIFY(nvlist_lookup_uint64(nvp,
 			    nvpair_name(nvpair), &spa->spa_bootfs) == 0);
 			VERIFY(zap_update(mos,
 			    spa->spa_pool_props_object,
-			    zpool_prop_to_name(ZFS_PROP_BOOTFS), 8, 1,
+			    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), 8, 1,
 			    &spa->spa_bootfs, tx) == 0);
 			break;
+
+		case ZPOOL_PROP_AUTOREPLACE:
+			VERIFY(nvlist_lookup_uint64(nvp,
+			    nvpair_name(nvpair), &intval) == 0);
+			VERIFY(zap_update(mos,
+			    spa->spa_pool_props_object,
+			    zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 8, 1,
+			    &intval, tx) == 0);
+			break;
 		}
 	}
 }
@@ -3191,7 +3284,7 @@
 	zap_attribute_t za;
 	objset_t *mos = spa->spa_meta_objset;
 	zfs_source_t src;
-	zfs_prop_t prop;
+	zpool_prop_t prop;
 	nvlist_t *propval;
 	uint64_t value;
 	int err;
@@ -3215,14 +3308,14 @@
 		VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 		switch (za.za_integer_length) {
 		case 8:
-			if (zfs_prop_default_numeric(prop) ==
+			if (zpool_prop_default_numeric(prop) ==
 			    za.za_first_integer)
 				src = ZFS_SRC_DEFAULT;
 			else
 				src = ZFS_SRC_LOCAL;
 			value = za.za_first_integer;
 
-			if (prop == ZFS_PROP_BOOTFS) {
+			if (prop == ZPOOL_PROP_BOOTFS) {
 				dsl_pool_t *dp;
 				dsl_dataset_t *ds = NULL;
 				char strval[MAXPATHLEN];
@@ -3274,7 +3367,61 @@
 	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
 		VERIFY(zap_remove(spa->spa_meta_objset,
 		    spa->spa_pool_props_object,
-		    zpool_prop_to_name(ZFS_PROP_BOOTFS), tx) == 0);
+		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
 		spa->spa_bootfs = 0;
 	}
 }
+
+/*
+ * Post a sysevent corresponding to the given event.  The 'name' must be one of
+ * the event definitions in sys/sysevent/eventdefs.h.  The payload will be
+ * filled in from the spa and (optionally) the vdev.  This doesn't do anything
+ * in the userland libzpool, as we don't want consumers to misinterpret ztest
+ * or zdb as real changes.
+ */
+void
+spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
+{
+#ifdef _KERNEL
+	sysevent_t		*ev;
+	sysevent_attr_list_t	*attr = NULL;
+	sysevent_value_t	value;
+	sysevent_id_t		eid;
+
+	ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
+	    SE_SLEEP);
+
+	value.value_type = SE_DATA_TYPE_STRING;
+	value.value.sv_string = spa_name(spa);
+	if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
+		goto done;
+
+	value.value_type = SE_DATA_TYPE_UINT64;
+	value.value.sv_uint64 = spa_guid(spa);
+	if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
+		goto done;
+
+	if (vd) {
+		value.value_type = SE_DATA_TYPE_UINT64;
+		value.value.sv_uint64 = vd->vdev_guid;
+		if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
+		    SE_SLEEP) != 0)
+			goto done;
+
+		if (vd->vdev_path) {
+			value.value_type = SE_DATA_TYPE_STRING;
+			value.value.sv_string = vd->vdev_path;
+			if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
+			    &value, SE_SLEEP) != 0)
+				goto done;
+		}
+	}
+
+	(void) log_sysevent(ev, SE_SLEEP, &eid);
+
+done:
+	if (attr)
+		sysevent_free_attr(attr);
+	sysevent_free(ev);
+#endif
+}
author	eschrock
date	Tue, 12 Jun 2007 13:18:17 -0700
parents	3dfde0f4662d
children	5d5b6ba91b17