changeset 19377:ce42a2836d9d

10241 ZFS not detecting faulty spares in a timely manner 12132 zfs-retire agent crashes fmd on systems without vdev devids 12034 zfs test send_encrypted_props can fail Reviewed by: C Fraire <cfraire@me.com> Reviewed by: Andrew Stormont <andyjstormont@gmail.com> Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com> Reviewed by: Patrick Mooney <patrick.mooney@joyent.com> Reviewed by: Rob Johnston <rob.johnston@joyent.com> Approved by: Dan McDonald <danmcd@joyent.com>
author Kody A Kantor <kody@kkantor.com>
date Wed, 06 Feb 2019 19:30:08 +0000
parents 50a678a2c678
children bbdb2eb6f168
files usr/src/cmd/fm/modules/common/zfs-diagnosis/zfs_de.c usr/src/cmd/fm/modules/common/zfs-retire/zfs_retire.c usr/src/lib/libzfs/common/libzfs_pool.c usr/src/pkg/manifests/system-test-zfstest.mf usr/src/test/zfs-tests/include/libtest.shlib usr/src/test/zfs-tests/runfiles/delphix.run usr/src/test/zfs-tests/runfiles/omnios.run usr/src/test/zfs-tests/runfiles/openindiana.run usr/src/test/zfs-tests/tests/functional/cli_root/zpool_replace/setup.ksh usr/src/test/zfs-tests/tests/functional/cli_root/zpool_replace/zpool_replace_002_neg.ksh usr/src/test/zfs-tests/tests/functional/rsend/send_encrypted_props.ksh usr/src/uts/common/fs/zfs/spa.c usr/src/uts/common/fs/zfs/spa_misc.c usr/src/uts/common/fs/zfs/sys/spa.h usr/src/uts/common/fs/zfs/sys/spa_impl.h
diffstat 15 files changed, 268 insertions(+), 31 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/cmd/fm/modules/common/zfs-diagnosis/zfs_de.c	Thu Jan 09 10:59:44 2020 -0600
+++ b/usr/src/cmd/fm/modules/common/zfs-diagnosis/zfs_de.c	Wed Feb 06 19:30:08 2019 +0000
@@ -22,6 +22,7 @@
 /*
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
  */
 
 #include <assert.h>
@@ -38,9 +39,9 @@
 #include <sys/fm/fs/zfs.h>
 
 /*
- * Our serd engines are named 'zfs_<pool_guid>_<vdev_guid>_{checksum,io}'.  This
- * #define reserves enough space for two 64-bit hex values plus the length of
- * the longest string.
+ * Our serd engines are named 'zfs_<pool_guid>_<vdev_guid>_{checksum,io,probe}'.
+ * This #define reserves enough space for two 64-bit hex values plus the length
+ * of the longest string.
  */
 #define	MAX_SERDLEN	(16 * 2 + sizeof ("zfs___checksum"))
 
@@ -59,6 +60,7 @@
 	char		zc_serd_checksum[MAX_SERDLEN];
 	char		zc_serd_io[MAX_SERDLEN];
 	int		zc_has_remove_timer;
+	char		zc_serd_probe[MAX_SERDLEN];
 } zfs_case_data_t;
 
 /*
@@ -88,12 +90,16 @@
 #define	CASE_DATA_VERSION_INITIAL	1
 #define	CASE_DATA_VERSION_SERD		2
 
+/* The length of the maximum uint64 rendered as a decimal string. */
+#define	MAX_ULL_STR 21
+
 typedef struct zfs_de_stats {
 	fmd_stat_t	old_drops;
 	fmd_stat_t	dev_drops;
 	fmd_stat_t	vdev_drops;
 	fmd_stat_t	import_drops;
 	fmd_stat_t	resource_drops;
+	fmd_stat_t	pool_drops;
 } zfs_de_stats_t;
 
 zfs_de_stats_t zfs_stats = {
@@ -101,7 +107,8 @@
 	{ "dev_drops", FMD_TYPE_UINT64, "ereports dropped (dev during open)"},
 	{ "vdev_drops", FMD_TYPE_UINT64, "ereports dropped (weird vdev types)"},
 	{ "import_drops", FMD_TYPE_UINT64, "ereports dropped (during import)" },
-	{ "resource_drops", FMD_TYPE_UINT64, "resource related ereports" }
+	{ "resource_drops", FMD_TYPE_UINT64, "resource related ereports" },
+	{ "pool_drops", FMD_TYPE_UINT64, "ereports dropped (pool iter failed)"},
 };
 
 static hrtime_t zfs_remove_timeout;
@@ -279,6 +286,29 @@
 	return (0);
 }
 
+/*
+ * Find a pool with a matching GUID.
+ */
+typedef struct find_cbdata {
+	uint64_t	cb_guid;
+	zpool_handle_t	*cb_zhp;
+} find_cbdata_t;
+
+static int
+find_pool(zpool_handle_t *zhp, void *data)
+{
+	find_cbdata_t *cbp = data;
+
+	if (cbp->cb_guid ==
+	    zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL)) {
+		cbp->cb_zhp = zhp;
+		return (0);
+	}
+
+	zpool_close(zhp);
+	return (0);
+}
+
 struct load_time_arg {
 	uint64_t lt_guid;
 	er_timeval_t *lt_time;
@@ -370,8 +400,8 @@
 }
 
 /*
- * Construct the name of a serd engine given the pool/vdev GUID and type (io or
- * checksum).
+ * Construct the name of a serd engine given the pool/vdev GUID and type (io,
+ * checksum, or probe).
  */
 static void
 zfs_serd_name(char *buf, uint64_t pool_guid, uint64_t vdev_guid,
@@ -527,6 +557,9 @@
 zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
 {
 	zfs_case_t *zcp, *dcp;
+	libzfs_handle_t *zhdl;
+	zpool_handle_t *zhp;
+
 	int32_t pool_state;
 	uint64_t ena, pool_guid, vdev_guid;
 	er_timeval_t pool_load;
@@ -534,7 +567,10 @@
 	nvlist_t *detector;
 	boolean_t pool_found = B_FALSE;
 	boolean_t isresource;
-	char *fru, *type;
+	boolean_t is_inactive_spare, islog, iscache;
+	nvlist_t *vd_nvl = NULL;
+	char *fru, *type, *vdg;
+	find_cbdata_t cb;
 
 	/*
 	 * We subscribe to notifications for vdev or pool removal.  In these
@@ -627,7 +663,8 @@
 			pool_found = B_TRUE;
 			pool_load = zcp->zc_when;
 		}
-		if (zcp->zc_data.zc_vdev_guid == vdev_guid)
+		if (zcp->zc_data.zc_vdev_guid == vdev_guid &&
+		    zcp->zc_data.zc_pool_guid == pool_guid)
 			break;
 	}
 
@@ -775,6 +812,8 @@
 			if (zcp->zc_data.zc_serd_checksum[0] != '\0')
 				fmd_serd_reset(hdl,
 				    zcp->zc_data.zc_serd_checksum);
+			if (zcp->zc_data.zc_serd_probe[0] != '\0')
+				fmd_serd_reset(hdl, zcp->zc_data.zc_serd_probe);
 		}
 		zfs_stats.resource_drops.fmds_value.ui64++;
 		return;
@@ -791,12 +830,48 @@
 	if (fmd_case_solved(hdl, zcp->zc_case))
 		return;
 
+	zhdl = fmd_hdl_getspecific(hdl);
+
+	/*
+	 * Find the corresponding pool.
+	 */
+	cb.cb_guid = pool_guid;
+	cb.cb_zhp = NULL;
+	if (zhdl != NULL && zpool_iter(zhdl, find_pool, &cb) != 0) {
+		zfs_stats.pool_drops.fmds_value.ui64++;
+		return;
+	}
+
+	zhp = cb.cb_zhp; /* NULL if pool was not found. */
+	if (zhp != NULL) {
+		/*
+		 * The libzfs API takes a string representation of a base-10
+		 * guid here instead of a number, likely because the primary
+		 * libzfs consumers are the CLI tools.
+		 */
+		vdg = fmd_hdl_zalloc(hdl, MAX_ULL_STR, FMD_SLEEP);
+		(void) snprintf(vdg, MAX_ULL_STR, "%" PRIx64, vdev_guid);
+
+		/*
+		 * According to libzfs the 'spare' bit is set when the spare is
+		 * unused, and unset when in use.
+		 *
+		 * We don't really care about the returned nvlist. We're only
+		 * interested in the boolean flags.
+		 */
+		if ((vd_nvl = zpool_find_vdev(zhp, vdg,
+		    &is_inactive_spare, &islog, &iscache)) != NULL) {
+			nvlist_free(vd_nvl);
+		}
+		fmd_hdl_free(hdl, vdg, MAX_ULL_STR);
+	}
+
 	/*
 	 * Determine if we should solve the case and generate a fault.  We solve
 	 * a case if:
 	 *
-	 * 	a. A pool failed to open (ereport.fs.zfs.pool)
-	 * 	b. A device failed to open (ereport.fs.zfs.pool) while a pool
+	 *	a. A pool failed to open (ereport.fs.zfs.pool)
+	 *	b. A device failed to open (ereport.fs.zfs.pool) while a pool
 	 *	   was up and running.
 	 *
 	 * We may see a series of ereports associated with a pool open, all
@@ -843,8 +918,8 @@
 		boolean_t checkremove = B_FALSE;
 
 		/*
-		 * If this is a checksum or I/O error, then toss it into the
-		 * appropriate SERD engine and check to see if it has fired.
+		 * If this is a checksum, I/O, or probe error, then toss it into
+		 * the appropriate SERD engine and check to see if it has fired.
 		 * Ideally, we want to do something more sophisticated,
 		 * (persistent errors for a single data block, etc).  For now,
 		 * a single SERD engine is sufficient.
@@ -894,7 +969,24 @@
 			}
 		} else if (fmd_nvl_class_match(hdl, nvl,
 		    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) {
-			checkremove = B_TRUE;
+			if (zcp->zc_data.zc_serd_probe[0] == '\0') {
+				zfs_serd_name(zcp->zc_data.zc_serd_probe,
+				    pool_guid, vdev_guid, "probe");
+				fmd_serd_create(hdl, zcp->zc_data.zc_serd_probe,
+				    fmd_prop_get_int32(hdl, "probe_N"),
+				    fmd_prop_get_int64(hdl, "probe_T"));
+				zfs_case_serialize(hdl, zcp);
+			}
+
+			/*
+			 * We only want to wait for SERD triggers for spare
+			 * vdevs. Normal pool vdevs should be diagnosed
+			 * immediately if a probe failure is received.
+			 */
+			if (!is_inactive_spare || fmd_serd_record(hdl,
+			    zcp->zc_data.zc_serd_probe, ep)) {
+				checkremove = B_TRUE;
+			}
 		}
 
 		/*
@@ -938,6 +1030,8 @@
 		fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_checksum);
 	if (zcp->zc_data.zc_serd_io[0] != '\0')
 		fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_io);
+	if (zcp->zc_data.zc_serd_probe[0] != '\0')
+		fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_probe);
 	if (zcp->zc_data.zc_has_remove_timer)
 		fmd_timer_remove(hdl, zcp->zc_remove_timer);
 	uu_list_remove(zfs_cases, zcp);
@@ -967,6 +1061,8 @@
 	{ "checksum_T", FMD_TYPE_TIME, "10min" },
 	{ "io_N", FMD_TYPE_UINT32, "10" },
 	{ "io_T", FMD_TYPE_TIME, "10min" },
+	{ "probe_N", FMD_TYPE_UINT32, "5" },
+	{ "probe_T", FMD_TYPE_TIME, "24hour" },
 	{ "remove_timeout", FMD_TYPE_TIME, "15sec" },
 	{ NULL, 0, NULL }
 };
--- a/usr/src/cmd/fm/modules/common/zfs-retire/zfs_retire.c	Thu Jan 09 10:59:44 2020 -0600
+++ b/usr/src/cmd/fm/modules/common/zfs-retire/zfs_retire.c	Wed Feb 06 19:30:08 2019 +0000
@@ -124,13 +124,21 @@
 	}
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
-	    &child, &children) != 0)
-		return (NULL);
+	    &child, &children) == 0) {
+		for (c = 0; c < children; c++) {
+			if ((ret = find_vdev(zhdl, child[c], search_fru,
+			    search_guid)) != NULL)
+				return (ret);
+		}
+	}
 
-	for (c = 0; c < children; c++) {
-		if ((ret = find_vdev(zhdl, child[c], search_fru,
-		    search_guid)) != NULL)
-			return (ret);
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
+	    &child, &children) == 0) {
+		for (c = 0; c < children; c++) {
+			if ((ret = find_vdev(zhdl, child[c], search_fru,
+			    search_guid)) != NULL)
+				return (ret);
+		}
 	}
 
 	return (NULL);
@@ -227,6 +235,8 @@
 	char *dev_name;
 	zprop_source_t source;
 	int ashift;
+	zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl);
+	libzfs_handle_t *zhdl = zdp->zrd_hdl;
 
 	config = zpool_get_config(zhp, NULL);
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
@@ -250,7 +260,7 @@
 	(void) nvlist_add_string(replacement, ZPOOL_CONFIG_TYPE,
 	    VDEV_TYPE_ROOT);
 
-	dev_name = zpool_vdev_name(NULL, zhp, vdev, B_FALSE);
+	dev_name = zpool_vdev_name(zhdl, zhp, vdev, B_FALSE);
 
 	/*
 	 * Try to replace each spare, ending when we successfully
--- a/usr/src/lib/libzfs/common/libzfs_pool.c	Thu Jan 09 10:59:44 2020 -0600
+++ b/usr/src/lib/libzfs/common/libzfs_pool.c	Wed Feb 06 19:30:08 2019 +0000
@@ -3046,7 +3046,7 @@
 	zfs_cmd_t zc = { 0 };
 	char msg[1024];
 	int ret;
-	nvlist_t *tgt;
+	nvlist_t *tgt, *newvd;
 	boolean_t avail_spare, l2cache, islog;
 	uint64_t val;
 	char *newname;
@@ -3090,14 +3090,14 @@
 	if ((newname = zpool_vdev_name(NULL, NULL, child[0], 0)) == NULL)
 		return (-1);
 
+	newvd = zpool_find_vdev(zhp, newname, &avail_spare, &l2cache, NULL);
 	/*
 	 * If the target is a hot spare that has been swapped in, we can only
 	 * replace it with another hot spare.
 	 */
 	if (replacing &&
 	    nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_IS_SPARE, &val) == 0 &&
-	    (zpool_find_vdev(zhp, newname, &avail_spare, &l2cache,
-	    NULL) == NULL || !avail_spare) &&
+	    (newvd == NULL || !avail_spare) &&
 	    is_replacing_spare(config_root, tgt, 1)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "can only be replaced by another hot spare"));
@@ -3107,6 +3107,11 @@
 
 	free(newname);
 
+	if (replacing && avail_spare && !vdev_is_online(newvd)) {
+		(void) zpool_standard_error(hdl, ENXIO, msg);
+		return (-1);
+	}
+
 	if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0)
 		return (-1);
 
--- a/usr/src/pkg/manifests/system-test-zfstest.mf	Thu Jan 09 10:59:44 2020 -0600
+++ b/usr/src/pkg/manifests/system-test-zfstest.mf	Wed Feb 06 19:30:08 2019 +0000
@@ -1941,6 +1941,9 @@
 file \
     path=opt/zfs-tests/tests/functional/cli_root/zpool_replace/zpool_replace_001_neg \
     mode=0555
+file \
+    path=opt/zfs-tests/tests/functional/cli_root/zpool_replace/zpool_replace_002_neg \
+    mode=0555
 file path=opt/zfs-tests/tests/functional/cli_root/zpool_resilver/cleanup \
     mode=0555
 file path=opt/zfs-tests/tests/functional/cli_root/zpool_resilver/setup \
--- a/usr/src/test/zfs-tests/include/libtest.shlib	Thu Jan 09 10:59:44 2020 -0600
+++ b/usr/src/test/zfs-tests/include/libtest.shlib	Wed Feb 06 19:30:08 2019 +0000
@@ -323,6 +323,13 @@
 	log_pass
 }
 
+function default_mirror_2way_setup
+{
+	default_mirror_setup_noexit $1 $2
+
+	log_pass
+}
+
 #
 # Given a pair of disks, set up a storage pool and dataset for the mirror
 # @parameters: $1 the primary side of the mirror
--- a/usr/src/test/zfs-tests/runfiles/delphix.run	Thu Jan 09 10:59:44 2020 -0600
+++ b/usr/src/test/zfs-tests/runfiles/delphix.run	Wed Feb 06 19:30:08 2019 +0000
@@ -363,7 +363,8 @@
     'zpool_remove_003_pos']
 
 [/opt/zfs-tests/tests/functional/cli_root/zpool_replace]
-tests = ['zpool_replace_001_neg', 'replace-o_ashift', 'replace_prop_ashift']
+tests = ['zpool_replace_001_neg', 'zpool_replace_002_neg', 'replace-o_ashift',
+    'replace_prop_ashift']
 
 [/opt/zfs-tests/tests/functional/cli_root/zpool_resilver]
 tests = ['zpool_resilver_bad_args', 'zpool_resilver_restart']
--- a/usr/src/test/zfs-tests/runfiles/omnios.run	Thu Jan 09 10:59:44 2020 -0600
+++ b/usr/src/test/zfs-tests/runfiles/omnios.run	Wed Feb 06 19:30:08 2019 +0000
@@ -363,7 +363,8 @@
     'zpool_remove_003_pos']
 
 [/opt/zfs-tests/tests/functional/cli_root/zpool_replace]
-tests = ['zpool_replace_001_neg', 'replace-o_ashift', 'replace_prop_ashift']
+tests = ['zpool_replace_001_neg', 'zpool_replace_002_neg', 'replace-o_ashift',
+    'replace_prop_ashift']
 
 [/opt/zfs-tests/tests/functional/cli_root/zpool_resilver]
 tests = ['zpool_resilver_bad_args', 'zpool_resilver_restart']
--- a/usr/src/test/zfs-tests/runfiles/openindiana.run	Thu Jan 09 10:59:44 2020 -0600
+++ b/usr/src/test/zfs-tests/runfiles/openindiana.run	Wed Feb 06 19:30:08 2019 +0000
@@ -363,7 +363,8 @@
     'zpool_remove_003_pos']
 
 [/opt/zfs-tests/tests/functional/cli_root/zpool_replace]
-tests = ['zpool_replace_001_neg', 'replace-o_ashift', 'replace_prop_ashift']
+tests = ['zpool_replace_001_neg', 'zpool_replace_002_neg', 'replace-o_ashift',
+    'replace_prop_ashift']
 
 [/opt/zfs-tests/tests/functional/cli_root/zpool_resilver]
 tests = ['zpool_resilver_bad_args', 'zpool_resilver_restart']
--- a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_replace/setup.ksh	Thu Jan 09 10:59:44 2020 -0600
+++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_replace/setup.ksh	Wed Feb 06 19:30:08 2019 +0000
@@ -25,11 +25,15 @@
 # Use is subject to license terms.
 #
 
+#
+# Copyright 2019 Joyent, Inc.
+#
+
 . $STF_SUITE/include/libtest.shlib
 
 verify_runnable "global"
-verify_disk_count "$DISKS" 2
+verify_disk_count "$DISKS" 3
 
 DISK=${DISKS%% *}
 
-default_mirror_setup $DISKS
+default_mirror_2way_setup $DISKS
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_replace/zpool_replace_002_neg.ksh	Wed Feb 06 19:30:08 2019 +0000
@@ -0,0 +1,51 @@
+#!/usr/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+#
+# zpool replace returns an error when spare device is faulted.
+#
+# STRATEGY:
+# 1. Add hot spare to pool
+# 2. Fault the hot spare device
+# 3. Attempt to replace a device in a pool with the faulted spare
+# 4. Verify the 'zpool replace' command fails
+#
+
+SPARE=${DISKS##* }
+DISK=${DISKS%% *}
+
+verify_runnable "global"
+log_must zpool add $TESTPOOL spare $SPARE
+log_assert "zpool replace returns an error when the hot spare is faulted"
+
+log_must zinject -d $SPARE -A fault $TESTPOOL
+log_mustnot zpool replace $TESTPOOL $DISK $SPARE
+
+log_pass "zpool replace returns an error when the hot spare is faulted"
--- a/usr/src/test/zfs-tests/tests/functional/rsend/send_encrypted_props.ksh	Thu Jan 09 10:59:44 2020 -0600
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/send_encrypted_props.ksh	Wed Feb 06 19:30:08 2019 +0000
@@ -191,7 +191,7 @@
 log_must test "$(get_prop 'encryption' $ds)" == "aes-256-ccm"
 log_must test "$(get_prop 'keyformat' $ds)" == "passphrase"
 log_must test "$(get_prop 'mounted' $ds)" == "yes"
-recv_cksum=$(md5digest /$ds/$TESTFILE0)
+recv_cksum=$(md5sum /$ds/$TESTFILE0 | awk '{ print $1 }')
 log_must test "$recv_cksum" == "$cksum"
 log_must zfs destroy -r $ds
 
--- a/usr/src/uts/common/fs/zfs/spa.c	Thu Jan 09 10:59:44 2020 -0600
+++ b/usr/src/uts/common/fs/zfs/spa.c	Wed Feb 06 19:30:08 2019 +0000
@@ -27,7 +27,7 @@
  * Copyright 2013 Saso Kiselkov. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2016 Toomas Soome <tsoome@me.com>
- * Copyright 2018 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2017 Datto Inc.
  * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
@@ -229,6 +229,13 @@
 uint64_t	zfs_max_missing_tvds_scan = 0;
 
 /*
+ * Interval in seconds at which to poll spare vdevs for health.
+ * Setting this to zero disables spare polling.
+ * Set to three hours by default.
+ */
+uint_t		spa_spare_poll_interval_seconds = 60 * 60 * 3;
+
+/*
  * Debugging aid that pauses spa_sync() towards the end.
  */
 boolean_t	zfs_pause_spa_sync = B_FALSE;
@@ -7539,6 +7546,8 @@
 	if (tasks & SPA_ASYNC_PROBE) {
 		spa_vdev_state_enter(spa, SCL_NONE);
 		spa_async_probe(spa, spa->spa_root_vdev);
+		for (int i = 0; i < spa->spa_spares.sav_count; i++)
+			spa_async_probe(spa, spa->spa_spares.sav_vdevs[i]);
 		(void) spa_vdev_state_exit(spa, NULL, 0);
 	}
 
@@ -8624,6 +8633,14 @@
 
 	spa_handle_ignored_writes(spa);
 
+	/* Mark unused spares as needing a health check. */
+	if (spa_spare_poll_interval_seconds != 0 &&
+	    NSEC2SEC(gethrtime() - spa->spa_spares_last_polled) >
+	    spa_spare_poll_interval_seconds) {
+		spa_spare_poll(spa);
+		spa->spa_spares_last_polled = gethrtime();
+	}
+
 	/*
 	 * If any async tasks have been requested, kick them off.
 	 */
--- a/usr/src/uts/common/fs/zfs/spa_misc.c	Thu Jan 09 10:59:44 2020 -0600
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c	Wed Feb 06 19:30:08 2019 +0000
@@ -26,6 +26,7 @@
  * Copyright 2013 Saso Kiselkov. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright (c) 2017 Datto Inc.
+ * Copyright 2019 Joyent, Inc.
  * Copyright (c) 2017, Intel Corporation.
  */
 
@@ -1045,6 +1046,41 @@
  * be completely consistent with respect to other vdev configuration changes.
  */
 
+/*
+ * Poll the spare vdevs to make sure they are not faulty.
+ *
+ * The probe operation will raise an ENXIO error and create an FM ereport if the
+ * probe fails.
+ */
+void
+spa_spare_poll(spa_t *spa)
+{
+	boolean_t async_request = B_FALSE;
+	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+	for (int i = 0; i < spa->spa_spares.sav_count; i++) {
+		spa_aux_t search, *found;
+		vdev_t *vd = spa->spa_spares.sav_vdevs[i];
+
+		search.aux_guid = vd->vdev_guid;
+
+		mutex_enter(&spa_spare_lock);
+		found = avl_find(&spa_spare_avl, &search, NULL);
+		/* This spare is in use by a pool. */
+		if (found != NULL && found->aux_pool != 0) {
+			mutex_exit(&spa_spare_lock);
+			continue;
+		}
+		mutex_exit(&spa_spare_lock);
+
+		vd->vdev_probe_wanted = B_TRUE;
+		async_request = B_TRUE;
+	}
+	if (async_request)
+		spa_async_request(spa, SPA_ASYNC_PROBE);
+
+	spa_config_exit(spa, SCL_STATE, FTAG);
+}
+
 static int
 spa_spare_compare(const void *a, const void *b)
 {
--- a/usr/src/uts/common/fs/zfs/sys/spa.h	Thu Jan 09 10:59:44 2020 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h	Wed Feb 06 19:30:08 2019 +0000
@@ -25,7 +25,7 @@
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
  * Copyright (c) 2017 Datto Inc.
  * Copyright (c) 2017, Intel Corporation.
  */
@@ -819,6 +819,9 @@
 extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt);
 extern void spa_spare_activate(vdev_t *vd);
 
+/* spare polling */
+extern void spa_spare_poll(spa_t *spa);
+
 /* L2ARC state (which is global across all pools) */
 extern void spa_l2cache_add(vdev_t *vd);
 extern void spa_l2cache_remove(vdev_t *vd);
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h	Thu Jan 09 10:59:44 2020 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h	Wed Feb 06 19:30:08 2019 +0000
@@ -26,6 +26,7 @@
  * Copyright 2013 Saso Kiselkov. All rights reserved.
  * Copyright (c) 2017 Datto Inc.
  * Copyright (c) 2017, Intel Corporation.
+ * Copyright 2019 Joyent, Inc.
  */
 
 #ifndef _SYS_SPA_IMPL_H
@@ -254,6 +255,7 @@
 
 	spa_aux_vdev_t	spa_spares;		/* hot spares */
 	spa_aux_vdev_t	spa_l2cache;		/* L2ARC cache devices */
+	hrtime_t	spa_spares_last_polled;	/* time spares last polled */
 	nvlist_t	*spa_label_features;	/* Features for reading MOS */
 	uint64_t	spa_config_object;	/* MOS object for pool config */
 	uint64_t	spa_config_generation;	/* config generation number */