changeset 14120:e9e346400fef

3949 ztest fault injection should avoid resilvering devices 3950 ztest: deadman fires when we're doing a scan 3951 ztest hang when running dedup test 3952 ztest: ztest_reguid test and ztest_fault_inject don't place nice together Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: Adam Leventhal <ahl@delphix.com> Approved by: Richard Lowe <richlowe@richlowe.net>
author George Wilson <george.wilson@delphix.com>
date Wed, 07 Aug 2013 10:24:34 -0800
parents d6c78587e290
children 6f5ac5d649af
files usr/src/cmd/ztest/ztest.c usr/src/lib/libzpool/common/llib-lzpool usr/src/uts/common/fs/zfs/spa.c usr/src/uts/common/fs/zfs/vdev_file.c
diffstat 4 files changed, 60 insertions(+), 16 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/cmd/ztest/ztest.c	Mon Aug 05 15:59:45 2013 -0400
+++ b/usr/src/cmd/ztest/ztest.c	Wed Aug 07 10:24:34 2013 -0800
@@ -184,6 +184,7 @@
 
 extern uint64_t metaslab_gang_bang;
 extern uint64_t metaslab_df_alloc_threshold;
+extern uint64_t zfs_deadman_synctime;
 
 static ztest_shared_opts_t *ztest_shared_opts;
 static ztest_shared_opts_t ztest_opts;
@@ -363,7 +364,7 @@
 	{ ztest_fault_inject,			1,	&zopt_sometimes	},
 	{ ztest_ddt_repair,			1,	&zopt_sometimes	},
 	{ ztest_dmu_snapshot_hold,		1,	&zopt_sometimes	},
-	{ ztest_reguid,				1,	&zopt_sometimes },
+	{ ztest_reguid,				1,	&zopt_rarely	},
 	{ ztest_spa_rename,			1,	&zopt_rarely	},
 	{ ztest_scrub,				1,	&zopt_rarely	},
 	{ ztest_spa_upgrade,			1,	&zopt_rarely	},
@@ -4754,6 +4755,14 @@
 	ASSERT(leaves >= 1);
 
 	/*
+	 * Grab the name lock as reader. There are some operations
+	 * which don't like to have their vdevs changed while
+	 * they are in progress (i.e. spa_change_guid). Those
+	 * operations will have grabbed the name lock as writer.
+	 */
+	(void) rw_rdlock(&ztest_name_lock);
+
+	/*
 	 * We need SCL_STATE here because we're going to look at vd0->vdev_tsd.
 	 */
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
@@ -4782,7 +4791,14 @@
 		if (vd0 != NULL && vd0->vdev_top->vdev_islog)
 			islog = B_TRUE;
 
-		if (vd0 != NULL && maxfaults != 1) {
+		/*
+		 * If the top-level vdev needs to be resilvered
+		 * then we only allow faults on the device that is
+		 * resilvering.
+		 */
+		if (vd0 != NULL && maxfaults != 1 &&
+		    (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) ||
+		    vd0->vdev_resilvering)) {
 			/*
 			 * Make vd0 explicitly claim to be unreadable,
 			 * or unwriteable, or reach behind its back
@@ -4813,6 +4829,7 @@
 
 		if (sav->sav_count == 0) {
 			spa_config_exit(spa, SCL_STATE, FTAG);
+			(void) rw_unlock(&ztest_name_lock);
 			return;
 		}
 		vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)];
@@ -4826,6 +4843,7 @@
 	}
 
 	spa_config_exit(spa, SCL_STATE, FTAG);
+	(void) rw_unlock(&ztest_name_lock);
 
 	/*
 	 * If we can tolerate two or more faults, or we're dealing
@@ -5290,16 +5308,33 @@
 ztest_deadman_thread(void *arg)
 {
 	ztest_shared_t *zs = arg;
-	int grace = 300;
-	hrtime_t delta;
-
-	delta = (zs->zs_thread_stop - zs->zs_thread_start) / NANOSEC + grace;
-
-	(void) poll(NULL, 0, (int)(1000 * delta));
-
-	fatal(0, "failed to complete within %d seconds of deadline", grace);
-
-	return (NULL);
+	spa_t *spa = ztest_spa;
+	hrtime_t delta, total = 0;
+
+	for (;;) {
+		delta = (zs->zs_thread_stop - zs->zs_thread_start) /
+		    NANOSEC + zfs_deadman_synctime;
+
+		(void) poll(NULL, 0, (int)(1000 * delta));
+
+		/*
+		 * If the pool is suspended then fail immediately. Otherwise,
+		 * check to see if the pool is making any progress. If
+		 * vdev_deadman() discovers that there hasn't been any recent
+		 * I/Os then it will end up aborting the tests.
+		 */
+		if (spa_suspended(spa)) {
+			fatal(0, "aborting test after %llu seconds because "
+			    "pool has transitioned to a suspended state.",
+			    zfs_deadman_synctime);
+			return (NULL);
+		}
+		vdev_deadman(spa->spa_root_vdev);
+
+		total += zfs_deadman_synctime;
+		(void) printf("ztest has been running for %lld seconds\n",
+		    total);
+	}
 }
 
 static void
@@ -6024,6 +6059,7 @@
 	(void) setvbuf(stdout, NULL, _IOLBF, 0);
 
 	dprintf_setup(&argc, argv);
+	zfs_deadman_synctime = 300;
 
 	ztest_fd_rand = open("/dev/urandom", O_RDONLY);
 	ASSERT3S(ztest_fd_rand, >=, 0);
--- a/usr/src/lib/libzpool/common/llib-lzpool	Mon Aug 05 15:59:45 2013 -0400
+++ b/usr/src/lib/libzpool/common/llib-lzpool	Wed Aug 07 10:24:34 2013 -0800
@@ -64,3 +64,4 @@
 extern uint64_t metaslab_gang_bang;
 extern uint64_t metaslab_df_alloc_threshold;
 extern boolean_t zfeature_checks_disable;
+extern uint64_t zfs_deadman_synctime;
--- a/usr/src/uts/common/fs/zfs/spa.c	Mon Aug 05 15:59:45 2013 -0400
+++ b/usr/src/uts/common/fs/zfs/spa.c	Wed Aug 07 10:24:34 2013 -0800
@@ -757,6 +757,7 @@
 	int error;
 	uint64_t guid;
 
+	mutex_enter(&spa->spa_vdev_top_lock);
 	mutex_enter(&spa_namespace_lock);
 	guid = spa_generate_guid(NULL);
 
@@ -769,6 +770,7 @@
 	}
 
 	mutex_exit(&spa_namespace_lock);
+	mutex_exit(&spa->spa_vdev_top_lock);
 
 	return (error);
 }
@@ -4674,7 +4676,6 @@
 		if (pvd->vdev_ops == &vdev_spare_ops)
 			cvd->vdev_unspare = B_FALSE;
 		vdev_remove_parent(cvd);
-		cvd->vdev_resilvering = B_FALSE;
 	}
 
 
@@ -5302,6 +5303,13 @@
 			return (oldvd);
 	}
 
+	if (vd->vdev_resilvering && vdev_dtl_empty(vd, DTL_MISSING) &&
+	    vdev_dtl_empty(vd, DTL_OUTAGE)) {
+		ASSERT(vd->vdev_ops->vdev_op_leaf);
+		vd->vdev_resilvering = B_FALSE;
+		vdev_config_dirty(vd->vdev_top);
+	}
+
 	/*
 	 * Check for a completed replacement.  We always consider the first
 	 * vdev in the list to be the oldest vdev, and the last one to be
--- a/usr/src/uts/common/fs/zfs/vdev_file.c	Mon Aug 05 15:59:45 2013 -0400
+++ b/usr/src/uts/common/fs/zfs/vdev_file.c	Wed Aug 07 10:24:34 2013 -0800
@@ -185,7 +185,6 @@
 static int
 vdev_file_io_start(zio_t *zio)
 {
-	spa_t *spa = zio->io_spa;
 	vdev_t *vd = zio->io_vd;
 	vdev_file_t *vf = vd->vdev_tsd;
 	vdev_buf_t *vb;
@@ -224,8 +223,8 @@
 	bp->b_private = vf->vf_vnode;
 	bp->b_iodone = (int (*)())vdev_file_io_intr;
 
-	spa_taskq_dispatch_ent(spa, ZIO_TYPE_FREE, ZIO_TASKQ_ISSUE,
-	    vdev_file_io_strategy, bp, 0, &zio->io_tqent);
+	VERIFY3U(taskq_dispatch(system_taskq, vdev_file_io_strategy, bp,
+	    TQ_SLEEP), !=, 0);
 
 	return (ZIO_PIPELINE_STOP);
 }