changeset 13869:921a99998bb4

3246 ZFS I/O deadman thread Reviewed by: Matt Ahrens <matthew.ahrens@delphix.com> Reviewed by: Eric Schrock <eric.schrock@delphix.com> Reviewed by: Christopher Siden <chris.siden@delphix.com> Approved by: Garrett D'Amore <garrett@damore.org>
author George.Wilson <george.wilson@delphix.com>
date Mon, 29 Oct 2012 12:08:09 -0500
parents cba64774a360
children 387db3e6d543
files usr/src/cmd/zinject/translate.c usr/src/cmd/zinject/zinject.c usr/src/lib/libzpool/common/kernel.c usr/src/lib/libzpool/common/sys/zfs_context.h usr/src/uts/common/fs/zfs/spa.c usr/src/uts/common/fs/zfs/spa_misc.c usr/src/uts/common/fs/zfs/sys/spa.h usr/src/uts/common/fs/zfs/sys/spa_boot.h usr/src/uts/common/fs/zfs/sys/spa_impl.h usr/src/uts/common/fs/zfs/sys/vdev.h usr/src/uts/common/fs/zfs/sys/vdev_impl.h usr/src/uts/common/fs/zfs/sys/zfs_context.h usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h usr/src/uts/common/fs/zfs/sys/zio.h usr/src/uts/common/fs/zfs/vdev.c usr/src/uts/common/fs/zfs/vdev_queue.c usr/src/uts/common/fs/zfs/zio_inject.c usr/src/uts/intel/zfs/spa_boot.c usr/src/uts/sparc/zfs/spa_boot.c
diffstat 19 files changed, 332 insertions(+), 41 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/cmd/zinject/translate.c	Thu Oct 25 02:27:19 2012 +0000
+++ b/usr/src/cmd/zinject/translate.c	Mon Oct 29 12:08:09 2012 -0500
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <libzfs.h>
@@ -472,6 +473,20 @@
 		    &record->zi_guid) == 0);
 	}
 
+	/*
+	 * Device faults can take on three different forms:
+	 * 1). delayed or hanging I/O
+	 * 2). zfs label faults
+	 * 3). generic disk faults
+	 */
+	if (record->zi_timer != 0) {
+		record->zi_cmd = ZINJECT_DELAY_IO;
+	} else if (label_type != TYPE_INVAL) {
+		record->zi_cmd = ZINJECT_LABEL_FAULT;
+	} else {
+		record->zi_cmd = ZINJECT_DEVICE_FAULT;
+	}
+
 	switch (label_type) {
 	case TYPE_LABEL_UBERBLOCK:
 		record->zi_start = offsetof(vdev_label_t, vl_uberblock[0]);
--- a/usr/src/cmd/zinject/zinject.c	Thu Oct 25 02:27:19 2012 +0000
+++ b/usr/src/cmd/zinject/zinject.c	Mon Oct 29 12:08:09 2012 -0500
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 /*
@@ -600,7 +601,7 @@
 	}
 
 	while ((c = getopt(argc, argv,
-	    ":aA:b:d:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
+	    ":aA:b:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
 		switch (c) {
 		case 'a':
 			flags |= ZINJECT_FLUSH_ARC;
@@ -626,6 +627,15 @@
 		case 'd':
 			device = optarg;
 			break;
+		case 'D':
+			record.zi_timer = strtoull(optarg, &end, 10);
+			if (errno != 0 || *end != '\0') {
+				(void) fprintf(stderr, "invalid i/o delay "
+				    "value: '%s'\n", optarg);
+				usage();
+				return (1);
+			}
+			break;
 		case 'e':
 			if (strcasecmp(optarg, "io") == 0) {
 				error = EIO;
@@ -690,6 +700,7 @@
 		case 'p':
 			(void) strlcpy(record.zi_func, optarg,
 			    sizeof (record.zi_func));
+			record.zi_cmd = ZINJECT_PANIC;
 			break;
 		case 'q':
 			quiet = 1;
@@ -763,13 +774,15 @@
 	argc -= optind;
 	argv += optind;
 
+	if (record.zi_duration != 0)
+		record.zi_cmd = ZINJECT_IGNORED_WRITES;
+
 	if (cancel != NULL) {
 		/*
 		 * '-c' is invalid with any other options.
 		 */
 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
-		    level != 0 || record.zi_func[0] != '\0' ||
-		    record.zi_duration != 0) {
+		    level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED) {
 			(void) fprintf(stderr, "cancel (-c) incompatible with "
 			    "any other options\n");
 			usage();
@@ -801,8 +814,7 @@
 		 * for doing injection, so handle it separately here.
 		 */
 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
-		    level != 0 || record.zi_func[0] != '\0' ||
-		    record.zi_duration != 0) {
+		    level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED) {
 			(void) fprintf(stderr, "device (-d) incompatible with "
 			    "data error injection\n");
 			usage();
@@ -836,7 +848,7 @@
 
 	} else if (raw != NULL) {
 		if (range != NULL || type != TYPE_INVAL || level != 0 ||
-		    record.zi_func[0] != '\0' || record.zi_duration != 0) {
+		    record.zi_cmd != ZINJECT_UNINITIALIZED) {
 			(void) fprintf(stderr, "raw (-b) format with "
 			    "any other options\n");
 			usage();
@@ -859,13 +871,14 @@
 			return (1);
 		}
 
+		record.zi_cmd = ZINJECT_DATA_FAULT;
 		if (translate_raw(raw, &record) != 0)
 			return (1);
 		if (!error)
 			error = EIO;
-	} else if (record.zi_func[0] != '\0') {
+	} else if (record.zi_cmd == ZINJECT_PANIC) {
 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
-		    level != 0 || device != NULL || record.zi_duration != 0) {
+		    level != 0 || device != NULL) {
 			(void) fprintf(stderr, "panic (-p) incompatible with "
 			    "other options\n");
 			usage();
@@ -883,7 +896,7 @@
 		if (argv[1] != NULL)
 			record.zi_type = atoi(argv[1]);
 		dataset[0] = '\0';
-	} else if (record.zi_duration != 0) {
+	} else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) {
 		if (nowrites == 0) {
 			(void) fprintf(stderr, "-s or -g meaningless "
 			    "without -I (ignore writes)\n");
@@ -937,6 +950,7 @@
 			return (1);
 		}
 
+		record.zi_cmd = ZINJECT_DATA_FAULT;
 		if (translate_record(type, argv[0], range, level, &record, pool,
 		    dataset) != 0)
 			return (1);
--- a/usr/src/lib/libzpool/common/kernel.c	Thu Oct 25 02:27:19 2012 +0000
+++ b/usr/src/lib/libzpool/common/kernel.c	Mon Oct 29 12:08:09 2012 -0500
@@ -46,6 +46,7 @@
 uint64_t physmem;
 vnode_t *rootdir = (vnode_t *)0xabcd1234;
 char hw_serial[HW_HOSTID_LEN];
+kmutex_t cpu_lock;
 vmem_t *zio_arena = NULL;
 
 struct utsname utsname = {
@@ -790,6 +791,26 @@
 	return (0);
 }
 
+/* ARGSUSED */
+cyclic_id_t
+cyclic_add(cyc_handler_t *hdlr, cyc_time_t *when)
+{
+	return (1);
+}
+
+/* ARGSUSED */
+void
+cyclic_remove(cyclic_id_t id)
+{
+}
+
+/* ARGSUSED */
+int
+cyclic_reprogram(cyclic_id_t id, hrtime_t expiration)
+{
+	return (1);
+}
+
 /*
  * =========================================================================
  * kernel emulation setup & teardown
@@ -823,6 +844,8 @@
 
 	system_taskq_init();
 
+	mutex_init(&cpu_lock, NULL, MUTEX_DEFAULT, NULL);
+
 	spa_init(mode);
 }
 
--- a/usr/src/lib/libzpool/common/sys/zfs_context.h	Thu Oct 25 02:27:19 2012 +0000
+++ b/usr/src/lib/libzpool/common/sys/zfs_context.h	Mon Oct 29 12:08:09 2012 -0500
@@ -453,6 +453,9 @@
 
 extern void delay(clock_t ticks);
 
+#define	SEC_TO_TICK(sec)	((sec) * hz)
+#define	NSEC_TO_TICK(usec)	((usec) / (NANOSEC / hz))
+
 #define	gethrestime_sec() time(NULL)
 #define	gethrestime(t) \
 	do {\
@@ -572,6 +575,34 @@
 #define	ddi_log_sysevent(_a, _b, _c, _d, _e, _f, _g) \
 	sysevent_post_event(_c, _d, _b, "libzpool", _e, _f)
 
+/*
+ * Cyclic information
+ */
+extern kmutex_t cpu_lock;
+
+typedef uintptr_t cyclic_id_t;
+typedef uint16_t cyc_level_t;
+typedef void (*cyc_func_t)(void *);
+
+#define	CY_LOW_LEVEL	0
+#define	CY_INFINITY	INT64_MAX
+#define	CYCLIC_NONE	((cyclic_id_t)0)
+
+typedef struct cyc_time {
+	hrtime_t cyt_when;
+	hrtime_t cyt_interval;
+} cyc_time_t;
+
+typedef struct cyc_handler {
+	cyc_func_t cyh_func;
+	void *cyh_arg;
+	cyc_level_t cyh_level;
+} cyc_handler_t;
+
+extern cyclic_id_t cyclic_add(cyc_handler_t *, cyc_time_t *);
+extern void cyclic_remove(cyclic_id_t);
+extern int cyclic_reprogram(cyclic_id_t, hrtime_t);
+
 #ifdef	__cplusplus
 }
 #endif
--- a/usr/src/uts/common/fs/zfs/spa.c	Thu Oct 25 02:27:19 2012 +0000
+++ b/usr/src/uts/common/fs/zfs/spa.c	Mon Oct 29 12:08:09 2012 -0500
@@ -5983,6 +5983,10 @@
 
 	tx = dmu_tx_create_assigned(dp, txg);
 
+	spa->spa_sync_starttime = gethrtime();
+	VERIFY(cyclic_reprogram(spa->spa_deadman_cycid,
+	    spa->spa_sync_starttime + spa->spa_deadman_synctime));
+
 	/*
 	 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
 	 * set spa_deflate if we have no raid-z vdevs.
@@ -6111,6 +6115,8 @@
 	}
 	dmu_tx_commit(tx);
 
+	VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
+
 	/*
 	 * Clear the dirty config list.
 	 */
--- a/usr/src/uts/common/fs/zfs/spa_misc.c	Thu Oct 25 02:27:19 2012 +0000
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c	Mon Oct 29 12:08:09 2012 -0500
@@ -26,6 +26,7 @@
 
 #include <sys/zfs_context.h>
 #include <sys/spa_impl.h>
+#include <sys/spa_boot.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
@@ -249,6 +250,26 @@
  */
 int zfs_recover = 0;
 
+extern int zfs_txg_synctime_ms;
+
+/*
+ * Expiration time in units of zfs_txg_synctime_ms. This value has two
+ * meanings. First it is used to determine when the spa_deadman logic
+ * should fire. By default the spa_deadman will fire if spa_sync has
+ * not completed in 1000 * zfs_txg_synctime_ms (i.e. 1000 seconds).
+ * Secondly, the value determines if an I/O is considered "hung".
+ * Any I/O that has not completed in zfs_deadman_synctime is considered
+ * "hung" resulting in a system panic.
+ * 1000 zfs_txg_synctime_ms (i.e. 1000 seconds).
+ */
+uint64_t zfs_deadman_synctime = 1000ULL;
+
+/*
+ * Override the zfs deadman behavior via /etc/system. By default the
+ * deadman is enabled except on VMware and sparc deployments.
+ */
+int zfs_deadman_enabled = -1;
+
 
 /*
  * ==========================================================================
@@ -418,6 +439,23 @@
 }
 
 /*
+ * Fires when spa_sync has not completed within zfs_deadman_synctime_ms.
+ * If the zfs_deadman_enabled flag is set then it inspects all vdev queues
+ * looking for potentially hung I/Os.
+ */
+void
+spa_deadman(void *arg)
+{
+	spa_t *spa = arg;
+
+	zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu",
+	    (gethrtime() - spa->spa_sync_starttime) / NANOSEC,
+	    ++spa->spa_deadman_calls);
+	if (zfs_deadman_enabled)
+		vdev_deadman(spa->spa_root_vdev);
+}
+
+/*
  * Create an uninitialized spa_t with the given name.  Requires
  * spa_namespace_lock.  The caller must ensure that the spa_t doesn't already
  * exist by calling spa_lookup() first.
@@ -427,6 +465,8 @@
 {
 	spa_t *spa;
 	spa_config_dirent_t *dp;
+	cyc_handler_t hdlr;
+	cyc_time_t when;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
@@ -458,6 +498,25 @@
 	spa->spa_proc = &p0;
 	spa->spa_proc_state = SPA_PROC_NONE;
 
+	hdlr.cyh_func = spa_deadman;
+	hdlr.cyh_arg = spa;
+	hdlr.cyh_level = CY_LOW_LEVEL;
+
+	spa->spa_deadman_synctime = zfs_deadman_synctime *
+	    zfs_txg_synctime_ms * MICROSEC;
+
+	/*
+	 * This determines how often we need to check for hung I/Os after
+	 * the cyclic has already fired. Since checking for hung I/Os is
+	 * an expensive operation we don't want to check too frequently.
+	 * Instead wait for 5 synctimes before checking again.
+	 */
+	when.cyt_interval = 5ULL * zfs_txg_synctime_ms * MICROSEC;
+	when.cyt_when = CY_INFINITY;
+	mutex_enter(&cpu_lock);
+	spa->spa_deadman_cycid = cyclic_add(&hdlr, &when);
+	mutex_exit(&cpu_lock);
+
 	refcount_create(&spa->spa_refcount);
 	spa_config_lock_init(spa);
 
@@ -540,6 +599,12 @@
 	nvlist_free(spa->spa_load_info);
 	spa_config_set(spa, NULL);
 
+	mutex_enter(&cpu_lock);
+	if (spa->spa_deadman_cycid != CYCLIC_NONE)
+		cyclic_remove(spa->spa_deadman_cycid);
+	mutex_exit(&cpu_lock);
+	spa->spa_deadman_cycid = CYCLIC_NONE;
+
 	refcount_destroy(&spa->spa_refcount);
 
 	spa_config_lock_destroy(spa);
@@ -1507,6 +1572,12 @@
 }
 
 uint64_t
+spa_deadman_synctime(spa_t *spa)
+{
+	return (spa->spa_deadman_synctime);
+}
+
+uint64_t
 dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
 {
 	uint64_t asize = DVA_GET_ASIZE(dva);
@@ -1600,7 +1671,9 @@
 
 	spa_mode_global = mode;
 
-#ifndef _KERNEL
+#ifdef _KERNEL
+	spa_arch_init();
+#else
 	if (spa_mode_global != FREAD && dprintf_find_string("watch")) {
 		arc_procfd = open("/proc/self/ctl", O_WRONLY);
 		if (arc_procfd == -1) {
--- a/usr/src/uts/common/fs/zfs/sys/spa.h	Thu Oct 25 02:27:19 2012 +0000
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h	Mon Oct 29 12:08:09 2012 -0500
@@ -604,6 +604,7 @@
 extern uint64_t spa_bootfs(spa_t *spa);
 extern uint64_t spa_delegation(spa_t *spa);
 extern objset_t *spa_meta_objset(spa_t *spa);
+extern uint64_t spa_deadman_synctime(spa_t *spa);
 
 /* Miscellaneous support routines */
 extern void spa_activate_mos_feature(spa_t *spa, const char *feature);
--- a/usr/src/uts/common/fs/zfs/sys/spa_boot.h	Thu Oct 25 02:27:19 2012 +0000
+++ b/usr/src/uts/common/fs/zfs/sys/spa_boot.h	Mon Oct 29 12:08:09 2012 -0500
@@ -23,6 +23,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
 #ifndef _SYS_SPA_BOOT_H
 #define	_SYS_SPA_BOOT_H
 
@@ -35,6 +39,8 @@
 extern char *spa_get_bootprop(char *prop);
 extern void spa_free_bootprop(char *prop);
 
+extern void spa_arch_init(void);
+
 #ifdef	__cplusplus
 }
 #endif
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h	Thu Oct 25 02:27:19 2012 +0000
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h	Mon Oct 29 12:08:09 2012 -0500
@@ -227,6 +227,10 @@
 	uint64_t	spa_feat_for_write_obj;	/* required to write to pool */
 	uint64_t	spa_feat_for_read_obj;	/* required to read from pool */
 	uint64_t	spa_feat_desc_obj;	/* Feature descriptions */
+	cyclic_id_t	spa_deadman_cycid;	/* cyclic id */
+	uint64_t	spa_deadman_calls;	/* number of deadman calls */
+	uint64_t	spa_sync_starttime;	/* starting time fo spa_sync */
+	uint64_t	spa_deadman_synctime;	/* deadman expiration timer */
 	/*
 	 * spa_refcnt & spa_config_lock must be the last elements
 	 * because refcount_t changes size based on compilation options.
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h	Thu Oct 25 02:27:19 2012 +0000
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h	Mon Oct 29 12:08:09 2012 -0500
@@ -79,6 +79,7 @@
 extern void vdev_metaslab_set_size(vdev_t *);
 extern void vdev_expand(vdev_t *vd, uint64_t txg);
 extern void vdev_split(vdev_t *vd);
+extern void vdev_deadman(vdev_t *vd);
 
 
 extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h	Thu Oct 25 02:27:19 2012 +0000
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h	Mon Oct 29 12:08:09 2012 -0500
@@ -104,6 +104,8 @@
 	avl_tree_t	vq_read_tree;
 	avl_tree_t	vq_write_tree;
 	avl_tree_t	vq_pending_tree;
+	uint64_t	vq_io_complete_ts;
+	uint64_t	vq_io_delta_ts;
 	kmutex_t	vq_lock;
 };
 
--- a/usr/src/uts/common/fs/zfs/sys/zfs_context.h	Thu Oct 25 02:27:19 2012 +0000
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_context.h	Mon Oct 29 12:08:09 2012 -0500
@@ -22,8 +22,10 @@
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+
 /*
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_ZFS_CONTEXT_H
@@ -67,6 +69,7 @@
 #include <sys/sysevent/dev.h>
 #include <sys/fm/util.h>
 #include <sys/sunddi.h>
+#include <sys/cyclic.h>
 
 #define	CPU_SEQID	(CPU->cpu_seqid)
 
--- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h	Thu Oct 25 02:27:19 2012 +0000
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h	Mon Oct 29 12:08:09 2012 -0500
@@ -240,12 +240,24 @@
 	uint32_t	zi_iotype;
 	int32_t		zi_duration;
 	uint64_t	zi_timer;
+	uint32_t	zi_cmd;
+	uint32_t	zi_pad;
 } zinject_record_t;
 
 #define	ZINJECT_NULL		0x1
 #define	ZINJECT_FLUSH_ARC	0x2
 #define	ZINJECT_UNLOAD_SPA	0x4
 
+typedef enum zinject_type {
+	ZINJECT_UNINITIALIZED,
+	ZINJECT_DATA_FAULT,
+	ZINJECT_DEVICE_FAULT,
+	ZINJECT_LABEL_FAULT,
+	ZINJECT_IGNORED_WRITES,
+	ZINJECT_PANIC,
+	ZINJECT_DELAY_IO,
+} zinject_type_t;
+
 typedef struct zfs_share {
 	uint64_t	z_exportdata;
 	uint64_t	z_sharedata;
--- a/usr/src/uts/common/fs/zfs/sys/zio.h	Thu Oct 25 02:27:19 2012 +0000
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h	Mon Oct 29 12:08:09 2012 -0500
@@ -21,8 +21,6 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- */
-/*
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
  */
@@ -403,6 +401,7 @@
 
 	uint64_t	io_offset;
 	uint64_t	io_deadline;
+	uint64_t	io_timestamp;
 	avl_node_t	io_offset_node;
 	avl_node_t	io_deadline_node;
 	avl_tree_t	*io_vdev_tree;
@@ -548,6 +547,7 @@
 extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
 extern int zio_handle_label_injection(zio_t *zio, int error);
 extern void zio_handle_ignored_writes(zio_t *zio);
+extern uint64_t zio_handle_io_delay(zio_t *zio);
 
 /*
  * Checksum ereport functions
--- a/usr/src/uts/common/fs/zfs/vdev.c	Thu Oct 25 02:27:19 2012 +0000
+++ b/usr/src/uts/common/fs/zfs/vdev.c	Mon Oct 29 12:08:09 2012 -0500
@@ -3153,3 +3153,41 @@
 	}
 	vdev_propagate_state(cvd);
 }
+
+void
+vdev_deadman(vdev_t *vd)
+{
+	for (int c = 0; c < vd->vdev_children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+
+		vdev_deadman(cvd);
+	}
+
+	if (vd->vdev_ops->vdev_op_leaf) {
+		vdev_queue_t *vq = &vd->vdev_queue;
+
+		mutex_enter(&vq->vq_lock);
+		if (avl_numnodes(&vq->vq_pending_tree) > 0) {
+			spa_t *spa = vd->vdev_spa;
+			zio_t *fio;
+			uint64_t delta;
+
+			/*
+			 * Look at the head of all the pending queues,
+			 * if any I/O has been outstanding for longer than
+			 * the spa_deadman_synctime we panic the system.
+			 */
+			fio = avl_first(&vq->vq_pending_tree);
+			delta = ddi_get_lbolt64() - fio->io_timestamp;
+			if (delta > NSEC_TO_TICK(spa_deadman_synctime(spa))) {
+				zfs_dbgmsg("SLOW IO: zio timestamp %llu, "
+				    "delta %llu, last io %llu",
+				    fio->io_timestamp, delta,
+				    vq->vq_io_complete_ts);
+				fm_panic("I/O to pool '%s' appears to be "
+				    "hung.", spa_name(spa));
+			}
+		}
+		mutex_exit(&vq->vq_lock);
+	}
+}
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c	Thu Oct 25 02:27:19 2012 +0000
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c	Mon Oct 29 12:08:09 2012 -0500
@@ -23,6 +23,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
 #include <sys/zfs_context.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
@@ -288,6 +292,7 @@
 		    zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_AGG,
 		    flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
 		    vdev_queue_agg_io_done, NULL);
+		aio->io_timestamp = fio->io_timestamp;
 
 		nio = fio;
 		do {
@@ -359,7 +364,8 @@
 
 	mutex_enter(&vq->vq_lock);
 
-	zio->io_deadline = (ddi_get_lbolt64() >> zfs_vdev_time_shift) +
+	zio->io_timestamp = ddi_get_lbolt64();
+	zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) +
 	    zio->io_priority;
 
 	vdev_queue_io_add(vq, zio);
@@ -384,10 +390,16 @@
 {
 	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
 
+	if (zio_injection_enabled)
+		delay(SEC_TO_TICK(zio_handle_io_delay(zio)));
+
 	mutex_enter(&vq->vq_lock);
 
 	avl_remove(&vq->vq_pending_tree, zio);
 
+	vq->vq_io_complete_ts = ddi_get_lbolt64();
+	vq->vq_io_delta_ts = vq->vq_io_complete_ts - zio->io_timestamp;
+
 	for (int i = 0; i < zfs_vdev_ramp_rate; i++) {
 		zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending);
 		if (nio == NULL)
--- a/usr/src/uts/common/fs/zfs/zio_inject.c	Thu Oct 25 02:27:19 2012 +0000
+++ b/usr/src/uts/common/fs/zfs/zio_inject.c	Mon Oct 29 12:08:09 2012 -0500
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 /*
@@ -147,14 +148,8 @@
 	for (handler = list_head(&inject_handlers); handler != NULL;
 	    handler = list_next(&inject_handlers, handler)) {
 
-		/* Ignore errors not destined for this pool */
-		if (zio->io_spa != handler->zi_spa)
-			continue;
-
-		/* Ignore device errors and panic injection */
-		if (handler->zi_record.zi_guid != 0 ||
-		    handler->zi_record.zi_func[0] != '\0' ||
-		    handler->zi_record.zi_duration != 0)
+		if (zio->io_spa != handler->zi_spa ||
+		    handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT)
 			continue;
 
 		/* If this handler matches, return EIO */
@@ -197,10 +192,7 @@
 		uint64_t start = handler->zi_record.zi_start;
 		uint64_t end = handler->zi_record.zi_end;
 
-		/* Ignore device only faults or panic injection */
-		if (handler->zi_record.zi_start == 0 ||
-		    handler->zi_record.zi_func[0] != '\0' ||
-		    handler->zi_record.zi_duration != 0)
+		if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT)
 			continue;
 
 		/*
@@ -246,13 +238,7 @@
 	for (handler = list_head(&inject_handlers); handler != NULL;
 	    handler = list_next(&inject_handlers, handler)) {
 
-		/*
-		 * Ignore label specific faults, panic injection
-		 * or fake writes
-		 */
-		if (handler->zi_record.zi_start != 0 ||
-		    handler->zi_record.zi_func[0] != '\0' ||
-		    handler->zi_record.zi_duration != 0)
+		if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT)
 			continue;
 
 		if (vd->vdev_guid == handler->zi_record.zi_guid) {
@@ -316,10 +302,8 @@
 	    handler = list_next(&inject_handlers, handler)) {
 
 		/* Ignore errors not destined for this pool */
-		if (zio->io_spa != handler->zi_spa)
-			continue;
-
-		if (handler->zi_record.zi_duration == 0)
+		if (zio->io_spa != handler->zi_spa ||
+		    handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
 			continue;
 
 		/*
@@ -355,11 +339,8 @@
 	for (handler = list_head(&inject_handlers); handler != NULL;
 	    handler = list_next(&inject_handlers, handler)) {
 
-		/* Ignore errors not destined for this pool */
-		if (spa != handler->zi_spa)
-			continue;
-
-		if (handler->zi_record.zi_duration == 0)
+		if (spa != handler->zi_spa ||
+		    handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
 			continue;
 
 		if (handler->zi_record.zi_duration > 0) {
@@ -379,6 +360,34 @@
 	rw_exit(&inject_lock);
 }
 
+uint64_t
+zio_handle_io_delay(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	inject_handler_t *handler;
+	uint64_t seconds = 0;
+
+	if (zio_injection_enabled == 0)
+		return (0);
+
+	rw_enter(&inject_lock, RW_READER);
+
+	for (handler = list_head(&inject_handlers); handler != NULL;
+	    handler = list_next(&inject_handlers, handler)) {
+
+		if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO)
+			continue;
+
+		if (vd->vdev_guid == handler->zi_record.zi_guid) {
+			seconds = handler->zi_record.zi_timer;
+			break;
+		}
+
+	}
+	rw_exit(&inject_lock);
+	return (seconds);
+}
+
 /*
  * Create a new handler for the given record.  We add it to the list, adding
  * a reference to the spa_t in the process.  We increment zio_injection_enabled,
--- a/usr/src/uts/intel/zfs/spa_boot.c	Thu Oct 25 02:27:19 2012 +0000
+++ b/usr/src/uts/intel/zfs/spa_boot.c	Mon Oct 29 12:08:09 2012 -0500
@@ -24,9 +24,16 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
 #include <sys/zio.h>
 #include <sys/spa.h>
 #include <sys/sunddi.h>
+#include <sys/x86_archext.h>
+
+extern int zfs_deadman_enabled;
 
 char *
 spa_get_bootprop(char *propname)
@@ -44,3 +51,21 @@
 {
 	ddi_prop_free(value);
 }
+
+void
+spa_arch_init(void)
+{
+	/*
+	 * Configure the default settings for the zfs deadman unless
+	 * overriden by /etc/system.
+	 */
+	if (zfs_deadman_enabled == -1) {
+		/*
+		 * Disable the zfs deadman logic on VMware deployments.
+		 */
+		if (get_hwenv() == HW_VMWARE)
+			zfs_deadman_enabled = 0;
+		else
+			zfs_deadman_enabled = 1;
+	}
+}
--- a/usr/src/uts/sparc/zfs/spa_boot.c	Thu Oct 25 02:27:19 2012 +0000
+++ b/usr/src/uts/sparc/zfs/spa_boot.c	Mon Oct 29 12:08:09 2012 -0500
@@ -24,10 +24,16 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
 #include <sys/zio.h>
 #include <sys/spa.h>
 #include <sys/bootconf.h>
 
+extern int zfs_deadman_enabled;
+
 char *
 spa_get_bootprop(char *propname)
 {
@@ -52,3 +58,13 @@
 {
 	kmem_free(propname, strlen(propname) + 1);
 }
+
+void
+spa_arch_init(void)
+{
+	/*
+	 * The deadman is disabled by default on sparc.
+	 */
+	if (zfs_deadman_enabled == -1)
+		zfs_deadman_enabled = 0;
+}