changeset 14030:99b3fd6e18da

3621 ZFS LU stuck in the offlining state Reviewed by: Sebastien Roy <sebastien.roy@delphix.com> Reviewed by: Jeff Biseda <jeff.biseda@delphix.com> Reviewed by: Dan McDonald <danmcd@nexenta.com> Approved by: Christopher Siden <christopher.siden@delphix.com>
author Saso Kiselkov <skiselkov@gmail.com>
date Thu, 23 May 2013 08:52:46 -0800
parents 11aad50aea32
children e4eb37f33d60
files usr/src/uts/common/io/comstar/stmf/stmf.c usr/src/uts/common/io/comstar/stmf/stmf_impl.h
diffstat 2 files changed, 55 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/uts/common/io/comstar/stmf/stmf.c	Thu May 23 09:51:05 2013 -0400
+++ b/usr/src/uts/common/io/comstar/stmf/stmf.c	Thu May 23 08:52:46 2013 -0800
@@ -23,6 +23,8 @@
  */
 /*
  * Copyright 2012, Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  */
 
 #include <sys/conf.h>
@@ -65,6 +67,15 @@
 #define	MSG_ID_TM_BIT	0x8000000000000000
 #define	ALIGNED_TO_8BYTE_BOUNDARY(i)	(((i) + 7) & ~7)
 
+/*
+ * When stmf_io_deadman_enabled is set to B_TRUE, we check that finishing up
+ * I/O operations on an offlining LU doesn't take longer than stmf_io_deadman
+ * seconds. If it does, we trigger a panic to inform the user of hung I/O
+ * blocking us for too long.
+ */
+boolean_t stmf_io_deadman_enabled = B_TRUE;
+int stmf_io_deadman = 1000;			/* seconds */
+
 struct stmf_svc_clocks;
 
 static int stmf_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
@@ -91,6 +102,7 @@
 void stmf_svc_init();
 stmf_status_t stmf_svc_fini();
 void stmf_svc(void *arg);
+static void stmf_wait_ilu_tasks_finish(stmf_i_lu_t *ilu);
 void stmf_svc_queue(int cmd, void *obj, stmf_state_change_info_t *info);
 static void stmf_svc_kill_obj_requests(void *obj);
 static void stmf_svc_timeout(struct stmf_svc_clocks *);
@@ -3059,6 +3071,7 @@
 	}
 	ilu->ilu_cur_task_cntr = &ilu->ilu_task_cntr1;
 	STMF_EVENT_ALLOC_HANDLE(ilu->ilu_event_hdl);
+	cv_init(&ilu->ilu_offline_pending_cv, NULL, CV_DRIVER, NULL);
 	stmf_create_kstat_lu(ilu);
 	/*
 	 * register with proxy module if available and logical unit
@@ -3197,6 +3210,7 @@
 		mutex_destroy(&ilu->ilu_kstat_lock);
 	}
 	stmf_delete_itl_kstat_by_guid(ilu->ilu_ascii_hex_guid);
+	cv_destroy(&ilu->ilu_offline_pending_cv);
 	mutex_exit(&stmf_state.stmf_lock);
 	return (STMF_SUCCESS);
 }
@@ -4447,6 +4461,8 @@
 		rw_exit(iss->iss_lockp);
 		return (NULL);
 	}
+	ASSERT(lu == dlun0 || (ilu->ilu_state != STMF_STATE_OFFLINING &&
+	    ilu->ilu_state != STMF_STATE_OFFLINE));
 	do {
 		if (ilu->ilu_free_tasks == NULL) {
 			new_task = 1;
@@ -4575,6 +4591,8 @@
 	itask->itask_lu_free_next = ilu->ilu_free_tasks;
 	ilu->ilu_free_tasks = itask;
 	ilu->ilu_ntasks_free++;
+	if (ilu->ilu_ntasks == ilu->ilu_ntasks_free)
+		cv_signal(&ilu->ilu_offline_pending_cv);
 	mutex_exit(&ilu->ilu_task_lock);
 	atomic_add_32(itask->itask_ilu_task_cntr, -1);
 }
@@ -7838,8 +7856,7 @@
 			    STMF_ABORTED);
 			lu = (stmf_lu_t *)req->svc_obj;
 			ilu = (stmf_i_lu_t *)lu->lu_stmf_private;
-			if (ilu->ilu_ntasks != ilu->ilu_ntasks_free)
-				break;
+			stmf_wait_ilu_tasks_finish(ilu);
 			lu->lu_ctl(lu, req->svc_cmd, &req->svc_info);
 			break;
 		default:
@@ -7991,6 +8008,40 @@
 	stmf_state.stmf_svc_flags |= STMF_SVC_ACTIVE;
 }
 
+/*
+ * Waits for ongoing I/O tasks to finish on an LU in preparation for
+ * the LU's offlining. The LU should already be in an Offlining state
+ * (otherwise I/O to the LU might never end). There is an additional
+ * enforcement of this via a deadman timer check.
+ */
+static void
+stmf_wait_ilu_tasks_finish(stmf_i_lu_t *ilu)
+{
+	clock_t start, now, deadline;
+
+	start = now = ddi_get_lbolt();
+	deadline = start + drv_usectohz(stmf_io_deadman * 1000000llu);
+	mutex_enter(&ilu->ilu_task_lock);
+	while (ilu->ilu_ntasks != ilu->ilu_ntasks_free) {
+		(void) cv_timedwait(&ilu->ilu_offline_pending_cv,
+		    &ilu->ilu_task_lock, deadline);
+		now = ddi_get_lbolt();
+		if (now > deadline) {
+			if (stmf_io_deadman_enabled) {
+				cmn_err(CE_PANIC, "stmf_svc: I/O deadman hit "
+				    "on STMF_CMD_LU_OFFLINE after %d seconds",
+				    stmf_io_deadman);
+			} else {
+				/* keep on spinning */
+				deadline = now + drv_usectohz(stmf_io_deadman *
+				    1000000llu);
+			}
+		}
+	}
+	mutex_exit(&ilu->ilu_task_lock);
+	DTRACE_PROBE1(deadman__timeout__wait, clock_t, now - start);
+}
+
 void
 stmf_svc_queue(int cmd, void *obj, stmf_state_change_info_t *info)
 {
--- a/usr/src/uts/common/io/comstar/stmf/stmf_impl.h	Thu May 23 09:51:05 2013 -0400
+++ b/usr/src/uts/common/io/comstar/stmf/stmf_impl.h	Thu May 23 08:52:46 2013 -0800
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
  */
 #ifndef _STMF_IMPL_H
 #define	_STMF_IMPL_H
@@ -97,6 +98,7 @@
 	kstat_t		*ilu_kstat_info;
 	kstat_t		*ilu_kstat_io;
 	kmutex_t	ilu_kstat_lock;
+	kcondvar_t	ilu_offline_pending_cv;
 
 	/* point to the luid entry in stmf_state.stmf_luid_list */
 	void		*ilu_luid;