changeset 11087:af762f250609

6887350 device configuration creeps to halt when faulty devices are involved 6896951 Potential mutex error in pmcs_dev_state_recovery()
author Srikanth, Ramana <Ramana.Srikanth@Sun.COM>
date Tue, 17 Nov 2009 20:17:54 -0500
parents 77c085f864d5
children 650086a6e474
files usr/src/uts/common/io/scsi/adapters/pmcs/pmcs_scsa.c usr/src/uts/common/io/scsi/adapters/pmcs/pmcs_subr.c usr/src/uts/common/sys/scsi/adapters/pmcs/pmcs_def.h
diffstat 3 files changed, 72 insertions(+), 14 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/uts/common/io/scsi/adapters/pmcs/pmcs_scsa.c	Wed Nov 18 09:20:22 2009 +0800
+++ b/usr/src/uts/common/io/scsi/adapters/pmcs/pmcs_scsa.c	Tue Nov 17 20:17:54 2009 -0500
@@ -423,6 +423,15 @@
 	ASSERT(target->phy);
 	phyp = target->phy;
 
+	if (target->recover_wait) {
+		mutex_exit(&target->statlock);
+		mutex_exit(&pwp->lock);
+		pmcs_prt(pwp, PMCS_PRT_DEBUG_CONFIG, phyp, target, "%s: "
+		    "Target 0x%p in device state recovery, fail tran_tgt_free",
+		    __func__, (void *)target);
+		return;
+	}
+
 	/*
 	 * If this target still has a PHY pointer and that PHY's target pointer
 	 * has been cleared, then that PHY has been reaped. In that case, there
--- a/usr/src/uts/common/io/scsi/adapters/pmcs/pmcs_subr.c	Wed Nov 18 09:20:22 2009 +0800
+++ b/usr/src/uts/common/io/scsi/adapters/pmcs/pmcs_subr.c	Tue Nov 17 20:17:54 2009 -0500
@@ -2875,6 +2875,9 @@
 	/* keep phynum */
 	pptr->width = 0;
 	pptr->ds_recovery_retries = 0;
+	pptr->ds_prev_good_recoveries = 0;
+	pptr->last_good_recovery = 0;
+	pptr->prev_recovery = 0;
 	/* keep dtype */
 	pptr->config_stop = 0;
 	pptr->spinup_hold = 0;
@@ -6982,20 +6985,25 @@
 		}
 
 		tgt = pptr->target;
-		if (tgt == NULL || tgt->dev_gone) {
-			if (pptr->dtype != NOTHING) {
-				pmcs_prt(pwp, PMCS_PRT_DEBUG2, pptr, tgt,
-				    "%s: no target for DS error recovery for "
-				    "PHY 0x%p", __func__, (void *)pptr);
+
+		if (tgt != NULL) {
+			mutex_enter(&tgt->statlock);
+			if (tgt->recover_wait == 0) {
+				goto next_phy;
 			}
-			goto next_phy;
-		}
-
-		mutex_enter(&tgt->statlock);
-
-		if (tgt->recover_wait == 0) {
-			goto next_phy;
-		}
+		}
+
+		if (pptr->prev_recovery) {
+			if (ddi_get_lbolt() - pptr->prev_recovery <
+			    drv_usectohz(PMCS_DS_RECOVERY_INTERVAL)) {
+				pmcs_prt(pwp, PMCS_PRT_DEBUG2, pptr, tgt,
+				    "%s: DS recovery on PHY %s "
+				    "re-invoked too soon. Skipping...",
+				    __func__, pptr->path);
+				goto next_phy;
+			}
+		}
+		pptr->prev_recovery = ddi_get_lbolt();
 
 		/*
 		 * Step 1: Put the device into the IN_RECOVERY state
@@ -7101,7 +7109,22 @@
 		    PMCS_DEVICE_STATE_OPERATIONAL);
 		if (rc == 0) {
 			tgt->recover_wait = 0;
+
 			pptr->ds_recovery_retries = 0;
+			if ((pptr->ds_prev_good_recoveries == 0) ||
+			    (ddi_get_lbolt() - pptr->last_good_recovery >
+			    drv_usectohz(PMCS_MAX_DS_RECOVERY_TIME))) {
+				pptr->last_good_recovery = ddi_get_lbolt();
+				pptr->ds_prev_good_recoveries = 1;
+			} else if (ddi_get_lbolt() < pptr->last_good_recovery +
+			    drv_usectohz(PMCS_MAX_DS_RECOVERY_TIME)) {
+				pptr->ds_prev_good_recoveries++;
+			} else {
+				pmcs_handle_ds_recovery_error(pptr, tgt, pwp,
+				    __func__, __LINE__, "Max recovery"
+				    "attempts reached. Declaring PHY dead");
+			}
+
 			/*
 			 * Don't bother to run the work queues if the PHY
 			 * is dead.
@@ -8260,6 +8283,7 @@
     pmcs_hw_t *pwp, const char *func_name, int line, char *reason_string)
 {
 	ASSERT(mutex_owned(&phyp->phy_lock));
+	ASSERT((tgt == NULL) || mutex_owned(&tgt->statlock));
 
 	phyp->ds_recovery_retries++;
 
@@ -8267,7 +8291,22 @@
 		pmcs_prt(pwp, PMCS_PRT_DEBUG, phyp, tgt,
 		    "%s: retry limit reached after %s to PHY %s failed",
 		    func_name, reason_string, phyp->path);
-		tgt->recover_wait = 0;
+		if (tgt != NULL) {
+			tgt->recover_wait = 0;
+		}
+		phyp->dead = 1;
+		PHY_CHANGED_AT_LOCATION(pwp, phyp, func_name, line);
+		RESTART_DISCOVERY(pwp);
+	} else if ((phyp->ds_prev_good_recoveries >
+	    PMCS_MAX_DS_RECOVERY_RETRIES) &&
+	    (phyp->last_good_recovery + drv_usectohz(PMCS_MAX_DS_RECOVERY_TIME)
+	    < ddi_get_lbolt())) {
+		pmcs_prt(pwp, PMCS_PRT_DEBUG, phyp, tgt, "%s: max number of "
+		    "successful recoveries reached, declaring PHY %s dead",
+		    __func__, phyp->path);
+		if (tgt != NULL) {
+			tgt->recover_wait = 0;
+		}
 		phyp->dead = 1;
 		PHY_CHANGED_AT_LOCATION(pwp, phyp, func_name, line);
 		RESTART_DISCOVERY(pwp);
--- a/usr/src/uts/common/sys/scsi/adapters/pmcs/pmcs_def.h	Wed Nov 18 09:20:22 2009 +0800
+++ b/usr/src/uts/common/sys/scsi/adapters/pmcs/pmcs_def.h	Tue Nov 17 20:17:54 2009 -0500
@@ -72,6 +72,10 @@
 	uint8_t		phynum;		/* phy number on parent expander */
 	uint8_t		width;		/* how many phys wide */
 	uint8_t		ds_recovery_retries; /* # error retry attempts */
+	uint8_t		ds_prev_good_recoveries; /* # successful recoveries */
+	clock_t		prev_recovery;	/* previous successful recovery */
+	clock_t		last_good_recovery; /* oldest successful recovery */
+			/* within PMCS_MAX_DS_RECOVERY_TIME time frame */
 	pmcs_dtype_t	dtype;		/* current dtype of the phy */
 	pmcs_dtype_t	pend_dtype;	/* new dtype (pending change) */
 	uint32_t
@@ -113,6 +117,12 @@
 /* maximum number of ds recovery retries (ds_recovery_retries) */
 #define	PMCS_MAX_DS_RECOVERY_RETRIES	4
 
+/* max time allowed for successful recovery */
+#define	PMCS_MAX_DS_RECOVERY_TIME	(60 * 1000000) /* 60 seconds */
+
+/* ds recovery on same same phy is not allowed within this interval */
+#define	PMCS_DS_RECOVERY_INTERVAL	(1000000) /* 1 second */
+
 
 /*
  * Inbound and Outbound Queue Related Definitions.