changeset 11692:8795ed2df6db

6898573 Watch dog support for PMCS 6922946 potential null dereference in pmcs_smp_release() 6922947 potential null dereference in pmcs_remove_phy_from_iport() 6923443 Inserted disk not shown in BUI but is listed as part of pool from 'zpool status'
author Jesse Butler <Jesse.Butler@Sun.COM>
date Thu, 18 Feb 2010 12:52:39 -0700
parents 60b9aa653af2
children 0a223da9570a
files usr/src/cmd/mdb/common/modules/pmcs/pmcs.c usr/src/uts/common/io/scsi/adapters/pmcs/pmcs_attach.c usr/src/uts/common/io/scsi/adapters/pmcs/pmcs_nvram.c usr/src/uts/common/io/scsi/adapters/pmcs/pmcs_scsa.c usr/src/uts/common/io/scsi/adapters/pmcs/pmcs_subr.c usr/src/uts/common/sys/scsi/adapters/pmcs/pmcs.h usr/src/uts/common/sys/scsi/adapters/pmcs/pmcs_param.h usr/src/uts/common/sys/scsi/adapters/pmcs/pmcs_proto.h
diffstat 8 files changed, 357 insertions(+), 113 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/cmd/mdb/common/modules/pmcs/pmcs.c	Thu Feb 18 12:37:30 2010 -0700
+++ b/usr/src/cmd/mdb/common/modules/pmcs/pmcs.c	Thu Feb 18 12:52:39 2010 -0700
@@ -2363,6 +2363,9 @@
 	case STATE_DEAD:
 		state_str = "Dead";
 		break;
+	case STATE_IN_RESET:
+		state_str = "In Reset";
+		break;
 	}
 
 	mdb_printf("%16p %9s %4d %1d %1d 0x%08x 0x%04x 0x%04x %16p\n", addr,
@@ -2537,6 +2540,9 @@
 	case STATE_DEAD:
 		state_str = "Dead";
 		break;
+	case STATE_IN_RESET:
+		state_str = "In Reset";
+		break;
 	}
 
 	mdb_printf("%16p %9s %4d %1d %1d 0x%08x 0x%04x 0x%04x %16p\n", addr,
--- a/usr/src/uts/common/io/scsi/adapters/pmcs/pmcs_attach.c	Thu Feb 18 12:37:30 2010 -0700
+++ b/usr/src/uts/common/io/scsi/adapters/pmcs/pmcs_attach.c	Thu Feb 18 12:52:39 2010 -0700
@@ -895,6 +895,7 @@
 		if (pmcs_soft_reset(pwp, B_FALSE)) {
 			goto failure;
 		}
+		pwp->last_reset_reason = PMCS_LAST_RST_ATTACH;
 	}
 
 	/*
@@ -1375,6 +1376,7 @@
 		 * Reset chip
 		 */
 		(void) pmcs_soft_reset(pwp, B_FALSE);
+		pwp->last_reset_reason = PMCS_LAST_RST_DETACH;
 	}
 
 	/*
@@ -1611,6 +1613,7 @@
 	/* Stop MPI & Reset chip (no need to re-initialize) */
 	(void) pmcs_stop_mpi(pwp);
 	(void) pmcs_soft_reset(pwp, B_TRUE);
+	pwp->last_reset_reason = PMCS_LAST_RST_QUIESCE;
 
 	return (DDI_SUCCESS);
 }
@@ -1836,6 +1839,74 @@
 	return (0);
 }
 
+static void
+pmcs_check_forward_progress(pmcs_hw_t *pwp)
+{
+	uint32_t	cur_iqci;
+	uint32_t	cur_msgu_tick;
+	uint32_t	cur_iop_tick;
+	int 		i;
+
+	mutex_enter(&pwp->lock);
+
+	if (pwp->state == STATE_IN_RESET) {
+		mutex_exit(&pwp->lock);
+		return;
+	}
+
+	/* Ensure that inbound work is getting picked up */
+	for (i = 0; i < PMCS_NIQ; i++) {
+		cur_iqci = pmcs_rd_iqci(pwp, i);
+		if (cur_iqci == pwp->shadow_iqpi[i]) {
+			pwp->last_iqci[i] = cur_iqci;
+			continue;
+		}
+		if (cur_iqci == pwp->last_iqci[i]) {
+			pmcs_prt(pwp, PMCS_PRT_WARN, NULL, NULL,
+			    "Inbound Queue stall detected, issuing reset");
+			goto hot_reset;
+		}
+		pwp->last_iqci[i] = cur_iqci;
+	}
+
+	/* Check heartbeat on both the MSGU and IOP */
+	cur_msgu_tick = pmcs_rd_gst_tbl(pwp, PMCS_GST_MSGU_TICK);
+	if (cur_msgu_tick == pwp->last_msgu_tick) {
+		pmcs_prt(pwp, PMCS_PRT_WARN, NULL, NULL,
+		    "Stall detected on MSGU, issuing reset");
+		goto hot_reset;
+	}
+	pwp->last_msgu_tick = cur_msgu_tick;
+
+	cur_iop_tick  = pmcs_rd_gst_tbl(pwp, PMCS_GST_IOP_TICK);
+	if (cur_iop_tick == pwp->last_iop_tick) {
+		pmcs_prt(pwp, PMCS_PRT_WARN, NULL, NULL,
+		    "Stall detected on IOP, issuing reset");
+		goto hot_reset;
+	}
+	pwp->last_iop_tick = cur_iop_tick;
+
+	mutex_exit(&pwp->lock);
+	return;
+
+hot_reset:
+	pwp->state = STATE_DEAD;
+	/*
+	 * We've detected a stall. Attempt to recover service via hot
+	 * reset. In case of failure, pmcs_hot_reset() will handle the
+	 * failure and issue any required FM notifications.
+	 * See pmcs_subr.c for more details.
+	 */
+	if (pmcs_hot_reset(pwp)) {
+		pmcs_prt(pwp, PMCS_PRT_ERR, NULL, NULL,
+		    "%s: hot reset failure", __func__);
+	} else {
+		pmcs_prt(pwp, PMCS_PRT_ERR, NULL, NULL,
+		    "%s: hot reset complete", __func__);
+		pwp->last_reset_reason = PMCS_LAST_RST_STALL;
+	}
+	mutex_exit(&pwp->lock);
+}
 
 static void
 pmcs_check_commands(pmcs_hw_t *pwp)
@@ -2018,6 +2089,14 @@
 	    pwp->config_changed);
 
 	/*
+	 * Check forward progress on the chip
+	 */
+	if (++pwp->watchdog_count == PMCS_FWD_PROG_TRIGGER) {
+		pwp->watchdog_count = 0;
+		pmcs_check_forward_progress(pwp);
+	}
+
+	/*
 	 * Check to see if we need to kick discovery off again
 	 */
 	mutex_enter(&pwp->config_lock);
@@ -2032,7 +2111,6 @@
 	mutex_exit(&pwp->config_lock);
 
 	mutex_enter(&pwp->lock);
-
 	if (pwp->state != STATE_RUNNING) {
 		mutex_exit(&pwp->lock);
 		return;
@@ -2047,7 +2125,9 @@
 	}
 	pwp->wdhandle = timeout(pmcs_watchdog, pwp,
 	    drv_usectohz(PMCS_WATCH_INTERVAL));
+
 	mutex_exit(&pwp->lock);
+
 	pmcs_check_commands(pwp);
 	pmcs_handle_dead_phys(pwp);
 }
@@ -2570,18 +2650,24 @@
 pmcs_fatal_handler(pmcs_hw_t *pwp)
 {
 	pmcs_prt(pwp, PMCS_PRT_ERR, NULL, NULL, "Fatal Interrupt caught");
+
 	mutex_enter(&pwp->lock);
 	pwp->state = STATE_DEAD;
-	pmcs_register_dump_int(pwp);
-	pmcs_wr_msgunit(pwp, PMCS_MSGU_OBDB_MASK, 0xffffffff);
-	pmcs_wr_msgunit(pwp, PMCS_MSGU_OBDB_CLEAR, 0xffffffff);
+
+	/*
+	 * Attempt a hot reset. In case of failure, pmcs_hot_reset() will
+	 * handle the failure and issue any required FM notifications.
+	 * See pmcs_subr.c for more details.
+	 */
+	if (pmcs_hot_reset(pwp)) {
+		pmcs_prt(pwp, PMCS_PRT_ERR, NULL, NULL,
+		    "%s: hot reset failure", __func__);
+	} else {
+		pmcs_prt(pwp, PMCS_PRT_ERR, NULL, NULL,
+		    "%s: hot reset complete", __func__);
+		pwp->last_reset_reason = PMCS_LAST_RST_FATAL_ERROR;
+	}
 	mutex_exit(&pwp->lock);
-	pmcs_fm_ereport(pwp, DDI_FM_DEVICE_NO_RESPONSE);
-	ddi_fm_service_impact(pwp->dip, DDI_SERVICE_LOST);
-
-#ifdef	DEBUG
-	cmn_err(CE_PANIC, "PMCS Fatal Firmware Error");
-#endif
 }
 
 /*
--- a/usr/src/uts/common/io/scsi/adapters/pmcs/pmcs_nvram.c	Thu Feb 18 12:37:30 2010 -0700
+++ b/usr/src/uts/common/io/scsi/adapters/pmcs/pmcs_nvram.c	Thu Feb 18 12:52:39 2010 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  *
  *
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -215,6 +215,7 @@
 	} else {
 		pmcs_prt(pwp, PMCS_PRT_WARN, NULL, NULL,
 		    "%s: Firmware successfully upgraded.", __func__);
+		pwp->last_reset_reason = PMCS_LAST_RST_FW_UPGRADE;
 	}
 	return (0);
 }
--- a/usr/src/uts/common/io/scsi/adapters/pmcs/pmcs_scsa.c	Thu Feb 18 12:37:30 2010 -0700
+++ b/usr/src/uts/common/io/scsi/adapters/pmcs/pmcs_scsa.c	Thu Feb 18 12:52:39 2010 -0700
@@ -225,7 +225,7 @@
 	/*
 	 * See if there's already a target softstate.  If not, allocate one.
 	 */
-	tgt = pmcs_get_target(iport, tgt_port);
+	tgt = pmcs_get_target(iport, tgt_port, B_TRUE);
 
 	if (tgt == NULL) {
 		goto tgt_init_fail;
@@ -435,7 +435,6 @@
 	pwp = ITRAN2PMC(tran);
 	mutex_enter(&pwp->lock);
 	mutex_enter(&target->statlock);
-	ASSERT(target->phy);
 	phyp = target->phy;
 
 	pmcs_prt(pwp, PMCS_PRT_DEBUG_CONFIG, phyp, target,
@@ -1239,7 +1238,7 @@
 	mutex_enter(&pwp->lock);
 
 	/* Retrieve softstate using unit-address */
-	tgt = pmcs_get_target(iport, tgt_port);
+	tgt = pmcs_get_target(iport, tgt_port, B_TRUE);
 	if (tgt == NULL) {
 		pmcs_prt(pwp, PMCS_PRT_DEBUG, NULL, NULL,
 		    "%s: tgt softstate not found", __func__);
@@ -1410,8 +1409,10 @@
 		    (void *)tgt, tgt->target_num);
 		pwp->targets[tgt->target_num] = NULL;
 		tgt->target_num = PMCS_INVALID_TARGET_NUM;
-		tgt->phy->target = NULL;
-		tgt->phy = NULL;
+		if (tgt->phy) {
+			tgt->phy->target = NULL;
+			tgt->phy = NULL;
+		}
 		pmcs_destroy_target(tgt);
 	} else {
 		mutex_exit(&tgt->statlock);
@@ -1442,7 +1443,7 @@
 	}
 
 	pmcs_prt(pwp, PMCS_PRT_DEBUG, NULL, NULL, "%s called", __func__);
-	pwp->blocked = 1;
+	pwp->quiesced = pwp->blocked = 1;
 	while (totactive) {
 		totactive = 0;
 		for (target = 0; target < pwp->max_dev; target++) {
@@ -1502,7 +1503,7 @@
 		return (-1);
 	}
 	pmcs_prt(pwp, PMCS_PRT_DEBUG, NULL, NULL, "%s called", __func__);
-	pwp->blocked = 0;
+	pwp->blocked = pwp->quiesced = 0;
 	mutex_exit(&pwp->lock);
 
 	/*
@@ -2161,7 +2162,6 @@
 	}
 
 out:
-	pmcs_pwork(pwp, pwrk);
 	pmcs_dma_unload(pwp, sp);
 
 	/*
@@ -2175,6 +2175,17 @@
 	mutex_enter(&xp->statlock);
 
 	/*
+	 * If the device no longer has a PHY pointer, clear the PHY pointer
+	 * from the work structure before we free it.  Otherwise, pmcs_pwork
+	 * may decrement the ref_count on a PHY that's been freed.
+	 */
+	if (xp->phy == NULL) {
+		pwrk->phy = NULL;
+	}
+
+	pmcs_pwork(pwp, pwrk);
+
+	/*
 	 * If the device is gone, we only put this command on the completion
 	 * queue if the work structure is not marked dead.  If it's marked
 	 * dead, it will already have been put there.
@@ -2185,7 +2196,7 @@
 			mutex_enter(&xp->aqlock);
 			STAILQ_REMOVE(&xp->aq, sp, pmcs_cmd, cmd_next);
 			mutex_exit(&xp->aqlock);
-			pmcs_prt(pwp, PMCS_PRT_DEBUG1, pptr, xp,
+			pmcs_prt(pwp, PMCS_PRT_DEBUG3, pptr, xp,
 			    "%s: Removing cmd 0x%p (htag 0x%x) from aq",
 			    __func__, (void *)sp, sp->cmd_tag);
 			mutex_enter(&pwp->cq_lock);
@@ -2220,7 +2231,7 @@
 #else
 		mutex_enter(&xp->aqlock);
 #endif
-		pmcs_prt(pwp, PMCS_PRT_DEBUG1, pptr, xp,
+		pmcs_prt(pwp, PMCS_PRT_DEBUG3, pptr, xp,
 		    "%s: Removing cmd 0x%p (htag 0x%x) from aq", __func__,
 		    (void *)sp, sp->cmd_tag);
 		STAILQ_REMOVE(&xp->aq, sp, pmcs_cmd, cmd_next);
@@ -2633,7 +2644,6 @@
 	}
 
 out:
-	pmcs_pwork(pwp, pwrk);
 	pmcs_dma_unload(pwp, sp);
 
 	/*
@@ -2647,13 +2657,24 @@
 	mutex_enter(&xp->statlock);
 	xp->tagmap &= ~(1 << sp->cmd_satltag);
 
+	/*
+	 * If the device no longer has a PHY pointer, clear the PHY pointer
+	 * from the work structure before we free it.  Otherwise, pmcs_pwork
+	 * may decrement the ref_count on a PHY that's been freed.
+	 */
+	if (xp->phy == NULL) {
+		pwrk->phy = NULL;
+	}
+
+	pmcs_pwork(pwp, pwrk);
+
 	if (xp->dev_gone) {
 		mutex_exit(&xp->statlock);
 		if (!dead) {
 			mutex_enter(&xp->aqlock);
 			STAILQ_REMOVE(&xp->aq, sp, pmcs_cmd, cmd_next);
 			mutex_exit(&xp->aqlock);
-			pmcs_prt(pwp, PMCS_PRT_DEBUG1, pptr, xp,
+			pmcs_prt(pwp, PMCS_PRT_DEBUG3, pptr, xp,
 			    "%s: Removing cmd 0x%p (htag 0x%x) from aq",
 			    __func__, (void *)sp, sp->cmd_tag);
 			mutex_enter(&pwp->cq_lock);
@@ -3053,10 +3074,11 @@
 /*
  * Return the existing target softstate if there is one.  If there is,
  * the PHY is locked as well and that lock must be freed by the caller
- * after the target/PHY linkage is established.
+ * after the target/PHY linkage is established.  If there isn't one, and
+ * alloc_tgt is TRUE, then allocate one.
  */
 pmcs_xscsi_t *
-pmcs_get_target(pmcs_iport_t *iport, char *tgt_port)
+pmcs_get_target(pmcs_iport_t *iport, char *tgt_port, boolean_t alloc_tgt)
 {
 	pmcs_hw_t *pwp = iport->pwp;
 	pmcs_phy_t *phyp;
@@ -3109,6 +3131,14 @@
 	}
 
 	/*
+	 * If this was just a lookup (i.e. alloc_tgt is false), return now.
+	 */
+	if (alloc_tgt == B_FALSE) {
+		pmcs_unlock_phy(phyp);
+		return (NULL);
+	}
+
+	/*
 	 * Allocate the new softstate
 	 */
 	wwn = pmcs_barray2wwn(phyp->sas_address);
--- a/usr/src/uts/common/io/scsi/adapters/pmcs/pmcs_subr.c	Thu Feb 18 12:37:30 2010 -0700
+++ b/usr/src/uts/common/io/scsi/adapters/pmcs/pmcs_subr.c	Thu Feb 18 12:52:39 2010 -0700
@@ -1423,6 +1423,11 @@
 	pwp->blocked = 1;
 
 	/*
+	 * Clear our softstate copies of the MSGU and IOP heartbeats.
+	 */
+	pwp->last_msgu_tick = pwp->last_iop_tick = 0;
+
+	/*
 	 * Step 1
 	 */
 	s2 = pmcs_rd_msgunit(pwp, PMCS_MSGU_SCRATCH2);
@@ -1653,6 +1658,29 @@
 		return (-1);
 	}
 
+	/* Clear the firmware log */
+	if (pwp->fwlogp) {
+		bzero(pwp->fwlogp, PMCS_FWLOG_SIZE);
+	}
+
+	/* Reset our queue indices and entries */
+	bzero(pwp->shadow_iqpi, sizeof (pwp->shadow_iqpi));
+	bzero(pwp->last_iqci, sizeof (pwp->last_iqci));
+	for (i = 0; i < PMCS_NIQ; i++) {
+		if (pwp->iqp[i]) {
+			bzero(pwp->iqp[i], PMCS_QENTRY_SIZE * pwp->ioq_depth);
+			pmcs_wr_iqpi(pwp, i, 0);
+			pmcs_wr_iqci(pwp, i, 0);
+		}
+	}
+	for (i = 0; i < PMCS_NOQ; i++) {
+		if (pwp->oqp[i]) {
+			bzero(pwp->oqp[i], PMCS_QENTRY_SIZE * pwp->ioq_depth);
+			pmcs_wr_oqpi(pwp, i, 0);
+			pmcs_wr_oqci(pwp, i, 0);
+		}
+
+	}
 
 	if (pwp->state == STATE_DEAD || pwp->state == STATE_UNPROBING ||
 	    pwp->state == STATE_PROBING || pwp->locks_initted == 0) {
@@ -1673,18 +1701,8 @@
 	ASSERT(pwp->locks_initted != 0);
 
 	/*
-	 * Clean up various soft state.
-	 */
-	bzero(pwp->ports, sizeof (pwp->ports));
-
-	pmcs_free_all_phys(pwp, pwp->root_phys);
-
-	for (pptr = pwp->root_phys; pptr; pptr = pptr->sibling) {
-		pmcs_lock_phy(pptr);
-		pmcs_clear_phy(pwp, pptr);
-		pmcs_unlock_phy(pptr);
-	}
-
+	 * Flush the target queues and clear each target's PHY
+	 */
 	if (pwp->targets) {
 		for (i = 0; i < pwp->max_dev; i++) {
 			pmcs_xscsi_t *xp = pwp->targets[i];
@@ -1692,66 +1710,24 @@
 			if (xp == NULL) {
 				continue;
 			}
+
 			mutex_enter(&xp->statlock);
-			pmcs_clear_xp(pwp, xp);
+			pmcs_flush_target_queues(pwp, xp, PMCS_TGT_ALL_QUEUES);
+			xp->phy = NULL;
 			mutex_exit(&xp->statlock);
 		}
 	}
 
-	bzero(pwp->shadow_iqpi, sizeof (pwp->shadow_iqpi));
-	for (i = 0; i < PMCS_NIQ; i++) {
-		if (pwp->iqp[i]) {
-			bzero(pwp->iqp[i], PMCS_QENTRY_SIZE * pwp->ioq_depth);
-			pmcs_wr_iqpi(pwp, i, 0);
-			pmcs_wr_iqci(pwp, i, 0);
-		}
-	}
-	for (i = 0; i < PMCS_NOQ; i++) {
-		if (pwp->oqp[i]) {
-			bzero(pwp->oqp[i], PMCS_QENTRY_SIZE * pwp->ioq_depth);
-			pmcs_wr_oqpi(pwp, i, 0);
-			pmcs_wr_oqci(pwp, i, 0);
-		}
-
-	}
-	if (pwp->fwlogp) {
-		bzero(pwp->fwlogp, PMCS_FWLOG_SIZE);
-	}
-	STAILQ_INIT(&pwp->wf);
-	bzero(pwp->work, sizeof (pmcwork_t) * pwp->max_cmd);
-	for (i = 0; i < pwp->max_cmd - 1; i++) {
-		pmcwork_t *pwrk = &pwp->work[i];
-		STAILQ_INSERT_TAIL(&pwp->wf, pwrk, next);
-	}
-
-	/*
-	 * Clear out any leftover commands sitting in the work list
-	 */
-	for (i = 0; i < pwp->max_cmd; i++) {
-		pmcwork_t *pwrk = &pwp->work[i];
-		mutex_enter(&pwrk->lock);
-		if (pwrk->state == PMCS_WORK_STATE_ONCHIP) {
-			switch (PMCS_TAG_TYPE(pwrk->htag)) {
-			case PMCS_TAG_TYPE_WAIT:
-				mutex_exit(&pwrk->lock);
-				break;
-			case PMCS_TAG_TYPE_CBACK:
-			case PMCS_TAG_TYPE_NONE:
-				pmcs_pwork(pwp, pwrk);
-				break;
-			default:
-				break;
-			}
-		} else if (pwrk->state == PMCS_WORK_STATE_IOCOMPQ) {
-			pwrk->dead = 1;
-			mutex_exit(&pwrk->lock);
-		} else {
-			/*
-			 * The other states of NIL, READY and INTR
-			 * should not be visible outside of a lock being held.
-			 */
-			pmcs_pwork(pwp, pwrk);
-		}
+	/*
+	 * Zero out the ports list, free non root phys, clear root phys
+	 */
+	bzero(pwp->ports, sizeof (pwp->ports));
+	pmcs_free_all_phys(pwp, pwp->root_phys);
+	for (pptr = pwp->root_phys; pptr; pptr = pptr->sibling) {
+		pmcs_lock_phy(pptr);
+		pmcs_clear_phy(pwp, pptr);
+		pptr->target = NULL;
+		pmcs_unlock_phy(pptr);
 	}
 
 	/*
@@ -1760,7 +1736,6 @@
 	pmcs_wr_msgunit(pwp, PMCS_MSGU_OBDB_MASK, pwp->intr_mask);
 	pmcs_wr_msgunit(pwp, PMCS_MSGU_OBDB_CLEAR, 0xffffffff);
 
-	pwp->blocked = 0;
 	pwp->mpi_table_setup = 0;
 	mutex_exit(&pwp->lock);
 
@@ -1782,7 +1757,6 @@
 	}
 
 	mutex_enter(&pwp->lock);
-	pwp->blocked = 0;
 	SCHEDULE_WORK(pwp, PMCS_WORK_RUN_QUEUES);
 	mutex_exit(&pwp->lock);
 
@@ -1806,6 +1780,80 @@
 	return (-1);
 }
 
+
+/*
+ * Perform a 'hot' reset, which will soft reset the chip and
+ * restore the state back to pre-reset context. Called with pwp
+ * lock held.
+ */
+int
+pmcs_hot_reset(pmcs_hw_t *pwp)
+{
+	pmcs_iport_t	*iport;
+
+	ASSERT(mutex_owned(&pwp->lock));
+	pwp->state = STATE_IN_RESET;
+
+	/*
+	 * For any iports on this HBA, report empty target sets and
+	 * then tear them down.
+	 */
+	rw_enter(&pwp->iports_lock, RW_READER);
+	for (iport = list_head(&pwp->iports); iport != NULL;
+	    iport = list_next(&pwp->iports, iport)) {
+		mutex_enter(&iport->lock);
+		(void) scsi_hba_tgtmap_set_begin(iport->iss_tgtmap);
+		(void) scsi_hba_tgtmap_set_end(iport->iss_tgtmap, 0);
+		pmcs_iport_teardown_phys(iport);
+		mutex_exit(&iport->lock);
+	}
+	rw_exit(&pwp->iports_lock);
+
+	/* Grab a register dump, in the event that reset fails */
+	pmcs_register_dump_int(pwp);
+	mutex_exit(&pwp->lock);
+
+	/* Issue soft reset and clean up related softstate */
+	if (pmcs_soft_reset(pwp, B_FALSE)) {
+		/*
+		 * Disable interrupts, in case we got far enough along to
+		 * enable them, then fire off ereport and service impact.
+		 */
+		pmcs_prt(pwp, PMCS_PRT_DEBUG, NULL, NULL,
+		    "%s: failed soft reset", __func__);
+		pmcs_wr_msgunit(pwp, PMCS_MSGU_OBDB_MASK, 0xffffffff);
+		pmcs_wr_msgunit(pwp, PMCS_MSGU_OBDB_CLEAR, 0xffffffff);
+		pmcs_fm_ereport(pwp, DDI_FM_DEVICE_NO_RESPONSE);
+		ddi_fm_service_impact(pwp->dip, DDI_SERVICE_LOST);
+		mutex_enter(&pwp->lock);
+		pwp->state = STATE_DEAD;
+		return (DDI_FAILURE);
+	}
+
+	mutex_enter(&pwp->lock);
+	pwp->state = STATE_RUNNING;
+	mutex_exit(&pwp->lock);
+
+	/*
+	 * Finally, restart the phys, which will bring the iports back
+	 * up and eventually result in discovery running.
+	 */
+	if (pmcs_start_phys(pwp)) {
+		/* We should be up and running now, so retry */
+		if (pmcs_start_phys(pwp)) {
+			/* Apparently unable to restart PHYs, fail */
+			pmcs_prt(pwp, PMCS_PRT_DEBUG, NULL, NULL,
+			    "%s: failed to restart PHYs after soft reset",
+			    __func__);
+			mutex_enter(&pwp->lock);
+			return (DDI_FAILURE);
+		}
+	}
+
+	mutex_enter(&pwp->lock);
+	return (DDI_SUCCESS);
+}
+
 /*
  * Reset a device or a logical unit.
  */
@@ -1961,7 +2009,9 @@
 
 /*
  * Remove all phys from an iport's phymap and empty it's phylist.
- * Called when a port has been reset by the host (see pmcs_intr.c).
+ * Called when a port has been reset by the host (see pmcs_intr.c)
+ * or prior to issuing a soft reset if we detect a stall on the chip
+ * (see pmcs_attach.c).
  */
 void
 pmcs_iport_teardown_phys(pmcs_iport_t *iport)
@@ -1985,10 +2035,12 @@
 
 	/* Remove all phys from the phymap */
 	phys = sas_phymap_ua2phys(pwp->hss_phymap, iport->ua);
-	while ((phynum = sas_phymap_phys_next(phys)) != -1) {
-		(void) sas_phymap_phy_rem(pwp->hss_phymap, phynum);
-	}
-	sas_phymap_phys_free(phys);
+	if (phys) {
+		while ((phynum = sas_phymap_phys_next(phys)) != -1) {
+			(void) sas_phymap_phy_rem(pwp->hss_phymap, phynum);
+		}
+		sas_phymap_phys_free(phys);
+	}
 }
 
 /*
@@ -2020,6 +2072,7 @@
 	 */
 	ASSERT(list_is_empty(&iport->phys));
 	phys = sas_phymap_ua2phys(pwp->hss_phymap, iport->ua);
+	ASSERT(phys != NULL);
 	while ((phynum = sas_phymap_phys_next(phys)) != -1) {
 		/* Grab the phy pointer from root_phys */
 		pptr = pwp->root_phys + phynum;
@@ -2316,6 +2369,7 @@
 {
 	pmcs_phy_t		*pptr;
 	pmcs_phy_t		*root_phy;
+	int			phymap_active;
 
 	DTRACE_PROBE2(pmcs__discover__entry, ulong_t, pwp->work_flags,
 	    boolean_t, pwp->config_changed);
@@ -2335,6 +2389,7 @@
 		return;
 	}
 
+	phymap_active = pwp->phymap_active;
 	mutex_exit(&pwp->lock);
 
 	/*
@@ -2349,6 +2404,14 @@
 		SCHEDULE_WORK(pwp, PMCS_WORK_DISCOVER);
 		return;
 	}
+	if (pwp->num_iports != phymap_active) {
+		rw_exit(&pwp->iports_lock);
+		pmcs_prt(pwp, PMCS_PRT_DEBUG_CONFIG, NULL, NULL,
+		    "%s: phymaps or iport maps not stable; retry discovery",
+		    __func__);
+		SCHEDULE_WORK(pwp, PMCS_WORK_DISCOVER);
+		return;
+	}
 	rw_exit(&pwp->iports_lock);
 
 	mutex_enter(&pwp->config_lock);
@@ -2491,6 +2554,9 @@
 	}
 
 	pmcs_release_scratch(pwp);
+	if (!pwp->quiesced) {
+		pwp->blocked = 0;
+	}
 	pwp->configuring = 0;
 	mutex_exit(&pwp->config_lock);
 
@@ -7772,8 +7838,8 @@
 			next_pptr = list_next(&iport->phys, pptr);
 			mutex_enter(&pptr->phy_lock);
 			pptr->iport = NULL;
-			pmcs_update_phy_pm_props(phyp, phyp->att_port_pm_tmp,
-			    phyp->tgt_port_pm_tmp, B_FALSE);
+			pmcs_update_phy_pm_props(pptr, pptr->att_port_pm_tmp,
+			    pptr->tgt_port_pm_tmp, B_FALSE);
 			mutex_exit(&pptr->phy_lock);
 			pmcs_rele_iport(iport);
 			list_remove(&iport->phys, pptr);
@@ -7867,8 +7933,6 @@
 pmcs_smp_release(pmcs_iport_t *iport)
 {
 	if (iport == NULL) {
-		pmcs_prt(iport->pwp, PMCS_PRT_DEBUG_IPORT, NULL, NULL,
-		    "%s: iport is NULL...", __func__);
 		return;
 	}
 
@@ -8003,19 +8067,34 @@
     scsi_tgtmap_tgt_type_t tgt_type, void **tgt_privp)
 {
 	pmcs_iport_t *iport = (pmcs_iport_t *)tgtmap_priv;
-
-	pmcs_prt(iport->pwp, PMCS_PRT_DEBUG_IPORT, NULL, NULL,
-	    "%s: called for iport%d/%s(%d)", __func__,
-	    ddi_get_instance(iport->dip), tgt_addr, tgt_type);
+	pmcs_hw_t *pwp = iport->pwp;
+	pmcs_xscsi_t *target;
+
+	/*
+	 * Look up the target.  If there is one, and it doesn't have a PHY
+	 * pointer, re-establish that linkage here.
+	 */
+	mutex_enter(&pwp->lock);
+	target = pmcs_get_target(iport, tgt_addr, B_FALSE);
+	mutex_exit(&pwp->lock);
+
+	/*
+	 * If we got a target, it will now have a PHY pointer and the PHY
+	 * will point to the target.  The PHY will be locked, so we'll need
+	 * to unlock it.
+	 */
+	if (target) {
+		pmcs_unlock_phy(target->phy);
+	}
 
 	/*
 	 * Update config_restart_time so we don't try to restart discovery
 	 * while enumeration is still in progress.
 	 */
-	mutex_enter(&iport->pwp->config_lock);
-	iport->pwp->config_restart_time = ddi_get_lbolt() +
+	mutex_enter(&pwp->config_lock);
+	pwp->config_restart_time = ddi_get_lbolt() +
 	    drv_usectohz(PMCS_REDISCOVERY_DELAY);
-	mutex_exit(&iport->pwp->config_lock);
+	mutex_exit(&pwp->config_lock);
 }
 
 /* ARGSUSED */
--- a/usr/src/uts/common/sys/scsi/adapters/pmcs/pmcs.h	Thu Feb 18 12:37:30 2010 -0700
+++ b/usr/src/uts/common/sys/scsi/adapters/pmcs/pmcs.h	Thu Feb 18 12:52:39 2010 -0700
@@ -292,9 +292,23 @@
 		STATE_PROBING,
 		STATE_RUNNING,
 		STATE_UNPROBING,
+		STATE_IN_RESET,
 		STATE_DEAD
 	} state;
 
+	/*
+	 * Last reason for a soft reset
+	 */
+	enum pwp_last_reset_reason {
+		PMCS_LAST_RST_UNINIT,
+		PMCS_LAST_RST_ATTACH,
+		PMCS_LAST_RST_FW_UPGRADE,
+		PMCS_LAST_RST_FATAL_ERROR,
+		PMCS_LAST_RST_STALL,
+		PMCS_LAST_RST_QUIESCE,
+		PMCS_LAST_RST_DETACH
+	} last_reset_reason;
+
 	uint32_t
 		fw_disable_update	: 1,
 		fw_force_update		: 1,
@@ -311,7 +325,8 @@
 		physpeed		: 3,
 		resource_limited	: 1,
 		configuring		: 1,
-		ds_err_recovering	: 1;
+		ds_err_recovering	: 1,
+		quiesced		: 1;
 
 	/*
 	 * This HBA instance's iportmap and list of iport states.
@@ -406,6 +421,7 @@
 	 * memory and update the card as needed.
 	 */
 	uint32_t	shadow_iqpi[PMCS_MAX_IQ];
+	uint32_t	last_iqci[PMCS_MAX_IQ];
 	uint32_t	iqpi_offset[PMCS_MAX_IQ];
 	uint32_t	*iqp[PMCS_MAX_IQ];
 	kmutex_t	iqp_lock[PMCS_NIQ];
@@ -462,6 +478,12 @@
 	uint64_t	flash_chunk_addr;
 
 	/*
+	 * Copies of the last read MSGU and IOP heartbeats.
+	 */
+	uint32_t	last_msgu_tick;
+	uint32_t	last_iop_tick;
+
+	/*
 	 * Card information, some determined during MPI setup
 	 */
 	uint32_t	fw;		/* firmware version */
@@ -473,6 +495,12 @@
 	uint16_t	max_dev;	/* max number of devices supported */
 	uint16_t	last_wq_dev;	/* last dev whose wq was serviced */
 
+	/*
+	 * Counter for the number of times watchdog fires.  We can use this
+	 * to throttle events which fire off of the watchdog, such as the
+	 * forward progress detection routine.
+	 */
+	uint8_t		watchdog_count;
 
 	/*
 	 * Interrupt Setup stuff.
--- a/usr/src/uts/common/sys/scsi/adapters/pmcs/pmcs_param.h	Thu Feb 18 12:37:30 2010 -0700
+++ b/usr/src/uts/common/sys/scsi/adapters/pmcs/pmcs_param.h	Thu Feb 18 12:52:39 2010 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  *
  *
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /*
@@ -78,6 +78,15 @@
 #define	PMCS_WATCH_INTERVAL	250000	/* watchdog interval in us */
 
 /*
+ * Forward progress trigger. This is the number of times we run through
+ * watchdog before checking for forward progress.  Implicitly bound to
+ * PMCS_WATCH_INTERVAL above. For example, with a PMCS_WATCH_INTERVAL of
+ * 250000, the watchdog will run every quarter second, so forward progress
+ * will be checked every 16th watchdog fire, or every four seconds.
+ */
+#define	PMCS_FWD_PROG_TRIGGER	16
+
+/*
  * Inbound Queue definitions
  */
 #define	PMCS_NIQ		9	/* 9 Inbound Queues */
--- a/usr/src/uts/common/sys/scsi/adapters/pmcs/pmcs_proto.h	Thu Feb 18 12:37:30 2010 -0700
+++ b/usr/src/uts/common/sys/scsi/adapters/pmcs/pmcs_proto.h	Thu Feb 18 12:52:39 2010 -0700
@@ -247,6 +247,11 @@
 int pmcs_soft_reset(pmcs_hw_t *, boolean_t);
 
 /*
+ * This is a hot reset which will attempt reconfiguration after reset.
+ */
+int pmcs_hot_reset(pmcs_hw_t *);
+
+/*
  * Some more reset functions
  */
 int pmcs_reset_dev(pmcs_hw_t *, pmcs_phy_t *, uint64_t);
@@ -340,7 +345,7 @@
 void pmcs_worker(void *);
 
 pmcs_phy_t *pmcs_get_root_phy(pmcs_phy_t *);
-pmcs_xscsi_t *pmcs_get_target(pmcs_iport_t *, char *);
+pmcs_xscsi_t *pmcs_get_target(pmcs_iport_t *, char *, boolean_t);
 
 void pmcs_fatal_handler(pmcs_hw_t *);