changeset 3726:dfabc2cbe34c

6441349 Deadlock in PX fabric error handler. 6508432 Need to set the proper bits in AER CE Mask Register 6510861 PCIe correctable error causes Solaris Panic due to PLX8114 HW bug
author dwoods
date Tue, 27 Feb 2007 17:42:05 -0800
parents 06a7db15387f
children 445fe920d4f5
files usr/src/uts/sparc/os/driver_aliases usr/src/uts/sun4/io/px/px_fm.c usr/src/uts/sun4/io/px/px_pci.c usr/src/uts/sun4/io/px/px_var.h usr/src/uts/sun4u/io/px/px_err.c usr/src/uts/sun4v/io/px/px_err.c
diffstat 6 files changed, 53 insertions(+), 12 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/uts/sparc/os/driver_aliases	Tue Feb 27 13:44:50 2007 -0800
+++ b/usr/src/uts/sparc/os/driver_aliases	Tue Feb 27 17:42:05 2007 -0800
@@ -147,6 +147,7 @@
 px "pciex108e,80f8"
 px_pci "pciexclass,060400"
 pxb_bcm "pciex1166,103"
+pxb_plx "pciex10b5,8114"
 pxb_plx "pciex10b5,8532"
 pxb_plx "pciex10b5,8516"
 pxb_plx "pciex10b5,8548"
--- a/usr/src/uts/sun4/io/px/px_fm.c	Tue Feb 27 13:44:50 2007 -0800
+++ b/usr/src/uts/sun4/io/px/px_fm.c	Tue Feb 27 17:42:05 2007 -0800
@@ -232,6 +232,7 @@
 int
 px_fm_callback(dev_info_t *dip, ddi_fm_error_t *derr, const void *impl_data)
 {
+	dev_info_t	*pdip = ddi_get_parent(dip);
 	px_t		*px_p = (px_t *)impl_data;
 	int		i, acc_type = 0;
 	int		lookup, rc_err, fab_err = PF_NO_PANIC;
@@ -240,13 +241,24 @@
 	px_ranges_t	*ranges_p;
 	int		range_len;
 
+	/*
+	 * If the current thread already owns the px_fm_mutex, then we
+	 * have encountered an error while processing a previous
+	 * error.  Attempting to take the mutex again will cause the
+	 * system to deadlock.
+	 */
+	if (px_p->px_fm_mutex_owner == curthread)
+		return (DDI_FM_FATAL);
+
+	i_ddi_fm_handler_exit(pdip);
 	mutex_enter(&px_p->px_fm_mutex);
+	px_p->px_fm_mutex_owner = curthread;
 
 	addr_high = (uint32_t)((uint64_t)derr->fme_bus_specific >> 32);
 	addr_low = (uint32_t)((uint64_t)derr->fme_bus_specific);
 
 	/*
-	 * Make sure this failed load came from this PCIe port.  Check by
+	 * Make sure this failed load came from this PCIe port.	 Check by
 	 * matching the upper 32 bits of the address with the ranges property.
 	 */
 	range_len = px_p->px_ranges_length / sizeof (px_ranges_t);
@@ -271,7 +283,9 @@
 
 	/* This address doesn't belong to this leaf, just return with OK */
 	if (!acc_type) {
+		px_p->px_fm_mutex_owner = NULL;
 		mutex_exit(&px_p->px_fm_mutex);
+		i_ddi_fm_handler_enter(pdip);
 		return (DDI_FM_OK);
 	}
 
@@ -289,7 +303,9 @@
 		    &px_p->px_dq_tail);
 	}
 
+	px_p->px_fm_mutex_owner = NULL;
 	mutex_exit(&px_p->px_fm_mutex);
+	i_ddi_fm_handler_enter(pdip);
 
 	if ((rc_err & (PX_PANIC | PX_PROTECTED)) || (fab_err & PF_PANIC) ||
 	    (lookup == PF_HDL_NOTFOUND))
@@ -320,6 +336,7 @@
 	ddi_fm_error_t	derr;
 
 	mutex_enter(&px_p->px_fm_mutex);
+	px_p->px_fm_mutex_owner = curthread;
 
 	/* Create the derr */
 	bzero(&derr, sizeof (ddi_fm_error_t));
@@ -338,6 +355,7 @@
 		    &px_p->px_dq_tail);
 	}
 
+	px_p->px_fm_mutex_owner = NULL;
 	mutex_exit(&px_p->px_fm_mutex);
 
 	px_err_panic(rc_err, PX_RC, fab_err);
--- a/usr/src/uts/sun4/io/px/px_pci.c	Tue Feb 27 13:44:50 2007 -0800
+++ b/usr/src/uts/sun4/io/px/px_pci.c	Tue Feb 27 17:42:05 2007 -0800
@@ -321,6 +321,9 @@
 	ddi_acc_handle_t	config_handle;
 	char			device_type[8];
 	uint16_t		cap_ptr;
+#ifdef PX_PLX
+	uint_t			bus_num, primary, secondary;
+#endif /* PX_PLX */
 
 	instance = ddi_get_instance(devi);
 
@@ -459,6 +462,28 @@
 	pxb->pxb_hotplug_capable = B_FALSE;
 
 #ifdef PX_PLX
+	/*
+	 * Due to a PLX HW bug we need to disable the receiver error CE on all
+	 * ports. To this end we create a property "pcie_ce_mask" with value
+	 * set to PCIE_AER_CE_RECEIVER_ERR. The pcie module will check for this
+	 * property before setting the AER CE mask.
+	 */
+	(void) ddi_prop_update_int(DDI_DEV_T_NONE, pxb->pxb_dip,
+		"pcie_ce_mask", PCIE_AER_CE_RECEIVER_ERR);
+
+	/*
+	 * There is a bug in the PLX 8114 bridge, such that an 8-bit
+	 * write to the secondary bus number register will corrupt an
+	 * internal shadow copy of the primary bus number.  Reading
+	 * out the registers and writing the same values back as
+	 * 16-bits resolves the problem.  This bug was reported by
+	 * PLX as errata #19.
+	 */
+	primary = pci_config_get8(config_handle, PCI_BCNF_PRIBUS);
+	secondary = pci_config_get8(config_handle, PCI_BCNF_SECBUS);
+	bus_num = (secondary << 8) | primary;
+	pci_config_put16(config_handle, PCI_BCNF_PRIBUS, bus_num);
+
 	if (pxb->pxb_rev_id <= PXB_DEVICE_PLX_AA_REV)
 		goto hotplug_done;
 #endif /* PX_PLX */
@@ -516,17 +541,6 @@
 	}
 	pxb->pxb_init_flags |= PXB_INIT_FM;
 
-#ifdef PX_PLX
-	/*
-	 * Due to a PLX HW bug we need to disable the receiver error CE on all
-	 * ports. To this end we create a property "pcie_ce_mask" with value
-	 * set to PCIE_AER_CE_RECEIVER_ERR. The pcie module will check for this
-	 * property before setting the AER CE mask.
-	 */
-	(void) ddi_prop_update_int(DDI_DEV_T_NONE, pxb->pxb_dip,
-		"pcie_ce_mask", PCIE_AER_CE_RECEIVER_ERR);
-#endif /* PX_PLX */
-
 	ddi_report_dev(devi);
 
 	return (DDI_SUCCESS);
--- a/usr/src/uts/sun4/io/px/px_var.h	Tue Feb 27 13:44:50 2007 -0800
+++ b/usr/src/uts/sun4/io/px/px_var.h	Tue Feb 27 17:42:05 2007 -0800
@@ -128,6 +128,7 @@
 	/* FMA */
 	int		px_fm_cap;
 	kmutex_t	px_fm_mutex;
+	kthread_t	*px_fm_mutex_owner;
 	ddi_iblock_cookie_t px_fm_ibc;
 
 	uint32_t	px_dev_caps;
--- a/usr/src/uts/sun4u/io/px/px_err.c	Tue Feb 27 13:44:50 2007 -0800
+++ b/usr/src/uts/sun4u/io/px/px_err.c	Tue Feb 27 17:42:05 2007 -0800
@@ -662,11 +662,13 @@
 	derr.fme_flag = DDI_FM_ERR_UNEXPECTED;
 
 	mutex_enter(&px_p->px_fm_mutex);
+	px_p->px_fm_mutex_owner = curthread;
 
 	err = px_err_cmn_intr(px_p, &derr, PX_INTR_CALL, PX_FM_BLOCK_HOST);
 	(void) px_lib_intr_setstate(rpdip, px_fault_p->px_fh_sysino,
 	    INTR_IDLE_STATE);
 
+	px_p->px_fm_mutex_owner = NULL;
 	mutex_exit(&px_p->px_fm_mutex);
 
 	px_err_panic(err, PX_HB, PX_NO_ERROR);
@@ -700,6 +702,7 @@
 	derr.fme_flag = DDI_FM_ERR_UNEXPECTED;
 
 	mutex_enter(&px_p->px_fm_mutex);
+	px_p->px_fm_mutex_owner = curthread;
 
 	/* send ereport/handle/clear fire registers */
 	rc_err = px_err_cmn_intr(px_p, &derr, PX_INTR_CALL, PX_FM_BLOCK_PCIE);
@@ -714,6 +717,7 @@
 	(void) px_lib_intr_setstate(rpdip, px_fault_p->px_fh_sysino,
 	    INTR_IDLE_STATE);
 
+	px_p->px_fm_mutex_owner = NULL;
 	mutex_exit(&px_p->px_fm_mutex);
 
 	px_err_panic(rc_err, PX_RC, fab_err);
--- a/usr/src/uts/sun4v/io/px/px_err.c	Tue Feb 27 13:44:50 2007 -0800
+++ b/usr/src/uts/sun4v/io/px/px_err.c	Tue Feb 27 17:42:05 2007 -0800
@@ -204,6 +204,7 @@
 	ddi_fm_error_t	derr;
 
 	mutex_enter(&px_p->px_fm_mutex);
+	px_p->px_fm_mutex_owner = curthread;
 
 	/* Create the derr */
 	bzero(&derr, sizeof (ddi_fm_error_t));
@@ -225,10 +226,12 @@
 	/* Set the intr state to idle for the leaf that received the mondo */
 	if (px_lib_intr_setstate(rpdip, fault_p->px_fh_sysino,
 		INTR_IDLE_STATE) != DDI_SUCCESS) {
+		px_p->px_fm_mutex_owner = NULL;
 		mutex_exit(&px_p->px_fm_mutex);
 		return (DDI_INTR_UNCLAIMED);
 	}
 
+	px_p->px_fm_mutex_owner = NULL;
 	mutex_exit(&px_p->px_fm_mutex);
 
 	switch (epkt->rc_descr.block) {