changeset 10438:dac6771300a8

6875268 missing power supplies may be reported as faulted 6874918 sensor-transport produces ereports too aggresively 6877019 topo_node_facility tries to release lock it doesn't own
author Robert Johnston <Robert.Johnston@Sun.COM>
date Tue, 01 Sep 2009 13:28:49 -0700
parents 157ade6698b1
children 0ea4c598a710
files usr/src/cmd/fm/modules/common/sensor-transport/sensor_transport.c usr/src/lib/fm/topo/libtopo/common/topo_node.c
diffstat 2 files changed, 42 insertions(+), 16 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/cmd/fm/modules/common/sensor-transport/sensor_transport.c	Tue Sep 01 21:49:27 2009 +0530
+++ b/usr/src/cmd/fm/modules/common/sensor-transport/sensor_transport.c	Tue Sep 01 13:28:49 2009 -0700
@@ -39,6 +39,7 @@
 typedef struct sensor_fault {
 	struct sensor_fault	*sf_next;
 	char			*sf_fru;
+	uint32_t		sf_num_fails;
 	boolean_t		sf_last_faulted;
 	boolean_t		sf_faulted;
 	boolean_t		sf_unknown;
@@ -51,6 +52,11 @@
 	id_t		st_timer;
 	sensor_fault_t	*st_faults;
 	boolean_t	st_first;
+	/*
+	 * The number of consecutive sensor readings indicating failure that
+	 * we'll tolerate before sending an ereport.
+	 */
+	uint32_t	st_tolerance;
 } sensor_transport_t;
 
 typedef struct st_stats {
@@ -73,7 +79,7 @@
 	const char *name = topo_node_name(node);
 	nvlist_t *nvl, *props, *rsrc, *fru;
 	char *fmri;
-	int err;
+	int err, ret;
 	int32_t last_source, source = -1;
 	boolean_t nonrecov, faulted, predictive, source_diff;
 	nvpair_t *nvp;
@@ -84,23 +90,42 @@
 	if (strcmp(name, FAN) != 0 && strcmp(name, PSU) != 0)
 		return (0);
 
+	if (topo_node_resource(node, &rsrc, NULL) != 0) {
+		st_stats.st_bad_fmri.fmds_value.ui64++;
+		return (0);
+	}
+
+	/*
+	 * If the resource isn't present, don't bother invoking the sensor
+	 * failure method.  It may be that the sensors aren't part of the same
+	 * physical FRU and will report failure if the FRU is no longer there.
+	 */
+	if ((ret = topo_fmri_present(thp, rsrc, &err)) < 0) {
+		fmd_hdl_debug(hdl, "topo_fmri_present() failed for %s=%d",
+		    name, topo_node_instance(node));
+		nvlist_free(rsrc);
+		return (0);
+	}
+
+	if (!ret) {
+		fmd_hdl_debug(hdl, "%s=%d is not present, ignoring",
+		    name, topo_node_instance(node));
+		nvlist_free(rsrc);
+		return (0);
+	}
+
 	if (topo_method_invoke(node, TOPO_METH_SENSOR_FAILURE,
 	    TOPO_METH_SENSOR_FAILURE_VERSION, NULL, &nvl, &err) != 0) {
 		if (err == ETOPO_METHOD_NOTSUP) {
 			fmd_hdl_debug(hdl, "Method %s not supported on %s=%d",
 			    TOPO_METH_SENSOR_FAILURE, name,
 			    topo_node_instance(node));
+			nvlist_free(rsrc);
 			return (0);
 		}
 		nvl = NULL;
 	}
 
-	if (topo_node_resource(node, &rsrc, NULL) != 0) {
-		st_stats.st_bad_fmri.fmds_value.ui64++;
-		nvlist_free(nvl);
-		return (0);
-	}
-
 	if (topo_node_fru(node, &fru, NULL, NULL) != 0) {
 		st_stats.st_bad_fmri.fmds_value.ui64++;
 		nvlist_free(nvl);
@@ -196,6 +221,9 @@
 		}
 	}
 
+	if (faulted)
+		sfp->sf_num_fails++;
+
 	if (nvl == NULL)
 		sfp->sf_unknown = B_TRUE;
 
@@ -207,7 +235,8 @@
 		 * to uniquely identify faulty resources instead and post one
 		 * per resource, even if they share the same FRU.
 		 */
-		if (!sfp->sf_last_faulted) {
+		if (!sfp->sf_last_faulted &&
+		    (sfp->sf_num_fails > stp->st_tolerance)) {
 			ena = fmd_event_ena_create(hdl);
 			event = fmd_nvl_alloc(hdl, FMD_SLEEP);
 
@@ -274,7 +303,8 @@
 	 */
 	for (sfp = stp->st_faults; sfp != NULL; sfp = sfp->sf_next) {
 		sfp->sf_unknown = B_FALSE;
-		sfp->sf_last_faulted = sfp->sf_faulted;
+		if (sfp->sf_num_fails > stp->st_tolerance)
+			sfp->sf_last_faulted = sfp->sf_faulted;
 		sfp->sf_faulted = B_FALSE;
 	}
 
@@ -312,6 +342,7 @@
 
 static const fmd_prop_t fmd_props[] = {
 	{ "interval", FMD_TYPE_TIME, "1min" },
+	{ "tolerance", FMD_TYPE_UINT32, "1" },
 	{ NULL, 0, NULL }
 };
 
@@ -326,7 +357,7 @@
 };
 
 static const fmd_hdl_info_t fmd_info = {
-	"Sensor Transport Agent", "1.0", &fmd_ops, fmd_props
+	"Sensor Transport Agent", "1.1", &fmd_ops, fmd_props
 };
 
 void
@@ -354,6 +385,7 @@
 
 	stp = fmd_hdl_zalloc(hdl, sizeof (sensor_transport_t), FMD_SLEEP);
 	stp->st_interval = fmd_prop_get_int64(hdl, "interval");
+	stp->st_tolerance = fmd_prop_get_int32(hdl, "tolerance");
 
 	fmd_hdl_setspecific(hdl, stp);
 
--- a/usr/src/lib/fm/topo/libtopo/common/topo_node.c	Tue Sep 01 21:49:27 2009 +0530
+++ b/usr/src/lib/fm/topo/libtopo/common/topo_node.c	Tue Sep 01 13:28:49 2009 -0700
@@ -690,7 +690,6 @@
 			    topo_node_name(node), topo_node_instance(node),
 			    topo_strerror(*errp));
 			topo_node_rele(tmp);
-			topo_node_unlock(node);
 			return (-1);
 		}
 		if ((nvlist_lookup_nvlist(rsrc, "facility", &fac) != 0) ||
@@ -699,7 +698,6 @@
 
 			nvlist_free(rsrc);
 			topo_node_rele(tmp);
-			topo_node_unlock(node);
 			return (-1);
 		}
 
@@ -717,9 +715,7 @@
 		 */
 		if (topo_prop_get_uint32(tmp, TOPO_PGROUP_FACILITY,
 		    TOPO_FACILITY_TYPE, &tmp_facsubtype, errp) != 0) {
-
 			topo_node_rele(tmp);
-			topo_node_unlock(node);
 			return (-1);
 		}
 		if (fac_subtype == tmp_facsubtype ||
@@ -728,7 +724,6 @@
 			    sizeof (topo_faclist_t))) == NULL) {
 				*errp = ETOPO_NOMEM;
 				topo_node_rele(tmp);
-				topo_node_unlock(node);
 				return (-1);
 			}
 			fac_ele->tf_node = tmp;
@@ -737,7 +732,6 @@
 		}
 		topo_node_rele(tmp);
 	}
-	topo_node_unlock(node);
 
 	if (list_empty) {
 		*errp = ETOPO_FAC_NOENT;