changeset 13310:fc9f3d59525e

825 Would like SMF critical restart rate to be configurable Reviewed by: danmcd@nexenta.com Reviewed by: garrett@nexenta.com Approved by: richlowe@richlowe.net
author John Sonnenschein <johns@joyent.com>
date Mon, 21 Mar 2011 19:57:30 -0400
parents b4a945297226
children de40bba6236d
files usr/src/cmd/svc/startd/method.c usr/src/man/man1m/svc.startd.1m
diffstat 2 files changed, 58 insertions(+), 8 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/cmd/svc/startd/method.c	Fri Mar 18 18:30:02 2011 -0400
+++ b/usr/src/cmd/svc/startd/method.c	Mon Mar 21 19:57:30 2011 -0400
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Joyent Inc.
  */
 
 /*
@@ -70,6 +71,7 @@
 #include <unistd.h>
 #include <atomic.h>
 #include <poll.h>
+#include <libscf_priv.h>
 
 #include "startd.h"
 
@@ -110,24 +112,51 @@
 /*
  * method_rate_critical(restarter_inst_t *)
  *    Return true if the average start interval is less than the permitted
- *    interval.  Implicit success if insufficient measurements for an
- *    average exist.
+ *    interval.  The implicit interval defaults to RINST_FAILURE_RATE_NS and
+ *    RINST_START_TIMES but may be overridden with the svc properties
+ *    startd/critical_failure_count and startd/critical_failure_period
+ *    which represent the number of failures to consider and the amount of
+ *    time in seconds in which that number may occur, respectively. Note that
+ *    this time is measured as of the transition to 'enabled' rather than wall
+ *    clock time.
+ *    Implicit success if insufficient measurements for an average exist.
  */
 static int
 method_rate_critical(restarter_inst_t *inst)
 {
+	hrtime_t critical_failure_period = RINST_FAILURE_RATE_NS;
+	uint_t critical_failure_count = RINST_START_TIMES;
 	uint_t n = inst->ri_start_index;
 	hrtime_t avg_ns = 0;
+	uint64_t scf_fr, scf_st;
+	scf_propvec_t *prop = NULL;
+	scf_propvec_t restart_critical[] = {
+		{ "critical_failure_period", NULL, SCF_TYPE_INTEGER, NULL, 0 },
+		{ "critical_failure_count", NULL, SCF_TYPE_INTEGER, NULL, 0 },
+		{ NULL }
+	};
 
-	if (inst->ri_start_index < RINST_START_TIMES)
+	restart_critical[0].pv_ptr = &scf_fr;
+	restart_critical[1].pv_ptr = &scf_st;
+
+	if (scf_read_propvec(inst->ri_i.i_fmri, "startd",
+	    B_TRUE, restart_critical, &prop) != SCF_FAILED) {
+		/*
+		 * critical_failure_period is expressed
+		 * in seconds but tracked in ns
+		 */
+		critical_failure_period = (hrtime_t)scf_fr * NANOSEC;
+		critical_failure_count = (uint_t)scf_st;
+	}
+	if (inst->ri_start_index < critical_failure_count)
 		return (0);
 
 	avg_ns =
-	    (inst->ri_start_time[(n - 1) % RINST_START_TIMES] -
-	    inst->ri_start_time[n % RINST_START_TIMES]) /
-	    (RINST_START_TIMES - 1);
+	    (inst->ri_start_time[(n - 1) % critical_failure_count] -
+	    inst->ri_start_time[n % critical_failure_count]) /
+	    (critical_failure_count - 1);
 
-	return (avg_ns < RINST_FAILURE_RATE_NS);
+	return (avg_ns < critical_failure_period);
 }
 
 /*
--- a/usr/src/man/man1m/svc.startd.1m	Fri Mar 18 18:30:02 2011 -0400
+++ b/usr/src/man/man1m/svc.startd.1m	Mon Mar 21 19:57:30 2011 -0400
@@ -1,9 +1,10 @@
 '\" te
 .\" Copyright (c) 2008, Sun Microsystems, Inc. All Rights Reserved.
+.\" Copyright 2011, Joyent Inc 
 .\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License").  You may not use this file except in compliance with the License.
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing.  See the License for the specific language governing permissions and limitations under the License.
 .\" When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE.  If applicable, add the following below this CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner]
-.TH svc.startd 1M "5 May 2008" "SunOS 5.11" "System Administration Commands"
+.TH svc.startd 1M "18 Mar 2011" "SunOS 5.11" "System Administration Commands"
 .SH NAME
 svc.startd \- Service Management Facility master restarter
 .SH SYNOPSIS
@@ -381,6 +382,26 @@
 .ne 2
 .mk
 .na
+\fB\fBstartd/critical_failure_count\fR 
+.ad
+.br
+.na
+\fBstartd/critical_failure_period\fR\fR
+.ad
+.sp .6
+.RS 4n
+The \fBcritical_failure_count\fR and \fBcritical_failure_period\fR properties
+together specify the maximum number of service failures allowed in a given
+time interval before \fBsvc.startd\fR transitions the service to maintenance.
+If the number of failures exceeds \fBcritical_failure_count\fR in any period of
+\fBcritical_failure_period\fR seconds, \fBsvc.startd\fR will transition the
+service to maintenance.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
 \fB\fBstartd/duration\fR\fR
 .ad
 .sp .6