changeset 14130:3159d5f5b16a

3989 svc.startd gets stuck in a loop when HOME dir doesn't exist 3990 a misconfigured smf service can cause svc.configd to leak memory and eventually hang 3991 startd, configd spinning due to bad sac start method Reviewed by: Robert Mustacchi <rm@joyent.com> Approved by: Richard Lowe <richlowe@richlowe.net>
author Jerry Jelinek <jerry.jelinek@joyent.com>
date Thu, 05 Jul 2012 15:52:15 +0000
parents 790945ad7848
children 15f2242812c8
files usr/src/cmd/svc/startd/method.c usr/src/cmd/svc/startd/restarter.c usr/src/cmd/svc/startd/startd.h usr/src/cmd/svc/startd/wait.c usr/src/man/man1m/svc.startd.1m
diffstat 5 files changed, 106 insertions(+), 15 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/cmd/svc/startd/method.c	Sun Aug 04 09:58:38 2013 -0700
+++ b/usr/src/cmd/svc/startd/method.c	Thu Jul 05 15:52:15 2012 +0000
@@ -121,10 +121,10 @@
  *    clock time.
  *    Implicit success if insufficient measurements for an average exist.
  */
-static int
+int
 method_rate_critical(restarter_inst_t *inst)
 {
-	hrtime_t critical_failure_period = RINST_FAILURE_RATE_NS;
+	hrtime_t critical_failure_period;
 	uint_t critical_failure_count = RINST_START_TIMES;
 	uint_t n = inst->ri_start_index;
 	hrtime_t avg_ns = 0;
@@ -136,6 +136,11 @@
 		{ NULL }
 	};
 
+	if (instance_is_wait_style(inst))
+		critical_failure_period = RINST_WT_SVC_FAILURE_RATE_NS;
+	else
+		critical_failure_period = RINST_FAILURE_RATE_NS;
+
 	restart_critical[0].pv_ptr = &scf_fr;
 	restart_critical[1].pv_ptr = &scf_st;
 
@@ -832,12 +837,39 @@
 		    "to root-accessible libraries\n", inst->ri_i.i_fmri);
 
 	/*
+	 * For wait-style svc, sanity check that method exists to prevent an
+	 * infinite loop.
+	 */
+	if (instance_is_wait_style(inst) && type == METHOD_START) {
+		char *pend;
+		struct stat64 sbuf;
+
+		/*
+		 * We need to handle start method strings that have arguments,
+		 * such as '/lib/svc/method/console-login %i'.
+		 */
+		if ((pend = strchr(method, ' ')) != NULL)
+			*pend = '\0';
+
+		if (stat64(method, &sbuf) == -1 && errno == ENOENT) {
+			log_instance(inst, B_TRUE, "Missing start method (%s), "
+			    "changing state to maintenance.", method);
+			restarter_free_method_context(mcp);
+			result = ENOENT;
+			goto out;
+		}
+		if (pend != NULL)
+			*pend = ' ';
+	}
+
+	/*
 	 * If the service is restarting too quickly, send it to
 	 * maintenance.
 	 */
 	if (type == METHOD_START) {
 		method_record_start(inst);
-		if (method_rate_critical(inst)) {
+		if (method_rate_critical(inst) &&
+		    !instance_is_wait_style(inst)) {
 			log_instance(inst, B_TRUE, "Restarting too quickly, "
 			    "changing state to maintenance.");
 			result = ELOOP;
--- a/usr/src/cmd/svc/startd/restarter.c	Sun Aug 04 09:58:38 2013 -0700
+++ b/usr/src/cmd/svc/startd/restarter.c	Thu Jul 05 15:52:15 2012 +0000
@@ -140,6 +140,8 @@
 
 static uu_list_pool_t *restarter_queue_pool;
 
+#define	WT_SVC_ERR_THROTTLE	1	/* 1 sec delay for erroring wait svc */
+
 /*
  * Function used to reset the restart times for an instance, when
  * an administrative task comes along and essentially makes the times
@@ -1030,6 +1032,7 @@
 	int err;
 	restarter_error_t re;
 	restarter_str_t	reason;
+	restarter_instance_state_t new_state;
 
 	assert(MUTEX_HELD(&inst->ri_lock));
 	assert(inst->ri_method_thread == 0);
@@ -1040,6 +1043,16 @@
 		reason = restarter_str_ct_ev_exit;
 		cp = "all processes in service exited";
 		break;
+	case RSTOP_ERR_CFG:
+		re = RERR_FAULT;
+		reason = restarter_str_method_failed;
+		cp = "service exited with a configuration error";
+		break;
+	case RSTOP_ERR_EXIT:
+		re = RERR_RESTART;
+		reason = restarter_str_ct_ev_exit;
+		cp = "service exited with an error";
+		break;
 	case RSTOP_CORE:
 		re = RERR_FAULT;
 		reason = restarter_str_ct_ev_core;
@@ -1107,7 +1120,10 @@
 	log_framework(re == RERR_FAULT ? LOG_INFO : LOG_DEBUG,
 	    "%s: Instance stopping because %s.\n", inst->ri_i.i_fmri, cp);
 
-	if (instance_is_wait_style(inst) && cause == RSTOP_EXIT) {
+	if (instance_is_wait_style(inst) &&
+	    (cause == RSTOP_EXIT ||
+	    cause == RSTOP_ERR_CFG ||
+	    cause == RSTOP_ERR_EXIT)) {
 		/*
 		 * No need to stop instance, as child has exited; remove
 		 * contract and move the instance to the offline state.
@@ -1123,8 +1139,26 @@
 			bad_error("restarter_instance_update_states", err);
 		}
 
-		(void) update_fault_count(inst, FAULT_COUNT_RESET);
-		reset_start_times(inst);
+		if (cause == RSTOP_ERR_EXIT) {
+			/*
+			 * The RSTOP_ERR_EXIT cause is set via the
+			 * wait_thread -> wait_remove code path when we have
+			 * a "wait" style svc that exited with an error. If
+			 * the svc is failing too quickly, we throttle it so
+			 * that we don't restart it more than once/second.
+			 * Since we know we're running in the wait thread its
+			 * ok to throttle it right here.
+			 */
+			(void) update_fault_count(inst, FAULT_COUNT_INCR);
+			if (method_rate_critical(inst)) {
+				log_instance(inst, B_TRUE, "Failing too "
+				    "quickly, throttling.");
+				(void) sleep(WT_SVC_ERR_THROTTLE);
+			}
+		} else {
+			(void) update_fault_count(inst, FAULT_COUNT_RESET);
+			reset_start_times(inst);
+		}
 
 		if (inst->ri_i.i_primary_ctid != 0) {
 			inst->ri_m_inst =
@@ -1149,7 +1183,8 @@
 			bad_error("restarter_instance_update_states", err);
 		}
 
-		return (0);
+		if (cause != RSTOP_ERR_CFG)
+			return (0);
 	} else if (instance_is_wait_style(inst) && re == RERR_RESTART) {
 		/*
 		 * Stopping a wait service through means other than the pid
@@ -1161,9 +1196,23 @@
 		wait_ignore_by_fmri(inst->ri_i.i_fmri);
 	}
 
+	/*
+	 * There are some configuration errors which we cannot detect until we
+	 * try to run the method.  For example, see exec_method() where the
+	 * restarter_set_method_context() call can return SMF_EXIT_ERR_CONFIG
+	 * in several cases. If this happens for a "wait-style" svc,
+	 * wait_remove() sets the cause as RSTOP_ERR_CFG so that we can detect
+	 * the configuration error and go into maintenance, even though it is
+	 * a "wait-style" svc.
+	 */
+	if (cause == RSTOP_ERR_CFG)
+		new_state = RESTARTER_STATE_MAINT;
+	else
+		new_state = inst->ri_i.i_enabled ?
+		    RESTARTER_STATE_OFFLINE : RESTARTER_STATE_DISABLED;
+
 	switch (err = restarter_instance_update_states(local_handle, inst,
-	    inst->ri_i.i_state, inst->ri_i.i_enabled ? RESTARTER_STATE_OFFLINE :
-	    RESTARTER_STATE_DISABLED, RERR_NONE, reason)) {
+	    inst->ri_i.i_state, new_state, RERR_NONE, reason)) {
 	case 0:
 	case ECONNRESET:
 		break;
--- a/usr/src/cmd/svc/startd/startd.h	Sun Aug 04 09:58:38 2013 -0700
+++ b/usr/src/cmd/svc/startd/startd.h	Thu Jul 05 15:52:15 2012 +0000
@@ -379,7 +379,9 @@
 	RSTOP_HWERR,		/* uncorrectable hardware error */
 	RSTOP_DEPENDENCY,	/* dependency activity caused stop */
 	RSTOP_DISABLE,		/* disabled */
-	RSTOP_RESTART		/* restart requested */
+	RSTOP_RESTART,		/* restart requested */
+	RSTOP_ERR_CFG,		/* wait svc exited with a config. error */
+	RSTOP_ERR_EXIT		/* wait svc exited with an error */
 } stop_cause_t;
 
 /*
@@ -405,6 +407,7 @@
 
 #define	RINST_START_TIMES	5		/* failures to consider */
 #define	RINST_FAILURE_RATE_NS	600000000000LL	/* 1 failure/10 minutes */
+#define	RINST_WT_SVC_FAILURE_RATE_NS	NANOSEC	/* 1 failure/second */
 
 /* Number of events in the queue when we start dropping ADMIN events. */
 #define	RINST_QUEUE_THRESHOLD	100
@@ -717,6 +720,7 @@
 /* method.c */
 void *method_thread(void *);
 void method_remove_contract(restarter_inst_t *, boolean_t, boolean_t);
+int method_rate_critical(restarter_inst_t *);
 
 /* misc.c */
 void startd_close(int);
--- a/usr/src/cmd/svc/startd/wait.c	Sun Aug 04 09:58:38 2013 -0700
+++ b/usr/src/cmd/svc/startd/wait.c	Thu Jul 05 15:52:15 2012 +0000
@@ -21,10 +21,9 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2012, Joyent, Inc.  All rights reserved.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * wait.c - asynchronous monitoring of "wait registered" start methods
  *
@@ -90,6 +89,7 @@
 wait_remove(wait_info_t *wi, int direct)
 {
 	int status;
+	stop_cause_t cause = RSTOP_EXIT;
 
 	if (waitpid(wi->wi_pid, &status, 0) == -1) {
 		if (wi->wi_parent)
@@ -101,6 +101,10 @@
 			log_framework(LOG_NOTICE,
 			    "instance %s exited with status %d\n", wi->wi_fmri,
 			    WEXITSTATUS(status));
+			if (WEXITSTATUS(status) == SMF_EXIT_ERR_CONFIG)
+				cause = RSTOP_ERR_CFG;
+			else
+				cause = RSTOP_ERR_EXIT;
 		}
 	}
 
@@ -137,7 +141,7 @@
 
 		log_framework(LOG_DEBUG,
 		    "wait_remove requesting stop of %s\n", wi->wi_fmri);
-		(void) stop_instance_fmri(wait_hndl, wi->wi_fmri, RSTOP_EXIT);
+		(void) stop_instance_fmri(wait_hndl, wi->wi_fmri, cause);
 	}
 
 	uu_list_node_fini(wi, &wi->wi_link, wait_info_pool);
--- a/usr/src/man/man1m/svc.startd.1m	Sun Aug 04 09:58:38 2013 -0700
+++ b/usr/src/man/man1m/svc.startd.1m	Thu Jul 05 15:52:15 2012 +0000
@@ -1,6 +1,6 @@
 '\" te
 .\" Copyright (c) 2008, Sun Microsystems, Inc. All Rights Reserved.
-.\" Copyright 2011, Joyent Inc
+.\" Copyright 2012, Joyent, Inc. All Rights Reserved.
 .\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License").  You may not use this file except in compliance with the License.
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing.  See the License for the specific language governing permissions and limitations under the License.
 .\" When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE.  If applicable, add the following below this CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner]
@@ -488,7 +488,9 @@
 "\fBWait\fR" model services are restarted whenever the child process associated
 with the service exits. A child process that exits is not considered an error
 for "\fBwait\fR" model services, and repeated failures do not lead to a
-transition to maintenance state.
+transition to maintenance state. However, a wait service which is repeatedly
+exiting with an error that exceeds the default rate (5 failures/second) will be
+throttled back so that the service only restarts once per second.
 .SS "LEGACY SERVICES"
 .sp
 .LP