comparison usr/src/lib/librestart/common/librestart.c @ 12979:ab9ae749152f

PSARC/2009/617 Software Events Notification Parameters CLI PSARC/2009/618 snmp-notify: SNMP Notification Daemon for Software Events PSARC/2009/619 smtp-notify: Email Notification Daemon for Software Events PSARC/2010/225 fmd for non-global Solaris zones PSARC/2010/226 Solaris Instance UUID PSARC/2010/227 nvlist_nvflag(3NVPAIR) PSARC/2010/228 libfmevent additions PSARC/2010/257 sysevent_evc_setpropnvl and sysevent_evc_getpropnvl PSARC/2010/265 FMRI and FMA Event Stabilty, 'ireport' category 1 event class, and the 'sw' FMRI scheme PSARC/2010/278 FMA/SMF integration: instance state transitions PSARC/2010/279 Modelling panics within FMA PSARC/2010/290 logadm.conf upgrade 6392476 fmdump needs to pretty-print 6393375 userland ereport/ireport event generation interfaces 6445732 Add email notification agent for FMA and software events 6804168 RFE: Allow an efficient means to monitor SMF services status changes 6866661 scf_values_destroy(3SCF) will segfault if is passed NULL 6884709 Add snmp notification agent for FMA and software events 6884712 Add private interface to tap into libfmd_msg macro expansion capabilities 6897919 fmd to run in a non-global zone 6897937 fmd use of non-private doors is not safe 6900081 add a UUID to Solaris kernel image for use in crashdump identification 6914884 model panic events as a defect diagnosis in FMA 6944862 fmd_case_open_uuid, fmd_case_uuisresolved, fmd_nvl_create_defect 6944866 log legacy sysevents in fmd 6944867 enumerate svc scheme in topo 6944868 software-diagnosis and software-response fmd modules 6944870 model SMF maintenance state as a defect diagnosis in FMA 6944876 savecore runs in foreground for systems with zfs root and dedicated dump 6965796 Implement notification parameters for SMF state transitions and FMA events 6968287 SUN-FM-MIB.mib needs to be updated to reflect Oracle information 6972331 logadm.conf upgrade PSARC/2010/290
author Gavin Maltby <gavin.maltby@oracle.com>
date Fri, 30 Jul 2010 17:04:17 +1000
parents 2522fef20c5f
children 33bb96bf701a
comparison
equal deleted inserted replaced
12978:19d842faf8e4 12979:ab9ae749152f
18 * 18 *
19 * CDDL HEADER END 19 * CDDL HEADER END
20 */ 20 */
21 21
22 /* 22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
25 */ 24 */
26 25
26 #include <libintl.h>
27 #include <librestart.h> 27 #include <librestart.h>
28 #include <librestart_priv.h> 28 #include <librestart_priv.h>
29 #include <libscf.h> 29 #include <libscf.h>
30 #include <libscf_priv.h> 30 #include <libscf_priv.h>
31 31
106 restarter_instance_state_t re_state; 106 restarter_instance_state_t re_state;
107 restarter_instance_state_t re_next_state; 107 restarter_instance_state_t re_next_state;
108 }; 108 };
109 109
110 /* 110 /*
111 * Long reasons must all parse/read correctly in the following contexts:
112 *
113 * "A service instance transitioned state: %s."
114 * "A service failed: %s."
115 * "Reason: %s."
116 * "The service transitioned state (%s) and ..."
117 *
118 * With the exception of restart_str_none they must also fit the following
119 * moulds:
120 *
121 * "An instance transitioned because %s, and ..."
122 * "An instance transitioned to <new-state> because %s, and ..."
123 *
124 * Note that whoever is rendering the long message must provide the
125 * terminal punctuation - don't include it here. Similarly, do not
126 * provide an initial capital letter in reason-long.
127 *
128 * The long reason strings are Volatile - within the grammatical constraints
129 * above we may improve them as need be. The intention is that a consumer
130 * may blindly render the string along the lines of the above examples,
131 * but has no other guarantees as to the exact wording. Long reasons
132 * are localized.
133 *
134 * We define revisions of the set of short reason strings in use. Within
135 * a given revision, all short reasons are Committed. Consumers must check
136 * the revision in use before relying on the semantics of the short reason
137 * codes - if the version exceeds that which they are familiar with they should
138 * fail gracefully. Having checked for version compatability, a consumer
139 * is assured that
140 *
141 * "short_reason_A iff semantic_A", provided:
142 *
143 * . the restarter uses this short reason code at all,
144 * . the short reason is not "none" (which a restarter could
145 * specifiy for any transition semantics)
146 *
147 * To split/refine such a Committed semantic_A into further cases,
148 * we are required to bump the revision number. This should be an
149 * infrequent occurence. If you bump the revision number you may
150 * need to make corresponding changes in any source that calls
151 * restarter_str_version (e.g., FMA event generation).
152 *
153 * To add additional reasons to the set you must also bump the version
154 * number.
155 */
156
157 /*
158 * The following describes revision 0 of the set of transition reasons.
159 * Read the preceding block comment before making any changes.
160 */
161 static const struct restarter_state_transition_reason restarter_str[] = {
162 /*
163 * Any transition for which the restarter has not provided a reason.
164 */
165 {
166 restarter_str_none,
167 "none",
168 "the restarter gave no reason"
169 },
170
171 /*
172 * A transition to maintenance state due to a
173 * 'svcadm mark maintenance <fmri>'. *Not* used if the libscf
174 * interface smf_maintain_instance(3SCF) is used to request maintenance.
175 */
176 {
177 restarter_str_administrative_request,
178 "administrative_request",
179 "maintenance was requested by an administrator"
180 },
181
182 /*
183 * A transition to maintenance state if a repository inconsistency
184 * exists when the service/instance state is first read by startd
185 * into the graph engine (this can also happen during startd restart).
186 */
187 {
188 restarter_str_bad_repo_state,
189 "bad_repo_state",
190 "an SMF repository inconsistecy exists"
191 },
192
193 /*
194 * A transition 'maintenance -> uninitialized' resulting always
195 * from 'svcadm clear <fmri>'. *Not* used if the libscf interface
196 * smf_restore_instance(3SCF) is used.
197 */
198 {
199 restarter_str_clear_request,
200 "clear_request",
201 "maintenance clear was requested by an administrator"
202 },
203
204 /*
205 * A transition 'online -> offline' due to a process core dump.
206 */
207 {
208 restarter_str_ct_ev_core,
209 "ct_ev_core",
210 "a process dumped core"
211 },
212
213 /*
214 * A transition 'online -> offline' due to an empty process contract,
215 * i.e., the last process in a contract type service has exited.
216 */
217 {
218 restarter_str_ct_ev_exit,
219 "ct_ev_exit",
220 "all processes in the service have exited"
221 },
222
223 /*
224 * A transition 'online -> offline' due to a hardware error.
225 */
226 {
227 restarter_str_ct_ev_hwerr,
228 "ct_ev_hwerr",
229 "a process was killed due to uncorrectable hardware error"
230 },
231
232 /*
233 * A transition 'online -> offline' due to a process in the service
234 * having received a fatal signal originating from outside the
235 * service process contract.
236 */
237 {
238 restarter_str_ct_ev_signal,
239 "ct_ev_signal",
240 "a process received a fatal signal from outside the service"
241 },
242
243 /*
244 * A transition 'offline -> online' when all dependencies for the
245 * service have been met.
246 */
247 {
248 restarter_str_dependencies_satisfied,
249 "dependencies_satisfied",
250 "all dependencies have been satisfied"
251 },
252
253 /*
254 * A transition 'online -> offline' because some dependency for the
255 * service is no-longer met.
256 */
257 {
258 restarter_str_dependency_activity,
259 "dependency_activity",
260 "a dependency activity required a stop"
261 },
262
263 /*
264 * A transition to maintenance state due to a cycle in the
265 * service dependencies.
266 */
267 {
268 restarter_str_dependency_cycle,
269 "dependency_cycle",
270 "a dependency cycle exists"
271 },
272
273 /*
274 * A transition 'online -> offline -> disabled' due to a
275 * 'svcadm disable [-t] <fmri>' or smf_disable_instance(3SCF) call.
276 */
277 {
278 restarter_str_disable_request,
279 "disable_request",
280 "a disable was requested"
281 },
282
283 /*
284 * A transition 'disabled -> offline' due to a
285 * 'svcadm enable [-t] <fmri>' or smf_enable_instance(3SCF) call.
286 */
287 {
288 restarter_str_enable_request,
289 "enable_request",
290 "an enable was requested"
291 },
292
293 /*
294 * A transition to maintenance state when a method fails
295 * repeatedly for a retryable reason.
296 */
297 {
298 restarter_str_fault_threshold_reached,
299 "fault_threshold_reached",
300 "a method is failing in a retryable manner but too often"
301 },
302
303 /*
304 * A transition to uninitialized state when startd reads the service
305 * configuration and inserts it into the graph engine.
306 */
307 {
308 restarter_str_insert_in_graph,
309 "insert_in_graph",
310 "the instance was inserted in the graph"
311 },
312
313 /*
314 * A transition to maintenance state due to an invalid dependency
315 * declared for the service.
316 */
317 {
318 restarter_str_invalid_dependency,
319 "invalid_dependency",
320 "a service has an invalid dependency"
321 },
322
323 /*
324 * A transition to maintenance state because the service-declared
325 * restarter is invalid.
326 */
327 {
328 restarter_str_invalid_restarter,
329 "invalid_restarter",
330 "the service restarter is invalid"
331 },
332
333 /*
334 * A transition to maintenance state because a restarter method
335 * exited with one of SMF_EXIT_ERR_CONFIG, SMF_EXIT_ERR_NOSMF,
336 * SMF_EXIT_ERR_PERM, or SMF_EXIT_ERR_FATAL.
337 */
338 {
339 restarter_str_method_failed,
340 "method_failed",
341 "a start, stop or refresh method failed"
342 },
343
344 /*
345 * A transition 'uninitialized -> {disabled|offline}' after
346 * "insert_in_graph" to match the state configured in the
347 * repository.
348 */
349 {
350 restarter_str_per_configuration,
351 "per_configuration",
352 "the SMF repository configuration specifies this state"
353 },
354
355 /*
356 * Refresh requested - no state change.
357 */
358 {
359 restarter_str_refresh,
360 NULL,
361 "a refresh was requested (no change of state)"
362 },
363
364 /*
365 * A transition 'online -> offline -> online' due to a
366 * 'svcadm restart <fmri> or equivlaent libscf API call.
367 * Both the 'online -> offline' and 'offline -> online' transtions
368 * specify this reason.
369 */
370 {
371 restarter_str_restart_request,
372 "restart_request",
373 "a restart was requested"
374 },
375
376 /*
377 * A transition to maintenance state because the start method is
378 * being executed successfully but too frequently.
379 */
380 {
381 restarter_str_restarting_too_quickly,
382 "restarting_too_quickly",
383 "the instance is restarting too quickly"
384 },
385
386 /*
387 * A transition to maintenance state due a service requesting
388 * 'svcadm mark maintenance <fmri>' or equivalent libscf API call.
389 * A command line 'svcadm mark maintenance <fmri>' does not produce
390 * this reason - it produces administrative_request instead.
391 */
392 {
393 restarter_str_service_request,
394 "service_request",
395 "maintenance was requested by another service"
396 },
397
398 /*
399 * An instanced inserted into the graph at its existing state
400 * during a startd restart - no state change.
401 */
402 {
403 restarter_str_startd_restart,
404 NULL,
405 "the instance was inserted in the graph due to startd restart"
406 }
407 };
408
409 uint32_t
410 restarter_str_version(void)
411 {
412 return (RESTARTER_STRING_VERSION);
413 }
414
415 const char *
416 restarter_get_str_short(restarter_str_t key)
417 {
418 int i;
419 for (i = 0; i < sizeof (restarter_str) /
420 sizeof (struct restarter_state_transition_reason); i++)
421 if (key == restarter_str[i].str_key)
422 return (restarter_str[i].str_short);
423 return (NULL);
424 }
425
426 const char *
427 restarter_get_str_long(restarter_str_t key)
428 {
429 int i;
430 for (i = 0; i < sizeof (restarter_str) /
431 sizeof (struct restarter_state_transition_reason); i++)
432 if (key == restarter_str[i].str_key)
433 return (dgettext(TEXT_DOMAIN,
434 restarter_str[i].str_long));
435 return (NULL);
436 }
437
438 /*
111 * A static no memory error message mc_error_t structure 439 * A static no memory error message mc_error_t structure
112 * to be used in cases when memory errors are to be returned 440 * to be used in cases when memory errors are to be returned
113 * This avoids the need to attempt to allocate memory for the 441 * This avoids the need to attempt to allocate memory for the
114 * message, therefore getting into a cycle of no memory failures. 442 * message, therefore getting into a cycle of no memory failures.
115 */ 443 */
493 821
494 /* 822 /*
495 * Commit the state, next state, and auxiliary state into the repository. 823 * Commit the state, next state, and auxiliary state into the repository.
496 * Let the graph engine know about the state change and error. On success, 824 * Let the graph engine know about the state change and error. On success,
497 * return 0. On error, return 825 * return 0. On error, return
498 * EINVAL - aux has spaces
499 * - inst is invalid or not an instance FMRI
500 * EPROTO - librestart compiled against different libscf 826 * EPROTO - librestart compiled against different libscf
501 * ENOMEM - out of memory 827 * ENOMEM - out of memory
502 * - repository server out of resources 828 * - repository server out of resources
503 * ENOTACTIVE - repository server not running 829 * ENOTACTIVE - repository server not running
504 * ECONNABORTED - repository connection established, but then broken 830 * ECONNABORTED - repository connection established, but then broken
515 restarter_set_states(restarter_event_handle_t *h, const char *inst, 841 restarter_set_states(restarter_event_handle_t *h, const char *inst,
516 restarter_instance_state_t cur_state, 842 restarter_instance_state_t cur_state,
517 restarter_instance_state_t new_cur_state, 843 restarter_instance_state_t new_cur_state,
518 restarter_instance_state_t next_state, 844 restarter_instance_state_t next_state,
519 restarter_instance_state_t new_next_state, restarter_error_t e, 845 restarter_instance_state_t new_next_state, restarter_error_t e,
520 const char *aux) 846 restarter_str_t aux)
521 { 847 {
522 nvlist_t *attr; 848 nvlist_t *attr;
523 scf_handle_t *scf_h; 849 scf_handle_t *scf_h;
524 instance_data_t id; 850 instance_data_t id;
525 int ret = 0; 851 int ret = 0;
526 char *p = (char *)aux; 852 const char *p = restarter_get_str_short(aux);
527 853
528 assert(h->reh_master_channel != NULL); 854 assert(h->reh_master_channel != NULL);
529 assert(h->reh_master_channel_name != NULL); 855 assert(h->reh_master_channel_name != NULL);
530 assert(h->reh_master_subscriber_id != NULL); 856 assert(h->reh_master_subscriber_id != NULL);
531
532 /* Validate format of auxiliary state: no spaces allowed */
533 if (p != NULL) {
534 while (*p != '\0') {
535 if (isspace(*p))
536 return (EINVAL);
537 p++;
538 }
539 }
540 857
541 if ((scf_h = scf_handle_create(SCF_VERSION)) == NULL) { 858 if ((scf_h = scf_handle_create(SCF_VERSION)) == NULL) {
542 switch (scf_error()) { 859 switch (scf_error()) {
543 case SCF_ERROR_VERSION_MISMATCH: 860 case SCF_ERROR_VERSION_MISMATCH:
544 return (EPROTO); 861 return (EPROTO);
570 if (nvlist_alloc(&attr, NV_UNIQUE_NAME, 0) != 0 || 887 if (nvlist_alloc(&attr, NV_UNIQUE_NAME, 0) != 0 ||
571 nvlist_add_int32(attr, RESTARTER_NAME_STATE, new_cur_state) != 0 || 888 nvlist_add_int32(attr, RESTARTER_NAME_STATE, new_cur_state) != 0 ||
572 nvlist_add_int32(attr, RESTARTER_NAME_NEXT_STATE, new_next_state) 889 nvlist_add_int32(attr, RESTARTER_NAME_NEXT_STATE, new_next_state)
573 != 0 || 890 != 0 ||
574 nvlist_add_int32(attr, RESTARTER_NAME_ERROR, e) != 0 || 891 nvlist_add_int32(attr, RESTARTER_NAME_ERROR, e) != 0 ||
575 nvlist_add_string(attr, RESTARTER_NAME_INSTANCE, inst) != 0) { 892 nvlist_add_string(attr, RESTARTER_NAME_INSTANCE, inst) != 0 ||
893 nvlist_add_int32(attr, RESTARTER_NAME_REASON, aux) != 0) {
576 ret = ENOMEM; 894 ret = ENOMEM;
577 } else { 895 } else {
578 id.i_fmri = inst; 896 id.i_fmri = inst;
579 id.i_state = cur_state; 897 id.i_state = cur_state;
580 id.i_next_state = next_state; 898 id.i_next_state = next_state;
581 899
582 ret = _restarter_commit_states(scf_h, &id, new_cur_state, 900 ret = _restarter_commit_states(scf_h, &id, new_cur_state,
583 new_next_state, aux); 901 new_next_state, p);
584 902
585 if (ret == 0) { 903 if (ret == 0) {
586 ret = restarter_event_publish_retry( 904 ret = restarter_event_publish_retry(
587 h->reh_master_channel, "master", "state_change", 905 h->reh_master_channel, "master", "state_change",
588 "com.sun", "librestart", attr, EVCH_NOSLEEP); 906 "com.sun", "librestart", attr, EVCH_NOSLEEP);