Mercurial > illumos > illumos-gate
changeset 6704:69d909654d1c
6533308 fmd does not properly recover from an ldc channel reset
6624294 hard coded excessive sleep on EBUSY accept failure can reduce ETM thruput
author | rb144127 |
---|---|
date | Fri, 23 May 2008 14:27:10 -0700 |
parents | b961f9b565e9 |
children | 50db7364dad5 |
files | usr/src/cmd/fm/modules/sun4v/etm/etm.c usr/src/cmd/fm/modules/sun4v/etm/etm_impl.h usr/src/cmd/fm/modules/sun4v/etm/etm_xport_api_dd.c |
diffstat | 3 files changed, 89 insertions(+), 17 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/cmd/fm/modules/sun4v/etm/etm.c Fri May 23 12:19:20 2008 -0700 +++ b/usr/src/cmd/fm/modules/sun4v/etm/etm.c Fri May 23 14:27:10 2008 -0700 @@ -20,7 +20,7 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -90,7 +90,8 @@ { ETM_PROP_NM_CONSOLE, FMD_TYPE_BOOL, "false" }, { ETM_PROP_NM_SYSLOGD, FMD_TYPE_BOOL, "true" }, { ETM_PROP_NM_FACILITY, FMD_TYPE_STRING, "LOG_DAEMON" }, - { ETM_PROP_NM_MAX_RESP_Q_LEN, FMD_TYPE_INT32, "512" }, + { ETM_PROP_NM_MAX_RESP_Q_LEN, FMD_TYPE_UINT32, "512" }, + { ETM_PROP_NM_BAD_ACC_TO_SEC, FMD_TYPE_UINT32, "1" }, { NULL, 0, NULL } }; @@ -175,6 +176,9 @@ static uint32_t etm_resp_q_max_len = 0; /* max length (ele cnt) of responder queue */ +static uint32_t +etm_bad_acc_to_sec = 0; /* sleep timeout (in sec) after bad conn accept */ + static pthread_mutex_t etm_resp_q_lock = PTHREAD_MUTEX_INITIALIZER; /* protects responder queue */ @@ -329,6 +333,10 @@ fmd_stat_t etm_log_err; fmd_stat_t etm_msg_err; + /* miscellaneous stats */ + + fmd_stat_t etm_reset_xport; + } etm_stats = { /* ETM msg counters */ @@ -490,7 +498,12 @@ { "etm_log_err", FMD_TYPE_UINT64, "failed to log message to log(7D)" }, { "etm_msg_err", FMD_TYPE_UINT64, - "failed to log message to sysmsg(7D)" } + "failed to log message to sysmsg(7D)" }, + + /* miscellaneous stats */ + + { "etm_reset_xport", FMD_TYPE_UINT64, + "xport resets after xport API failure" } }; /* @@ -1555,6 +1568,25 @@ } /* etm_send_response() */ /* + * etm_reset_xport - reset the transport layer (via fini;init) + * presumably for an error condition we cannot + * otherwise recover from (ex: hung LDC channel) + * + * caveats - no checking/locking is done to ensure an existing connection + * is idle during an xport reset; we don't want to deadlock + * and presumably the transport is stuck/unusable anyway + */ + +static void +etm_reset_xport(fmd_hdl_t *hdl) +{ + (void) etm_xport_fini(hdl); + (void) etm_xport_init(hdl); + etm_stats.etm_reset_xport.fmds_value.ui64++; + +} /* etm_reset_xport() */ + +/* * etm_handle_new_conn - receive an ETM message sent from the other end via * the given open connection, pull out any FMA events * and post them to the local FMD (or handle any ETM @@ -1579,12 +1611,14 @@ nvlist_t *evp; /* ptr to unpacked FMA event */ char *class; /* FMA event class */ ssize_t i, n; /* gen use */ + int should_reset_xport; /* bool to reset xport */ if (etm_debug_lvl >= 2) { etm_show_time(hdl, "ante conn handle"); } fmd_hdl_debug(hdl, "info: handling new conn %p\n", conn); + should_reset_xport = 0; ev_hdrp = NULL; ctl_hdrp = NULL; resp_hdrp = NULL; @@ -1599,6 +1633,7 @@ if ((ev_hdrp = etm_hdr_read(hdl, conn, &hdr_sz)) == NULL) { /* errno assumed set by above call */ + should_reset_xport = (errno == ENOTACTIVE); fmd_hdl_debug(hdl, "error: FMA event dropped: " "bad hdr read errno %d\n", errno); etm_stats.etm_rd_drop_fmaevent.fmds_value.ui64++; @@ -1636,6 +1671,7 @@ if ((n = etm_io_op(hdl, "FMA event dropped: " "bad io read on event bodies", conn, body_buf, body_sz, ETM_IO_OP_RD)) < 0) { + should_reset_xport = (n == -ENOTACTIVE); etm_stats.etm_rd_drop_fmaevent.fmds_value.ui64++; goto func_ret; } @@ -1729,6 +1765,7 @@ if ((n = etm_io_op(hdl, "bad io read on ctl body", conn, body_buf, body_sz, ETM_IO_OP_RD)) < 0) { + should_reset_xport = (n == -ENOTACTIVE); goto func_ret; } @@ -1768,6 +1805,7 @@ if ((n = etm_io_op(hdl, "bad io read on resp len", conn, body_buf, body_sz, ETM_IO_OP_RD)) < 0) { + should_reset_xport = (n == -ENOTACTIVE); goto func_ret; } @@ -1811,6 +1849,7 @@ if ((n = etm_io_op(hdl, "bad io read on sa body", conn, body_buf, body_sz, ETM_IO_OP_RD)) < 0) { + should_reset_xport = (n == -ENOTACTIVE); goto func_ret; } @@ -1863,9 +1902,30 @@ if (body_buf != NULL) { fmd_hdl_free(hdl, body_buf, body_sz); } + if (should_reset_xport) { + etm_reset_xport(hdl); + } } /* etm_handle_new_conn() */ /* + * etm_handle_bad_accept - recover from a failed connection acceptance + */ + +static void +etm_handle_bad_accept(fmd_hdl_t *hdl, int nev) +{ + int should_reset_xport; /* bool to reset xport */ + + should_reset_xport = (nev == -ENOTACTIVE); + fmd_hdl_debug(hdl, "error: bad conn accept errno %d\n", (-nev)); + etm_stats.etm_xport_accept_fail.fmds_value.ui64++; + (void) etm_sleep(etm_bad_acc_to_sec); /* avoid spinning CPU */ + if (should_reset_xport) { + etm_reset_xport(hdl); + } +} /* etm_handle_bad_accept() */ + +/* * etm_server - loop forever accepting new connections * using the given FMD handle, * handling any ETM msgs sent from the other side @@ -1876,7 +1936,7 @@ etm_server(void *arg) { etm_xport_conn_t conn; /* connection handle */ - ssize_t n; /* gen use */ + int nev; /* -errno val */ fmd_hdl_t *hdl; /* FMD handle */ hdl = arg; @@ -1887,15 +1947,11 @@ if ((conn = etm_xport_accept(hdl, NULL)) == NULL) { /* errno assumed set by above call */ - n = errno; + nev = (-errno); if (etm_is_dying) { break; } - fmd_hdl_debug(hdl, - "error: bad conn accept errno %d\n", n); - etm_stats.etm_xport_accept_fail.fmds_value.ui64++; - /* avoid spinning CPU */ - (void) etm_sleep(ETM_SLEEP_SLOW); + etm_handle_bad_accept(hdl, nev); continue; } @@ -2078,6 +2134,8 @@ etm_resp_q_max_len = fmd_prop_get_int32(hdl, ETM_PROP_NM_MAX_RESP_Q_LEN); etm_stats.etm_resp_q_max_len.fmds_value.ui64 = etm_resp_q_max_len; + etm_bad_acc_to_sec = fmd_prop_get_int32(hdl, + ETM_PROP_NM_BAD_ACC_TO_SEC); /* obtain an FMD transport handle so we can post FMA events later */
--- a/usr/src/cmd/fm/modules/sun4v/etm/etm_impl.h Fri May 23 12:19:20 2008 -0700 +++ b/usr/src/cmd/fm/modules/sun4v/etm/etm_impl.h Fri May 23 14:27:10 2008 -0700 @@ -20,7 +20,7 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -94,6 +94,8 @@ #define ETM_PROP_NM_MAX_RESP_Q_LEN "etm_resp_q_max_len" +#define ETM_PROP_NM_BAD_ACC_TO_SEC "etm_bad_acc_to_sec" + /* * --------------------------------- prolog ---------------------------------- */
--- a/usr/src/cmd/fm/modules/sun4v/etm/etm_xport_api_dd.c Fri May 23 12:19:20 2008 -0700 +++ b/usr/src/cmd/fm/modules/sun4v/etm/etm_xport_api_dd.c Fri May 23 14:27:10 2008 -0700 @@ -660,8 +660,12 @@ pollfd.revents = 0; pollfd.fd = _conn->fd; - if (poll(&pollfd, 1, -1) < 1) - return (-EIO); + if ((n = poll(&pollfd, 1, -1)) < 1) { + if (n == 0) + return (-EIO); + else + return (-errno); + } /* * set i to the maximum size --- read(..., i) below will @@ -846,7 +850,7 @@ (void) close(_conn->fd); etm_xport_free_addr(hdl, _addr); fmd_hdl_free(hdl, _conn, sizeof (_etm_xport_conn_t)); - etm_xport_stats.xport_os_open_fail.fmds_value.ui64++; + etm_xport_stats.xport_os_ioctl_fail.fmds_value.ui64++; (void) pthread_mutex_unlock(&etm_xport_vldc_lock); return (NULL); } @@ -956,8 +960,10 @@ pollfd.revents = 0; pollfd.fd = _conn->fd; - if (poll(&pollfd, 1, -1) < 1) { - errno = -EIO; + if ((n = poll(&pollfd, 1, -1)) < 1) { + if (n == 0) { + errno = EIO; + } goto func_ret; } } else { @@ -1263,6 +1269,12 @@ etm_xport_irb_tail = NULL; etm_xport_irb_mtu_sz = 0; + /* cleanup statistics from FMD */ + + (void) fmd_stat_destroy(hdl, + sizeof (etm_xport_stats) / sizeof (fmd_stat_t), + (fmd_stat_t *)&etm_xport_stats); + fmd_hdl_debug(hdl, "info: xport finalized ok\n"); return (0); @@ -1373,8 +1385,8 @@ } if (n < 0) { /* errno assumed set by above call */ + rv = (-errno); etm_xport_stats.xport_os_ioctl_fail.fmds_value.ui64++; - rv = (-errno); } else { rv = (int)op_ctl.oo_val; }