# HG changeset patch # User Timo Sirainen # Date 1511541979 -7200 # Node ID d3d8164a2083d578a466608d9805c1b565987c68 # Parent 5f9ff90666492a12567ec214ff46277a268350c7 director: Improve PING/PONG timeout errors and add new warnings Log a warning if PING-PONG takes over 5 secs in total, or also if the recipient notices that the PING took >= 5 seconds to receive. diff -r 5f9ff9066649 -r d3d8164a2083 src/director/director-connection.c --- a/src/director/director-connection.c Fri Nov 24 18:22:04 2017 +0200 +++ b/src/director/director-connection.c Fri Nov 24 18:46:19 2017 +0200 @@ -67,6 +67,8 @@ #define DIRECTOR_CONNECTION_PING_INTERVAL_MSECS (15*1000) /* How long to wait before sending PING while waiting for SYNC reply */ #define DIRECTOR_CONNECTION_PING_SYNC_INTERVAL_MSECS 1000 +/* Log a warning if PING reply or PONG response takes longer than this */ +#define DIRECTOR_CONNECTION_PINGPONG_WARN_MSECS (5*1000) /* If outgoing director connection exists for less than this many seconds, mark the host as failed so we won't try to reconnect to it immediately */ #define DIRECTOR_SUCCESS_MIN_CONNECT_SECS 40 @@ -106,6 +108,9 @@ size_t peak_bytes_buffered; struct timeval ping_sent_time; + size_t ping_sent_buffer_size; + struct timeval ping_sent_user_cpu; + uoff_t ping_sent_input_offset, ping_sent_output_offset; unsigned int last_ping_msecs; /* for incoming connections the director host isn't known until @@ -1805,16 +1810,84 @@ } } -static bool director_cmd_pong(struct director_connection *conn) +static bool director_cmd_ping(struct director_connection *conn, + const char *const *args) +{ + time_t sent_time; + uintmax_t send_buffer_size; + + if (str_array_length(args) >= 2 && + str_to_time(args[0], &sent_time) == 0 && + str_to_uintmax(args[1], &send_buffer_size) == 0) { + int diff_secs = ioloop_time - sent_time; + if (diff_secs*1000+500 > DIRECTOR_CONNECTION_PINGPONG_WARN_MSECS) { + i_warning("director(%s): PING response took %d secs to receive " + "(send buffer was %ju bytes)", + conn->name, diff_secs, send_buffer_size); + } + } + director_connection_send(conn, + t_strdup_printf("PONG\t%ld\t%zu\n", + (long)ioloop_time, o_stream_get_buffer_used_size(conn->output))); + return TRUE; +} + +static void +director_ping_append_extra(struct director_connection *conn, string_t *str, + time_t pong_sent_time, + uintmax_t pong_send_buffer_size) { + struct rusage usage; + + str_printfa(str, "buffer size at PING was %zu bytes", conn->ping_sent_buffer_size); + if (pong_sent_time != 0) { + str_printfa(str, ", remote sent it %ld secs ago", + (long)(ioloop_time - pong_sent_time)); + } + if (pong_send_buffer_size != (uintmax_t)-1) { + str_printfa(str, ", remote buffer size at PONG was %ju bytes", + pong_send_buffer_size); + } + if (conn->ping_sent_user_cpu.tv_sec != (time_t)-1 && + getrusage(RUSAGE_SELF, &usage) == 0) { + int diff = timeval_diff_msecs(&usage.ru_utime, + &conn->ping_sent_user_cpu); + str_printfa(str, ", %u.%03u CPU secs since PING was sent", + diff/1000, diff%1000); + } + str_printfa(str, ", %"PRIuUOFF_T" bytes input", + conn->input->v_offset - conn->ping_sent_input_offset); + str_printfa(str, ", %"PRIuUOFF_T" bytes output", + conn->output->offset - conn->ping_sent_output_offset); +} + +static bool director_cmd_pong(struct director_connection *conn, + const char *const *args) +{ + time_t sent_time; + uintmax_t send_buffer_size; + if (!conn->ping_waiting) return TRUE; conn->ping_waiting = FALSE; timeout_remove(&conn->to_pong); + if (args[0] == NULL || str_to_time(args[0], &sent_time) < 0) + sent_time = 0; + else if (args[1] == NULL || str_to_uintmax(args[1], &send_buffer_size) < 0) + send_buffer_size = (uintmax_t)-1; + int ping_msecs = timeval_diff_msecs(&ioloop_timeval, &conn->ping_sent_time); - if (ping_msecs >= 0) + if (ping_msecs >= 0) { + if (ping_msecs > DIRECTOR_CONNECTION_PINGPONG_WARN_MSECS) { + string_t *extra = t_str_new(128); + director_ping_append_extra(conn, extra, sent_time, send_buffer_size); + i_warning("director(%s): PONG response took %u.%03u secs (%s)", + conn->name, ping_msecs/1000, ping_msecs%1000, + str_c(extra)); + } conn->last_ping_msecs = ping_msecs; + } if (conn->verifying_left) { conn->verifying_left = FALSE; @@ -1847,12 +1920,10 @@ /* allow also other commands during handshake */ } - if (strcmp(cmd, "PING") == 0) { - director_connection_send(conn, "PONG\n"); - return TRUE; - } + if (strcmp(cmd, "PING") == 0) + return director_cmd_ping(conn, args); if (strcmp(cmd, "PONG") == 0) - return director_cmd_pong(conn); + return director_cmd_pong(conn, args); if (strcmp(cmd, "USER") == 0) return director_cmd_user(conn, args); if (strcmp(cmd, "USER-WEAK") == 0) @@ -2474,6 +2545,8 @@ str_printfa(str, "Ping timed out in %u.%03u secs, disconnecting (", diff/1000, diff%1000); + director_ping_append_extra(conn, str, 0, (uintmax_t)-1); + str_append(str, ", "); director_connection_append_stats(conn, str); if (conn->handshake_received) str_append(str, ", handshaked"); @@ -2486,8 +2559,11 @@ static void director_connection_pong_timeout(struct director_connection *conn) { - i_error("director(%s): PONG reply not received although other " - "input keeps coming, disconnecting", conn->name); + int diff = timeval_diff_msecs(&ioloop_timeval, &conn->ping_sent_time); + + i_error("director(%s): PONG reply not received in %u.%03u secs, " + "although other input keeps coming, disconnecting", conn->name, + diff/1000, diff%1000); director_connection_disconnected(&conn, "Pong timeout"); } @@ -2501,9 +2577,21 @@ director_connection_ping_idle_timeout, conn); conn->to_pong = timeout_add(conn->dir->set->director_ping_max_timeout*1000, director_connection_pong_timeout, conn); - director_connection_send(conn, "PING\n"); conn->ping_waiting = TRUE; conn->ping_sent_time = ioloop_timeval; + conn->ping_sent_buffer_size = o_stream_get_buffer_used_size(conn->output); + conn->ping_sent_input_offset = conn->input->v_offset; + conn->ping_sent_output_offset = conn->output->offset; + + struct rusage usage; + if (getrusage(RUSAGE_SELF, &usage) == 0) + conn->ping_sent_user_cpu = usage.ru_utime; + else + conn->ping_sent_user_cpu.tv_sec = (time_t)-1; + /* send it after getting the buffer size */ + director_connection_send(conn, t_strdup_printf( + "PING\t%ld\t%zu\n", (long)ioloop_time, + conn->ping_sent_buffer_size)); } const char *director_connection_get_name(struct director_connection *conn)