changeset 15132:27d3289e1f5c

director: Improved logging related to disconnections.
author Timo Sirainen <tss@iki.fi>
date Wed, 12 Sep 2012 00:20:47 +0300
parents 24e791bbcf69
children 8cec3b4c43ca
files src/director/director-connection.c src/director/director-connection.h src/director/director.c src/director/director.h
diffstat 4 files changed, 109 insertions(+), 42 deletions(-) [+]
line wrap: on
line diff
--- a/src/director/director-connection.c	Wed Sep 12 00:04:34 2012 +0300
+++ b/src/director/director-connection.c	Wed Sep 12 00:20:47 2012 +0300
@@ -71,6 +71,8 @@
    mark the host as failed so we won't try to reconnect to it immediately */
 #define DIRECTOR_SUCCESS_MIN_CONNECT_SECS 40
 #define DIRECTOR_WAIT_DISCONNECT_SECS 10
+#define DIRECTOR_HANDSHAKE_WARN_SECS 29
+#define DIRECTOR_HANDSHAKE_BYTES_LOG_MIN_SECS (60*30)
 
 #if DIRECTOR_CONNECTION_DONE_TIMEOUT_MSECS <= DIRECTOR_CONNECTION_PING_TIMEOUT_MSECS
 #  error DIRECTOR_CONNECTION_DONE_TIMEOUT_MSECS is too low
@@ -92,6 +94,8 @@
 	/* for incoming connections the director host isn't known until
 	   ME-line is received */
 	struct director_host *host;
+	/* this is set only for wrong connections: */
+	struct director_host *connect_request_to;
 
 	int fd;
 	struct io *io;
@@ -118,7 +122,9 @@
 };
 
 static void director_connection_disconnected(struct director_connection **conn);
-static void director_connection_reconnect(struct director_connection **conn);
+static void director_connection_reconnect(struct director_connection **conn,
+					  const char *reason);
+static void director_connection_log_disconnect(struct director_connection *conn);
 
 static void ATTR_FORMAT(2, 3)
 director_cmd_error(struct director_connection *conn, const char *fmt, ...)
@@ -169,7 +175,9 @@
 
 static void director_connection_wait_timeout(struct director_connection *conn)
 {
-	director_connection_deinit(&conn);
+	director_connection_log_disconnect(conn);
+	director_connection_deinit(&conn,
+		"Timeout waiting for disconnect after CONNECT");
 }
 
 static void director_connection_send_connect(struct director_connection *conn,
@@ -186,6 +194,12 @@
 	(void)o_stream_flush(conn->output);
 	o_stream_uncork(conn->output);
 
+	/* wait for a while for the remote to disconnect, so it will hopefully
+	   see our CONNECT command. we'll also log the warning later to avoid
+	   multiple log lines about it. */
+	conn->connect_request_to = host;
+	director_host_ref(conn->connect_request_to);
+
 	conn->to_disconnect =
 		timeout_add(DIRECTOR_WAIT_DISCONNECT_SECS*1000,
 			    director_connection_wait_timeout, conn);
@@ -220,10 +234,10 @@
 	} else if (dir->left == NULL) {
 		/* no conflicts yet */
 	} else if (dir->left->host == conn->host) {
-		i_info("Dropping existing connection %s "
-		       "in favor of its new connection %s",
-		       dir->left->host->name, conn->host->name);
-		director_connection_deinit(&dir->left);
+		i_warning("Replacing left director connection %s with %s",
+			  dir->left->host->name, conn->host->name);
+		director_connection_deinit(&dir->left, t_strdup_printf(
+			"Replacing with %s", conn->host->name));
 	} else if (dir->left->verifying_left) {
 		/* we're waiting to verify if our current left is still
 		   working. if we don't receive a PONG, the current left
@@ -235,9 +249,6 @@
 					     dir->self_host) < 0) {
 		/* the old connection is the correct one.
 		   refer the client there (FIXME: do we ever get here?) */
-		i_warning("Director connection %s tried to connect to "
-			  "us, should use %s instead",
-			  conn->name, dir->left->host->name);
 		director_connection_send_connect(conn, dir->left->host);
 		return TRUE;
 	} else {
@@ -267,7 +278,8 @@
 			/* either use this or disconnect it */
 			if (!director_connection_assign_left(conn)) {
 				/* we don't want this */
-				director_connection_deinit(&conn);
+				director_connection_deinit(&conn,
+					"Unwanted incoming connection");
 				director_assign_left(dir);
 				break;
 			}
@@ -304,9 +316,10 @@
 			conn->wrong_host = TRUE;
 			return FALSE;
 		}
-		i_info("Replacing right director connection %s with %s",
-		       dir->right->host->name, conn->host->name);
-		director_connection_deinit(&dir->right);
+		i_warning("Replacing right director connection %s with %s",
+			  dir->right->host->name, conn->host->name);
+		director_connection_deinit(&dir->right, t_strdup_printf(
+			"Replacing with %s", conn->host->name));
 	}
 	dir->right = conn;
 	i_free(conn->name);
@@ -410,7 +423,8 @@
 	} else if (dir->left->host == conn->host) {
 		/* b) */
 		i_assert(dir->left != conn);
-		director_connection_deinit(&dir->left);
+		director_connection_deinit(&dir->left,
+			"Replacing with new incoming connection");
 	} else if (director_host_cmp_to_self(conn->host, dir->left->host,
 					     dir->self_host) < 0) {
 		/* c) */
@@ -919,14 +933,19 @@
 static bool director_handshake_cmd_done(struct director_connection *conn)
 {
 	struct director *dir = conn->dir;
-
-	if (dir->debug) {
-		unsigned int secs = time(NULL)-conn->created;
+	unsigned int handshake_secs = time(NULL) - conn->created;
+	string_t *str;
 
-		i_debug("director(%s): Handshake took %u secs, "
-			"bytes in=%"PRIuUOFF_T" out=%"PRIuUOFF_T,
-			conn->name, secs, conn->input->v_offset,
-			conn->output->offset);
+	if (handshake_secs >= DIRECTOR_HANDSHAKE_WARN_SECS || dir->debug) {
+		str = t_str_new(128);
+		str_printfa(str, "director(%s): Handshake took %u secs, "
+			    "bytes in=%"PRIuUOFF_T" out=%"PRIuUOFF_T,
+			    conn->name, handshake_secs, conn->input->v_offset,
+			    conn->output->offset);
+		if (handshake_secs >= DIRECTOR_HANDSHAKE_WARN_SECS)
+			i_warning("%s", str_c(str));
+		else
+			i_debug("%s", str_c(str));
 	}
 
 	/* the host is up now, make sure we can connect to it immediately
@@ -1143,12 +1162,8 @@
 		if (conn->in && conn != dir->left && conn->me_received &&
 		    conn->to_disconnect == NULL &&
 		    director_host_cmp_to_self(dir->left->host, conn->host,
-					      dir->self_host) < 0) {
-			i_warning("Director connection %s tried to connect to "
-				  "us, should use %s instead",
-				  conn->name, dir->left->host->name);
+					      dir->self_host) < 0)
 			director_connection_send_connect(conn, dir->left->host);
-		}
 	}
 }
 
@@ -1220,6 +1235,11 @@
 		return director_connection_sync(conn, args);
 	if (strcmp(cmd, "CONNECT") == 0)
 		return director_cmd_connect(conn, args);
+	if (strcmp(cmd, "QUIT") == 0) {
+		i_warning("Director %s disconnected us with reason: %s",
+			  conn->name, t_strarray_join(args, " "));
+		return FALSE;
+	}
 
 	director_cmd_error(conn, "Unknown command %s", cmd);
 	return FALSE;
@@ -1247,6 +1267,37 @@
 	return ret;
 }
 
+static void
+director_connection_log_disconnect(struct director_connection *conn)
+{
+	unsigned int secs = ioloop_time - conn->created;
+	string_t *str = t_str_new(128);
+
+	i_assert(conn->connected);
+
+	if (conn->connect_request_to != NULL) {
+		i_warning("Director %s tried to connect to us, "
+			  "should use %s instead",
+			  conn->name, conn->connect_request_to->name);
+		return;
+	}
+
+	str_printfa(str, "Director %s disconnected: ", conn->name);
+	str_append(str, "Connection closed");
+	if (errno != 0 && errno != EPIPE)
+		str_printfa(str, ": %m");
+
+	str_printfa(str, " (connected %u secs, "
+		    "in=%"PRIuUOFF_T" out=%"PRIuUOFF_T,
+		    secs, conn->input->v_offset, conn->output->offset);
+
+	if (!conn->me_received)
+		str_append(str, ", handshake ME not received");
+	else if (!conn->handshake_received)
+		str_append(str, ", handshake DONE not received");
+	i_error("%s", str_c(str));
+}
+
 static void director_connection_input(struct director_connection *conn)
 {
 	struct director *dir = conn->dir;
@@ -1258,16 +1309,14 @@
 		return;
 	case -1:
 		/* disconnected */
-		i_error("Director %s disconnected%s", conn->name,
-			conn->handshake_received ? "" :
-			" before handshake finished");
+		director_connection_log_disconnect(conn);
 		director_connection_disconnected(&conn);
 		return;
 	case -2:
 		/* buffer full */
 		director_cmd_error(conn, "Director sent us more than %d bytes",
 				   MAX_INBUF_SIZE);
-		director_connection_reconnect(&conn);
+		director_connection_reconnect(&conn, "Too long input line");
 		return;
 	}
 
@@ -1289,7 +1338,8 @@
 		} T_END;
 
 		if (!ret) {
-			director_connection_reconnect(&conn);
+			director_connection_reconnect(&conn, t_strdup_printf(
+				"Invalid input: %s", line));
 			break;
 		}
 	}
@@ -1474,7 +1524,8 @@
 	return conn;
 }
 
-void director_connection_deinit(struct director_connection **_conn)
+void director_connection_deinit(struct director_connection **_conn,
+				const char *remote_reason)
 {
 	struct director_connection *const *conns, *conn = *_conn;
 	struct director *dir = conn->dir;
@@ -1482,8 +1533,15 @@
 
 	*_conn = NULL;
 
-	if (dir->debug && conn->host != NULL)
-		i_debug("Disconnecting from %s", conn->host->name);
+	if (dir->debug && conn->host != NULL) {
+		i_debug("Disconnecting from %s: %s",
+			conn->host->name, remote_reason);
+	}
+	if (*remote_reason != '\0' &&
+	    conn->minor_version >= DIRECTOR_VERSION_QUIT) {
+		o_stream_send_str(conn->output, t_strdup_printf(
+			"QUIT\t%s\n", remote_reason));
+	}
 
 	conns = array_get(&dir->connections, &count);
 	for (i = 0; i < count; i++) {
@@ -1504,6 +1562,8 @@
 	if (conn->host != NULL)
 		director_host_unref(conn->host);
 
+	if (conn->connect_request_to != NULL)
+		director_host_unref(conn->connect_request_to);
 	if (conn->user_iter != NULL)
 		user_directory_iter_deinit(&conn->user_iter);
 	if (conn->to_disconnect != NULL)
@@ -1542,17 +1602,18 @@
 		conn->host->last_network_failure = ioloop_time;
 	}
 
-	director_connection_deinit(_conn);
+	director_connection_deinit(_conn, "");
 	if (dir->right == NULL)
 		director_connect(dir);
 }
 
-void director_connection_reconnect(struct director_connection **_conn)
+static void director_connection_reconnect(struct director_connection **_conn,
+					  const char *reason)
 {
 	struct director_connection *conn = *_conn;
 	struct director *dir = conn->dir;
 
-	director_connection_deinit(_conn);
+	director_connection_deinit(_conn, reason);
 	if (dir->right == NULL)
 		director_connect(dir);
 }
--- a/src/director/director-connection.h	Wed Sep 12 00:04:34 2012 +0300
+++ b/src/director/director-connection.h	Wed Sep 12 00:20:47 2012 +0300
@@ -10,7 +10,8 @@
 struct director_connection *
 director_connection_init_out(struct director *dir, int fd,
 			     struct director_host *host);
-void director_connection_deinit(struct director_connection **conn);
+void director_connection_deinit(struct director_connection **conn,
+				const char *remote_reason);
 
 void director_connection_send(struct director_connection *conn,
 			      const char *data);
--- a/src/director/director.c	Wed Sep 12 00:04:34 2012 +0300
+++ b/src/director/director.c	Wed Sep 12 00:20:47 2012 +0300
@@ -216,7 +216,10 @@
 	if (dir->left != NULL) {
 		/* since we couldn't connect to it,
 		   it must have failed recently */
-		director_connection_deinit(&dir->left);
+		i_warning("director: Assuming %s is dead, disconnecting",
+			  director_connection_get_name(dir->left));
+		director_connection_deinit(&dir->left,
+					   "This connection is dead?");
 	}
 	dir->ring_min_version = DIRECTOR_VERSION_MINOR;
 	if (!dir->ring_handshaked)
@@ -482,7 +485,7 @@
 		if (director_connection_get_host(conn) != removed_host)
 			i++;
 		else {
-			director_connection_deinit(&conn);
+			director_connection_deinit(&conn, "Removing from ring");
 			conns = array_get(&dir->connections, &count);
 		}
 	}
@@ -845,7 +848,7 @@
 	while (array_count(&dir->connections) > 0) {
 		connp = array_idx(&dir->connections, 0);
 		conn = *connp;
-		director_connection_deinit(&conn);
+		director_connection_deinit(&conn, "Shutting down");
 	}
 
 	user_directory_deinit(&dir->users);
--- a/src/director/director.h	Wed Sep 12 00:04:34 2012 +0300
+++ b/src/director/director.h	Wed Sep 12 00:20:47 2012 +0300
@@ -6,12 +6,14 @@
 
 #define DIRECTOR_VERSION_NAME "director"
 #define DIRECTOR_VERSION_MAJOR 1
-#define DIRECTOR_VERSION_MINOR 2
+#define DIRECTOR_VERSION_MINOR 3
 
 /* weak users supported in protocol v1.1+ */
 #define DIRECTOR_VERSION_WEAK_USERS 1
 /* director removes supported in v1.2+ */
 #define DIRECTOR_VERSION_RING_REMOVE 2
+/* quit reason supported in v1.3+ */
+#define DIRECTOR_VERSION_QUIT 3
 
 /* Minimum time between even attempting to communicate with a director that
    failed due to a protocol error. */