changeset 22673:867c3905ac0b

director: Avoid USER loops when ring latency is over 1 second Do this by adding a timestamp parameter to USER events. This way if it takes over 1 second for the USER event to traverse the ring, it won't get into an infinite loop getting the user updated over and over again.
author Timo Sirainen <timo.sirainen@dovecot.fi>
date Sun, 26 Nov 2017 01:19:35 +0200
parents d811474ef901
children af1a9f282179
files src/director/director-connection.c src/director/director.c src/director/director.h src/director/user-directory.h
diffstat 4 files changed, 42 insertions(+), 10 deletions(-) [+]
line wrap: on
line diff
--- a/src/director/director-connection.c	Sun Nov 26 01:14:01 2017 +0200
+++ b/src/director/director-connection.c	Sun Nov 26 01:19:35 2017 +0200
@@ -682,12 +682,25 @@
 		user->host->user_count++;
 		ret = TRUE;
 	}
-	if (timestamp == ioloop_time && (time_t)user->timestamp != timestamp) {
+	/* Update user's timestamp if it's higher than the current one. Note
+	   that we'll preserve the original timestamp. This is important when
+	   the director ring is slow and a single USER can traverse through
+	   the ring more than a second. We don't want to get into a loop where
+	   the same USER goes through the ring forever. */
+	if ((time_t)user->timestamp < timestamp) {
+		/* NOTE: This makes the users list somewhat out-of-order.
+		   It's not a big problem - most likely it's only a few seconds
+		   difference. The worst that can happen is that some users
+		   take up memory that should have been freed already. */
+		dir_debug("user refresh: %u refreshed timestamp from %u to %ld",
+			  username_hash, user->timestamp, (long)timestamp);
 		user_directory_refresh(users, user);
+		user->timestamp = timestamp;
 		ret = TRUE;
+	} else {
+		dir_debug("user refresh: %u ignored timestamp %ld (we have %u)",
+			  username_hash, (long)timestamp, user->timestamp);
 	}
-	dir_debug("user refresh: %u refreshed timeout to %ld",
-		  username_hash, (long)user->timestamp);
 
 	if (unset_weak_user) {
 		/* user is no longer weak. handle pending requests for
@@ -765,10 +778,12 @@
 	struct mail_host *host;
 	struct user *user;
 	bool forced;
+	time_t timestamp = ioloop_time;
 
-	if (str_array_length(args) != 2 ||
+	if (str_array_length(args) < 2 ||
 	    str_to_uint(args[0], &username_hash) < 0 ||
-	    net_addr2ip(args[1], &ip) < 0) {
+	    net_addr2ip(args[1], &ip) < 0 ||
+	    (args[2] != NULL && str_to_time(args[2], &timestamp) < 0)) {
 		director_cmd_error(conn, "Invalid parameters");
 		return FALSE;
 	}
@@ -780,7 +795,7 @@
 	}
 
 	if (director_user_refresh(conn, username_hash,
-				  host, ioloop_time, FALSE, &forced, &user)) {
+				  host, timestamp, FALSE, &forced, &user)) {
 		struct director_host *src_host =
 			forced ? conn->dir->self_host : conn->host;
 		i_assert(!user->weak);
--- a/src/director/director.c	Sun Nov 26 01:14:01 2017 +0200
+++ b/src/director/director.c	Sun Nov 26 01:19:35 2017 +0200
@@ -730,11 +730,24 @@
 void director_update_user(struct director *dir, struct director_host *src,
 			  struct user *user)
 {
+	struct director_connection *const *connp;
+
 	i_assert(src != NULL);
-
 	i_assert(!user->weak);
-	director_update_send(dir, src, t_strdup_printf("USER\t%u\t%s\n",
-		user->username_hash, user->host->ip_str));
+
+	array_foreach(&dir->connections, connp) {
+		if (director_connection_get_host(*connp) == src)
+			continue;
+
+		if (director_connection_get_minor_version(*connp) >= DIRECTOR_VERSION_USER_TIMESTAMP) {
+			director_connection_send(*connp, t_strdup_printf(
+				"USER\t%u\t%s\t%u\n", user->username_hash, user->host->ip_str,
+				user->timestamp));
+		} else {
+			director_connection_send(*connp, t_strdup_printf(
+				"USER\t%u\t%s\n", user->username_hash, user->host->ip_str));
+		}
+	}
 }
 
 void director_update_user_weak(struct director *dir, struct director_host *src,
--- a/src/director/director.h	Sun Nov 26 01:14:01 2017 +0200
+++ b/src/director/director.h	Sun Nov 26 01:19:35 2017 +0200
@@ -28,6 +28,8 @@
 #define DIRECTOR_VERSION_USER_KICK_ALT 8
 /* Users are sent as "U" command in handshake */
 #define DIRECTOR_VERSION_HANDSHAKE_U_CMD 9
+/* USER event with timestamp supported */
+#define DIRECTOR_VERSION_USER_TIMESTAMP 9
 
 /* Minimum time between even attempting to communicate with a director that
    failed due to a protocol error. */
--- a/src/director/user-directory.h	Sun Nov 26 01:14:01 2017 +0200
+++ b/src/director/user-directory.h	Sun Nov 26 01:19:35 2017 +0200
@@ -5,7 +5,9 @@
 	((user)->kill_ctx != NULL)
 
 struct user {
-	/* sorted by time */
+	/* Approximately sorted by time (except during handshaking).
+	   The sorting order may be constantly wrong a few seconds here and
+	   there. */
 	struct user *prev, *next;
 
 	/* first 32 bits of MD5(username). collisions are quite unlikely, but