changeset 20979:652b5d0a0365

director: Prevent race conditions by adding USER_KILL_STATE_FLUSHING In theory it's possible that a user is freed during a flush and added back before flush is finished, possibly even being moved again. This check makes sure that we don't finish such move unless we're actually at the correct flushing state. (If there's another flush also running for the user it'll be ignored.) This is also useful for logging purposes.
author Timo Sirainen <timo.sirainen@dovecot.fi>
date Tue, 25 Oct 2016 00:22:20 +0300
parents 7a4fb29a4e52
children 481b70689214
files src/director/director.c src/director/user-directory.c src/director/user-directory.h
diffstat 3 files changed, 18 insertions(+), 1 deletions(-) [+]
line wrap: on
line diff
--- a/src/director/director.c	Tue Oct 25 00:13:23 2016 +0300
+++ b/src/director/director.c	Tue Oct 25 00:22:20 2016 +0300
@@ -717,9 +717,18 @@
 	struct user *user =
 		user_directory_lookup(ctx->dir->users, ctx->username_hash);
 
-	if (user != NULL)
+	if (user == NULL) {
+		dir_debug("User %u freed while flushing, result=%d",
+			  ctx->username_hash, result);
+	} else if (user->kill_state != USER_KILL_STATE_FLUSHING) {
+		dir_debug("User %u move state changed while flushing, result=%d",
+			  ctx->username_hash, result);
+	} else {
+		dir_debug("Flushing user %u finished, result=%d",
+			  ctx->username_hash, result);
 		director_user_kill_finish_delayed(ctx->dir, user,
 						  result == 1);
+	}
 	if (result == 0) {
 		struct istream *is = iostream_temp_finish(&ctx->reply, (size_t)-1);
 		char *data;
@@ -781,6 +790,8 @@
 	const char *const args[] = {"FLUSH",
 		t_strdup_printf("%u", user->username_hash), NULL};
 
+	user->kill_state = USER_KILL_STATE_FLUSHING;
+
 	if ((program_client_create(ctx->socket_path, args, &set, FALSE,
 				   &ctx->pclient, &error)) != 0) {
 		i_error("%s: Failed to flush user hash %u in host %s: %s",
@@ -857,6 +868,7 @@
 static void
 director_finish_user_kill(struct director *dir, struct user *user, bool self)
 {
+	i_assert(user->kill_state != USER_KILL_STATE_FLUSHING);
 	i_assert(user->kill_state != USER_KILL_STATE_DELAY);
 
 	if (dir->right == NULL) {
@@ -925,6 +937,7 @@
 
 static void director_user_move_timeout(struct user *user)
 {
+	i_assert(user->kill_state != USER_KILL_STATE_FLUSHING);
 	i_assert(user->kill_state != USER_KILL_STATE_DELAY);
 
 	if (log_throttle_accept(user_move_throttle)) {
@@ -1095,6 +1108,7 @@
 		director_finish_user_kill(dir, user, TRUE);
 		break;
 	case USER_KILL_STATE_NONE:
+	case USER_KILL_STATE_FLUSHING:
 	case USER_KILL_STATE_DELAY:
 	case USER_KILL_STATE_KILLING_NOTIFY_RECEIVED:
 		break;
--- a/src/director/user-directory.c	Tue Oct 25 00:13:23 2016 +0300
+++ b/src/director/user-directory.c	Tue Oct 25 00:22:20 2016 +0300
@@ -42,6 +42,7 @@
 	"notify-received",
 	"waiting-for-notify",
 	"waiting-for-everyone",
+	"flushing",
 	"delay",
 };
 
--- a/src/director/user-directory.h	Tue Oct 25 00:13:23 2016 +0300
+++ b/src/director/user-directory.h	Tue Oct 25 00:22:20 2016 +0300
@@ -16,6 +16,8 @@
 	/* We're done killing, but waiting for USER-KILLED-EVERYWHERE
 	   notification until this state gets reset. */
 	USER_KILL_STATE_KILLED_WAITING_FOR_EVERYONE,
+	/* Waiting for the flush socket to finish. */
+	USER_KILL_STATE_FLUSHING,
 	/* Wait for a while for the user connections to actually die. Note that
 	   only at this stage we can be sure that all the directors know about
 	   the user move (although it could be earlier if we added a new