changeset 21466:10103d3a4ad4

lib-http: client: Fixed peer reconnection failure handling. The addressed problem occurs in a very specific situation in which the original successful connection is dropped, yet a new connection fails. It manifests as an assertion failure or panic: Panic: file ioloop-epoll.c: line 189 (io_loop_handler_run_internal): assertion failed: (msecs >= 0) Panic: BUG: No IOs or timeouts set. Not waiting for infinity. The timing is very critical. However, this doesn't mean that the occurrence of this problem is very unlikely; it can happen frequently under high load.
author Stephan Bosch <stephan.bosch@dovecot.fi>
date Thu, 02 Feb 2017 01:36:50 +0100
parents eef5c6cc06f8
children 85fc39ade4e3
files src/lib-http/http-client-queue.c src/lib-http/test-http-client-errors.c
diffstat 2 files changed, 189 insertions(+), 30 deletions(-) [+]
line wrap: on
line diff
--- a/src/lib-http/http-client-queue.c	Thu Feb 02 01:34:35 2017 +0100
+++ b/src/lib-http/http-client-queue.c	Thu Feb 02 01:36:50 2017 +0100
@@ -478,18 +478,6 @@
 	struct http_client_peer *failed_peer;
 	struct http_client_peer *const *peer_idx;
 
-	if (queue->cur_peer != NULL) {
-		/* The peer still has some working connections, which means that
-		   pending requests wait until they're picked up by those connections
-		   or the remaining connections fail as well. In the latter case,
-		   connecting to different peer can resolve the situation, but only
-		   if there is more than one IP. In any other case, the requests will
-		   eventually fail. In the future we could start connections to the next
-		   IP at this point already, but that is no small change. */
-		i_assert(array_count(&queue->pending_peers) == 0);
-		return;
-	}
-
 	http_client_queue_debug(queue,
 		"Failed to set up connection to %s%s: %s "
 		"(%u peers pending, %u requests pending)",
@@ -499,25 +487,44 @@
 		reason, array_count(&queue->pending_peers),
 		array_count(&queue->requests));
 
-	/* we're still doing the initial connections to this hport. if
-		 we're also doing parallel connections with soft timeouts
-		 (pending_peer_count>1), wait for them to finish
-		 first. */
-	failed_peer = NULL;
-	array_foreach(&queue->pending_peers, peer_idx) {
-		if (http_client_peer_addr_cmp(&(*peer_idx)->addr, addr) == 0) {
-			failed_peer = *peer_idx;
-			array_delete(&queue->pending_peers,
-				array_foreach_idx(&queue->pending_peers, peer_idx), 1);
-			break;
+	if (queue->cur_peer != NULL) {
+		if (http_client_peer_is_connected(queue->cur_peer)) {
+			/* The peer still has some working connections, which means that
+				 pending requests wait until they're picked up by those connections
+				 or the remaining connections fail as well. In the latter case,
+				 connecting to different peer can resolve the situation, but only
+				 if there is more than one IP. In any other case, the requests will
+				 eventually fail. In the future we could start connections to the next
+				 IP at this point already, but that is no small change. */
+			i_assert(array_count(&queue->pending_peers) == 0);
+			return;
 		}
-	}
-	i_assert(failed_peer != NULL);
-	if (array_count(&queue->pending_peers) > 0) {
-		http_client_queue_debug(queue,
-			"Waiting for remaining pending peers.");
-		http_client_peer_unlink_queue(failed_peer, queue);
-		return;
+
+		failed_peer = queue->cur_peer;
+		http_client_peer_unlink_queue(queue->cur_peer, queue);
+		queue->cur_peer = NULL;
+
+	} else {
+		/* we're still doing the initial connections to this hport. if
+			 we're also doing parallel connections with soft timeouts
+			 (pending_peer_count>1), wait for them to finish
+			 first. */
+		failed_peer = NULL;
+		array_foreach(&queue->pending_peers, peer_idx) {
+			if (http_client_peer_addr_cmp(&(*peer_idx)->addr, addr) == 0) {
+				failed_peer = *peer_idx;
+				array_delete(&queue->pending_peers,
+					array_foreach_idx(&queue->pending_peers, peer_idx), 1);
+				break;
+			}
+		}
+		i_assert(failed_peer != NULL);
+		if (array_count(&queue->pending_peers) > 0) {
+			http_client_queue_debug(queue,
+				"Waiting for remaining pending peers.");
+			http_client_peer_unlink_queue(failed_peer, queue);
+			return;
+		}
 	}
 
 	/* one of the connections failed. if we're not using soft timeouts,
--- a/src/lib-http/test-http-client-errors.c	Thu Feb 02 01:34:35 2017 +0100
+++ b/src/lib-http/test-http-client-errors.c	Thu Feb 02 01:36:50 2017 +0100
@@ -2453,6 +2453,157 @@
 	test_end();
 }
 
+/*
+ * Reconnect failure
+ */
+
+/* dns */
+
+static void
+test_dns_reconnect_failure_input(struct server_connection *conn)
+{
+	static unsigned int count = 0;
+	const char *line;
+
+	while ((line=i_stream_read_next_line(conn->conn.input)) != NULL) {
+		if (debug)
+			i_debug("DNS REQUEST %u: %s", count, line);
+
+		if (count == 0) {
+			o_stream_nsend_str(conn->conn.output,
+				"0 1\n127.0.0.1\n");
+		} else {
+			o_stream_nsend_str(conn->conn.output,
+				t_strdup_printf("%d\n", EAI_FAIL));
+			if (count > 4) {
+				server_connection_deinit(&conn);
+				return;
+			}
+		}
+		count++;
+	}
+}
+
+static void test_dns_reconnect_failure(void)
+{
+	test_server_input = test_dns_reconnect_failure_input;
+	test_server_run(0);
+}
+/* server */
+
+static void
+test_reconnect_failure_input(struct server_connection *conn)
+{
+	static const char *resp =
+		"HTTP/1.1 200 OK\r\n"
+		"Content-Length: 18\r\n"
+		"\r\n"
+		"Everything is OK\r\n";
+
+	o_stream_nsend_str(conn->conn.output, resp);
+	i_close_fd(&fd_listen);
+	sleep(500);
+}
+
+static void test_server_reconnect_failure(unsigned int index)
+{
+	test_server_input = test_reconnect_failure_input;
+	test_server_run(index);
+}
+
+/* client */
+
+struct _reconnect_failure_ctx {
+	struct timeout *to;
+};
+
+static void
+test_client_reconnect_failure_response2(
+	const struct http_response *resp,
+	struct _reconnect_failure_ctx *ctx)
+{
+	if (debug)
+		i_debug("RESPONSE: %u %s", resp->status, resp->reason);
+
+	test_assert(resp->status == 9002);
+	test_assert(resp->reason != NULL && *resp->reason != '\0');
+
+	io_loop_stop(ioloop);
+	i_free(ctx);
+}
+
+static void
+test_client_reconnect_failure_next(
+	struct _reconnect_failure_ctx *ctx)
+{
+	struct http_client_request *hreq;
+
+	if (debug)
+		i_debug("NEXT REQUEST");
+
+	timeout_remove(&ctx->to);
+
+	hreq = http_client_request(http_client,
+		"GET", "example.com", "/reconnect-failure-2.txt",
+		test_client_reconnect_failure_response2, ctx);
+	http_client_request_set_port(hreq, bind_ports[0]);
+	http_client_request_submit(hreq);
+}
+
+static void
+test_client_reconnect_failure_response1(
+	const struct http_response *resp,
+	struct _reconnect_failure_ctx *ctx)
+{
+	if (debug)
+		i_debug("RESPONSE: %u %s", resp->status, resp->reason);
+
+	test_assert(resp->status == 200);
+	test_assert(resp->reason != NULL && *resp->reason != '\0');
+
+	ctx->to = timeout_add_short(999,
+		test_client_reconnect_failure_next, ctx);
+}
+
+static bool
+test_client_reconnect_failure(const struct http_client_settings *client_set)
+{
+	struct http_client_request *hreq;
+	struct _reconnect_failure_ctx *ctx;
+
+	ctx = i_new(struct _reconnect_failure_ctx, 1);
+	
+	http_client = http_client_init(client_set);
+
+	hreq = http_client_request(http_client,
+		"GET", "example.com", "/reconnect-failure-1.txt",
+		test_client_reconnect_failure_response1, ctx);
+	http_client_request_set_port(hreq, bind_ports[0]);
+	http_client_request_submit(hreq);
+
+	return TRUE;
+}
+
+/* test */
+
+static void test_reconnect_failure(void)
+{
+	struct http_client_settings http_client_set;
+
+	test_client_defaults(&http_client_set);
+	http_client_set.dns_client_socket_path = "./dns-test";
+	http_client_set.dns_ttl_msecs = 2000;
+	http_client_set.max_idle_time_msecs = 1000;
+	http_client_set.max_attempts = 1;
+	http_client_set.request_timeout_msecs = 1000;
+
+	test_begin("reconnect failure");
+	test_run_client_server(&http_client_set,
+		test_client_reconnect_failure,
+		test_server_reconnect_failure, 1,
+		test_dns_reconnect_failure);
+	test_end();
+}
 
 /*
  * All tests
@@ -2482,6 +2633,7 @@
 	test_dns_lookup_failure,
 	test_dns_lookup_ttl,
 	test_peer_reuse_failure,
+	test_reconnect_failure,
 	NULL
 };