Skip to content

Commit 74600b6

Browse files
committed
CDRIVER-2237 re-check servers after network errors
Obeying the Server Discovery and Monitoring Spec, mark a server Unknown after any network besides a timeout, re-check the server before using it again.
1 parent 7d24183 commit 74600b6

File tree

6 files changed

+101
-47
lines changed

6 files changed

+101
-47
lines changed

src/mongoc/mongoc-cluster-private.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,10 @@ void
8282
mongoc_cluster_destroy (mongoc_cluster_t *cluster);
8383

8484
void
85-
mongoc_cluster_disconnect_node (mongoc_cluster_t *cluster, uint32_t id);
85+
mongoc_cluster_disconnect_node (mongoc_cluster_t *cluster,
86+
uint32_t id,
87+
bool invalidate,
88+
const bson_error_t *why);
8689

8790
int32_t
8891
mongoc_cluster_get_max_bson_obj_size (mongoc_cluster_t *cluster);
@@ -101,8 +104,7 @@ _mongoc_cluster_buffer_iovec (mongoc_iovec_t *iov,
101104
char *buffer);
102105

103106
bool
104-
mongoc_cluster_check_interval (mongoc_cluster_t *cluster,
105-
uint32_t server_id);
107+
mongoc_cluster_check_interval (mongoc_cluster_t *cluster, uint32_t server_id);
106108

107109
bool
108110
mongoc_cluster_sendv_to_server (mongoc_cluster_t *cluster,

src/mongoc/mongoc-cluster.c

Lines changed: 48 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -409,7 +409,7 @@ mongoc_cluster_run_command_internal (mongoc_cluster_t *cluster,
409409
cluster->iov.len,
410410
cluster->sockettimeoutms,
411411
error)) {
412-
mongoc_cluster_disconnect_node (cluster, cmd->server_id);
412+
mongoc_cluster_disconnect_node (cluster, cmd->server_id, true, error);
413413

414414
/* add info about the command to writev_full's error message */
415415
_bson_error_message_printf (
@@ -427,11 +427,12 @@ mongoc_cluster_run_command_internal (mongoc_cluster_t *cluster,
427427
reply_header_size,
428428
reply_header_size,
429429
cluster->sockettimeoutms)) {
430-
mongoc_cluster_disconnect_node (cluster, cmd->server_id);
431430
RUN_CMD_ERR (MONGOC_ERROR_STREAM,
432431
MONGOC_ERROR_STREAM_SOCKET,
433432
"socket error or timeout");
434433

434+
mongoc_cluster_disconnect_node (
435+
cluster, cmd->server_id, !mongoc_stream_timed_out (stream), error);
435436
GOTO (done);
436437
}
437438

@@ -1347,7 +1348,9 @@ _mongoc_cluster_auth_node (mongoc_cluster_t *cluster,
13471348
* mongoc_cluster_disconnect_node --
13481349
*
13491350
* Remove a node from the set of nodes. This should be done if
1350-
* a stream in the set is found to be invalid.
1351+
* a stream in the set is found to be invalid. If @invalidate is
1352+
* true, also mark the server Unknown in the topology description,
1353+
* passing the error information from @why as the reason.
13511354
*
13521355
* WARNING: pointers to a disconnected mongoc_cluster_node_t or
13531356
* its stream are now invalid, be careful of dangling pointers.
@@ -1363,9 +1366,13 @@ _mongoc_cluster_auth_node (mongoc_cluster_t *cluster,
13631366
*/
13641367

13651368
void
1366-
mongoc_cluster_disconnect_node (mongoc_cluster_t *cluster, uint32_t server_id)
1369+
mongoc_cluster_disconnect_node (mongoc_cluster_t *cluster,
1370+
uint32_t server_id,
1371+
bool invalidate,
1372+
const bson_error_t *why /* IN */)
13671373
{
13681374
mongoc_topology_t *topology = cluster->client->topology;
1375+
13691376
ENTRY;
13701377

13711378
if (topology->single_threaded) {
@@ -1377,13 +1384,15 @@ mongoc_cluster_disconnect_node (mongoc_cluster_t *cluster, uint32_t server_id)
13771384
/* might never actually have connected */
13781385
if (scanner_node && scanner_node->stream) {
13791386
mongoc_topology_scanner_node_disconnect (scanner_node, true);
1380-
EXIT;
13811387
}
1382-
EXIT;
13831388
} else {
13841389
mongoc_set_rm (cluster->nodes, server_id);
13851390
}
13861391

1392+
if (invalidate) {
1393+
mongoc_topology_invalidate_server (topology, server_id, why);
1394+
}
1395+
13871396
EXIT;
13881397
}
13891398

@@ -1606,10 +1615,9 @@ _mongoc_cluster_stream_for_server (mongoc_cluster_t *cluster,
16061615
* ServerDescription of type Unknown, and fill the ServerDescription's
16071616
* error field with useful information."
16081617
*
1609-
* error was filled by fetch_stream_single/pooled, pass it to invalidate()
1618+
* error was filled by fetch_stream_single/pooled, pass it to disconnect()
16101619
*/
1611-
mongoc_cluster_disconnect_node (cluster, server_id);
1612-
mongoc_topology_invalidate_server (topology, server_id, err_ptr);
1620+
mongoc_cluster_disconnect_node (cluster, server_id, true, err_ptr);
16131621
}
16141622

16151623
RETURN (server_stream);
@@ -1643,18 +1651,23 @@ mongoc_cluster_stream_for_server (mongoc_cluster_t *cluster,
16431651
bson_error_t *error)
16441652
{
16451653
mongoc_server_stream_t *server_stream = NULL;
1654+
bson_error_t err_local;
16461655

16471656
ENTRY;
16481657

16491658
BSON_ASSERT (cluster);
16501659
BSON_ASSERT (server_id);
16511660

1661+
if (!error) {
1662+
error = &err_local;
1663+
}
1664+
16521665
server_stream = _mongoc_cluster_stream_for_server (
16531666
cluster, server_id, reconnect_ok, error);
16541667

16551668
if (!server_stream) {
16561669
/* failed */
1657-
mongoc_cluster_disconnect_node (cluster, server_id);
1670+
mongoc_cluster_disconnect_node (cluster, server_id, true, error);
16581671
}
16591672

16601673
RETURN (server_stream);
@@ -1801,7 +1814,8 @@ mongoc_cluster_fetch_stream_pooled (mongoc_cluster_t *cluster,
18011814
if (timestamp == -1 || cluster_node->timestamp < timestamp) {
18021815
/* topology change or net error during background scan made us remove
18031816
* or replace server description since node's birth. destroy node. */
1804-
mongoc_cluster_disconnect_node (cluster, server_id);
1817+
mongoc_cluster_disconnect_node (
1818+
cluster, server_id, false /* invalidate */, NULL);
18051819
} else {
18061820
return _mongoc_cluster_create_server_stream (
18071821
topology, server_id, cluster_node->stream, error);
@@ -2206,7 +2220,11 @@ mongoc_cluster_check_interval (mongoc_cluster_t *cluster, uint32_t server_id)
22062220

22072221
if (scanner_node->last_used + (1000 * CHECK_CLOSED_DURATION_MSEC) < now) {
22082222
if (mongoc_stream_check_closed (stream)) {
2209-
mongoc_cluster_disconnect_node (cluster, server_id);
2223+
bson_set_error (&error,
2224+
MONGOC_ERROR_STREAM,
2225+
MONGOC_ERROR_STREAM_SOCKET,
2226+
"connection closed");
2227+
mongoc_cluster_disconnect_node (cluster, server_id, true, &error);
22102228
return false;
22112229
}
22122230
}
@@ -2223,9 +2241,7 @@ mongoc_cluster_check_interval (mongoc_cluster_t *cluster, uint32_t server_id)
22232241
bson_destroy (&command);
22242242

22252243
if (!r) {
2226-
mongoc_cluster_disconnect_node (cluster, server_id);
2227-
mongoc_topology_invalidate_server (
2228-
topology, server_id, &error /* IN */);
2244+
mongoc_cluster_disconnect_node (cluster, server_id, true, &error);
22292245
}
22302246
}
22312247

@@ -2420,6 +2436,7 @@ mongoc_cluster_try_recv (mongoc_cluster_t *cluster,
24202436
bson_error_t *error)
24212437
{
24222438
uint32_t server_id;
2439+
bson_error_t err_local;
24232440
int32_t msg_len;
24242441
int32_t max_msg_size;
24252442
off_t pos;
@@ -2435,6 +2452,10 @@ mongoc_cluster_try_recv (mongoc_cluster_t *cluster,
24352452

24362453
TRACE ("Waiting for reply from server_id \"%u\"", server_id);
24372454

2455+
if (!error) {
2456+
error = &err_local;
2457+
}
2458+
24382459
/*
24392460
* Buffer the message length to determine how much more to read.
24402461
*/
@@ -2444,7 +2465,11 @@ mongoc_cluster_try_recv (mongoc_cluster_t *cluster,
24442465
MONGOC_DEBUG (
24452466
"Could not read 4 bytes, stream probably closed or timed out");
24462467
mongoc_counter_protocol_ingress_error_inc ();
2447-
mongoc_cluster_disconnect_node (cluster, server_id);
2468+
mongoc_cluster_disconnect_node (
2469+
cluster,
2470+
server_id,
2471+
!mongoc_stream_timed_out (server_stream->stream),
2472+
error);
24482473
RETURN (false);
24492474
}
24502475

@@ -2459,7 +2484,7 @@ mongoc_cluster_try_recv (mongoc_cluster_t *cluster,
24592484
MONGOC_ERROR_PROTOCOL,
24602485
MONGOC_ERROR_PROTOCOL_INVALID_REPLY,
24612486
"Corrupt or malicious reply received.");
2462-
mongoc_cluster_disconnect_node (cluster, server_id);
2487+
mongoc_cluster_disconnect_node (cluster, server_id, true, error);
24632488
mongoc_counter_protocol_ingress_error_inc ();
24642489
RETURN (false);
24652490
}
@@ -2472,7 +2497,11 @@ mongoc_cluster_try_recv (mongoc_cluster_t *cluster,
24722497
msg_len - 4,
24732498
cluster->sockettimeoutms,
24742499
error)) {
2475-
mongoc_cluster_disconnect_node (cluster, server_id);
2500+
mongoc_cluster_disconnect_node (
2501+
cluster,
2502+
server_id,
2503+
!mongoc_stream_timed_out (server_stream->stream),
2504+
error);
24762505
mongoc_counter_protocol_ingress_error_inc ();
24772506
RETURN (false);
24782507
}
@@ -2485,7 +2514,7 @@ mongoc_cluster_try_recv (mongoc_cluster_t *cluster,
24852514
MONGOC_ERROR_PROTOCOL,
24862515
MONGOC_ERROR_PROTOCOL_INVALID_REPLY,
24872516
"Failed to decode reply from server.");
2488-
mongoc_cluster_disconnect_node (cluster, server_id);
2517+
mongoc_cluster_disconnect_node (cluster, server_id, true, error);
24892518
mongoc_counter_protocol_ingress_error_inc ();
24902519
RETURN (false);
24912520
}

src/mongoc/mongoc-cursor.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -527,7 +527,7 @@ _mongoc_cursor_destroy (mongoc_cursor_t *cursor)
527527
if (!cursor->done) {
528528
/* The only way to stop an exhaust cursor is to kill the connection */
529529
mongoc_cluster_disconnect_node (&cursor->client->cluster,
530-
cursor->server_id);
530+
cursor->server_id, false, NULL);
531531
}
532532
} else if (cursor->rpc.reply.cursor_id) {
533533
bson_strncpy (db, cursor->ns, cursor->dblen + 1);

tests/test-mongoc-client.c

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -450,7 +450,8 @@ test_mongoc_client_authenticate_cached (bool pooled)
450450
mongoc_cursor_destroy (cursor);
451451

452452
if (pooled) {
453-
mongoc_cluster_disconnect_node (&client->cluster, 1);
453+
mongoc_cluster_disconnect_node (
454+
&client->cluster, 1, false /* invalidate */, NULL);
454455
} else {
455456
scanner_node =
456457
mongoc_topology_scanner_get_node (client->topology->scanner, 1);
@@ -1969,7 +1970,9 @@ _test_mongoc_client_ssl_opts (bool pooled)
19691970
host = test_framework_get_host ();
19701971
port = test_framework_get_port ();
19711972
uri_str = bson_strdup_printf (
1972-
"mongodb://%s:%d/?serverSelectionTimeoutMS=1000", host, port);
1973+
"mongodb://%s:%d/?serverSelectionTimeoutMS=1000&connectTimeoutMS=1000",
1974+
host,
1975+
port);
19731976

19741977
uri_str_auth = test_framework_add_user_password_from_env (uri_str);
19751978
uri_str_auth_ssl = bson_strdup_printf ("%s&ssl=true", uri_str_auth);
@@ -2518,8 +2521,6 @@ _cmd (mock_server_t *server,
25182521

25192522
if (server_replies) {
25202523
mock_server_replies_simple (request, "{'ok': 1}");
2521-
} else {
2522-
mock_server_hangs_up (request);
25232524
}
25242525

25252526
r = future_get_bool (future);
@@ -2619,6 +2620,7 @@ _test_ssl_reconnect (bool pooled)
26192620
mock_server_run (server);
26202621

26212622
uri = mongoc_uri_copy (mock_server_get_uri (server));
2623+
mongoc_uri_set_option_as_int32 (uri, "socketTimeoutMS", 1000);
26222624

26232625
if (pooled) {
26242626
capture_logs (true);
@@ -2633,11 +2635,11 @@ _test_ssl_reconnect (bool pooled)
26332635
ASSERT_OR_PRINT (_cmd (server, client, true /* server replies */, &error),
26342636
error);
26352637

2636-
/* man-in-the-middle: certificate changed, for example expired*/
2638+
/* man-in-the-middle: certificate changed, for example expired */
26372639
server_opts.pem_file = CERT_EXPIRED;
26382640
mock_server_set_ssl_opts (server, &server_opts);
26392641

2640-
/* server closes connections */
2642+
/* network timeout */
26412643

26422644
ASSERT (!_cmd (server, client, false /* server hangs up */, &error));
26432645
if (pooled) {
@@ -3062,7 +3064,7 @@ _test_null_error_pointer (bool pooled)
30623064
/* disconnect */
30633065
mock_server_destroy (server);
30643066
if (pooled) {
3065-
mongoc_cluster_disconnect_node (&client->cluster, 1);
3067+
mongoc_cluster_disconnect_node (&client->cluster, 1, false, NULL);
30663068
} else {
30673069
mongoc_topology_scanner_node_t *scanner_node;
30683070

tests/test-mongoc-cluster.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,7 @@ _test_cluster_command_timeout (bool pooled)
231231
future_t *future;
232232
request_t *request;
233233
uint16_t client_port;
234+
mongoc_server_description_t *sd;
234235
bson_t reply;
235236

236237
capture_logs (true);
@@ -261,6 +262,11 @@ _test_cluster_command_timeout (bool pooled)
261262
MONGOC_ERROR_STREAM_SOCKET,
262263
"Failed to send \"foo\" command with database \"db\"");
263264

265+
/* a network timeout does NOT invalidate the server description */
266+
sd = mongoc_topology_server_by_id (client->topology, 1, NULL);
267+
BSON_ASSERT (sd->type != MONGOC_SERVER_UNKNOWN);
268+
mongoc_server_description_destroy (sd);
269+
264270
/* late response */
265271
mock_server_replies_simple (request, "{'ok': 1, 'bar': 1}");
266272
request_destroy (request);
@@ -319,6 +325,7 @@ _test_write_disconnect (bool legacy)
319325
future_t *future;
320326
request_t *request;
321327
mongoc_topology_scanner_node_t *scanner_node;
328+
mongoc_server_description_t *sd;
322329

323330
if (!TestSuite_CheckMockServerAllowed ()) {
324331
return;
@@ -369,6 +376,11 @@ _test_write_disconnect (bool legacy)
369376
1 /* server_id */);
370377
ASSERT (scanner_node && !scanner_node->stream);
371378

379+
/* a hangup DOES invalidate the server description */
380+
sd = mongoc_topology_server_by_id (client->topology, 1, NULL);
381+
BSON_ASSERT (sd->type == MONGOC_SERVER_UNKNOWN);
382+
mongoc_server_description_destroy (sd);
383+
372384
mongoc_collection_destroy (collection);
373385
request_destroy (request);
374386
future_destroy (future);

tests/test-mongoc-topology.c

Lines changed: 26 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -858,7 +858,8 @@ _test_server_removed_during_handshake (bool pooled)
858858
mock_server_get_host_and_port (server));
859859

860860
/* pretend to close a connection. does NOT affect server description yet */
861-
mongoc_cluster_disconnect_node (&client->cluster, 1);
861+
mongoc_cluster_disconnect_node (
862+
&client->cluster, 1, false /* invalidate */, NULL);
862863
sd = mongoc_client_get_server_description (client, 1);
863864
/* still primary */
864865
ASSERT_CMPINT ((int) MONGOC_SERVER_RS_PRIMARY, ==, sd->type);
@@ -1408,20 +1409,28 @@ test_topology_install (TestSuite *suite)
14081409
test_framework_skip_if_slow);
14091410
TestSuite_AddMockServerTest (
14101411
suite, "/Topology/add_and_scan_failure", test_add_and_scan_failure);
1411-
TestSuite_AddMockServerTest (
1412-
suite, "/Topology/ismaster_retry/single/hangup", test_ismaster_retry_single_hangup);
1413-
TestSuite_AddMockServerTest (
1414-
suite, "/Topology/ismaster_retry/single/timeout", test_ismaster_retry_single_timeout);
1415-
TestSuite_AddMockServerTest (
1416-
suite, "/Topology/ismaster_retry/single/hangup/fail", test_ismaster_retry_single_hangup_fail);
1417-
TestSuite_AddMockServerTest (
1418-
suite, "/Topology/ismaster_retry/single/timeout/fail", test_ismaster_retry_single_timeout_fail);
1419-
TestSuite_AddMockServerTest (
1420-
suite, "/Topology/ismaster_retry/pooled/hangup", test_ismaster_retry_pooled_hangup);
1421-
TestSuite_AddMockServerTest (
1422-
suite, "/Topology/ismaster_retry/pooled/timeout", test_ismaster_retry_pooled_timeout);
1423-
TestSuite_AddMockServerTest (
1424-
suite, "/Topology/ismaster_retry/pooled/hangup/fail", test_ismaster_retry_pooled_hangup_fail);
1425-
TestSuite_AddMockServerTest (
1426-
suite, "/Topology/ismaster_retry/pooled/timeout/fail", test_ismaster_retry_pooled_timeout_fail);
1412+
TestSuite_AddMockServerTest (suite,
1413+
"/Topology/ismaster_retry/single/hangup",
1414+
test_ismaster_retry_single_hangup);
1415+
TestSuite_AddMockServerTest (suite,
1416+
"/Topology/ismaster_retry/single/timeout",
1417+
test_ismaster_retry_single_timeout);
1418+
TestSuite_AddMockServerTest (suite,
1419+
"/Topology/ismaster_retry/single/hangup/fail",
1420+
test_ismaster_retry_single_hangup_fail);
1421+
TestSuite_AddMockServerTest (suite,
1422+
"/Topology/ismaster_retry/single/timeout/fail",
1423+
test_ismaster_retry_single_timeout_fail);
1424+
TestSuite_AddMockServerTest (suite,
1425+
"/Topology/ismaster_retry/pooled/hangup",
1426+
test_ismaster_retry_pooled_hangup);
1427+
TestSuite_AddMockServerTest (suite,
1428+
"/Topology/ismaster_retry/pooled/timeout",
1429+
test_ismaster_retry_pooled_timeout);
1430+
TestSuite_AddMockServerTest (suite,
1431+
"/Topology/ismaster_retry/pooled/hangup/fail",
1432+
test_ismaster_retry_pooled_hangup_fail);
1433+
TestSuite_AddMockServerTest (suite,
1434+
"/Topology/ismaster_retry/pooled/timeout/fail",
1435+
test_ismaster_retry_pooled_timeout_fail);
14271436
}

0 commit comments

Comments
 (0)