Skip to content

Commit 9b294e0

Browse files
committed
CDRIVER-789 crash on network err from removed node
If the scanner knows about A, a primary, and B, a secondary: * B is removed from the replica set and shut down * The scanner begins, launching async commands to check A and B * A responds first and tells the scanner to remove B * mongoc_topology_reconcile removes B's mongoc_topology_scanner_node_t and destroys it, stream and all * But the mongoc_async_cmd_t for B is still active in the scanner, with the same stream. * B's mongoc_async_cmd_t fails with a network error, and passes the error into the command callback. * The command callback sees the error and destroys the same stream *again*, crashing. The solution is to not destroy removed nodes in mongoc_topology_reconcile, but close their streams, and mark them "retired" for the remainder of the scan. At the end of the scan destroy retired nodes.
1 parent 5b14714 commit 9b294e0

16 files changed

+365
-41
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,7 @@ function(mongoc_add_example example use_shared)
242242
endfunction()
243243

244244
set(test-libmongoc-sources
245+
${SOURCE_DIR}/tests/debug-stream.c
245246
${SOURCE_DIR}/tests/ha-test.c
246247
${SOURCE_DIR}/tests/json-test.c
247248
${SOURCE_DIR}/tests/mock_server/future.c

src/mongoc/mongoc-async-cmd-private.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ typedef enum
3939
MONGOC_ASYNC_CMD_RECV_LEN,
4040
MONGOC_ASYNC_CMD_RECV_RPC,
4141
MONGOC_ASYNC_CMD_ERROR_STATE,
42+
MONGOC_ASYNC_CMD_CANCELED_STATE,
4243
} mongoc_async_cmd_state_t;
4344

4445
typedef struct _mongoc_async_cmd

src/mongoc/mongoc-async-cmd.c

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -44,15 +44,14 @@ mongoc_async_cmd_result_t
4444
_mongoc_async_cmd_phase_recv_len (mongoc_async_cmd_t *cmd);
4545
mongoc_async_cmd_result_t
4646
_mongoc_async_cmd_phase_recv_rpc (mongoc_async_cmd_t *cmd);
47-
mongoc_async_cmd_result_t
48-
_mongoc_async_cmd_phase_error (mongoc_async_cmd_t *cmd);
4947

5048
static const _monogc_async_cmd_phase_t gMongocCMDPhases[] = {
5149
_mongoc_async_cmd_phase_setup,
5250
_mongoc_async_cmd_phase_send,
5351
_mongoc_async_cmd_phase_recv_len,
5452
_mongoc_async_cmd_phase_recv_rpc,
55-
_mongoc_async_cmd_phase_error,
53+
NULL, /* no callback for MONGOC_ASYNC_CMD_ERROR_STATE */
54+
NULL, /* no callback for MONGOC_ASYNC_CMD_CANCELED_STATE */
5655
};
5756

5857
#ifdef MONGOC_ENABLE_SSL
@@ -97,8 +96,14 @@ mongoc_async_cmd_run (mongoc_async_cmd_t *acmd)
9796
{
9897
mongoc_async_cmd_result_t result;
9998
int64_t rtt;
99+
_monogc_async_cmd_phase_t phase_callback;
100100

101-
result = gMongocCMDPhases[acmd->state](acmd);
101+
phase_callback = gMongocCMDPhases[acmd->state];
102+
if (phase_callback) {
103+
result = phase_callback (acmd);
104+
} else {
105+
result = MONGOC_ASYNC_CMD_ERROR;
106+
}
102107

103108
if (result == MONGOC_ASYNC_CMD_IN_PROGRESS) {
104109
return true;
@@ -109,7 +114,7 @@ mongoc_async_cmd_run (mongoc_async_cmd_t *acmd)
109114
if (result == MONGOC_ASYNC_CMD_SUCCESS) {
110115
acmd->cb (result, &acmd->reply, rtt, acmd->data, &acmd->error);
111116
} else {
112-
/* we're either in ERROR or TIMEOUT */
117+
/* we're in ERROR, TIMEOUT, or CANCELED */
113118
acmd->cb (result, NULL, rtt, acmd->data, &acmd->error);
114119
}
115120

@@ -385,9 +390,3 @@ _mongoc_async_cmd_phase_recv_rpc (mongoc_async_cmd_t *acmd)
385390

386391
return MONGOC_ASYNC_CMD_IN_PROGRESS;
387392
}
388-
389-
mongoc_async_cmd_result_t
390-
_mongoc_async_cmd_phase_error (mongoc_async_cmd_t *acmd)
391-
{
392-
return MONGOC_ASYNC_CMD_ERROR;
393-
}

src/mongoc/mongoc-async.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ mongoc_async_run (mongoc_async_t *async,
9292

9393
DL_FOREACH_SAFE (async->cmds, acmd, tmp)
9494
{
95+
/* async commands are sorted by expire_at */
9596
if (now > acmd->expire_at) {
9697
acmd->cb (MONGOC_ASYNC_CMD_TIMEOUT, NULL, (now - acmd->start_time), acmd->data,
9798
&acmd->error);

src/mongoc/mongoc-client-private.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,12 @@ mongoc_client_t *
6969
_mongoc_client_new_from_uri (const mongoc_uri_t *uri,
7070
mongoc_topology_t *topology);
7171

72+
mongoc_stream_t *
73+
mongoc_client_default_stream_initiator (const mongoc_uri_t *uri,
74+
const mongoc_host_list_t *host,
75+
void *user_data,
76+
bson_error_t *error);
77+
7278
mongoc_stream_t *
7379
_mongoc_client_create_stream (mongoc_client_t *client,
7480
const mongoc_host_list_t *host,

src/mongoc/mongoc-client.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,7 @@ mongoc_client_connect_unix (const mongoc_uri_t *uri,
265265
*--------------------------------------------------------------------------
266266
*/
267267

268-
static mongoc_stream_t *
268+
mongoc_stream_t *
269269
mongoc_client_default_stream_initiator (const mongoc_uri_t *uri,
270270
const mongoc_host_list_t *host,
271271
void *user_data,

src/mongoc/mongoc-cluster.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1161,6 +1161,8 @@ _mongoc_cluster_add_node (mongoc_cluster_t *cluster,
11611161
RETURN (NULL);
11621162
}
11631163

1164+
BSON_ASSERT (!scanner_node->retired);
1165+
11641166
stream = scanner_node->stream;
11651167
if (!stream) {
11661168
MONGOC_WARNING ("Failed connection to %s", sd->connection_address);
@@ -1251,6 +1253,8 @@ mongoc_cluster_fetch_stream (mongoc_cluster_t *cluster,
12511253
goto FETCH_FAIL;
12521254
}
12531255

1256+
BSON_ASSERT (!scanner_node->retired);
1257+
12541258
stream = scanner_node->stream;
12551259
if (!stream) {
12561260
goto FETCH_FAIL;
@@ -2047,6 +2051,8 @@ _mongoc_cluster_check_interval (mongoc_cluster_t *cluster,
20472051
return false;
20482052
}
20492053

2054+
BSON_ASSERT (!scanner_node->retired);
2055+
20502056
stream = scanner_node->stream;
20512057

20522058
if (!stream) {

src/mongoc/mongoc-topology-scanner-private.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ typedef struct mongoc_topology_scanner_node
5454

5555
struct mongoc_topology_scanner_node *next;
5656
struct mongoc_topology_scanner_node *prev;
57+
58+
bool retired;
5759
} mongoc_topology_scanner_node_t;
5860

5961
typedef struct mongoc_topology_scanner
@@ -84,7 +86,7 @@ mongoc_topology_scanner_new (const mongoc_uri_t *uri,
8486
void
8587
mongoc_topology_scanner_destroy (mongoc_topology_scanner_t *ts);
8688

87-
void
89+
mongoc_topology_scanner_node_t *
8890
mongoc_topology_scanner_add (mongoc_topology_scanner_t *ts,
8991
const mongoc_host_list_t *host,
9092
uint32_t id);
@@ -95,6 +97,9 @@ mongoc_topology_scanner_add_and_scan (mongoc_topology_scanner_t *ts,
9597
uint32_t id,
9698
int64_t timeout_msec);
9799

100+
void
101+
mongoc_topology_scanner_node_retire (mongoc_topology_scanner_node_t *node);
102+
98103
void
99104
mongoc_topology_scanner_node_destroy (mongoc_topology_scanner_node_t *node,
100105
bool failed);
@@ -108,6 +113,9 @@ bool
108113
mongoc_topology_scanner_work (mongoc_topology_scanner_t *ts,
109114
int32_t timeout_msec);
110115

116+
void
117+
mongoc_topology_scanner_reset (mongoc_topology_scanner_t *ts);
118+
111119
mongoc_topology_scanner_node_t *
112120
mongoc_topology_scanner_get_node (mongoc_topology_scanner_t *ts,
113121
uint32_t id);

src/mongoc/mongoc-topology-scanner.c

Lines changed: 53 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -97,10 +97,10 @@ mongoc_topology_scanner_destroy (mongoc_topology_scanner_t *ts)
9797
bson_free (ts);
9898
}
9999

100-
static mongoc_topology_scanner_node_t *
101-
_add_node (mongoc_topology_scanner_t *ts,
102-
const mongoc_host_list_t *host,
103-
uint32_t id)
100+
mongoc_topology_scanner_node_t *
101+
mongoc_topology_scanner_add (mongoc_topology_scanner_t *ts,
102+
const mongoc_host_list_t *host,
103+
uint32_t id)
104104
{
105105
mongoc_topology_scanner_node_t *node;
106106

@@ -128,15 +128,6 @@ _add_node (mongoc_topology_scanner_t *ts,
128128
return node;
129129
}
130130

131-
void
132-
mongoc_topology_scanner_add (mongoc_topology_scanner_t *ts,
133-
const mongoc_host_list_t *host,
134-
uint32_t id)
135-
{
136-
_add_node (ts, host, id);
137-
}
138-
139-
140131
void
141132
mongoc_topology_scanner_add_and_scan (mongoc_topology_scanner_t *ts,
142133
const mongoc_host_list_t *host,
@@ -147,7 +138,7 @@ mongoc_topology_scanner_add_and_scan (mongoc_topology_scanner_t *ts,
147138

148139
BSON_ASSERT (timeout_msec < INT32_MAX);
149140

150-
node = _add_node (ts, host, id);
141+
node = mongoc_topology_scanner_add (ts, host, id);
151142

152143
if (node && mongoc_topology_scanner_node_setup (node)) {
153144
node->cmd = mongoc_async_cmd (
@@ -158,9 +149,20 @@ mongoc_topology_scanner_add_and_scan (mongoc_topology_scanner_t *ts,
158149
node, (int32_t) timeout_msec);
159150
}
160151

152+
/* if setup fails the node stays in the scanner. destroyed after the scan. */
161153
return;
162154
}
163155

156+
void
157+
mongoc_topology_scanner_node_retire (mongoc_topology_scanner_node_t *node)
158+
{
159+
if (node->cmd) {
160+
node->cmd->state = MONGOC_ASYNC_CMD_CANCELED_STATE;
161+
}
162+
163+
node->retired = true;
164+
}
165+
164166
void
165167
mongoc_topology_scanner_node_destroy (mongoc_topology_scanner_node_t *node, bool failed)
166168
{
@@ -236,13 +238,19 @@ mongoc_topology_scanner_ismaster_handler (mongoc_async_cmd_result_t async_status
236238
bson_error_t *error)
237239
{
238240
mongoc_topology_scanner_node_t *node;
239-
int64_t now = bson_get_monotonic_time ();
241+
int64_t now;
240242

241243
bson_return_if_fail (data);
242244

243245
node = (mongoc_topology_scanner_node_t *)data;
244246
node->cmd = NULL;
245247

248+
if (node->retired) {
249+
return;
250+
}
251+
252+
now = bson_get_monotonic_time ();
253+
246254
/* if no ismaster response, async cmd had an error or timed out */
247255
if (!ismaster_response ||
248256
async_status == MONGOC_ASYNC_CMD_ERROR ||
@@ -396,6 +404,8 @@ mongoc_topology_scanner_node_setup (mongoc_topology_scanner_node_t *node)
396404

397405
if (node->stream) { return true; }
398406

407+
BSON_ASSERT (!node->retired);
408+
399409
if (node->ts->initiator) {
400410
sock_stream = node->ts->initiator (node->ts->uri, &node->host,
401411
node->ts->initiator_context, &error);
@@ -474,6 +484,9 @@ mongoc_topology_scanner_start (mongoc_topology_scanner_t *ts,
474484
/* check node if it last failed before current cooldown period began */
475485
if (node->last_failed < cooldown) {
476486
if (mongoc_topology_scanner_node_setup (node)) {
487+
488+
BSON_ASSERT (!node->cmd);
489+
477490
node->cmd = mongoc_async_cmd (
478491
ts->async, node->stream, ts->setup,
479492
node->host.host, "admin",
@@ -509,14 +522,37 @@ mongoc_topology_scanner_work (mongoc_topology_scanner_t *ts,
509522
r = mongoc_async_run (ts->async, timeout_msec);
510523

511524
if (! r) {
512-
mongoc_host_list_destroy_all (ts->seen);
513-
ts->seen = NULL;
514525
ts->in_progress = false;
515526
}
516527

517528
return r;
518529
}
519530

531+
/*
532+
*--------------------------------------------------------------------------
533+
*
534+
* mongoc_topology_scanner_reset --
535+
*
536+
* Reset "retired" nodes that failed or were removed in the previous
537+
* scan.
538+
*
539+
*--------------------------------------------------------------------------
540+
*/
541+
542+
void
543+
mongoc_topology_scanner_reset (mongoc_topology_scanner_t *ts)
544+
{
545+
mongoc_topology_scanner_node_t *node, *tmp;
546+
547+
DL_FOREACH_SAFE (ts->nodes, node, tmp) {
548+
if (node->retired) {
549+
mongoc_topology_scanner_node_destroy (node, true);
550+
}
551+
}
552+
553+
mongoc_host_list_destroy_all (ts->seen);
554+
ts->seen = NULL;
555+
}
520556

521557
void
522558
mongoc_topology_scanner_set_async_cb (mongoc_topology_scanner_t *ts,

src/mongoc/mongoc-topology.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ mongoc_topology_reconcile (mongoc_topology_t *topology) {
6363
/* Remove removed nodes */
6464
DL_FOREACH_SAFE (scanner->nodes, ele, tmp) {
6565
if (! mongoc_topology_description_server_by_id (description, ele->id)) {
66-
mongoc_topology_scanner_node_destroy (ele, true);
66+
mongoc_topology_scanner_node_retire (ele);
6767
}
6868
}
6969
}
@@ -326,6 +326,8 @@ _mongoc_topology_do_blocking_scan (mongoc_topology_t *topology) {
326326
while (_mongoc_topology_run_scanner (topology,
327327
topology->connect_timeout_msec)) {}
328328

329+
/* "retired" nodes can be checked again in the next scan */
330+
mongoc_topology_scanner_reset (topology->scanner);
329331
topology->last_scan = bson_get_monotonic_time ();
330332
topology->stale = false;
331333
}
@@ -697,6 +699,10 @@ void * _mongoc_topology_run_background (void *data)
697699
topology->connect_timeout_msec)) {}
698700

699701
mongoc_mutex_lock (&topology->mutex);
702+
703+
/* "retired" nodes can be checked again in the next scan */
704+
mongoc_topology_scanner_reset (topology->scanner);
705+
700706
topology->last_scan = bson_get_monotonic_time ();
701707
topology->scanning = false;
702708
mongoc_mutex_unlock (&topology->mutex);

0 commit comments

Comments
 (0)