Skip to content

Commit 52004fb

Browse files
committed
CDRIVER-2638 retry scan on some errs
1 parent 3603ef5 commit 52004fb

15 files changed

+325
-82
lines changed

src/mongoc/mongoc-client.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1556,7 +1556,7 @@ _mongoc_client_retryable_write_command_with_stream (
15561556
* server does not support retryable writes, fall through and allow the
15571557
* original error to be reported. */
15581558
if (!ret && is_retryable && (error->domain == MONGOC_ERROR_STREAM ||
1559-
mongoc_cluster_is_not_master_error (error))) {
1559+
mongoc_cluster_is_not_master_or_recovering_error (error))) {
15601560
bson_error_t ignored_error;
15611561

15621562
/* each write command may be retried at most once */

src/mongoc/mongoc-cluster-private.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ typedef struct _mongoc_cluster_t {
7272
} mongoc_cluster_t;
7373

7474
bool
75-
mongoc_cluster_is_not_master_error (const bson_error_t *error);
75+
mongoc_cluster_is_not_master_or_recovering_error (const bson_error_t *error);
7676

7777
void
7878
mongoc_cluster_init (mongoc_cluster_t *cluster,

src/mongoc/mongoc-cluster.c

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -406,11 +406,18 @@ mongoc_cluster_run_command_opquery (mongoc_cluster_t *cluster,
406406
}
407407

408408

409+
bool
410+
mongoc_cluster_is_not_master_or_recovering_error (const bson_error_t *error)
411+
{
412+
return strstr (error->message, "not master") ||
413+
strstr (error->message, "node is recovering");
414+
}
415+
416+
409417
bool
410418
mongoc_cluster_is_not_master_error (const bson_error_t *error)
411419
{
412-
return !strncmp (error->message, "not master", 10) ||
413-
!strncmp (error->message, "node is recovering", 18);
420+
return strstr (error->message, "not master") != NULL;
414421
}
415422

416423

@@ -419,13 +426,32 @@ handle_not_master_error (mongoc_cluster_t *cluster,
419426
uint32_t server_id,
420427
const bson_error_t *error)
421428
{
422-
if (mongoc_cluster_is_not_master_error (error)) {
429+
mongoc_topology_t *topology = cluster->client->topology;
430+
431+
if (mongoc_cluster_is_not_master_or_recovering_error (error)) {
423432
/* Server Discovery and Monitoring Spec: "When the client sees a 'not
424433
* master' or 'node is recovering' error it MUST replace the server's
425434
* description with a default ServerDescription of type Unknown."
426435
*/
427-
mongoc_topology_invalidate_server (
428-
cluster->client->topology, server_id, error);
436+
mongoc_topology_invalidate_server (topology, server_id, error);
437+
if (topology->single_threaded) {
438+
/* SDAM Spec: "For single-threaded clients, in the case of a 'not
439+
* master' error, the client MUST check the server immediately... For a
440+
* 'node is recovering' error, single-threaded clients MUST NOT check
441+
* the server, as an immediate server check is unlikely to find a
442+
* usable server."
443+
* Instead of an immediate check, mark the topology as stale so the
444+
* next command scans all servers (to find the new primary). */
445+
if (mongoc_cluster_is_not_master_error (error)) {
446+
cluster->client->topology->stale = true;
447+
}
448+
} else {
449+
/* SDAM Spec: "Multi-threaded and asynchronous clients MUST request an
450+
* immediate check of the server."
451+
* Instead of requesting a check of the one server, request a scan
452+
* to all servers (to find the new primary). */
453+
_mongoc_topology_request_scan (topology);
454+
}
429455
}
430456
}
431457

src/mongoc/mongoc-collection.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3048,7 +3048,7 @@ mongoc_collection_find_and_modify_with_opts (
30483048
* server does not support retryable writes, fall through and allow the
30493049
* original error to be reported. */
30503050
if (!ret && is_retryable && (error->domain == MONGOC_ERROR_STREAM ||
3051-
mongoc_cluster_is_not_master_error (error))) {
3051+
mongoc_cluster_is_not_master_or_recovering_error (error))) {
30523052
bson_error_t ignored_error;
30533053

30543054
/* each write command may be retried at most once */

src/mongoc/mongoc-topology-private.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,5 +151,6 @@ void
151151
_mongoc_topology_do_blocking_scan (mongoc_topology_t *topology,
152152
bson_error_t *error);
153153
bson_t* _mongoc_topology_get_ismaster (mongoc_topology_t* topology);
154-
154+
void
155+
_mongoc_topology_request_scan (mongoc_topology_t *topology);
155156
#endif

src/mongoc/mongoc-topology-scanner-private.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,6 @@ typedef struct mongoc_topology_scanner {
8888
mongoc_topology_scanner_setup_err_cb_t setup_err_cb;
8989
mongoc_topology_scanner_cb_t cb;
9090
void *cb_data;
91-
bool in_progress;
9291
const mongoc_uri_t *uri;
9392
mongoc_async_cmd_setup_t setup;
9493
mongoc_stream_initiator_t initiator;
@@ -193,6 +192,10 @@ mongoc_topology_scanner_set_ssl_opts (mongoc_topology_scanner_t *ts,
193192
mongoc_ssl_opt_t *opts);
194193
#endif
195194

195+
bool
196+
mongoc_topology_scanner_node_in_cooldown (mongoc_topology_scanner_node_t *node,
197+
int64_t when);
198+
196199
/* for testing. */
197200
mongoc_stream_t *
198201
_mongoc_topology_scanner_tcp_initiate (mongoc_async_cmd_t *acmd);

src/mongoc/mongoc-topology-scanner.c

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -814,7 +814,7 @@ mongoc_topology_scanner_node_setup (mongoc_topology_scanner_node_t *node,
814814
*
815815
*--------------------------------------------------------------------------
816816
*/
817-
static bool
817+
bool
818818
mongoc_topology_scanner_node_in_cooldown (mongoc_topology_scanner_node_t *node,
819819
int64_t when)
820820
{
@@ -860,7 +860,9 @@ mongoc_topology_scanner_in_cooldown (mongoc_topology_scanner_t *ts,
860860
*
861861
* Initializes the scanner and begins a full topology check. This
862862
* should be called once before calling mongoc_topology_scanner_work()
863-
* repeatedly to complete the scan.
863+
* to complete the scan.
864+
*
865+
* The topology mutex must be held by the caller.
864866
*
865867
* If "obey_cooldown" is true, this is a single-threaded blocking scan
866868
* that must obey the Server Discovery And Monitoring Spec's cooldownMS:
@@ -887,10 +889,6 @@ mongoc_topology_scanner_start (mongoc_topology_scanner_t *ts,
887889

888890
BSON_ASSERT (ts);
889891

890-
if (ts->in_progress) {
891-
return;
892-
}
893-
894892
_delete_retired_nodes (ts);
895893

896894
now = bson_get_monotonic_time ();

src/mongoc/mongoc-topology.c

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,6 @@
3333
static void
3434
_mongoc_topology_background_thread_stop (mongoc_topology_t *topology);
3535

36-
static void
37-
_mongoc_topology_request_scan (mongoc_topology_t *topology);
38-
3936
static bool
4037
_mongoc_topology_reconcile_add_nodes (mongoc_server_description_t *sd,
4138
mongoc_topology_t *topology)
@@ -888,7 +885,7 @@ _mongoc_topology_host_by_id (mongoc_topology_t *topology,
888885
*--------------------------------------------------------------------------
889886
*/
890887

891-
static void
888+
void
892889
_mongoc_topology_request_scan (mongoc_topology_t *topology)
893890
{
894891
topology->scan_requested = true;
@@ -1456,8 +1453,10 @@ _mongoc_topology_end_sessions_cmd (mongoc_topology_t *topology, bson_t *cmd)
14561453

14571454
/* Locks topology->mutex and retrieves (possibly constructing) the handshake
14581455
* on the topology scanner. */
1459-
bson_t* _mongoc_topology_get_ismaster (mongoc_topology_t* topology) {
1460-
bson_t* cmd;
1456+
bson_t *
1457+
_mongoc_topology_get_ismaster (mongoc_topology_t *topology)
1458+
{
1459+
bson_t *cmd;
14611460
mongoc_mutex_lock (&topology->mutex);
14621461
cmd = _mongoc_topology_scanner_get_ismaster (topology->scanner);
14631462
mongoc_mutex_unlock (&topology->mutex);

src/mongoc/mongoc-write-command.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -593,7 +593,7 @@ _mongoc_write_opmsg (mongoc_write_command_t *command,
593593
* and allow the original error to be reported. */
594594
if (!ret && is_retryable &&
595595
(error->domain == MONGOC_ERROR_STREAM ||
596-
mongoc_cluster_is_not_master_error (error))) {
596+
mongoc_cluster_is_not_master_or_recovering_error (error))) {
597597
bson_error_t ignored_error;
598598

599599
/* each write command may be retried at most once */

tests/TestSuite.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -575,6 +575,7 @@ test_error (const char *format, ...) BSON_GNUC_PRINTF (1, 2);
575575
BSON_FUNC); \
576576
abort (); \
577577
} \
578+
_mongoc_usleep (10 * 1000); \
578579
} \
579580
} while (0)
580581

0 commit comments

Comments
 (0)