Skip to content

Commit 87e9a59

Browse files
RUST-1842 Update prose tests for mongos deprioritization during retryable ops (#1397)
1 parent 7bfe7c1 commit 87e9a59

File tree

3 files changed

+105
-32
lines changed

3 files changed

+105
-32
lines changed

src/test/spec/retryable_reads.rs

Lines changed: 48 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use std::{future::IntoFuture, time::Duration};
1+
use std::{future::IntoFuture, sync::Arc, time::Duration};
22

33
use crate::bson::doc;
44

@@ -8,6 +8,7 @@ use crate::{
88
cmap::{CmapEvent, ConnectionCheckoutFailedReason},
99
command::CommandEvent,
1010
},
11+
options::SelectionCriteria,
1112
runtime::{self, AsyncJoinHandle},
1213
test::{
1314
block_connection_supported,
@@ -174,23 +175,40 @@ async fn retry_read_different_mongos() {
174175
client_options.hosts.drain(2..);
175176
client_options.retry_reads = Some(true);
176177

177-
let mut guards = vec![];
178-
for ix in [0, 1] {
179-
let mut opts = client_options.clone();
180-
opts.hosts.remove(ix);
181-
opts.direct_connection = Some(true);
182-
let client = Client::for_test().options(opts).await;
178+
let hosts = client_options.hosts.clone();
179+
let client = Client::for_test()
180+
.options(client_options)
181+
.monitor_events()
182+
.await;
183183

184+
// NOTE: This test uses a single client to set failpoints on each mongos and run the find
185+
// operation. This avoids flakiness caused by a race between server discovery and server
186+
// selection.
187+
188+
// When a client is first created, it initializes its view of the topology with all configured
189+
// mongos addresses, but marks each as Unknown until it completes the server discovery process
190+
// by sending and receiving "hello" messages Unknown servers are not eligible for server
191+
// selection.
192+
193+
// Previously, we created a new client for each call to `enable_fail_point` and for the find
194+
// operation. Each new client restarted the discovery process, and sometimes had not yet marked
195+
// both mongos servers as usable, leading to test failures when the retry logic couldn't find a
196+
// second eligible server.
197+
198+
// By reusing a single client, each `enable_fail_point` call forces discovery to complete for
199+
// the corresponding mongos. As a result, when the find operation runs, the client has a
200+
// fully discovered topology and can reliably select between both servers.
201+
let mut guards = Vec::new();
202+
for address in hosts {
203+
let address = address.clone();
184204
let fail_point = FailPoint::fail_command(&["find"], FailPointMode::Times(1))
185205
.error_code(6)
186-
.close_connection(true);
206+
.selection_criteria(SelectionCriteria::Predicate(Arc::new(move |info| {
207+
info.description.address == address
208+
})));
187209
guards.push(client.enable_fail_point(fail_point).await.unwrap());
188210
}
189211

190-
let client = Client::for_test()
191-
.options(client_options)
192-
.monitor_events()
193-
.await;
194212
let result = client
195213
.database("test")
196214
.collection::<crate::bson::Document>("retry_read_different_mongos")
@@ -211,6 +229,14 @@ async fn retry_read_different_mongos() {
211229
"unexpected events: {:#?}",
212230
events,
213231
);
232+
let first_failed = events[1].as_command_failed().unwrap();
233+
let first_address = &first_failed.connection.address;
234+
let second_failed = events[3].as_command_failed().unwrap();
235+
let second_address = &second_failed.connection.address;
236+
assert_ne!(
237+
first_address, second_address,
238+
"Failed commands did not occur on two different mongos instances"
239+
);
214240

215241
drop(guards); // enforce lifetime
216242
}
@@ -235,12 +261,11 @@ async fn retry_read_same_mongos() {
235261
client_options.direct_connection = Some(true);
236262
let client = Client::for_test().options(client_options).await;
237263

238-
let fail_point = FailPoint::fail_command(&["find"], FailPointMode::Times(1))
239-
.error_code(6)
240-
.close_connection(true);
264+
let fail_point = FailPoint::fail_command(&["find"], FailPointMode::Times(1)).error_code(6);
241265
client.enable_fail_point(fail_point).await.unwrap()
242266
};
243267

268+
client_options.direct_connection = Some(false);
244269
let client = Client::for_test()
245270
.options(client_options)
246271
.monitor_events()
@@ -265,6 +290,14 @@ async fn retry_read_same_mongos() {
265290
"unexpected events: {:#?}",
266291
events,
267292
);
293+
let first_failed = events[1].as_command_failed().unwrap();
294+
let first_address = &first_failed.connection.address;
295+
let second_failed = events[3].as_command_succeeded().unwrap();
296+
let second_address = &second_failed.connection.address;
297+
assert_eq!(
298+
first_address, second_address,
299+
"Failed command and retry did not occur on the same mongos instance",
300+
);
268301

269302
drop(fp_guard); // enforce lifetime
270303
}

src/test/spec/retryable_writes.rs

Lines changed: 49 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
use std::{sync::Arc, time::Duration};
22

3-
use crate::bson::Bson;
3+
use crate::{bson::Bson, options::SelectionCriteria};
44
use tokio::sync::Mutex;
55

66
use crate::{
@@ -317,27 +317,44 @@ async fn retry_write_different_mongos() {
317317
);
318318
return;
319319
}
320+
321+
// NOTE: This test uses a single client to set failpoints on each mongos and run the insert
322+
// operation. This avoids flakiness caused by a race between server discovery and server
323+
// selection.
324+
325+
// When a client is first created, it initializes its view of the topology with all configured
326+
// mongos addresses, but marks each as Unknown until it completes the server discovery process
327+
// by sending and receiving "hello" messages Unknown servers are not eligible for server
328+
// selection.
329+
330+
// Previously, we created a new client for each call to `enable_fail_point` and for the insert
331+
// operation. Each new client restarted the discovery process, and sometimes had not yet marked
332+
// both mongos servers as usable, leading to test failures when the retry logic couldn't insert
333+
// a second eligible server.
334+
335+
// By reusing a single client, each `enable_fail_point` call forces discovery to complete for
336+
// the corresponding mongos. As a result, when the insert operation runs, the client has a
337+
// fully discovered topology and can reliably select between both servers.
320338
client_options.hosts.drain(2..);
321339
client_options.retry_writes = Some(true);
340+
let hosts = client_options.hosts.clone();
341+
let client = Client::for_test()
342+
.options(client_options)
343+
.monitor_events()
344+
.await;
322345

323-
let mut guards = vec![];
324-
for ix in [0, 1] {
325-
let mut opts = client_options.clone();
326-
opts.hosts.remove(ix);
327-
opts.direct_connection = Some(true);
328-
let client = Client::for_test().options(opts).await;
329-
346+
let mut guards = Vec::new();
347+
for address in hosts {
348+
let address = address.clone();
330349
let fail_point = FailPoint::fail_command(&["insert"], FailPointMode::Times(1))
331350
.error_code(6)
332-
.error_labels(vec![RETRYABLE_WRITE_ERROR])
333-
.close_connection(true);
351+
.error_labels([RETRYABLE_WRITE_ERROR])
352+
.selection_criteria(SelectionCriteria::Predicate(Arc::new(move |info| {
353+
info.description.address == address
354+
})));
334355
guards.push(client.enable_fail_point(fail_point).await.unwrap());
335356
}
336357

337-
let client = Client::for_test()
338-
.options(client_options)
339-
.monitor_events()
340-
.await;
341358
let result = client
342359
.database("test")
343360
.collection::<crate::bson::Document>("retry_write_different_mongos")
@@ -358,6 +375,14 @@ async fn retry_write_different_mongos() {
358375
"unexpected events: {:#?}",
359376
events,
360377
);
378+
let first_failed = events[1].as_command_failed().unwrap();
379+
let first_address = &first_failed.connection.address;
380+
let second_failed = events[3].as_command_failed().unwrap();
381+
let second_address = &second_failed.connection.address;
382+
assert_ne!(
383+
first_address, second_address,
384+
"Failed commands did not occur on two different mongos instances"
385+
);
361386

362387
drop(guards); // enforce lifetime
363388
}
@@ -384,11 +409,11 @@ async fn retry_write_same_mongos() {
384409

385410
let fail_point = FailPoint::fail_command(&["insert"], FailPointMode::Times(1))
386411
.error_code(6)
387-
.error_labels(vec![RETRYABLE_WRITE_ERROR])
388-
.close_connection(true);
412+
.error_labels(vec![RETRYABLE_WRITE_ERROR]);
389413
client.enable_fail_point(fail_point).await.unwrap()
390414
};
391415

416+
client_options.direct_connection = Some(false);
392417
let client = Client::for_test()
393418
.options(client_options)
394419
.monitor_events()
@@ -413,6 +438,14 @@ async fn retry_write_same_mongos() {
413438
"unexpected events: {:#?}",
414439
events,
415440
);
441+
let first_failed = events[1].as_command_failed().unwrap();
442+
let first_address = &first_failed.connection.address;
443+
let second_failed = events[3].as_command_succeeded().unwrap();
444+
let second_address = &second_failed.connection.address;
445+
assert_eq!(
446+
first_address, second_address,
447+
"Failed commands did not occur on the same mongos instance",
448+
);
416449

417450
drop(fp_guard); // enforce lifetime
418451
}

src/test/util/event.rs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ use crate::{
99
bson::doc,
1010
event::{
1111
cmap::CmapEvent,
12-
command::{CommandEvent, CommandSucceededEvent},
12+
command::{CommandEvent, CommandFailedEvent, CommandSucceededEvent},
1313
sdam::SdamEvent,
1414
},
1515
test::get_client_options,
@@ -101,6 +101,13 @@ impl CommandEvent {
101101
_ => None,
102102
}
103103
}
104+
105+
pub(crate) fn as_command_failed(&self) -> Option<&CommandFailedEvent> {
106+
match self {
107+
CommandEvent::Failed(e) => Some(e),
108+
_ => None,
109+
}
110+
}
104111
}
105112

106113
#[derive(Clone, Debug)]

0 commit comments

Comments
 (0)