Skip to content

Commit b046456

Browse files
lwshangclaude
andcommitted
test: fix flaky test_route_with_one_healthy_and_one_unhealthy_seed
Replace fixed sleep durations with retry loops that actively poll for expected routing state. This eliminates timing-related flakiness in CI environments where task scheduling is less predictable. Changes: - Add wait_for_routing_to_domains helper that polls until expected domains are available - Update test to use retry loops with timeouts instead of fixed sleeps - Remove timing assumptions that failed in CI due to scheduler variance Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
1 parent 3473511 commit b046456

File tree

2 files changed

+52
-9
lines changed

2 files changed

+52
-9
lines changed

ic-agent/src/agent/route_provider/dynamic_routing/dynamic_route_provider.rs

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -389,7 +389,8 @@ mod tests {
389389
},
390390
node::Node,
391391
test_utils::{
392-
assert_routed_domains, route_n_times, NodeHealthCheckerMock, NodesFetcherMock,
392+
assert_routed_domains, route_n_times, wait_for_routing_to_domains,
393+
NodeHealthCheckerMock, NodesFetcherMock,
393394
},
394395
},
395396
RouteProvider, RoutesStats,
@@ -726,18 +727,24 @@ mod tests {
726727
.with_check_period(check_interval)
727728
.build();
728729
route_provider.start().await;
729-
tokio::time::sleep(std::time::Duration::from_millis(100)).await;
730730
let route_provider = Arc::new(route_provider);
731731

732-
// Test 1: calls to route() return only a healthy seed ic0.app.
733-
let routed_domains = route_n_times(3, Arc::clone(&route_provider));
734-
assert_routed_domains(routed_domains, vec![node_1.domain()]);
732+
// Test 1: Wait for routing to stabilize and verify only the healthy seed ic0.app is returned.
733+
wait_for_routing_to_domains(
734+
Arc::clone(&route_provider),
735+
vec![node_1.domain()],
736+
Duration::from_secs(3),
737+
)
738+
.await;
735739

736-
// Test 2: calls to route() return two healthy seeds, as the unhealthy seed becomes healthy.
740+
// Test 2: Make the unhealthy seed healthy and wait for routing to reflect both healthy seeds.
737741
checker.overwrite_healthy_nodes(vec![node_1.clone(), node_2.clone()]);
738-
tokio::time::sleep(2 * check_interval).await;
739-
let routed_domains = route_n_times(6, Arc::clone(&route_provider));
740-
assert_routed_domains(routed_domains, vec![node_1.domain(), node_2.domain()]);
742+
wait_for_routing_to_domains(
743+
Arc::clone(&route_provider),
744+
vec![node_1.domain(), node_2.domain()],
745+
Duration::from_secs(5),
746+
)
747+
.await;
741748
}
742749

743750
#[tokio::test]

ic-agent/src/agent/route_provider/dynamic_routing/test_utils.rs

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,42 @@ where
5858
}
5959
}
6060

61+
/// Polls the route provider until the expected domains are available or timeout is reached.
62+
/// This is more reliable than fixed sleeps for async state updates in tests.
63+
pub(super) async fn wait_for_routing_to_domains(
64+
route_provider: Arc<impl RouteProvider + ?Sized>,
65+
expected_domains: Vec<&str>,
66+
timeout: Duration,
67+
) {
68+
let start = std::time::Instant::now();
69+
let poll_interval = Duration::from_millis(50);
70+
let sample_size = expected_domains.len() * 2; // Sample multiple times to account for probabilistic routing
71+
72+
loop {
73+
if start.elapsed() >= timeout {
74+
panic!(
75+
"Timeout waiting for routing to expected domains: {:?}. Elapsed: {:?}",
76+
expected_domains,
77+
start.elapsed()
78+
);
79+
}
80+
81+
let routed_domains = route_n_times(sample_size, Arc::clone(&route_provider));
82+
let unique_domains: HashSet<String> = routed_domains.into_iter().collect();
83+
84+
// Check if all expected domains are present
85+
let expected_set: HashSet<&str> = expected_domains.iter().copied().collect();
86+
let actual_set: HashSet<&str> = unique_domains.iter().map(|s| s.as_str()).collect();
87+
88+
if expected_set == actual_set {
89+
// Success - all expected domains are now being routed to
90+
return;
91+
}
92+
93+
crate::util::sleep(poll_interval).await;
94+
}
95+
}
96+
6197
#[derive(Debug)]
6298
pub(super) struct NodesFetcherMock {
6399
// A set of nodes, existing in the topology.

0 commit comments

Comments
 (0)