chore: restart node with slowest batch processing time in systest (#8219)

mraszyk · web-flow · commit 88430b977033 · 2026-01-06T09:22:35.000Z
This PR updates the system test
`//rs/tests/message_routing:rejoin_test_long_rounds` to restart the
slowest node required for consensus in terms of batch processing time:
this way, the restarted node cannot catch up with the subnet without
additional measures (to be implemented in the future).
diff --git a/rs/tests/message_routing/rejoin_test_lib/rejoin_test_lib.rs b/rs/tests/message_routing/rejoin_test_lib/rejoin_test_lib.rs
@@ -36,6 +36,8 @@ const LATEST_CERTIFIED_HEIGHT: &str = "state_manager_latest_certified_height";
 const LAST_MANIFEST_HEIGHT: &str = "state_manager_last_computed_manifest_height";
 const REPLICATED_STATE_PURGE_HEIGHT_DISK: &str = "replicated_state_purge_height_disk";
 
+const METRIC_PROCESS_BATCH_PHASE_DURATION: &str = "mr_process_batch_phase_duration_seconds";
+
 const GIB: u64 = 1 << 30;
 
 pub async fn rejoin_test(
@@ -263,29 +265,27 @@ async fn deploy_busy_canister(agent: &Agent, effective_canister_id: PrincipalId,
         .expect("Failed to set up a busy canister.");
 }
 
-pub async fn rejoin_test_long_rounds(
-    env: TestEnv,
+async fn deploy_canisters_for_long_rounds(
+    logger: &slog::Logger,
+    nodes: Vec<IcNodeSnapshot>,
     num_canisters: usize,
-    dkg_interval: u64,
-    rejoin_node: IcNodeSnapshot,
-    agent_node: IcNodeSnapshot,
 ) {
-    let logger = env.logger();
-    let agent = agent_node.build_default_agent_async().await;
+    let init_node = nodes[0].clone();
+    let agent = init_node.build_default_agent_async().await;
     let ic00 = ManagementCanister::create(&agent);
 
     let num_seed_canisters = 4;
     info!(
         logger,
         "Deploying {} seed canisters on a node {} ...",
         num_seed_canisters,
-        agent_node.get_public_url()
+        init_node.get_public_url()
     );
     let mut create_seed_canisters_futs = vec![];
     for _ in 0..num_seed_canisters {
         create_seed_canisters_futs.push(deploy_seed_canister(
             &ic00,
-            agent_node.effective_canister_id(),
+            init_node.effective_canister_id(),
         ));
     }
     let seed_canisters = join_all(create_seed_canisters_futs).await;
@@ -312,24 +312,62 @@ pub async fn rejoin_test_long_rounds(
     }
 
     // We deploy 8 "busy" canisters: this way,
-    // there are 2 canisters per each of the 4 scheduler cores
+    // there are 2 canisters per each of the 4 scheduler threads
     // and thus every thread executes 2 x 1.8B = 3.6B instructions.
     let num_busy_canisters = 8;
     info!(
         logger,
         "Deploying {} busy canisters on a node {} ...",
         num_busy_canisters,
-        agent_node.get_public_url()
+        init_node.get_public_url()
     );
     let mut create_busy_canisters_futs = vec![];
     for _ in 0..num_busy_canisters {
         create_busy_canisters_futs.push(deploy_busy_canister(
             &agent,
-            agent_node.effective_canister_id(),
-            &logger,
+            init_node.effective_canister_id(),
+            logger,
         ));
     }
     join_all(create_busy_canisters_futs).await;
+}
+
+pub async fn rejoin_test_long_rounds(
+    env: TestEnv,
+    nodes: Vec<IcNodeSnapshot>,
+    num_canisters: usize,
+    dkg_interval: u64,
+) {
+    let logger = env.logger();
+    deploy_canisters_for_long_rounds(&logger, nodes.clone(), num_canisters).await;
+
+    // Sort nodes by their average duration to process a batch.
+    let mut average_process_batch_durations = vec![];
+    for node in &nodes {
+        let duration = average_process_batch_duration(&logger, node.clone()).await;
+        average_process_batch_durations.push(duration);
+    }
+    let mut paired: Vec<_> = average_process_batch_durations
+        .into_iter()
+        .zip(nodes.into_iter())
+        .collect();
+    paired.sort_by(|(k1, _), (k2, _)| k1.total_cmp(k2));
+    let sorted_nodes: Vec<_> = paired.into_iter().map(|(_, v)| v).collect();
+
+    // The fastest node will be the reference node used to check
+    // the latest certified height of the subnet.
+    let reference_node = sorted_nodes[0].clone();
+
+    // The restarted node will be the slowest node
+    // required for consensus in terms of batch processing time:
+    // this way, the restarted node cannot catch up with the subnet
+    // without additional measures (to be implemented in the future).
+    // E.g., for `n = 13`, we have `f = 4` and the nodes at indices
+    // `0`, `1`, ..., `n - (f + 1)` are required for consensus,
+    // i.e., we restart the node at (0-based) index
+    // `n - (f + 1) = n - (n / 3 + 1)`.
+    let n = sorted_nodes.len();
+    let rejoin_node = sorted_nodes[n - (n / 3 + 1)].clone();
 
     info!(
         logger,
@@ -345,15 +383,15 @@ pub async fn rejoin_test_long_rounds(
     // This way, the restarted node starts from that CUP
     // and we can assert it to catch up until the next CUP.
     info!(logger, "Waiting for a CUP ...");
-    let agent_node_status = agent_node
+    let reference_node_status = reference_node
         .status_async()
         .await
-        .expect("Failed to get status of agent_node");
-    let latest_certified_height = agent_node_status
+        .expect("Failed to get status of reference_node");
+    let latest_certified_height = reference_node_status
         .certified_height
-        .expect("Failed to get certified height of agent_node")
+        .expect("Failed to get certified height of reference_node")
         .get();
-    wait_for_cup(&logger, latest_certified_height, agent_node.clone()).await;
+    wait_for_cup(&logger, latest_certified_height, reference_node.clone()).await;
 
     info!(logger, "Start the killed node again ...");
     rejoin_node.vm().start();
@@ -362,7 +400,7 @@ pub async fn rejoin_test_long_rounds(
     let last_cup_height = wait_for_cup(
         &logger,
         latest_certified_height + dkg_interval + 1,
-        agent_node.clone(),
+        reference_node.clone(),
     )
     .await;
 
@@ -449,6 +487,37 @@ pub async fn assert_state_sync_has_happened(
     panic!("Couldn't verify that a state sync has happened after {NUM_RETRIES} attempts.");
 }
 
+async fn average_process_batch_duration(log: &slog::Logger, node: IcNodeSnapshot) -> f64 {
+    let label_sum = format!("{METRIC_PROCESS_BATCH_PHASE_DURATION}_sum");
+    let label_count = format!("{METRIC_PROCESS_BATCH_PHASE_DURATION}_count");
+    let metrics = fetch_metrics::<f64>(log, node.clone(), vec![&label_sum, &label_count]).await;
+    let sums: Vec<_> = metrics
+        .iter()
+        .filter_map(|(k, v)| {
+            if k.starts_with(&label_sum) {
+                Some(v)
+            } else {
+                None
+            }
+        })
+        .collect();
+    let counts: Vec<_> = metrics
+        .iter()
+        .filter_map(|(k, v)| {
+            if k.starts_with(&label_count) {
+                Some(v)
+            } else {
+                None
+            }
+        })
+        .collect();
+    assert_eq!(sums.len(), counts.len());
+    sums.iter()
+        .zip(counts.iter())
+        .map(|(x, y)| x[0] / y[0])
+        .sum()
+}
+
 pub async fn fetch_metrics<T>(
     log: &slog::Logger,
     node: IcNodeSnapshot,
@@ -468,7 +537,10 @@ where
         let metrics_result = metrics.fetch::<T>().await;
         match metrics_result {
             Ok(result) => {
-                if labels.iter().all(|&label| result.contains_key(label)) {
+                if labels
+                    .iter()
+                    .all(|&label| result.iter().any(|(k, _)| k.starts_with(label)))
+                {
                     info!(log, "Metrics successfully scraped {:?}.", result);
                     return result;
                 } else {
diff --git a/rs/tests/message_routing/rejoin_test_long_rounds.rs b/rs/tests/message_routing/rejoin_test_long_rounds.rs
@@ -7,7 +7,7 @@ Runbook::
 . pick a random node and install 4 "seed" canisters through it (the state sync test canister is used as "seed")
 . create 100,000 canisters via the "seed" canisters (in parallel)
 . deploy 8 "busy" canisters (universal canister with heartbeats executing 1.8B instructions)
-. pick another random node and kill that node
+. pick the slowest node required for consensus in terms of batch processing time and kill that node
 . wait for the subnet producing a CUP
 . start the killed node
 
@@ -118,15 +118,11 @@ async fn test_async(env: TestEnv, config: Config) {
     let topology_snapshot = env.topology_snapshot();
     let (app_subnet, _) = get_app_subnet_and_node(&topology_snapshot);
 
-    let mut nodes = app_subnet.nodes();
-    let rejoin_node = nodes.next().unwrap();
-    let agent_node = nodes.next().unwrap();
     rejoin_test_long_rounds(
         env,
+        app_subnet.nodes().collect(),
         config.num_canisters,
         DKG_INTERVAL,
-        rejoin_node.clone(),
-        agent_node.clone(),
     )
     .await;
 }