Add startup jitter to slurm job runners for thundering herd mitigation (#224)

daniel-thom · claude · web-flow · commit d74e17662cfe · 2026-03-17T14:31:14.000-06:00
* Add startup jitter to slurm job runners for thundering herd mitigation

When many Slurm allocations start simultaneously (e.g., 1000 nodes),
all torc-slurm-job-runner processes would contact the server at the
same instant, causing connection timeouts and SQLite lock contention.

Add --startup-delay-seconds flag to torc-slurm-job-runner that causes
each runner to sleep a deterministic random duration (hashed from
hostname, job ID, node ID, task PID) before its first API call. The
delay window is computed automatically by schedule_slurm_nodes based
on total runner count (scaling from 0s for 1 runner up to 60s for
100+ runners), accounting for start_one_worker_per_node.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/docs/src/specialized/hpc/hpc-deployment.md b/docs/src/specialized/hpc/hpc-deployment.md
@@ -204,6 +204,59 @@ curl -s "$TORC_API_URL/workflows" | head
 exit
 ```
 
+## Large-Scale Deployments
+
+### Startup Jitter (Thundering Herd Mitigation)
+
+When many Slurm allocations start simultaneously — for example, 1000 single-node jobs scheduled at
+once — all `torc-slurm-job-runner` processes may contact the server at the same instant. This
+"thundering herd" can overwhelm the server with concurrent requests, causing connection timeouts and
+SQLite lock contention.
+
+Torc mitigates this automatically. When `torc slurm schedule-nodes` generates sbatch scripts, it
+calculates a startup delay window based on the total number of runners that will start:
+
+| Total runners | Max startup delay |
+| ------------- | ----------------- |
+| 1             | 0 s (disabled)    |
+| 2–10          | 2–10 s            |
+| 11–100        | 10–60 s           |
+| 100+          | 60 s              |
+
+Each runner picks a deterministic delay within this window (hashed from its hostname, Slurm job ID,
+node ID, and task PID), then sleeps before making its first API call. This spreads the initial burst
+of requests across the delay window.
+
+The delay is passed to `torc-slurm-job-runner` via the `--startup-delay-seconds` flag in the
+generated sbatch script. You can override it manually if needed:
+
+```bash
+# In a custom sbatch script: set a 120-second jitter window
+torc-slurm-job-runner $URL $WORKFLOW_ID $OUTPUT --startup-delay-seconds 120
+```
+
+When `start_one_worker_per_node` is enabled, the total runner count includes all nodes across all
+allocations (e.g., 10 allocations × 4 nodes = 40 runners), so the delay window scales appropriately.
+
+To disable staggered startup, set `staggered_start: false` in `execution_config`:
+
+```yaml
+execution_config:
+  staggered_start: false
+```
+
+### Server Tuning for Large Workflows
+
+For workflows with many concurrent compute nodes, consider increasing the server thread count to
+expand the database connection pool:
+
+```bash
+# Default is 1 thread (3 connections). For 100+ nodes, increase:
+torc-server run --threads 8 --database /scratch/$USER/torc.db --host $HOST --port $PORT
+```
+
+The connection pool size is `max(threads, 2) + 2`, so `--threads 8` gives 10 connections.
+
 ## Troubleshooting
 
 ### "Connection refused" from compute nodes
diff --git a/src/bin/torc-slurm-job-runner.rs b/src/bin/torc-slurm-job-runner.rs
@@ -87,6 +87,12 @@ mod unix_main {
         /// Log level: error, warn, info, debug, trace
         #[arg(long)]
         log_level: Option<String>,
+
+        /// Maximum startup delay in seconds for thundering herd mitigation.
+        /// Each runner sleeps a deterministic jitter in [0, N) seconds before
+        /// contacting the server, spreading load when many nodes start at once.
+        #[arg(long, default_value = "0")]
+        startup_delay_seconds: u64,
     }
 
     fn workflow_has_multi_node_jobs(
@@ -244,6 +250,27 @@ mod unix_main {
             config.basic_auth = Some((username, Some(password.clone())));
         }
 
+        // Stagger startup to avoid thundering herd when many compute nodes start
+        // simultaneously. The delay window is set by the caller (sbatch script)
+        // based on the number of concurrent allocations.
+        if args.startup_delay_seconds > 0 {
+            let jitter = {
+                use std::collections::hash_map::DefaultHasher;
+                use std::hash::{Hash, Hasher};
+                let mut hasher = DefaultHasher::new();
+                hostname.hash(&mut hasher);
+                job_id.hash(&mut hasher);
+                node_id.hash(&mut hasher);
+                task_pid.hash(&mut hasher);
+                hasher.finish() % args.startup_delay_seconds
+            };
+            info!(
+                "Startup jitter: sleeping {} seconds (window={})",
+                jitter, args.startup_delay_seconds
+            );
+            thread::sleep(std::time::Duration::from_secs(jitter));
+        }
+
         // First, ping the server to ensure we can connect
         match utils::send_with_retries(
             &config,
diff --git a/src/client/commands/slurm.rs b/src/client/commands/slurm.rs
@@ -1559,6 +1559,31 @@ pub fn schedule_slurm_nodes(
 
     std::fs::create_dir_all(output)?;
 
+    // Compute startup jitter window for thundering herd mitigation.
+    // When many allocations start simultaneously, each runner sleeps a deterministic
+    // jitter in [0, startup_delay_seconds) before contacting the server.
+    let startup_delay_seconds = if execution_config.staggered_start() {
+        let nodes_per_alloc: i32 = config_map
+            .get("nodes")
+            .and_then(|v| v.parse().ok())
+            .unwrap_or(1);
+        let total_runners = if start_one_worker_per_node {
+            num_hpc_jobs * nodes_per_alloc
+        } else {
+            num_hpc_jobs
+        };
+        let delay = compute_startup_delay(total_runners.max(0) as u32);
+        if delay > 0 {
+            info!(
+                "Startup jitter: {} runners, delay window {} seconds",
+                total_runners, delay
+            );
+        }
+        delay
+    } else {
+        0
+    };
+
     for job_num in 1..num_hpc_jobs + 1 {
         let job_name = format!(
             "{}wf{}_{}_{}",
@@ -1584,6 +1609,7 @@ pub fn schedule_slurm_nodes(
             start_one_worker_per_node,
             tls_ca_cert,
             tls_insecure,
+            startup_delay_seconds,
         ) {
             error!("Error creating submission script: {}", e);
             return Err(e.into());
@@ -1647,6 +1673,19 @@ pub fn schedule_slurm_nodes(
     Ok(())
 }
 
+/// Compute the startup delay window in seconds based on the total number of runners.
+///
+/// Returns 0 for a single runner, scales linearly from 2–10s for 2–10 runners,
+/// 10–60s for 11–100 runners, and caps at 60s for 100+ runners.
+pub fn compute_startup_delay(total_runners: u32) -> u64 {
+    match total_runners {
+        0..=1 => 0,
+        2..=10 => total_runners as u64,
+        11..=100 => 10 + ((total_runners - 10) as u64 * 50 / 90), // linear 10..60
+        _ => 60,
+    }
+}
+
 /// Create a ComputeNodesResources instance by reading information from the Slurm environment
 ///
 /// # Arguments
diff --git a/src/client/hpc/hpc_interface.rs b/src/client/hpc/hpc_interface.rs
@@ -57,6 +57,7 @@ pub trait HpcInterface: Send + Sync {
     /// * `start_one_worker_per_node` - Whether to launch one worker per node via srun
     /// * `tls_ca_cert` - Optional path to a PEM-encoded CA certificate
     /// * `tls_insecure` - Whether to skip certificate verification
+    /// * `startup_delay_seconds` - Maximum startup jitter in seconds (0 to disable)
     #[allow(clippy::too_many_arguments)]
     fn create_submission_script(
         &self,
@@ -71,6 +72,7 @@ pub trait HpcInterface: Send + Sync {
         start_one_worker_per_node: bool,
         tls_ca_cert: Option<&str>,
         tls_insecure: bool,
+        startup_delay_seconds: u64,
     ) -> Result<()>;
 
     /// Get the current HPC job ID from environment variables
diff --git a/src/client/hpc/hpc_manager.rs b/src/client/hpc/hpc_manager.rs
@@ -137,6 +137,7 @@ impl HpcManager {
         keep_submission_script: bool,
         tls_ca_cert: Option<&str>,
         tls_insecure: bool,
+        startup_delay_seconds: u64,
     ) -> Result<String> {
         let filename = directory.join(format!("{}.sh", name));
 
@@ -152,6 +153,7 @@ impl HpcManager {
             start_one_worker_per_node,
             tls_ca_cert,
             tls_insecure,
+            startup_delay_seconds,
         )?;
 
         trace!("Created submission script {:?}", filename);
diff --git a/src/client/hpc/slurm_interface.rs b/src/client/hpc/slurm_interface.rs
@@ -215,6 +215,7 @@ impl HpcInterface for SlurmInterface {
         start_one_worker_per_node: bool,
         tls_ca_cert: Option<&str>,
         tls_insecure: bool,
+        startup_delay_seconds: u64,
     ) -> Result<()> {
         let mut script = format!(
             "#!/bin/bash\n\
@@ -273,6 +274,13 @@ impl HpcInterface for SlurmInterface {
             command.push_str(" --tls-insecure");
         }
 
+        if startup_delay_seconds > 0 {
+            command.push_str(&format!(
+                " --startup-delay-seconds {}",
+                startup_delay_seconds
+            ));
+        }
+
         // Unset conflicting Slurm memory variables.
         // These can be inherited from a parent allocation and conflict with --mem.
         // We only unset SLURM_MEM_PER_CPU and SLURM_MEM_PER_GPU since those conflict
diff --git a/src/client/workflow_spec.rs b/src/client/workflow_spec.rs
@@ -753,6 +753,13 @@ pub struct ExecutionConfig {
     #[serde(skip_serializing_if = "Option::is_none")]
     pub enable_cpu_bind: Option<bool>,
 
+    // ========== HPC scheduling settings ==========
+    /// Enable staggered startup for Slurm job runners to mitigate thundering herd.
+    /// When true (default), each runner sleeps a deterministic jitter before
+    /// contacting the server, spreading load when many nodes start at once.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub staggered_start: Option<bool>,
+
     // ========== Stdio settings ==========
     /// Workflow-level default for stdout/stderr capture.
     #[serde(skip_serializing_if = "Option::is_none")]
@@ -835,6 +842,11 @@ impl ExecutionConfig {
         self.oom_exit_code.unwrap_or(Self::DEFAULT_OOM_EXIT_CODE)
     }
 
+    /// Whether staggered startup is enabled for Slurm job runners.
+    pub fn staggered_start(&self) -> bool {
+        self.staggered_start.unwrap_or(true)
+    }
+
     /// Resolve the effective `StdioConfig` for a job, checking per-job overrides first.
     pub fn stdio_for_job(&self, job_name: &str) -> StdioConfig {
         if let Some(ref overrides) = self.job_stdio_overrides
@@ -936,6 +948,7 @@ impl ExecutionConfig {
             sigkill_headroom_seconds: None,
             timeout_exit_code: None,
             oom_exit_code: None,
+            staggered_start: None,
             stdio: None,
             job_stdio_overrides: None,
         }
@@ -5833,6 +5846,7 @@ jobs:
             oom_exit_code: Some(201),
             srun_termination_signal: None,
             enable_cpu_bind: None,
+            staggered_start: None,
             stdio: None,
             job_stdio_overrides: None,
         };
diff --git a/tests/test_execution_config.rs b/tests/test_execution_config.rs
@@ -424,6 +424,7 @@ fn test_execution_config_yaml_roundtrip() {
         oom_exit_code: Some(137),
         srun_termination_signal: None,
         enable_cpu_bind: None,
+        staggered_start: None,
         stdio: None,
         job_stdio_overrides: None,
     };
@@ -446,6 +447,7 @@ fn test_execution_config_json_roundtrip() {
         oom_exit_code: None,
         srun_termination_signal: Some("TERM@90".to_string()),
         enable_cpu_bind: Some(true),
+        staggered_start: None,
         stdio: None,
         job_stdio_overrides: None,
     };
diff --git a/tests/test_slurm_commands.rs b/tests/test_slurm_commands.rs