Skip to content

Commit d74e176

Browse files
daniel-thomclaude
andauthored
Add startup jitter to slurm job runners for thundering herd mitigation (#224)
* Add startup jitter to slurm job runners for thundering herd mitigation When many Slurm allocations start simultaneously (e.g., 1000 nodes), all torc-slurm-job-runner processes would contact the server at the same instant, causing connection timeouts and SQLite lock contention. Add --startup-delay-seconds flag to torc-slurm-job-runner that causes each runner to sleep a deterministic random duration (hashed from hostname, job ID, node ID, task PID) before its first API call. The delay window is computed automatically by schedule_slurm_nodes based on total runner count (scaling from 0s for 1 runner up to 60s for 100+ runners), accounting for start_one_worker_per_node. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent cd2d1bb commit d74e176

File tree

9 files changed

+261
-0
lines changed

9 files changed

+261
-0
lines changed

docs/src/specialized/hpc/hpc-deployment.md

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,59 @@ curl -s "$TORC_API_URL/workflows" | head
204204
exit
205205
```
206206

207+
## Large-Scale Deployments
208+
209+
### Startup Jitter (Thundering Herd Mitigation)
210+
211+
When many Slurm allocations start simultaneously — for example, 1000 single-node jobs scheduled at
212+
once — all `torc-slurm-job-runner` processes may contact the server at the same instant. This
213+
"thundering herd" can overwhelm the server with concurrent requests, causing connection timeouts and
214+
SQLite lock contention.
215+
216+
Torc mitigates this automatically. When `torc slurm schedule-nodes` generates sbatch scripts, it
217+
calculates a startup delay window based on the total number of runners that will start:
218+
219+
| Total runners | Max startup delay |
220+
| ------------- | ----------------- |
221+
| 1 | 0 s (disabled) |
222+
| 2–10 | 2–10 s |
223+
| 11–100 | 10–60 s |
224+
| 100+ | 60 s |
225+
226+
Each runner picks a deterministic delay within this window (hashed from its hostname, Slurm job ID,
227+
node ID, and task PID), then sleeps before making its first API call. This spreads the initial burst
228+
of requests across the delay window.
229+
230+
The delay is passed to `torc-slurm-job-runner` via the `--startup-delay-seconds` flag in the
231+
generated sbatch script. You can override it manually if needed:
232+
233+
```bash
234+
# In a custom sbatch script: set a 120-second jitter window
235+
torc-slurm-job-runner $URL $WORKFLOW_ID $OUTPUT --startup-delay-seconds 120
236+
```
237+
238+
When `start_one_worker_per_node` is enabled, the total runner count includes all nodes across all
239+
allocations (e.g., 10 allocations × 4 nodes = 40 runners), so the delay window scales appropriately.
240+
241+
To disable staggered startup, set `staggered_start: false` in `execution_config`:
242+
243+
```yaml
244+
execution_config:
245+
staggered_start: false
246+
```
247+
248+
### Server Tuning for Large Workflows
249+
250+
For workflows with many concurrent compute nodes, consider increasing the server thread count to
251+
expand the database connection pool:
252+
253+
```bash
254+
# Default is 1 thread (3 connections). For 100+ nodes, increase:
255+
torc-server run --threads 8 --database /scratch/$USER/torc.db --host $HOST --port $PORT
256+
```
257+
258+
The connection pool size is `max(threads, 2) + 2`, so `--threads 8` gives 10 connections.
259+
207260
## Troubleshooting
208261

209262
### "Connection refused" from compute nodes

src/bin/torc-slurm-job-runner.rs

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,12 @@ mod unix_main {
8787
/// Log level: error, warn, info, debug, trace
8888
#[arg(long)]
8989
log_level: Option<String>,
90+
91+
/// Maximum startup delay in seconds for thundering herd mitigation.
92+
/// Each runner sleeps a deterministic jitter in [0, N) seconds before
93+
/// contacting the server, spreading load when many nodes start at once.
94+
#[arg(long, default_value = "0")]
95+
startup_delay_seconds: u64,
9096
}
9197

9298
fn workflow_has_multi_node_jobs(
@@ -244,6 +250,27 @@ mod unix_main {
244250
config.basic_auth = Some((username, Some(password.clone())));
245251
}
246252

253+
// Stagger startup to avoid thundering herd when many compute nodes start
254+
// simultaneously. The delay window is set by the caller (sbatch script)
255+
// based on the number of concurrent allocations.
256+
if args.startup_delay_seconds > 0 {
257+
let jitter = {
258+
use std::collections::hash_map::DefaultHasher;
259+
use std::hash::{Hash, Hasher};
260+
let mut hasher = DefaultHasher::new();
261+
hostname.hash(&mut hasher);
262+
job_id.hash(&mut hasher);
263+
node_id.hash(&mut hasher);
264+
task_pid.hash(&mut hasher);
265+
hasher.finish() % args.startup_delay_seconds
266+
};
267+
info!(
268+
"Startup jitter: sleeping {} seconds (window={})",
269+
jitter, args.startup_delay_seconds
270+
);
271+
thread::sleep(std::time::Duration::from_secs(jitter));
272+
}
273+
247274
// First, ping the server to ensure we can connect
248275
match utils::send_with_retries(
249276
&config,

src/client/commands/slurm.rs

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1559,6 +1559,31 @@ pub fn schedule_slurm_nodes(
15591559

15601560
std::fs::create_dir_all(output)?;
15611561

1562+
// Compute startup jitter window for thundering herd mitigation.
1563+
// When many allocations start simultaneously, each runner sleeps a deterministic
1564+
// jitter in [0, startup_delay_seconds) before contacting the server.
1565+
let startup_delay_seconds = if execution_config.staggered_start() {
1566+
let nodes_per_alloc: i32 = config_map
1567+
.get("nodes")
1568+
.and_then(|v| v.parse().ok())
1569+
.unwrap_or(1);
1570+
let total_runners = if start_one_worker_per_node {
1571+
num_hpc_jobs * nodes_per_alloc
1572+
} else {
1573+
num_hpc_jobs
1574+
};
1575+
let delay = compute_startup_delay(total_runners.max(0) as u32);
1576+
if delay > 0 {
1577+
info!(
1578+
"Startup jitter: {} runners, delay window {} seconds",
1579+
total_runners, delay
1580+
);
1581+
}
1582+
delay
1583+
} else {
1584+
0
1585+
};
1586+
15621587
for job_num in 1..num_hpc_jobs + 1 {
15631588
let job_name = format!(
15641589
"{}wf{}_{}_{}",
@@ -1584,6 +1609,7 @@ pub fn schedule_slurm_nodes(
15841609
start_one_worker_per_node,
15851610
tls_ca_cert,
15861611
tls_insecure,
1612+
startup_delay_seconds,
15871613
) {
15881614
error!("Error creating submission script: {}", e);
15891615
return Err(e.into());
@@ -1647,6 +1673,19 @@ pub fn schedule_slurm_nodes(
16471673
Ok(())
16481674
}
16491675

1676+
/// Compute the startup delay window in seconds based on the total number of runners.
1677+
///
1678+
/// Returns 0 for a single runner, scales linearly from 2–10s for 2–10 runners,
1679+
/// 10–60s for 11–100 runners, and caps at 60s for 100+ runners.
1680+
pub fn compute_startup_delay(total_runners: u32) -> u64 {
1681+
match total_runners {
1682+
0..=1 => 0,
1683+
2..=10 => total_runners as u64,
1684+
11..=100 => 10 + ((total_runners - 10) as u64 * 50 / 90), // linear 10..60
1685+
_ => 60,
1686+
}
1687+
}
1688+
16501689
/// Create a ComputeNodesResources instance by reading information from the Slurm environment
16511690
///
16521691
/// # Arguments

src/client/hpc/hpc_interface.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ pub trait HpcInterface: Send + Sync {
5757
/// * `start_one_worker_per_node` - Whether to launch one worker per node via srun
5858
/// * `tls_ca_cert` - Optional path to a PEM-encoded CA certificate
5959
/// * `tls_insecure` - Whether to skip certificate verification
60+
/// * `startup_delay_seconds` - Maximum startup jitter in seconds (0 to disable)
6061
#[allow(clippy::too_many_arguments)]
6162
fn create_submission_script(
6263
&self,
@@ -71,6 +72,7 @@ pub trait HpcInterface: Send + Sync {
7172
start_one_worker_per_node: bool,
7273
tls_ca_cert: Option<&str>,
7374
tls_insecure: bool,
75+
startup_delay_seconds: u64,
7476
) -> Result<()>;
7577

7678
/// Get the current HPC job ID from environment variables

src/client/hpc/hpc_manager.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ impl HpcManager {
137137
keep_submission_script: bool,
138138
tls_ca_cert: Option<&str>,
139139
tls_insecure: bool,
140+
startup_delay_seconds: u64,
140141
) -> Result<String> {
141142
let filename = directory.join(format!("{}.sh", name));
142143

@@ -152,6 +153,7 @@ impl HpcManager {
152153
start_one_worker_per_node,
153154
tls_ca_cert,
154155
tls_insecure,
156+
startup_delay_seconds,
155157
)?;
156158

157159
trace!("Created submission script {:?}", filename);

src/client/hpc/slurm_interface.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,7 @@ impl HpcInterface for SlurmInterface {
215215
start_one_worker_per_node: bool,
216216
tls_ca_cert: Option<&str>,
217217
tls_insecure: bool,
218+
startup_delay_seconds: u64,
218219
) -> Result<()> {
219220
let mut script = format!(
220221
"#!/bin/bash\n\
@@ -273,6 +274,13 @@ impl HpcInterface for SlurmInterface {
273274
command.push_str(" --tls-insecure");
274275
}
275276

277+
if startup_delay_seconds > 0 {
278+
command.push_str(&format!(
279+
" --startup-delay-seconds {}",
280+
startup_delay_seconds
281+
));
282+
}
283+
276284
// Unset conflicting Slurm memory variables.
277285
// These can be inherited from a parent allocation and conflict with --mem.
278286
// We only unset SLURM_MEM_PER_CPU and SLURM_MEM_PER_GPU since those conflict

src/client/workflow_spec.rs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -753,6 +753,13 @@ pub struct ExecutionConfig {
753753
#[serde(skip_serializing_if = "Option::is_none")]
754754
pub enable_cpu_bind: Option<bool>,
755755

756+
// ========== HPC scheduling settings ==========
757+
/// Enable staggered startup for Slurm job runners to mitigate thundering herd.
758+
/// When true (default), each runner sleeps a deterministic jitter before
759+
/// contacting the server, spreading load when many nodes start at once.
760+
#[serde(skip_serializing_if = "Option::is_none")]
761+
pub staggered_start: Option<bool>,
762+
756763
// ========== Stdio settings ==========
757764
/// Workflow-level default for stdout/stderr capture.
758765
#[serde(skip_serializing_if = "Option::is_none")]
@@ -835,6 +842,11 @@ impl ExecutionConfig {
835842
self.oom_exit_code.unwrap_or(Self::DEFAULT_OOM_EXIT_CODE)
836843
}
837844

845+
/// Whether staggered startup is enabled for Slurm job runners.
846+
pub fn staggered_start(&self) -> bool {
847+
self.staggered_start.unwrap_or(true)
848+
}
849+
838850
/// Resolve the effective `StdioConfig` for a job, checking per-job overrides first.
839851
pub fn stdio_for_job(&self, job_name: &str) -> StdioConfig {
840852
if let Some(ref overrides) = self.job_stdio_overrides
@@ -936,6 +948,7 @@ impl ExecutionConfig {
936948
sigkill_headroom_seconds: None,
937949
timeout_exit_code: None,
938950
oom_exit_code: None,
951+
staggered_start: None,
939952
stdio: None,
940953
job_stdio_overrides: None,
941954
}
@@ -5833,6 +5846,7 @@ jobs:
58335846
oom_exit_code: Some(201),
58345847
srun_termination_signal: None,
58355848
enable_cpu_bind: None,
5849+
staggered_start: None,
58365850
stdio: None,
58375851
job_stdio_overrides: None,
58385852
};

tests/test_execution_config.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,7 @@ fn test_execution_config_yaml_roundtrip() {
424424
oom_exit_code: Some(137),
425425
srun_termination_signal: None,
426426
enable_cpu_bind: None,
427+
staggered_start: None,
427428
stdio: None,
428429
job_stdio_overrides: None,
429430
};
@@ -446,6 +447,7 @@ fn test_execution_config_json_roundtrip() {
446447
oom_exit_code: None,
447448
srun_termination_signal: Some("TERM@90".to_string()),
448449
enable_cpu_bind: Some(true),
450+
staggered_start: None,
449451
stdio: None,
450452
job_stdio_overrides: None,
451453
};

0 commit comments

Comments
 (0)