NatLabRockies
diff --git a/‎docs/src/core/reference/workflow-spec.md‎
Lines changed: 13 additions & 12 deletions b/‎docs/src/core/reference/workflow-spec.md‎
Lines changed: 13 additions & 12 deletions
diff --git a/‎docs/src/specialized/design/workflow-actions.md‎
Lines changed: 3 additions & 0 deletions b/‎docs/src/specialized/design/workflow-actions.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/src/specialized/hpc/multi-node-jobs.md‎
Lines changed: 60 additions & 8 deletions b/‎docs/src/specialized/hpc/multi-node-jobs.md‎
Lines changed: 60 additions & 8 deletions
diff --git a/‎examples/kdl/workflow_actions_simple_slurm.kdl‎
Lines changed: 0 additions & 1 deletion b/‎examples/kdl/workflow_actions_simple_slurm.kdl‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎slurm-tests/workflows/multi_node_direct.yaml‎
Lines changed: 48 additions & 0 deletions b/‎slurm-tests/workflows/multi_node_direct.yaml‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎src/bin/torc-slurm-job-runner.rs‎
Lines changed: 7 additions & 7 deletions b/‎src/bin/torc-slurm-job-runner.rs‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎src/client/async_cli_command.rs‎
Lines changed: 0 additions & 12 deletions b/‎src/client/async_cli_command.rs‎
Lines changed: 0 additions & 12 deletions
diff --git a/‎src/client/commands/slurm.rs‎
Lines changed: 25 additions & 2 deletions b/‎src/client/commands/slurm.rs‎
Lines changed: 25 additions & 2 deletions
diff --git a/‎src/client/hpc/hpc_interface.rs‎
Lines changed: 2 additions & 0 deletions b/‎src/client/hpc/hpc_interface.rs‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/client/hpc/hpc_manager.rs‎
Lines changed: 2 additions & 0 deletions b/‎src/client/hpc/hpc_manager.rs‎
Lines changed: 2 additions & 0 deletions
@@ -256,18 +256,19 @@ slurm_defaults:
 
 Defines conditional actions triggered by workflow or job state changes.
 
-| Name                | Type     | Default    | Description                                                                                               |
-| ------------------- | -------- | ---------- | --------------------------------------------------------------------------------------------------------- |
-| `trigger_type`      | string   | _required_ | When to trigger: `"on_workflow_start"`, `"on_workflow_complete"`, `"on_jobs_ready"`, `"on_jobs_complete"` |
-| `action_type`       | string   | _required_ | What to do: `"run_commands"`, `"schedule_nodes"`                                                          |
-| `jobs`              | [string] | none       | For job triggers: exact job names to match                                                                |
-| `job_name_regexes`  | [string] | none       | For job triggers: regex patterns to match job names                                                       |
-| `commands`          | [string] | none       | For `run_commands`: commands to execute                                                                   |
-| `scheduler`         | string   | none       | For `schedule_nodes`: scheduler name                                                                      |
-| `scheduler_type`    | string   | none       | For `schedule_nodes`: scheduler type (`"slurm"`, `"local"`)                                               |
-| `num_allocations`   | integer  | none       | For `schedule_nodes`: number of node allocations                                                          |
-| `max_parallel_jobs` | integer  | none       | For `schedule_nodes`: maximum parallel jobs                                                               |
-| `persistent`        | boolean  | false      | Whether the action persists and can be claimed by multiple workers                                        |
+| Name                        | Type     | Default    | Description                                                                                               |
+| --------------------------- | -------- | ---------- | --------------------------------------------------------------------------------------------------------- |
+| `trigger_type`              | string   | _required_ | When to trigger: `"on_workflow_start"`, `"on_workflow_complete"`, `"on_jobs_ready"`, `"on_jobs_complete"` |
+| `action_type`               | string   | _required_ | What to do: `"run_commands"`, `"schedule_nodes"`                                                          |
+| `jobs`                      | [string] | none       | For job triggers: exact job names to match                                                                |
+| `job_name_regexes`          | [string] | none       | For job triggers: regex patterns to match job names                                                       |
+| `commands`                  | [string] | none       | For `run_commands`: commands to execute                                                                   |
+| `scheduler`                 | string   | none       | For `schedule_nodes`: scheduler name                                                                      |
+| `scheduler_type`            | string   | none       | For `schedule_nodes`: scheduler type (`"slurm"`, `"local"`)                                               |
+| `num_allocations`           | integer  | none       | For `schedule_nodes`: number of node allocations                                                          |
+| `start_one_worker_per_node` | boolean  | false      | For `schedule_nodes`: launch one worker per node (direct mode only)                                       |
+| `max_parallel_jobs`         | integer  | none       | For `schedule_nodes`: maximum parallel jobs                                                               |
+| `persistent`                | boolean  | false      | Whether the action persists and can be claimed by multiple workers                                        |
 
 ## ResourceMonitorConfig
 
 
@@ -256,6 +256,9 @@ Dynamically allocate compute resources from a Slurm scheduler.
 - `scheduler` (required) - Name of Slurm scheduler configuration (must exist in `slurm_schedulers`)
 - `scheduler_type` (required) - Must be "slurm"
 - `num_allocations` (required) - Number of Slurm allocation requests to submit
+- `start_one_worker_per_node` (optional, default: false) - Launch one worker per allocated node via
+  `srun --ntasks-per-node=1`. Use this for direct-mode workflows with single-node jobs sharing a
+  multi-node allocation. Not compatible with `execution_config.mode: slurm`.
 
 **Use cases**:
 
 
@@ -18,13 +18,21 @@ nodes are in the allocation.
 **Use when**: You have many independent jobs that each fit on one node, and you want them to run in
 parallel across multiple nodes for throughput.
 
-**How it works**: Torc requests a multi-node Slurm allocation (e.g., 4 nodes). One worker manages
-the allocation and places each single-node job onto one node via `srun --nodes=1`. Single-node jobs
-may share a node as long as CPU, memory, and GPU limits allow. With N nodes, Torc can spread work
-across the allocation for throughput.
+**How it works**: Torc requests a multi-node Slurm allocation (e.g., 4 nodes). The behavior depends
+on the execution mode:
 
-**Example**: 100 independent analysis jobs, each needing 8 CPUs and 32 GB, across a 4-node
-allocation:
+- **Slurm mode** (default): A single worker manages the allocation and places each single-node job
+  onto a node via `srun --nodes=1`. Slurm handles resource isolation and node placement.
+- **Direct mode**: Jobs are executed directly without `srun` wrapping. To distribute work across
+  nodes, set `start_one_worker_per_node: true` on the `schedule_nodes` action. This launches one
+  worker per node via `srun --ntasks-per-node=1`, and each worker executes jobs directly on its
+  node.
+
+Single-node jobs may share a node as long as CPU, memory, and GPU limits allow. With N nodes, Torc
+can spread work across the allocation for throughput.
+
+**Example (Slurm mode)**: 100 independent analysis jobs, each needing 8 CPUs and 32 GB, across a
+4-node allocation:
 
 ```yaml
 name: parallel_analysis
@@ -58,9 +66,52 @@ actions:
     num_allocations: 1
 ```
 
+**Example (Direct mode)**: The same workload using direct execution with one worker per node:
+
+```yaml
+name: parallel_analysis_direct
+description: Run 20 analysis tasks across 2 nodes via direct execution
+
+execution_config:
+  mode: direct
+
+resource_requirements:
+  - name: analysis
+    num_cpus: 5
+    num_nodes: 1
+    memory: 2g
+    runtime: PT3M
+
+jobs:
+  - name: analyze_{i}
+    command: python analyze.py --chunk {i}
+    resource_requirements: analysis
+    scheduler: multi_node
+    parameters:
+      i: "1:20"
+
+slurm_schedulers:
+  - name: multi_node
+    account: myproject
+    nodes: 2
+    walltime: "00:10:00"
+
+actions:
+  - trigger_type: on_workflow_start
+    action_type: schedule_nodes
+    scheduler: multi_node
+    scheduler_type: slurm
+    start_one_worker_per_node: true
+    num_allocations: 1
+```
+
 Each node has 8 CPUs and 32 GB available per job. If a node has 64 CPUs total, it can run up to 8
 jobs concurrently (64 / 8 = 8). Across 4 nodes, that means up to 32 jobs running at once.
 
+> **Note:** `start_one_worker_per_node` is only supported with `execution_config.mode: direct`. It
+> is not compatible with slurm execution mode, where Torc uses a single worker with `srun`-based
+> node placement.
+
 ### Pattern 2: True Multi-Node Jobs (MPI, Distributed Training)
 
 **Use when**: A single job needs to span multiple nodes — for example, MPI applications, distributed
@@ -136,8 +187,9 @@ underlying Slurm allocations. There are two approaches, each with trade-offs.
 
 ### One multi-node allocation
 
-Request all nodes in a single `sbatch` job (e.g., `nodes: 4`). Torc runs one worker per node and
-distributes jobs across them.
+Request all nodes in a single `sbatch` job (e.g., `nodes: 4`). In slurm mode, a single worker
+distributes jobs across nodes via `srun`. In direct mode with `start_one_worker_per_node`, Torc runs
+one worker per node and each worker executes jobs locally.
 
 **Advantages:**
 
 
@@ -78,7 +78,6 @@ action {
     scheduler "process_scheduler"
     scheduler_type "slurm"
     num_allocations 2
-    start_one_worker_per_node #true
 }
 
 // Allocate resources for finalization stage
 
@@ -0,0 +1,48 @@
+# Test: Multi-Node Direct Execution
+#
+# 2-node allocation with direct execution mode (no srun wrapping for jobs).
+# The head worker spawns one torc-slurm-job-runner per node via
+# srun --ntasks-per-node=1, and each per-node worker executes jobs directly.
+# Tests that single-node jobs are distributed across nodes in direct mode.
+
+name: multi_node_direct
+description: 2-node allocation — 20 jobs x 5 CPUs via direct execution
+project: slurm-tests
+
+execution_config:
+  mode: direct
+
+resource_monitor:
+  enabled: true
+  granularity: time_series
+  sample_interval_seconds: 2
+
+resource_requirements:
+  - name: work_resources
+    num_cpus: 5
+    num_nodes: 1
+    memory: 2g
+    runtime: PT3M
+
+jobs:
+  - name: work_{i}
+    command: bash -c 'echo "Running on $(hostname)"; stress-ng --cpu 5 --timeout 30 --metrics-brief'
+    resource_requirements: work_resources
+    scheduler: two_node_scheduler
+    parameters:
+      i: 1:20
+
+slurm_schedulers:
+  - name: two_node_scheduler
+    account: PLACEHOLDER_ACCOUNT
+    partition: PLACEHOLDER_PARTITION
+    nodes: 2
+    walltime: "00:10:00"
+
+actions:
+  - trigger_type: "on_workflow_start"
+    action_type: "schedule_nodes"
+    scheduler: "two_node_scheduler"
+    scheduler_type: "slurm"
+    num_allocations: 1
+    start_one_worker_per_node: true
@@ -235,7 +235,7 @@ mod unix_main {
             insecure: args.tls_insecure,
         };
         let mut config = Configuration::with_tls(tls);
-        config.base_path = args.url;
+        config.base_path = args.url.clone();
 
         // Set up authentication if password is provided
         if let Some(ref password) = args.password {
@@ -292,12 +292,6 @@ mod unix_main {
         let scheduled_compute_node =
             get_scheduled_compute_node(&config, args.workflow_id, &slurm_interface);
 
-        if slurm_interface.is_head_node()
-            && let Some(ref node) = scheduled_compute_node
-        {
-            set_scheduled_compute_node_status(&config, node, "active");
-        }
-
         let scheduler_id = scheduled_compute_node.as_ref().map(|node| node.id);
         let scheduler_config_id = scheduled_compute_node
             .as_ref()
@@ -332,6 +326,12 @@ mod unix_main {
                 args.wait_for_healthy_database_minutes,
             );
 
+        if slurm_interface.is_head_node()
+            && let Some(ref node) = scheduled_compute_node
+        {
+            set_scheduled_compute_node_status(&config, node, "active");
+        }
+
         let node_tracker = if num_nodes > 1 && !has_multi_node_jobs {
             match slurm_interface.list_active_nodes(&job_id) {
                 Ok(node_names) => {
 
@@ -33,7 +33,6 @@ use crate::models::{JobModel, JobStatus, ResourceRequirementsModel, ResultModel,
 use chrono::{DateTime, Utc};
 use log::{self, debug, error, info, warn};
 use std::fs::File;
-use std::io::BufWriter;
 use std::path::Path;
 use std::process::{Child, Command, Stdio};
 
@@ -160,8 +159,6 @@ pub struct AsyncCliCommand {
     return_code: Option<i64>,
     pub is_complete: bool,
     status: JobStatus,
-    stdout_fp: Option<BufWriter<File>>,
-    stderr_fp: Option<BufWriter<File>>,
 }
 
 impl AsyncCliCommand {
@@ -185,8 +182,6 @@ impl AsyncCliCommand {
             return_code: None,
             is_complete: false,
             status,
-            stdout_fp: None,
-            stderr_fp: None,
         }
     }
 
@@ -231,11 +226,6 @@ impl AsyncCliCommand {
         let stderr_path =
             get_job_stderr_path(output_dir, workflow_id, self.job_id, run_id, attempt_id);
 
-        let stdout_file = File::create(&stdout_path)?;
-        let stderr_file = File::create(&stderr_path)?;
-        self.stdout_fp = Some(BufWriter::new(stdout_file));
-        self.stderr_fp = Some(BufWriter::new(stderr_file));
-
         let command_str = if let Some(ref invocation_script) = self.job.invocation_script {
             format!("{} {}", invocation_script, self.job.command)
         } else {
@@ -680,8 +670,6 @@ impl AsyncCliCommand {
             (self.completion_time.unwrap() - self.start_time).num_milliseconds() as f64 / 1000.0;
         self.status = status;
         self.return_code = Some(return_code);
-        self.stdout_fp = None;
-        self.stderr_fp = None;
         self.handle = None;
 
         // Collect Slurm accounting stats via sacct when running inside an allocation.
 
@@ -57,7 +57,9 @@ use crate::client::hpc::hpc_interface::HpcInterface;
 use crate::client::utils;
 use crate::client::workflow_graph::WorkflowGraph;
 use crate::client::workflow_manager::WorkflowManager;
-use crate::client::workflow_spec::{ResourceRequirementsSpec, SlurmDefaultsSpec, WorkflowSpec};
+use crate::client::workflow_spec::{
+    ExecutionConfig, ExecutionMode, ResourceRequirementsSpec, SlurmDefaultsSpec, WorkflowSpec,
+};
 use crate::config::TorcConfig;
 use crate::models;
 use tabled::Tabled;
@@ -389,6 +391,10 @@ EXAMPLES:
         /// Workflow ID
         #[arg()]
         workflow_id: Option<i64>,
+        /// Start one worker per allocated node.
+        /// Use this for direct-mode single-node jobs sharing a multi-node allocation.
+        #[arg(long, default_value = "false")]
+        start_one_worker_per_node: bool,
         /// Job prefix for the Slurm job names
         #[arg(short, long, default_value = "")]
         job_prefix: String,
@@ -1178,6 +1184,7 @@ pub fn handle_slurm_commands(config: &Configuration, command: &SlurmCommands, fo
         }
         SlurmCommands::ScheduleNodes {
             workflow_id,
+            start_one_worker_per_node,
             job_prefix,
             keep_submission_scripts,
             max_parallel_jobs,
@@ -1256,6 +1263,7 @@ pub fn handle_slurm_commands(config: &Configuration, command: &SlurmCommands, fo
                 wf_id,
                 sched_config_id,
                 *num_hpc_jobs,
+                *start_one_worker_per_node,
                 job_prefix,
                 output,
                 effective_poll_interval,
@@ -1433,6 +1441,7 @@ pub fn schedule_slurm_nodes(
     workflow_id: i64,
     scheduler_config_id: i64,
     num_hpc_jobs: i32,
+    start_one_worker_per_node: bool,
     job_prefix: &str,
     output: &str,
     poll_interval: i32,
@@ -1461,6 +1470,12 @@ pub fn schedule_slurm_nodes(
             return Err(format!("Failed to get workflow: {}", e).into());
         }
     };
+    let execution_config = ExecutionConfig::from_workflow_model(&workflow);
+    if start_one_worker_per_node && execution_config.mode != ExecutionMode::Direct {
+        return Err(
+            "start_one_worker_per_node requires execution_config.mode to be 'direct'".into(),
+        );
+    }
 
     let slurm_interface = match crate::client::hpc::slurm_interface::SlurmInterface::new() {
         Ok(interface) => interface,
@@ -1539,6 +1554,7 @@ pub fn schedule_slurm_nodes(
             max_parallel_jobs,
             Path::new(&script_path),
             &config_map,
+            start_one_worker_per_node,
             tls_ca_cert,
             tls_insecure,
         ) {
@@ -1633,7 +1649,13 @@ pub fn create_node_resources(
     };
 
     let num_gpus = interface.get_num_gpus() as i64;
-    let num_nodes = interface.get_num_nodes() as i64;
+    // When running as a subtask (one worker per node), each worker manages
+    // only its own node regardless of the total allocation size.
+    let num_nodes = if is_subtask {
+        1
+    } else {
+        interface.get_num_nodes() as i64
+    };
 
     // Return per-node resource values. The job runner is responsible for
     // multiplying by num_nodes to compute total allocation capacity.
@@ -3914,6 +3936,7 @@ fn handle_regenerate(
                 workflow_id,
                 scheduler_info.id,
                 scheduler_info.num_allocations as i32,
+                false, // start_one_worker_per_node
                 "",
                 output_dir.to_str().unwrap_or("torc_output"),
                 effective_poll_interval,
 
@@ -54,6 +54,7 @@ pub trait HpcInterface: Send + Sync {
     /// * `max_parallel_jobs` - Optional maximum number of parallel jobs
     /// * `filename` - Path where the submission script should be written
     /// * `config` - Configuration parameters for the HPC scheduler
+    /// * `start_one_worker_per_node` - Whether to launch one worker per node via srun
     /// * `tls_ca_cert` - Optional path to a PEM-encoded CA certificate
     /// * `tls_insecure` - Whether to skip certificate verification
     #[allow(clippy::too_many_arguments)]
@@ -67,6 +68,7 @@ pub trait HpcInterface: Send + Sync {
         max_parallel_jobs: Option<i32>,
         filename: &Path,
         config: &HashMap<String, String>,
+        start_one_worker_per_node: bool,
         tls_ca_cert: Option<&str>,
         tls_insecure: bool,
     ) -> Result<()>;
 
@@ -133,6 +133,7 @@ impl HpcManager {
         workflow_id: i64,
         poll_interval: i32,
         max_parallel_jobs: Option<i32>,
+        start_one_worker_per_node: bool,
         keep_submission_script: bool,
         tls_ca_cert: Option<&str>,
         tls_insecure: bool,
@@ -148,6 +149,7 @@ impl HpcManager {
             max_parallel_jobs,
             &filename,
             &self.config,
+            start_one_worker_per_node,
             tls_ca_cert,
             tls_insecure,
         )?;
Original file line number	Diff line number	Diff line change
`@@ -78,7 +78,6 @@ action {`
`78`	`78`	`scheduler "process_scheduler"`
`79`	`79`	`scheduler_type "slurm"`
`80`	`80`	`num_allocations 2`
`81`		`- start_one_worker_per_node #true`
`82`	`81`	`}`
`83`	`82`
`84`	`83`	`// Allocate resources for finalization stage`