@@ -57,7 +57,9 @@ use crate::client::hpc::hpc_interface::HpcInterface;
5757use crate :: client:: utils;
5858use crate :: client:: workflow_graph:: WorkflowGraph ;
5959use crate :: client:: workflow_manager:: WorkflowManager ;
60- use crate :: client:: workflow_spec:: { ResourceRequirementsSpec , SlurmDefaultsSpec , WorkflowSpec } ;
60+ use crate :: client:: workflow_spec:: {
61+ ExecutionConfig , ExecutionMode , ResourceRequirementsSpec , SlurmDefaultsSpec , WorkflowSpec ,
62+ } ;
6163use crate :: config:: TorcConfig ;
6264use crate :: models;
6365use tabled:: Tabled ;
@@ -389,6 +391,10 @@ EXAMPLES:
389391 /// Workflow ID
390392 #[ arg( ) ]
391393 workflow_id : Option < i64 > ,
394+ /// Start one worker per allocated node.
395+ /// Use this for direct-mode single-node jobs sharing a multi-node allocation.
396+ #[ arg( long, default_value = "false" ) ]
397+ start_one_worker_per_node : bool ,
392398 /// Job prefix for the Slurm job names
393399 #[ arg( short, long, default_value = "" ) ]
394400 job_prefix : String ,
@@ -1178,6 +1184,7 @@ pub fn handle_slurm_commands(config: &Configuration, command: &SlurmCommands, fo
11781184 }
11791185 SlurmCommands :: ScheduleNodes {
11801186 workflow_id,
1187+ start_one_worker_per_node,
11811188 job_prefix,
11821189 keep_submission_scripts,
11831190 max_parallel_jobs,
@@ -1256,6 +1263,7 @@ pub fn handle_slurm_commands(config: &Configuration, command: &SlurmCommands, fo
12561263 wf_id,
12571264 sched_config_id,
12581265 * num_hpc_jobs,
1266+ * start_one_worker_per_node,
12591267 job_prefix,
12601268 output,
12611269 effective_poll_interval,
@@ -1433,6 +1441,7 @@ pub fn schedule_slurm_nodes(
14331441 workflow_id : i64 ,
14341442 scheduler_config_id : i64 ,
14351443 num_hpc_jobs : i32 ,
1444+ start_one_worker_per_node : bool ,
14361445 job_prefix : & str ,
14371446 output : & str ,
14381447 poll_interval : i32 ,
@@ -1461,6 +1470,12 @@ pub fn schedule_slurm_nodes(
14611470 return Err ( format ! ( "Failed to get workflow: {}" , e) . into ( ) ) ;
14621471 }
14631472 } ;
1473+ let execution_config = ExecutionConfig :: from_workflow_model ( & workflow) ;
1474+ if start_one_worker_per_node && execution_config. mode != ExecutionMode :: Direct {
1475+ return Err (
1476+ "start_one_worker_per_node requires execution_config.mode to be 'direct'" . into ( ) ,
1477+ ) ;
1478+ }
14641479
14651480 let slurm_interface = match crate :: client:: hpc:: slurm_interface:: SlurmInterface :: new ( ) {
14661481 Ok ( interface) => interface,
@@ -1539,6 +1554,7 @@ pub fn schedule_slurm_nodes(
15391554 max_parallel_jobs,
15401555 Path :: new ( & script_path) ,
15411556 & config_map,
1557+ start_one_worker_per_node,
15421558 tls_ca_cert,
15431559 tls_insecure,
15441560 ) {
@@ -1633,7 +1649,13 @@ pub fn create_node_resources(
16331649 } ;
16341650
16351651 let num_gpus = interface. get_num_gpus ( ) as i64 ;
1636- let num_nodes = interface. get_num_nodes ( ) as i64 ;
1652+ // When running as a subtask (one worker per node), each worker manages
1653+ // only its own node regardless of the total allocation size.
1654+ let num_nodes = if is_subtask {
1655+ 1
1656+ } else {
1657+ interface. get_num_nodes ( ) as i64
1658+ } ;
16371659
16381660 // Return per-node resource values. The job runner is responsible for
16391661 // multiplying by num_nodes to compute total allocation capacity.
@@ -3914,6 +3936,7 @@ fn handle_regenerate(
39143936 workflow_id,
39153937 scheduler_info. id ,
39163938 scheduler_info. num_allocations as i32 ,
3939+ false , // start_one_worker_per_node
39173940 "" ,
39183941 output_dir. to_str ( ) . unwrap_or ( "torc_output" ) ,
39193942 effective_poll_interval,
0 commit comments