Skip to content

Commit cd63e8a

Browse files
committed
Address PR comments
1 parent b8edd97 commit cd63e8a

File tree

8 files changed

+89
-27
lines changed

8 files changed

+89
-27
lines changed

api/openapi.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7187,6 +7187,7 @@ components:
71877187
compute_node_min_time_for_new_jobs_seconds: 300
71887188
worker_startup_script: worker_startup_script
71897189
worker_completion_script: worker_completion_script
7190+
slurm_config: null
71907191
execution_config: '{"mode":"auto","limit_resources":true,"srun_termination_signal":"TERM@120"}'
71917192
properties:
71927193
id:
@@ -7263,6 +7264,15 @@ components:
72637264
type: boolean
72647265
status_id:
72657266
type: integer
7267+
slurm_config:
7268+
deprecated: true
7269+
description: "Deprecated: Use execution_config instead. JSON-encoded\
7270+
\ blob of Slurm configuration options for the workflow. May include\
7271+
\ fields such as limit_resources, use_srun, srun_termination_signal,\
7272+
\ and enable_cpu_bind. Stored as a JSON string."
7273+
type:
7274+
- string
7275+
- "null"
72667276
execution_config:
72677277
description: "JSON-encoded execution configuration controlling mode\
72687278
\ (direct/slurm/auto) and related settings such as limit_resources,\

julia_client/Torc/src/api/models/model_WorkflowModel.jl

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
use_pending_failed=false,
2424
enable_ro_crate=false,
2525
status_id=nothing,
26+
slurm_config=nothing,
2627
execution_config=nothing,
2728
)
2829
@@ -44,6 +45,7 @@
4445
- use_pending_failed::Bool : Use PendingFailed status for failed jobs (enables AI-assisted recovery)
4546
- enable_ro_crate::Bool : When true, automatically create RO-Crate entities for workflow files. Input files get entities during initialization; output files get entities on job completion.
4647
- status_id::Int64
48+
- slurm_config::String : Deprecated: Use execution_config instead. JSON-encoded blob of Slurm configuration options for the workflow. May include fields such as limit_resources, use_srun, srun_termination_signal, and enable_cpu_bind. Stored as a JSON string.
4749
- execution_config::String : JSON-encoded execution configuration controlling mode (direct/slurm/auto) and related settings such as limit_resources, termination_signal, sigterm_lead_seconds, sigkill_headroom_seconds, timeout_exit_code, oom_exit_code, srun_termination_signal, and enable_cpu_bind. The server stores this without interpretation; only the client deserializes it.
4850
"""
4951
Base.@kwdef mutable struct WorkflowModel <: OpenAPI.APIModel
@@ -65,16 +67,17 @@ Base.@kwdef mutable struct WorkflowModel <: OpenAPI.APIModel
6567
use_pending_failed::Union{Nothing, Bool} = false
6668
enable_ro_crate::Union{Nothing, Bool} = false
6769
status_id::Union{Nothing, Int64} = nothing
70+
slurm_config::Union{Nothing, String} = nothing
6871
execution_config::Union{Nothing, String} = nothing
6972

70-
function WorkflowModel(id, name, user, description, timestamp, project, metadata, compute_node_expiration_buffer_seconds, compute_node_wait_for_new_jobs_seconds, compute_node_ignore_workflow_completion, compute_node_wait_for_healthy_database_minutes, compute_node_min_time_for_new_jobs_seconds, jobs_sort_method, resource_monitor_config, slurm_defaults, use_pending_failed, enable_ro_crate, status_id, execution_config, )
71-
o = new(id, name, user, description, timestamp, project, metadata, compute_node_expiration_buffer_seconds, compute_node_wait_for_new_jobs_seconds, compute_node_ignore_workflow_completion, compute_node_wait_for_healthy_database_minutes, compute_node_min_time_for_new_jobs_seconds, jobs_sort_method, resource_monitor_config, slurm_defaults, use_pending_failed, enable_ro_crate, status_id, execution_config, )
73+
function WorkflowModel(id, name, user, description, timestamp, project, metadata, compute_node_expiration_buffer_seconds, compute_node_wait_for_new_jobs_seconds, compute_node_ignore_workflow_completion, compute_node_wait_for_healthy_database_minutes, compute_node_min_time_for_new_jobs_seconds, jobs_sort_method, resource_monitor_config, slurm_defaults, use_pending_failed, enable_ro_crate, status_id, slurm_config, execution_config, )
74+
o = new(id, name, user, description, timestamp, project, metadata, compute_node_expiration_buffer_seconds, compute_node_wait_for_new_jobs_seconds, compute_node_ignore_workflow_completion, compute_node_wait_for_healthy_database_minutes, compute_node_min_time_for_new_jobs_seconds, jobs_sort_method, resource_monitor_config, slurm_defaults, use_pending_failed, enable_ro_crate, status_id, slurm_config, execution_config, )
7275
OpenAPI.validate_properties(o)
7376
return o
7477
end
7578
end # type WorkflowModel
7679

77-
const _property_types_WorkflowModel = Dict{Symbol,String}(Symbol("id")=>"Int64", Symbol("name")=>"String", Symbol("user")=>"String", Symbol("description")=>"String", Symbol("timestamp")=>"String", Symbol("project")=>"String", Symbol("metadata")=>"String", Symbol("compute_node_expiration_buffer_seconds")=>"Int64", Symbol("compute_node_wait_for_new_jobs_seconds")=>"Int64", Symbol("compute_node_ignore_workflow_completion")=>"Bool", Symbol("compute_node_wait_for_healthy_database_minutes")=>"Int64", Symbol("compute_node_min_time_for_new_jobs_seconds")=>"Int64", Symbol("jobs_sort_method")=>"JobsSortMethod", Symbol("resource_monitor_config")=>"String", Symbol("slurm_defaults")=>"String", Symbol("use_pending_failed")=>"Bool", Symbol("enable_ro_crate")=>"Bool", Symbol("status_id")=>"Int64", Symbol("execution_config")=>"String", )
80+
const _property_types_WorkflowModel = Dict{Symbol,String}(Symbol("id")=>"Int64", Symbol("name")=>"String", Symbol("user")=>"String", Symbol("description")=>"String", Symbol("timestamp")=>"String", Symbol("project")=>"String", Symbol("metadata")=>"String", Symbol("compute_node_expiration_buffer_seconds")=>"Int64", Symbol("compute_node_wait_for_new_jobs_seconds")=>"Int64", Symbol("compute_node_ignore_workflow_completion")=>"Bool", Symbol("compute_node_wait_for_healthy_database_minutes")=>"Int64", Symbol("compute_node_min_time_for_new_jobs_seconds")=>"Int64", Symbol("jobs_sort_method")=>"JobsSortMethod", Symbol("resource_monitor_config")=>"String", Symbol("slurm_defaults")=>"String", Symbol("use_pending_failed")=>"Bool", Symbol("enable_ro_crate")=>"Bool", Symbol("status_id")=>"Int64", Symbol("slurm_config")=>"String", Symbol("execution_config")=>"String", )
7881
OpenAPI.property_type(::Type{ WorkflowModel }, name::Symbol) = Union{Nothing,eval(Base.Meta.parse(_property_types_WorkflowModel[name]))}
7982

8083
function OpenAPI.check_required(o::WorkflowModel)
@@ -102,6 +105,7 @@ function OpenAPI.validate_properties(o::WorkflowModel)
102105
OpenAPI.validate_property(WorkflowModel, Symbol("use_pending_failed"), o.use_pending_failed)
103106
OpenAPI.validate_property(WorkflowModel, Symbol("enable_ro_crate"), o.enable_ro_crate)
104107
OpenAPI.validate_property(WorkflowModel, Symbol("status_id"), o.status_id)
108+
OpenAPI.validate_property(WorkflowModel, Symbol("slurm_config"), o.slurm_config)
105109
OpenAPI.validate_property(WorkflowModel, Symbol("execution_config"), o.execution_config)
106110
end
107111

@@ -125,4 +129,5 @@ function OpenAPI.validate_property(::Type{ WorkflowModel }, name::Symbol, val)
125129

126130

127131

132+
128133
end

julia_client/julia_client/docs/WorkflowModel.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ Name | Type | Description | Notes
2222
**use_pending_failed** | **Bool** | Use PendingFailed status for failed jobs (enables AI-assisted recovery) | [optional] [default to false]
2323
**enable_ro_crate** | **Bool** | When true, automatically create RO-Crate entities for workflow files. Input files get entities during initialization; output files get entities on job completion. | [optional] [default to false]
2424
**status_id** | **Int64** | | [optional] [default to nothing]
25+
**slurm_config** | **String** | Deprecated: Use execution_config instead. JSON-encoded blob of Slurm configuration options for the workflow. May include fields such as limit_resources, use_srun, srun_termination_signal, and enable_cpu_bind. Stored as a JSON string. | [optional] [default to nothing]
2526
**execution_config** | **String** | JSON-encoded execution configuration controlling mode (direct/slurm/auto) and related settings such as limit_resources, termination_signal, sigterm_lead_seconds, sigkill_headroom_seconds, timeout_exit_code, oom_exit_code, srun_termination_signal, and enable_cpu_bind. The server stores this without interpretation; only the client deserializes it. | [optional] [default to nothing]
2627

2728

python_client/src/torc/openapi_client/models/workflow_model.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,9 @@ class WorkflowModel(BaseModel):
4545
use_pending_failed: Optional[StrictBool] = Field(default=False, description="Use PendingFailed status for failed jobs (enables AI-assisted recovery)")
4646
enable_ro_crate: Optional[StrictBool] = Field(default=False, description="When true, automatically create RO-Crate entities for workflow files. Input files get entities during initialization; output files get entities on job completion.")
4747
status_id: Optional[StrictInt] = None
48+
slurm_config: Optional[StrictStr] = Field(default=None, description="Deprecated: Use execution_config instead. JSON-encoded blob of Slurm configuration options for the workflow. May include fields such as limit_resources, use_srun, srun_termination_signal, and enable_cpu_bind. Stored as a JSON string.")
4849
execution_config: Optional[StrictStr] = Field(default=None, description="JSON-encoded execution configuration controlling mode (direct/slurm/auto) and related settings such as limit_resources, termination_signal, sigterm_lead_seconds, sigkill_headroom_seconds, timeout_exit_code, oom_exit_code, srun_termination_signal, and enable_cpu_bind. The server stores this without interpretation; only the client deserializes it.")
49-
__properties: ClassVar[List[str]] = ["id", "name", "user", "description", "timestamp", "project", "metadata", "compute_node_expiration_buffer_seconds", "compute_node_wait_for_new_jobs_seconds", "compute_node_ignore_workflow_completion", "compute_node_wait_for_healthy_database_minutes", "compute_node_min_time_for_new_jobs_seconds", "jobs_sort_method", "resource_monitor_config", "slurm_defaults", "use_pending_failed", "enable_ro_crate", "status_id", "execution_config"]
50+
__properties: ClassVar[List[str]] = ["id", "name", "user", "description", "timestamp", "project", "metadata", "compute_node_expiration_buffer_seconds", "compute_node_wait_for_new_jobs_seconds", "compute_node_ignore_workflow_completion", "compute_node_wait_for_healthy_database_minutes", "compute_node_min_time_for_new_jobs_seconds", "jobs_sort_method", "resource_monitor_config", "slurm_defaults", "use_pending_failed", "enable_ro_crate", "status_id", "slurm_config", "execution_config"]
5051

5152
model_config = ConfigDict(
5253
populate_by_name=True,
@@ -87,6 +88,11 @@ def to_dict(self) -> Dict[str, Any]:
8788
exclude=excluded_fields,
8889
exclude_none=True,
8990
)
91+
# set to None if slurm_config (nullable) is None
92+
# and model_fields_set contains the field
93+
if self.slurm_config is None and "slurm_config" in self.model_fields_set:
94+
_dict['slurm_config'] = None
95+
9096
# set to None if execution_config (nullable) is None
9197
# and model_fields_set contains the field
9298
if self.execution_config is None and "execution_config" in self.model_fields_set:
@@ -122,6 +128,7 @@ def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]:
122128
"use_pending_failed": obj.get("use_pending_failed") if obj.get("use_pending_failed") is not None else False,
123129
"enable_ro_crate": obj.get("enable_ro_crate") if obj.get("enable_ro_crate") is not None else False,
124130
"status_id": obj.get("status_id"),
131+
"slurm_config": obj.get("slurm_config"),
125132
"execution_config": obj.get("execution_config")
126133
})
127134
return _obj

src/client/async_cli_command.rs

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -117,15 +117,15 @@ fn build_srun_command(params: &SrunParams) -> Result<Command, String> {
117117
// Floor of 1 minute because --time=0 means unlimited in Slurm.
118118
if let Some(end) = params.end_time {
119119
let remaining_secs = (end - Utc::now()).num_seconds();
120-
if remaining_secs <= params.sigkill_headroom_seconds {
120+
let usable_secs = remaining_secs - params.sigkill_headroom_seconds;
121+
if usable_secs < 60 {
121122
return Err(format!(
122123
"Refusing to launch srun step for job {}: only {}s remaining \
123-
(need at least {}s sigkill headroom)",
124-
params.job_id, remaining_secs, params.sigkill_headroom_seconds
124+
({}s usable after {}s sigkill headroom, need at least 60s)",
125+
params.job_id, remaining_secs, usable_secs, params.sigkill_headroom_seconds
125126
));
126127
}
127-
let headroom_minutes = (params.sigkill_headroom_seconds + 59) / 60;
128-
let remaining_minutes = ((remaining_secs / 60) - headroom_minutes).max(1);
128+
let remaining_minutes = usable_secs / 60;
129129
srun.arg(format!("--time={}", remaining_minutes));
130130
}
131131

@@ -343,6 +343,7 @@ impl AsyncCliCommand {
343343
resource_requirements
344344
.and_then(|rr| memory_string_to_bytes(&rr.memory).ok())
345345
.map(|b| b as u64)
346+
.filter(|&b| b > 0)
346347
} else {
347348
None
348349
};

src/server/api/workflows.rs

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,7 @@ impl WorkflowsApiImpl {
285285
,w.project
286286
,w.metadata
287287
,w.status_id
288+
,w.slurm_config
288289
,w.execution_config
289290
FROM workflow w
290291
INNER JOIN workflow_status ws ON w.status_id = ws.id
@@ -311,6 +312,7 @@ impl WorkflowsApiImpl {
311312
,project
312313
,metadata
313314
,status_id
315+
,slurm_config
314316
,execution_config
315317
FROM workflow
316318
"
@@ -474,7 +476,10 @@ impl WorkflowsApiImpl {
474476
project: record.get("project"),
475477
metadata: record.get("metadata"),
476478
status_id: Some(record.get("status_id")),
477-
slurm_config: None,
479+
slurm_config: record
480+
.try_get::<Option<String>, _>("slurm_config")
481+
.ok()
482+
.flatten(),
478483
execution_config: record
479484
.try_get::<Option<String>, _>("execution_config")
480485
.ok()
@@ -650,9 +655,10 @@ where
650655
project,
651656
metadata,
652657
status_id,
658+
slurm_config,
653659
execution_config
654660
)
655-
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18)
661+
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19)
656662
RETURNING rowid
657663
"#,
658664
body.name,
@@ -672,6 +678,7 @@ where
672678
body.project,
673679
body.metadata,
674680
status_result[0].id,
681+
body.slurm_config,
675682
body.execution_config,
676683
)
677684
.fetch_all(&mut *tx)
@@ -947,7 +954,7 @@ where
947954
project: row.project,
948955
metadata: row.metadata,
949956
status_id: Some(row.status_id),
950-
slurm_config: None,
957+
slurm_config: row.slurm_config,
951958
execution_config: row.execution_config,
952959
},
953960
)),
@@ -1256,8 +1263,9 @@ where
12561263
enable_ro_crate = COALESCE($10, enable_ro_crate),
12571264
project = COALESCE($11, project),
12581265
metadata = COALESCE($12, metadata),
1259-
execution_config = COALESCE($13, execution_config)
1260-
WHERE id = $14
1266+
slurm_config = COALESCE($13, slurm_config),
1267+
execution_config = COALESCE($14, execution_config)
1268+
WHERE id = $15
12611269
"#,
12621270
body.name,
12631271
body.description,
@@ -1271,6 +1279,7 @@ where
12711279
enable_ro_crate_int,
12721280
body.project,
12731281
body.metadata,
1282+
body.slurm_config,
12741283
body.execution_config,
12751284
id
12761285
)

tests/test_execution_config.rs

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,8 @@ fn test_effective_mode_slurm() {
200200
#[test]
201201
#[serial]
202202
fn test_effective_mode_auto_without_slurm_env() {
203+
// Save original value
204+
let original = std::env::var("SLURM_JOB_ID").ok();
203205
// Ensure SLURM_JOB_ID is not set
204206
// SAFETY: Using serial_test to prevent concurrent access to env vars
205207
unsafe {
@@ -212,11 +214,20 @@ fn test_effective_mode_auto_without_slurm_env() {
212214
};
213215
assert_eq!(config.effective_mode(), ExecutionMode::Direct);
214216
assert!(!config.use_srun());
217+
218+
// Restore original value
219+
if let Some(val) = original {
220+
unsafe {
221+
std::env::set_var("SLURM_JOB_ID", val);
222+
}
223+
}
215224
}
216225

217226
#[test]
218227
#[serial]
219228
fn test_effective_mode_auto_with_slurm_env() {
229+
// Save original value
230+
let original = std::env::var("SLURM_JOB_ID").ok();
220231
// Set SLURM_JOB_ID temporarily
221232
// SAFETY: Using serial_test to prevent concurrent access to env vars
222233
unsafe {
@@ -230,9 +241,13 @@ fn test_effective_mode_auto_with_slurm_env() {
230241
assert_eq!(config.effective_mode(), ExecutionMode::Slurm);
231242
assert!(config.use_srun());
232243

233-
// Clean up
244+
// Restore original value
234245
unsafe {
235-
std::env::remove_var("SLURM_JOB_ID");
246+
if let Some(val) = original {
247+
std::env::set_var("SLURM_JOB_ID", val);
248+
} else {
249+
std::env::remove_var("SLURM_JOB_ID");
250+
}
236251
}
237252
}
238253

tests/test_srun_args.rs

Lines changed: 25 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -433,32 +433,46 @@ fn test_srun_with_end_time() {
433433

434434
#[test]
435435
#[serial]
436-
fn test_srun_with_end_time_minimum_one_minute() {
436+
fn test_srun_with_end_time_insufficient_time_rejected() {
437437
let temp_dir = TempDir::new().unwrap();
438-
let args_log = setup_srun_env(&temp_dir);
438+
let _args_log = setup_srun_env(&temp_dir);
439439
let rr = make_rr("compute", 2, "4g", 1);
440440

441-
// Set end_time 10 seconds from now (should clamp to minimum 1 minute)
441+
// Set end_time 10 seconds from now — with 60s default headroom, usable time
442+
// is negative, so the launch should be refused.
442443
let end_time = chrono::Utc::now() + chrono::Duration::seconds(10);
443444

444-
let args = run_and_capture_srun_args(
445-
&temp_dir,
446-
&args_log,
445+
let job = make_job(1, "echo hello");
446+
let mut cmd = AsyncCliCommand::new(job);
447+
448+
let result = cmd.start(
449+
temp_dir.path(),
450+
1,
451+
1,
452+
1,
453+
None,
454+
"http://localhost:8080/torc-service/v1",
447455
Some(&rr),
448456
true,
449457
ExecutionMode::Slurm,
450458
false,
451459
Some(end_time),
452460
None,
453-
)
454-
.expect("srun should have been invoked");
461+
60,
462+
None,
463+
);
455464

456465
cleanup_srun_env();
457466

458467
assert!(
459-
args.contains("--time=1"),
460-
"Should clamp to --time=1 for near-expired end_time: {}",
461-
args
468+
result.is_err(),
469+
"Should refuse to launch srun step when insufficient time remains"
470+
);
471+
let err_msg = result.unwrap_err().to_string();
472+
assert!(
473+
err_msg.contains("Refusing to launch"),
474+
"Error should mention refusing to launch: {}",
475+
err_msg
462476
);
463477
}
464478

0 commit comments

Comments
 (0)