Skip to content

Commit f368fdc

Browse files
committed
actually fix slurm message at exit
- add brief sleep after srun_proc exit to allow the exit to be recorded by slurm daemons
1 parent 40a57d3 commit f368fdc

File tree

1 file changed

+29
-6
lines changed

1 file changed

+29
-6
lines changed

src/slurmmanager.jl

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,24 +2,44 @@
22
ClusterManager for a Slurm allocation
33
44
Represents the resources available within a slurm allocation created by salloc/sbatch.
5-
The environment variables `SLURM_JOBID` and `SLURM_NTASKS` must be defined to construct this object.
5+
The environment variables `SLURM_JOB_ID` or `SLURM_JOBID` and `SLURM_NTASKS` must be defined to construct this object.
66
"""
77
mutable struct SlurmManager <: ClusterManager
88
jobid::Int
99
ntasks::Int
1010
verbose::Bool
1111
launch_timeout::Float64
12-
srun_proc::IO
12+
srun_post_exit_sleep::Float64
13+
srun_proc
1314

14-
function SlurmManager(;verbose=false, launch_timeout=60.0)
15-
if !("SLURM_JOBID" in keys(ENV) && "SLURM_NTASKS" in keys(ENV))
16-
throw(ErrorException("SlurmManager must be constructed inside a slurm allocation environemnt. SLURM_JOBID and SLURM_NTASKS must be defined."))
15+
function SlurmManager(;verbose=false, launch_timeout=60.0, srun_post_exit_sleep=0.01)
16+
17+
jobid =
18+
if "SLURM_JOB_ID" in keys(ENV)
19+
ENV["SLURM_JOB_ID"]
20+
elseif "SLURM_JOBID" in keys(ENV)
21+
ENV["SLURM_JOBID"]
22+
else
23+
error("""
24+
SlurmManager must be constructed inside a slurm allocation environemnt.
25+
SLURM_JOB_ID or SLURM_JOBID must be defined.
26+
""")
27+
end
28+
29+
ntasks =
30+
if "SLURM_NTASKS" in keys(ENV)
31+
ENV["SLURM_NTASKS"]
32+
else
33+
throw("""
34+
SlurmManager must be constructed inside a slurm environment with a specified number of tasks.
35+
SLURM_NTASKS must be defined.
36+
""")
1737
end
1838

1939
jobid = parse(Int, ENV["SLURM_JOBID"])
2040
ntasks = parse(Int, ENV["SLURM_NTASKS"])
2141

22-
new(jobid, ntasks, verbose, launch_timeout)
42+
new(jobid, ntasks, verbose, launch_timeout, srun_post_exit_sleep, nothing)
2343
end
2444
end
2545

@@ -64,6 +84,9 @@ function launch(manager::SlurmManager, params::Dict, instances_arr::Array, c::Co
6484
# avoids "Job step aborted: Waiting up to 32 seconds for job step to finish" message
6585
finalizer(manager) do manager
6686
wait(manager.srun_proc)
87+
# need to sleep briefly here to make sure that srun exit is recorded by slurm daemons
88+
# TODO find a way to wait on the condition directly instead of just sleeping
89+
sleep(manager.srun_post_exit_sleep)
6790
end
6891

6992
catch e

0 commit comments

Comments
 (0)