Skip to content

Commit 40a57d3

Browse files
committed
fix slurm message at exit
- adds finalizer to wait on srun_proc to ensure that the worker job step terminates before the manager process exits
1 parent f6e84a2 commit 40a57d3

File tree

1 file changed

+12
-7
lines changed

1 file changed

+12
-7
lines changed

src/slurmmanager.jl

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,12 @@ ClusterManager for a Slurm allocation
44
Represents the resources available within a slurm allocation created by salloc/sbatch.
55
The environment variables `SLURM_JOBID` and `SLURM_NTASKS` must be defined to construct this object.
66
"""
7-
struct SlurmManager <: ClusterManager
7+
mutable struct SlurmManager <: ClusterManager
88
jobid::Int
99
ntasks::Int
1010
verbose::Bool
1111
launch_timeout::Float64
12+
srun_proc::IO
1213

1314
function SlurmManager(;verbose=false, launch_timeout=60.0)
1415
if !("SLURM_JOBID" in keys(ENV) && "SLURM_NTASKS" in keys(ENV))
@@ -29,21 +30,19 @@ function launch(manager::SlurmManager, params::Dict, instances_arr::Array, c::Co
2930
exeflags = params[:exeflags]
3031

3132
srun_cmd = `srun -D $exehome $exename $exeflags --worker=$(cluster_cookie())`
32-
srun_proc = open(srun_cmd)
33+
manager.srun_proc = open(srun_cmd)
3334

3435
t = @async for i in 1:manager.ntasks
3536
manager.verbose && println("connecting to worker $i out of $(manager.ntasks)")
3637

37-
line = readline(srun_proc)
38+
line = readline(manager.srun_proc)
3839
m = match(r".*:(\d*)#(.*)", line)
3940
m === nothing && error("could not parse $line")
4041

4142
config = WorkerConfig()
4243
config.port = parse(Int, m[1])
4344
config.host = strip(m[2])
4445

45-
# Keep a reference to the proc, so it's properly closed once the last worker exits.
46-
config.userdata = srun_proc
4746
push!(instances_arr, config)
4847
notify(c)
4948
end
@@ -56,11 +55,17 @@ function launch(manager::SlurmManager, params::Dict, instances_arr::Array, c::Co
5655
wait(t)
5756

5857
# redirect output
59-
@async while !eof(srun_proc)
60-
line = readline(srun_proc)
58+
@async while !eof(manager.srun_proc)
59+
line = readline(manager.srun_proc)
6160
println(line)
6261
end
6362

63+
# wait to make sure that srun_proc exits before main program to avoid slurm complaining
64+
# avoids "Job step aborted: Waiting up to 32 seconds for job step to finish" message
65+
finalizer(manager) do manager
66+
wait(manager.srun_proc)
67+
end
68+
6469
catch e
6570
println("Error launching Slurm job:")
6671
rethrow(e)

0 commit comments

Comments
 (0)