@@ -4,11 +4,12 @@ ClusterManager for a Slurm allocation
4
4
Represents the resources available within a slurm allocation created by salloc/sbatch.
5
5
The environment variables `SLURM_JOBID` and `SLURM_NTASKS` must be defined to construct this object.
6
6
"""
7
- struct SlurmManager <: ClusterManager
7
+ mutable struct SlurmManager <: ClusterManager
8
8
jobid:: Int
9
9
ntasks:: Int
10
10
verbose:: Bool
11
11
launch_timeout:: Float64
12
+ srun_proc:: IO
12
13
13
14
function SlurmManager (;verbose= false , launch_timeout= 60.0 )
14
15
if ! (" SLURM_JOBID" in keys (ENV ) && " SLURM_NTASKS" in keys (ENV ))
@@ -29,21 +30,19 @@ function launch(manager::SlurmManager, params::Dict, instances_arr::Array, c::Co
29
30
exeflags = params[:exeflags ]
30
31
31
32
srun_cmd = ` srun -D $exehome $exename $exeflags --worker=$(cluster_cookie ()) `
32
- srun_proc = open (srun_cmd)
33
+ manager . srun_proc = open (srun_cmd)
33
34
34
35
t = @async for i in 1 : manager. ntasks
35
36
manager. verbose && println (" connecting to worker $i out of $(manager. ntasks) " )
36
37
37
- line = readline (srun_proc)
38
+ line = readline (manager . srun_proc)
38
39
m = match (r" .*:(\d *)#(.*)" , line)
39
40
m === nothing && error (" could not parse $line " )
40
41
41
42
config = WorkerConfig ()
42
43
config. port = parse (Int, m[1 ])
43
44
config. host = strip (m[2 ])
44
45
45
- # Keep a reference to the proc, so it's properly closed once the last worker exits.
46
- config. userdata = srun_proc
47
46
push! (instances_arr, config)
48
47
notify (c)
49
48
end
@@ -56,11 +55,17 @@ function launch(manager::SlurmManager, params::Dict, instances_arr::Array, c::Co
56
55
wait (t)
57
56
58
57
# redirect output
59
- @async while ! eof (srun_proc)
60
- line = readline (srun_proc)
58
+ @async while ! eof (manager . srun_proc)
59
+ line = readline (manager . srun_proc)
61
60
println (line)
62
61
end
63
62
63
+ # wait to make sure that srun_proc exits before main program to avoid slurm complaining
64
+ # avoids "Job step aborted: Waiting up to 32 seconds for job step to finish" message
65
+ finalizer (manager) do manager
66
+ wait (manager. srun_proc)
67
+ end
68
+
64
69
catch e
65
70
println (" Error launching Slurm job:" )
66
71
rethrow (e)
0 commit comments