|
2 | 2 | ClusterManager for a Slurm allocation
|
3 | 3 |
|
4 | 4 | Represents the resources available within a slurm allocation created by salloc/sbatch.
|
5 |
| -The environment variables `SLURM_JOBID` and `SLURM_NTASKS` must be defined to construct this object. |
| 5 | +The environment variables `SLURM_JOB_ID` or `SLURM_JOBID` and `SLURM_NTASKS` must be defined to construct this object. |
6 | 6 | """
|
7 | 7 | mutable struct SlurmManager <: ClusterManager
|
8 | 8 | jobid::Int
|
9 | 9 | ntasks::Int
|
10 | 10 | verbose::Bool
|
11 | 11 | launch_timeout::Float64
|
12 |
| - srun_proc::IO |
| 12 | + srun_post_exit_sleep::Float64 |
| 13 | + srun_proc |
13 | 14 |
|
14 |
| - function SlurmManager(;verbose=false, launch_timeout=60.0) |
15 |
| - if !("SLURM_JOBID" in keys(ENV) && "SLURM_NTASKS" in keys(ENV)) |
16 |
| - throw(ErrorException("SlurmManager must be constructed inside a slurm allocation environemnt. SLURM_JOBID and SLURM_NTASKS must be defined.")) |
| 15 | + function SlurmManager(;verbose=false, launch_timeout=60.0, srun_post_exit_sleep=0.01) |
| 16 | + |
| 17 | + jobid = |
| 18 | + if "SLURM_JOB_ID" in keys(ENV) |
| 19 | + ENV["SLURM_JOB_ID"] |
| 20 | + elseif "SLURM_JOBID" in keys(ENV) |
| 21 | + ENV["SLURM_JOBID"] |
| 22 | + else |
| 23 | + error(""" |
| 24 | + SlurmManager must be constructed inside a slurm allocation environemnt. |
| 25 | + SLURM_JOB_ID or SLURM_JOBID must be defined. |
| 26 | + """) |
| 27 | + end |
| 28 | + |
| 29 | + ntasks = |
| 30 | + if "SLURM_NTASKS" in keys(ENV) |
| 31 | + ENV["SLURM_NTASKS"] |
| 32 | + else |
| 33 | + throw(""" |
| 34 | + SlurmManager must be constructed inside a slurm environment with a specified number of tasks. |
| 35 | + SLURM_NTASKS must be defined. |
| 36 | + """) |
17 | 37 | end
|
18 | 38 |
|
19 | 39 | jobid = parse(Int, ENV["SLURM_JOBID"])
|
20 | 40 | ntasks = parse(Int, ENV["SLURM_NTASKS"])
|
21 | 41 |
|
22 |
| - new(jobid, ntasks, verbose, launch_timeout) |
| 42 | + new(jobid, ntasks, verbose, launch_timeout, srun_post_exit_sleep, nothing) |
23 | 43 | end
|
24 | 44 | end
|
25 | 45 |
|
@@ -64,6 +84,9 @@ function launch(manager::SlurmManager, params::Dict, instances_arr::Array, c::Co
|
64 | 84 | # avoids "Job step aborted: Waiting up to 32 seconds for job step to finish" message
|
65 | 85 | finalizer(manager) do manager
|
66 | 86 | wait(manager.srun_proc)
|
| 87 | + # need to sleep briefly here to make sure that srun exit is recorded by slurm daemons |
| 88 | + # TODO find a way to wait on the condition directly instead of just sleeping |
| 89 | + sleep(manager.srun_post_exit_sleep) |
67 | 90 | end
|
68 | 91 |
|
69 | 92 | catch e
|
|
0 commit comments