You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Wait for a set of jobs to complete. If a job fails, it will be rerun up to `reruns` times.
17
17
18
-
This function monitors the status of multiple jobs and handles failures by rerunning the failed jobs up to the specified number of `reruns`. It logs errors and job completion status, ensuring all jobs are completed before proceeding.
18
+
In addition to scheduler status, a job is only considered successful when the model writes its
19
+
`completed` checkpoint file. This makes restarts robust when jobs exit early due to time limits
20
+
and are requeued by the scheduler.
19
21
20
22
Arguments:
21
23
- `jobids`: Vector of job IDs.
@@ -39,7 +41,7 @@ function wait_for_jobs(
39
41
model_run_func;
40
42
verbose,
41
43
hpc_kwargs,
42
-
reruns =1,
44
+
reruns =0,
43
45
)
44
46
rerun_job_count =zeros(length(jobids))
45
47
completed_jobs =Set{Int}()
@@ -68,8 +70,15 @@ function wait_for_jobs(
68
70
push!(completed_jobs, m)
69
71
end
70
72
elseifjob_success(jobid)
71
-
@info"Ensemble member $m complete"
72
-
push!(completed_jobs, m)
73
+
# Only mark success if the model has written its completion checkpoint
74
+
ifmodel_completed(output_dir, iter, m)
75
+
@info"Ensemble member $m complete"
76
+
push!(completed_jobs, m)
77
+
else
78
+
# The scheduler may report COMPLETED for a batch script that requeued itself.
79
+
# Wait until the completion file exists.
80
+
@debug"Job $jobid completed but checkpoint not found yet for member $m"
81
+
end
73
82
end
74
83
end
75
84
sleep(5)
@@ -183,9 +192,26 @@ function generate_sbatch_directives(hpc_kwargs)
183
192
@asserthaskey(hpc_kwargs, :time) "Slurm kwargs must include key :time"
0 commit comments