Skip to content

Commit 0ed69c5

Browse files
authored
fix preempt and add walltime (#954)
1 parent f83b75a commit 0ed69c5

File tree

4 files changed

+14
-14
lines changed

4 files changed

+14
-14
lines changed

.github/workflows/frontier/submit-bench.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ sbatch <<EOT
3232
#SBATCH -A CFD154 # charge account
3333
#SBATCH -N 1 # Number of nodes required
3434
$sbatch_device_opts
35-
#SBATCH -t 01:59:00 # Duration of the job (Ex: 15 mins)
35+
#SBATCH -t 02:59:00 # Duration of the job (Ex: 15 mins)
3636
#SBATCH -o$job_slug.out # Combined output and error messages file
3737
#SBATCH -p extended # Extended partition for shorter queues
3838
#SBATCH -W # Do not exit until the submitted job terminates.

.github/workflows/frontier/submit.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ sbatch <<EOT
3333
#SBATCH -A CFD154 # charge account
3434
#SBATCH -N 1 # Number of nodes required
3535
$sbatch_device_opts
36-
#SBATCH -t 01:59:00 # Duration of the job (Ex: 15 mins)
36+
#SBATCH -t 02:59:00 # Duration of the job (Ex: 15 mins)
3737
#SBATCH -o$job_slug.out # Combined output and error messages file
3838
#SBATCH -p extended # Extended partition for shorter queues
3939
#SBATCH -W # Do not exit until the submitted job terminates.

.github/workflows/phoenix/submit-bench.sh

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ JOBID=$(sbatch <<-EOT | awk '{print $4}'
6969
EOT
7070
)
7171

72-
echo "🚀 Submitted SLURM job $JOBID"
72+
echo "Submitted: SLURM job $JOBID"
7373

7474
# if this wrapper is killed/canceled, make sure SLURM job is cleaned up
7575
trap '[[ -n "${JOBID:-}" ]] && scancel "$JOBID" >/dev/null 2>&1 || :' EXIT
@@ -86,22 +86,22 @@ while :; do
8686

8787
# If it’s one of SLURM’s terminal states, break immediately
8888
case "$STATE" in
89-
COMPLETED|FAILED|CANCELLED|TIMEOUT)
90-
echo " SLURM job $JOBID reached terminal state: $STATE"
89+
COMPLETED|FAILED|CANCELLED|TIMEOUT|PREEMPTED)
90+
echo "Completed: SLURM job $JOBID reached terminal state: $STATE"
9191
break
9292
;;
9393
"")
94-
echo " SLURM job $JOBID no longer in queue; assuming finished"
94+
echo "Completed: SLURM job $JOBID no longer in queue; assuming finished"
9595
break
9696
;;
9797
*)
98-
echo " SLURM job $JOBID state: $STATE"
98+
echo "Waiting: SLURM job $JOBID state: $STATE"
9999
sleep 10
100100
;;
101101
esac
102102
done
103103

104104
# Now retrieve the exit code and exit with it
105105
EXIT_CODE=$(sacct -j "$JOBID" --noheader --format=ExitCode | head -1 | cut -d: -f1)
106-
echo "🔚 SLURM job $JOBID exit code: $EXIT_CODE"
106+
echo "Completed: SLURM job $JOBID exit code: $EXIT_CODE"
107107
exit "$EXIT_CODE"

.github/workflows/phoenix/submit.sh

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ JOBID=$(sbatch <<-EOT | awk '{print $4}'
6262
EOT
6363
)
6464

65-
echo "🚀 Submitted SLURM job $JOBID"
65+
echo "Submitted: SLURM job $JOBID"
6666

6767
# if this wrapper is killed/canceled, make sure SLURM job is cleaned up
6868
trap '[[ -n "${JOBID:-}" ]] && scancel "$JOBID" >/dev/null 2>&1 || :' EXIT
@@ -79,22 +79,22 @@ while :; do
7979

8080
# If it’s one of SLURM’s terminal states, break immediately
8181
case "$STATE" in
82-
COMPLETED|FAILED|CANCELLED|TIMEOUT)
83-
echo " SLURM job $JOBID reached terminal state: $STATE"
82+
COMPLETED|FAILED|CANCELLED|TIMEOUT|PREEMPTED)
83+
echo "Completed: SLURM job $JOBID reached terminal state: $STATE"
8484
break
8585
;;
8686
"")
87-
echo " SLURM job $JOBID no longer in queue; assuming finished"
87+
echo "Completed: SLURM job $JOBID no longer in queue; assuming finished"
8888
break
8989
;;
9090
*)
91-
echo " SLURM job $JOBID state: $STATE"
91+
echo "Waiting: SLURM job $JOBID state: $STATE"
9292
sleep 10
9393
;;
9494
esac
9595
done
9696

9797
# Now retrieve the exit code and exit with it
9898
EXIT_CODE=$(sacct -j "$JOBID" --noheader --format=ExitCode | head -1 | cut -d: -f1)
99-
echo "🔚 SLURM job $JOBID exit code: $EXIT_CODE"
99+
echo "Completed: SLURM job $JOBID exit code: $EXIT_CODE"
100100
exit "$EXIT_CODE"

0 commit comments

Comments
 (0)