Skip to content

Commit 5361530

Browse files
authored
Update submit-bench.sh
1 parent dcb0e34 commit 5361530

File tree

1 file changed

+76
-39
lines changed

1 file changed

+76
-39
lines changed
Lines changed: 76 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,64 +1,101 @@
1-
#!/bin/bash
2-
3-
set -e
1+
#!/usr/bin/env bash
2+
set -euo pipefail
43

54
usage() {
65
echo "Usage: $0 [script.sh] [cpu|gpu]"
6+
exit 1
77
}
88

9-
if [ ! -z "$1" ]; then
10-
sbatch_script_contents=`cat $1`
11-
else
12-
usage
13-
exit 1
14-
fi
9+
[[ $# -eq 2 ]] || usage
1510

16-
sbatch_cpu_opts="\
11+
sbatch_script="$1"
12+
device="$2"
13+
14+
job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2"
15+
16+
# read the body of the user script
17+
sbatch_body=$(<"$sbatch_script")
18+
19+
# common SBATCH directives
20+
sbatch_common_opts="\
21+
#SBATCH -J shb-${sbatch_script%%.sh}-$device # job name
22+
#SBATCH --account=gts-sbryngelson3 # account
23+
#SBATCH -N1 # nodes
24+
#SBATCH -t 02:00:00 # walltime
25+
#SBATCH -q embers # QOS
26+
#SBATCH -o $job_slug.out # stdout+stderr
27+
#SBATCH --mem-per-cpu=2G # default mem (overridden below)
28+
"
29+
30+
# CPU vs GPU overrides
31+
if [[ "$device" == "cpu" ]]; then
32+
sbatch_device_opts="\
1733
#SBATCH -p cpu-small # partition
1834
#SBATCH --ntasks-per-node=24 # Number of cores per node required
1935
#SBATCH --mem-per-cpu=2G # Memory per core\
2036
"
21-
22-
sbatch_gpu_opts="\
37+
elif [[ "$device" == "gpu" ]]; then
38+
sbatch_device_opts="\
2339
#SBATCH -CL40S
2440
#SBATCH --ntasks-per-node=4 # Number of cores per node required
2541
#SBATCH -G2\
2642
"
27-
28-
if [ "$2" = "cpu" ]; then
29-
sbatch_device_opts="$sbatch_cpu_opts"
30-
elif [ "$2" = "gpu" ]; then
31-
sbatch_device_opts="$sbatch_gpu_opts"
3243
else
33-
usage
34-
exit 1
44+
usage
3545
fi
3646

37-
job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2"
47+
# submit and capture the JobID
48+
JOBID=$(sbatch <<-EOT | awk '{print $4}'
49+
#!/usr/bin/env bash
50+
${sbatch_common_opts}
51+
${sbatch_device_opts}
3852
39-
sbatch <<EOT
40-
#!/bin/bash
41-
#SBATCH -Jshb-$job_slug # Job name
42-
#SBATCH --account=gts-sbryngelson3 # charge account
43-
#SBATCH -N1 # Number of nodes required
44-
$sbatch_device_opts
45-
#SBATCH -t 02:00:00 # Duration of the job (Ex: 15 mins)
46-
#SBATCH -q embers # QOS Name
47-
#SBATCH -o$job_slug.out # Combined output and error messages file
48-
#SBATCH -W # Do not exit until the submitted job terminates.
53+
set -e -x
4954
50-
set -e
51-
set -x
55+
cd "\$SLURM_SUBMIT_DIR"
56+
echo "Running in \$(pwd):"
5257
53-
cd "\$SLURM_SUBMIT_DIR"
54-
echo "Running in $(pwd):"
58+
# load your modules & env
59+
. ./mfc.sh load -c p -m $device
5560
56-
job_slug="$job_slug"
57-
job_device="$2"
61+
# user script contents
62+
${sbatch_body}
63+
EOT
64+
)
5865

59-
. ./mfc.sh load -c p -m $2
66+
echo "🚀 Submitted SLURM job $JOBID"
6067

61-
$sbatch_script_contents
68+
# if this wrapper is killed/canceled, make sure SLURM job is cleaned up
69+
trap '[[ -n "${JOBID:-}" ]] && scancel "$JOBID" >/dev/null 2>&1 || :' EXIT
6270

63-
EOT
71+
# ────────── Poll until SLURM job finishes ──────────
72+
while :; do
73+
# Try sacct first
74+
STATE=$(sacct -j "$JOBID" --format=State --noheader --parsable2 | head -n1)
75+
76+
# Fallback to squeue if sacct is empty
77+
if [[ -z "$STATE" ]]; then
78+
STATE=$(squeue -j "$JOBID" -h -o "%T" || echo "")
79+
fi
80+
81+
# If it’s one of SLURM’s terminal states, break immediately
82+
case "$STATE" in
83+
COMPLETED|FAILED|CANCELLED|TIMEOUT)
84+
echo "✅ SLURM job $JOBID reached terminal state: $STATE"
85+
break
86+
;;
87+
"")
88+
echo "✅ SLURM job $JOBID no longer in queue; assuming finished"
89+
break
90+
;;
91+
*)
92+
echo "⏳ SLURM job $JOBID state: $STATE"
93+
sleep 10
94+
;;
95+
esac
96+
done
6497

98+
# Now retrieve the exit code and exit with it
99+
EXIT_CODE=$(sacct -j "$JOBID" --noheader --format=ExitCode | head -1 | cut -d: -f1)
100+
echo "🔚 SLURM job $JOBID exit code: $EXIT_CODE"
101+
exit "$EXIT_CODE"

0 commit comments

Comments
 (0)