Skip to content

Commit 725380b

Browse files
authored
Update submit.sh
1 parent c686945 commit 725380b

File tree

1 file changed

+62
-43
lines changed

1 file changed

+62
-43
lines changed
Lines changed: 62 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,64 +1,83 @@
1-
#!/bin/bash
2-
3-
set -e
1+
#!/usr/bin/env bash
2+
set -euo pipefail
43

54
usage() {
65
echo "Usage: $0 [script.sh] [cpu|gpu]"
6+
exit 1
77
}
88

9-
if [ ! -z "$1" ]; then
10-
sbatch_script_contents=`cat $1`
11-
else
12-
usage
13-
exit 1
14-
fi
9+
[[ $# -eq 2 ]] || usage
10+
11+
sbatch_script="$1"
12+
device="$2"
13+
14+
# read the body of the user script
15+
sbatch_body=$(<"$sbatch_script")
1516

16-
sbatch_cpu_opts="\
17-
#SBATCH -p cpu-small # partition
18-
#SBATCH --ntasks-per-node=24 # Number of cores per node required
19-
#SBATCH --mem-per-cpu=2G # Memory per core\
17+
# common SBATCH directives
18+
sbatch_common_opts="\
19+
#SBATCH -J shb-${sbatch_script%%.sh}-$device # job name
20+
#SBATCH --account=gts-sbryngelson3 # account
21+
#SBATCH -N1 # nodes
22+
#SBATCH -t 03:00:00 # walltime
23+
#SBATCH -q embers # QOS
24+
#SBATCH -o ${sbatch_script%%.sh}-$device.%j.out # stdout+stderr
25+
#SBATCH --mem-per-cpu=2G # default mem (overridden below)
2026
"
2127

22-
sbatch_gpu_opts="\
28+
# CPU vs GPU overrides
29+
if [[ "$device" == "cpu" ]]; then
30+
sbatch_device_opts="\
31+
#SBATCH -p cpu-small
32+
#SBATCH --ntasks-per-node=24
33+
"
34+
elif [[ "$device" == "gpu" ]]; then
35+
sbatch_device_opts="\
2336
#SBATCH -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s
24-
#SBATCH --ntasks-per-node=4 # Number of cores per node required
25-
#SBATCH -G2\
37+
#SBATCH --ntasks-per-node=4
38+
#SBATCH -G2
2639
"
27-
28-
if [ "$2" = "cpu" ]; then
29-
sbatch_device_opts="$sbatch_cpu_opts"
30-
elif [ "$2" = "gpu" ]; then
31-
sbatch_device_opts="$sbatch_gpu_opts"
3240
else
33-
usage
34-
exit 1
41+
usage
3542
fi
3643

37-
job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2"
44+
# submit and capture the JobID
45+
JOBID=$(sbatch <<-EOT | awk '{print $4}'
46+
#!/usr/bin/env bash
47+
${sbatch_common_opts}
48+
${sbatch_device_opts}
3849
39-
sbatch <<EOT
40-
#!/bin/bash
41-
#SBATCH -Jshb-$job_slug # Job name
42-
#SBATCH --account=gts-sbryngelson3 # charge account
43-
#SBATCH -N1 # Number of nodes required
44-
$sbatch_device_opts
45-
#SBATCH -t 03:00:00 # Duration of the job (Ex: 15 mins)
46-
#SBATCH -q embers # QOS Name
47-
#SBATCH -o$job_slug.out # Combined output and error messages file
48-
#SBATCH -W # Do not exit until the submitted job terminates.
50+
set -e -x
4951
50-
set -e
51-
set -x
52+
cd "\$SLURM_SUBMIT_DIR"
53+
echo "Running in \$(pwd):"
5254
53-
cd "\$SLURM_SUBMIT_DIR"
54-
echo "Running in $(pwd):"
55+
# load your modules & env
56+
. ./mfc.sh load -c p -m $device
5557
56-
job_slug="$job_slug"
57-
job_device="$2"
58+
# user script contents
59+
${sbatch_body}
60+
EOT
61+
)
5862

59-
. ./mfc.sh load -c p -m $2
63+
echo "🚀 Submitted SLURM job $JOBID"
6064

61-
$sbatch_script_contents
65+
# if this wrapper is killed/canceled, make sure SLURM job is cleaned up
66+
trap '[[ -n "${JOBID:-}" ]] && scancel "$JOBID" >/dev/null 2>&1 || :' EXIT
6267

63-
EOT
68+
# poll until job finishes
69+
while true; do
70+
STATE=$(sacct -j "$JOBID" --noheader --format=State | head -1 | cut -d' ' -f1)
71+
echo "⏳ SLURM job $JOBID state: $STATE"
72+
case "$STATE" in
73+
COMPLETED|FAILED|CANCELLED|TIMEOUT) break ;;
74+
*) sleep 10 ;;
75+
esac
76+
done
77+
78+
# show final report
79+
sacct -j "$JOBID" --format=JobID,State,ExitCode,Elapsed,MaxRSS
6480

81+
# exit with the job's real code (left of the colon)
82+
EXIT_CODE=$(sacct -j "$JOBID" --noheader --format=ExitCode | head -1 | cut -d: -f1)
83+
exit "$EXIT_CODE"

0 commit comments

Comments
 (0)