|
1 | | -#!/usr/bin/env bash |
2 | | -set -euo pipefail |
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +set -e |
3 | 4 |
|
4 | 5 | usage() { |
5 | 6 | echo "Usage: $0 [script.sh] [cpu|gpu]" |
6 | | - exit 1 |
7 | 7 | } |
8 | 8 |
|
9 | | -[[ $# -eq 2 ]] || usage |
10 | | - |
11 | | -sbatch_script="$1" |
12 | | -device="$2" |
13 | | - |
14 | | -job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2" |
15 | | - |
16 | | -# read the body of the user script |
17 | | -sbatch_body=$(<"$sbatch_script") |
| 9 | +if [ ! -z "$1" ]; then |
| 10 | + sbatch_script_contents=`cat $1` |
| 11 | +else |
| 12 | + usage |
| 13 | + exit 1 |
| 14 | +fi |
18 | 15 |
|
19 | | -# common SBATCH directives |
20 | | -sbatch_common_opts="\ |
21 | | -#SBATCH -J shb-${sbatch_script%%.sh}-$device # job name |
22 | | -#SBATCH --account=gts-sbryngelson3 # account |
23 | | -#SBATCH -N1 # nodes |
24 | | -#SBATCH -t 03:00:00 # walltime |
25 | | -#SBATCH -q embers # QOS |
26 | | -#SBATCH -o $job_slug.out # stdout+stderr |
27 | | -#SBATCH --mem-per-cpu=2G # default mem (overridden below) |
| 16 | +sbatch_cpu_opts="\ |
| 17 | +#SBATCH -p cpu-small # partition |
| 18 | +#SBATCH --ntasks-per-node=24 # Number of cores per node required |
| 19 | +#SBATCH --mem-per-cpu=2G # Memory per core\ |
28 | 20 | " |
29 | 21 |
|
30 | | -# CPU vs GPU overrides |
31 | | -if [[ "$device" == "cpu" ]]; then |
32 | | - sbatch_device_opts="\ |
33 | | -#SBATCH -p cpu-small |
34 | | -#SBATCH --ntasks-per-node=24 |
35 | | -" |
36 | | -elif [[ "$device" == "gpu" ]]; then |
37 | | - sbatch_device_opts="\ |
| 22 | +sbatch_gpu_opts="\ |
38 | 23 | #SBATCH -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s |
39 | | -#SBATCH --ntasks-per-node=4 |
40 | | -#SBATCH -G2 |
| 24 | +#SBATCH --ntasks-per-node=4 # Number of cores per node required |
| 25 | +#SBATCH -G2\ |
41 | 26 | " |
| 27 | + |
| 28 | +if [ "$2" = "cpu" ]; then |
| 29 | + sbatch_device_opts="$sbatch_cpu_opts" |
| 30 | +elif [ "$2" = "gpu" ]; then |
| 31 | + sbatch_device_opts="$sbatch_gpu_opts" |
42 | 32 | else |
43 | | - usage |
| 33 | + usage |
| 34 | + exit 1 |
44 | 35 | fi |
45 | 36 |
|
46 | | -# submit and capture the JobID |
47 | | -JOBID=$(sbatch <<-EOT | awk '{print $4}' |
48 | | - #!/usr/bin/env bash |
49 | | - ${sbatch_common_opts} |
50 | | - ${sbatch_device_opts} |
51 | | -
|
52 | | - set -e -x |
53 | | -
|
54 | | - cd "\$SLURM_SUBMIT_DIR" |
55 | | - echo "Running in \$(pwd):" |
| 37 | +job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2" |
56 | 38 |
|
57 | | - # load your modules & env |
58 | | - . ./mfc.sh load -c p -m $device |
| 39 | +sbatch <<EOT |
| 40 | +#!/bin/bash |
| 41 | +#SBATCH -Jshb-$job_slug # Job name |
| 42 | +#SBATCH --account=gts-sbryngelson3 # charge account |
| 43 | +#SBATCH -N1 # Number of nodes required |
| 44 | +$sbatch_device_opts |
| 45 | +#SBATCH -t 03:00:00 # Duration of the job (Ex: 15 mins) |
| 46 | +#SBATCH -q embers # QOS Name |
| 47 | +#SBATCH -o$job_slug.out # Combined output and error messages file |
| 48 | +#SBATCH -W # Do not exit until the submitted job terminates. |
59 | 49 |
|
60 | | - # user script contents |
61 | | - ${sbatch_body} |
62 | | -EOT |
63 | | -) |
| 50 | +set -e |
| 51 | +set -x |
64 | 52 |
|
65 | | -echo "Submitted: SLURM job $JOBID" |
| 53 | +cd "\$SLURM_SUBMIT_DIR" |
| 54 | +echo "Running in $(pwd):" |
66 | 55 |
|
67 | | -# if this wrapper is killed/canceled, make sure SLURM job is cleaned up |
68 | | -trap '[[ -n "${JOBID:-}" ]] && scancel "$JOBID" >/dev/null 2>&1 || :' EXIT |
| 56 | +job_slug="$job_slug" |
| 57 | +job_device="$2" |
69 | 58 |
|
70 | | -# ────────── Poll until SLURM job finishes ────────── |
71 | | -while :; do |
72 | | - # Try sacct first |
73 | | - STATE=$(sacct -j "$JOBID" --format=State --noheader --parsable2 | head -n1) |
| 59 | +. ./mfc.sh load -c p -m $2 |
74 | 60 |
|
75 | | - # Fallback to squeue if sacct is empty |
76 | | - if [[ -z "$STATE" ]]; then |
77 | | - STATE=$(squeue -j "$JOBID" -h -o "%T" || echo "") |
78 | | - fi |
| 61 | +$sbatch_script_contents |
79 | 62 |
|
80 | | - # If it’s one of SLURM’s terminal states, break immediately |
81 | | - case "$STATE" in |
82 | | - COMPLETED|FAILED|CANCELLED|TIMEOUT|PREEMPTED) |
83 | | - echo "Completed: SLURM job $JOBID reached terminal state: $STATE" |
84 | | - break |
85 | | - ;; |
86 | | - "") |
87 | | - echo "Completed: SLURM job $JOBID no longer in queue; assuming finished" |
88 | | - break |
89 | | - ;; |
90 | | - *) |
91 | | - echo "Waiting: SLURM job $JOBID state: $STATE" |
92 | | - sleep 10 |
93 | | - ;; |
94 | | - esac |
95 | | -done |
| 63 | +EOT |
96 | 64 |
|
97 | | -# Now retrieve the exit code and exit with it |
98 | | -EXIT_CODE=$(sacct -j "$JOBID" --noheader --format=ExitCode | head -1 | cut -d: -f1) |
99 | | -echo "Completed: SLURM job $JOBID exit code: $EXIT_CODE" |
100 | | -exit "$EXIT_CODE" |
0 commit comments