|
1 | | -#!/usr/bin/env bash |
2 | | -set -euo pipefail |
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +set -e |
3 | 4 |
|
4 | 5 | usage() { |
5 | 6 | echo "Usage: $0 [script.sh] [cpu|gpu]" |
6 | | - exit 1 |
7 | 7 | } |
8 | 8 |
|
9 | | -[[ $# -eq 2 ]] || usage |
10 | | - |
11 | | -sbatch_script="$1" |
12 | | - |
13 | | -device="$2" |
14 | | -job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2" |
15 | | - |
16 | | -# read the body of the user script |
17 | | -sbatch_body=$(<"$sbatch_script") |
18 | | - |
19 | | -# common SBATCH directives |
20 | | -sbatch_common_opts="\ |
21 | | -#SBATCH -J shb-${sbatch_script%%.sh}-$device # job name |
22 | | -#SBATCH --account=gts-sbryngelson3 # account |
23 | | -#SBATCH -N1 # nodes |
24 | | -#SBATCH -t 02:00:00 # walltime |
25 | | -#SBATCH -q embers # QOS |
26 | | -#SBATCH -o $job_slug.out # stdout+stderr |
27 | | -#SBATCH --mem-per-cpu=2G # default mem (overridden below) |
28 | | -" |
| 9 | +if [ ! -z "$1" ]; then |
| 10 | + sbatch_script_contents=`cat $1` |
| 11 | +else |
| 12 | + usage |
| 13 | + exit 1 |
| 14 | +fi |
29 | 15 |
|
30 | | -# CPU vs GPU overrides |
31 | | -if [[ "$device" == "cpu" ]]; then |
32 | | - sbatch_device_opts="\ |
| 16 | +sbatch_cpu_opts="\ |
33 | 17 | #SBATCH -p cpu-small # partition |
34 | 18 | #SBATCH --ntasks-per-node=24 # Number of cores per node required |
35 | 19 | #SBATCH --mem-per-cpu=2G # Memory per core\ |
36 | 20 | " |
37 | | -elif [[ "$device" == "gpu" ]]; then |
38 | | - sbatch_device_opts="\ |
| 21 | + |
| 22 | +sbatch_gpu_opts="\ |
39 | 23 | #SBATCH -CL40S |
40 | 24 | #SBATCH --ntasks-per-node=4 # Number of cores per node required |
41 | 25 | #SBATCH -G2\ |
42 | 26 | " |
| 27 | + |
| 28 | +if [ "$2" = "cpu" ]; then |
| 29 | + sbatch_device_opts="$sbatch_cpu_opts" |
| 30 | +elif [ "$2" = "gpu" ]; then |
| 31 | + sbatch_device_opts="$sbatch_gpu_opts" |
43 | 32 | else |
44 | | - usage |
| 33 | + usage |
| 34 | + exit 1 |
45 | 35 | fi |
46 | 36 |
|
47 | | -# submit and capture the JobID |
48 | | -JOBID=$(sbatch <<-EOT | awk '{print $4}' |
49 | | - #!/usr/bin/env bash |
50 | | - ${sbatch_common_opts} |
51 | | - ${sbatch_device_opts} |
52 | | - |
53 | | - export job_slug="${job_slug}" |
54 | | - export device="${device}" |
55 | | -
|
56 | | - echo "Job slug is:" $job_slug |
57 | | - echo "Device is:" $device |
58 | | - |
59 | | - set -e -x |
60 | | -
|
61 | | - cd "\$SLURM_SUBMIT_DIR" |
62 | | - echo "Running in \$(pwd):" |
| 37 | +job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2" |
63 | 38 |
|
64 | | - # load your modules & env |
65 | | - . ./mfc.sh load -c p -m $device |
| 39 | +sbatch <<EOT |
| 40 | +#!/bin/bash |
| 41 | +#SBATCH -Jshb-$job_slug # Job name |
| 42 | +#SBATCH --account=gts-sbryngelson3 # charge account |
| 43 | +#SBATCH -N1 # Number of nodes required |
| 44 | +$sbatch_device_opts |
| 45 | +#SBATCH -t 02:00:00 # Duration of the job (Ex: 15 mins) |
| 46 | +#SBATCH -q embers # QOS Name |
| 47 | +#SBATCH -o$job_slug.out # Combined output and error messages file |
| 48 | +#SBATCH -W # Do not exit until the submitted job terminates. |
66 | 49 |
|
67 | | - # user script contents |
68 | | - ${sbatch_body} |
69 | | -EOT |
70 | | -) |
| 50 | +set -e |
| 51 | +set -x |
71 | 52 |
|
72 | | -echo "Submitted: SLURM job $JOBID" |
| 53 | +cd "\$SLURM_SUBMIT_DIR" |
| 54 | +echo "Running in $(pwd):" |
73 | 55 |
|
74 | | -# if this wrapper is killed/canceled, make sure SLURM job is cleaned up |
75 | | -trap '[[ -n "${JOBID:-}" ]] && scancel "$JOBID" >/dev/null 2>&1 || :' EXIT |
| 56 | +job_slug="$job_slug" |
| 57 | +job_device="$2" |
76 | 58 |
|
77 | | -# ────────── Poll until SLURM job finishes ────────── |
78 | | -while :; do |
79 | | - # Try sacct first |
80 | | - STATE=$(sacct -j "$JOBID" --format=State --noheader --parsable2 | head -n1) |
| 59 | +. ./mfc.sh load -c p -m $2 |
81 | 60 |
|
82 | | - # Fallback to squeue if sacct is empty |
83 | | - if [[ -z "$STATE" ]]; then |
84 | | - STATE=$(squeue -j "$JOBID" -h -o "%T" || echo "") |
85 | | - fi |
| 61 | +$sbatch_script_contents |
86 | 62 |
|
87 | | - # If it’s one of SLURM’s terminal states, break immediately |
88 | | - case "$STATE" in |
89 | | - COMPLETED|FAILED|CANCELLED|TIMEOUT|PREEMPTED) |
90 | | - echo "Completed: SLURM job $JOBID reached terminal state: $STATE" |
91 | | - break |
92 | | - ;; |
93 | | - "") |
94 | | - echo "Completed: SLURM job $JOBID no longer in queue; assuming finished" |
95 | | - break |
96 | | - ;; |
97 | | - *) |
98 | | - echo "Waiting: SLURM job $JOBID state: $STATE" |
99 | | - sleep 10 |
100 | | - ;; |
101 | | - esac |
102 | | -done |
| 63 | +EOT |
103 | 64 |
|
104 | | -# Now retrieve the exit code and exit with it |
105 | | -EXIT_CODE=$(sacct -j "$JOBID" --noheader --format=ExitCode | head -1 | cut -d: -f1) |
106 | | -echo "Completed: SLURM job $JOBID exit code: $EXIT_CODE" |
107 | | -exit "$EXIT_CODE" |
0 commit comments