|
1 | | -#!/usr/bin/env bash |
2 | | -set -euo pipefail |
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +set -e |
3 | 4 |
|
4 | 5 | usage() { |
5 | | - echo "Usage: $0 [cpu|gpu]" |
6 | | - exit 1 |
| 6 | + echo "Usage: $0 [script.sh] [cpu|gpu]" |
7 | 7 | } |
8 | 8 |
|
9 | | -[[ $# -eq 1 ]] || usage |
10 | | - |
11 | | -device="$1" |
12 | | -job_slug="bench-$1" |
13 | | - |
14 | | -# read the body of the user script |
15 | | -sbatch_body=$(<"$sbatch_script") |
16 | | - |
17 | | -# common SBATCH directives |
18 | | -sbatch_common_opts="\ |
19 | | -#SBATCH -J MFC-benchmark-$device # job name |
20 | | -#SBATCH --account=gts-sbryngelson3 # account |
21 | | -#SBATCH -N1 # nodes |
22 | | -#SBATCH -t 02:00:00 # walltime |
23 | | -#SBATCH -q embers # QOS |
24 | | -#SBATCH -o $job_slug.out # stdout+stderr |
25 | | -#SBATCH --mem-per-cpu=2G # default mem (overridden below) |
26 | | -" |
| 9 | +if [ ! -z "$1" ]; then |
| 10 | + sbatch_script_contents=`cat $1` |
| 11 | +else |
| 12 | + usage |
| 13 | + exit 1 |
| 14 | +fi |
27 | 15 |
|
28 | | -# CPU vs GPU overrides |
29 | | -if [[ "$device" == "cpu" ]]; then |
30 | | - sbatch_device_opts="\ |
| 16 | +sbatch_cpu_opts="\ |
31 | 17 | #SBATCH -p cpu-small # partition |
32 | 18 | #SBATCH --ntasks-per-node=24 # Number of cores per node required |
33 | 19 | #SBATCH --mem-per-cpu=2G # Memory per core\ |
34 | 20 | " |
35 | | -elif [[ "$device" == "gpu" ]]; then |
36 | | - sbatch_device_opts="\ |
| 21 | + |
| 22 | +sbatch_gpu_opts="\ |
37 | 23 | #SBATCH -CL40S |
38 | 24 | #SBATCH --ntasks-per-node=4 # Number of cores per node required |
39 | 25 | #SBATCH -G2\ |
40 | 26 | " |
| 27 | + |
| 28 | +if [ "$2" = "cpu" ]; then |
| 29 | + sbatch_device_opts="$sbatch_cpu_opts" |
| 30 | +elif [ "$2" = "gpu" ]; then |
| 31 | + sbatch_device_opts="$sbatch_gpu_opts" |
41 | 32 | else |
42 | | - usage |
| 33 | + usage |
| 34 | + exit 1 |
43 | 35 | fi |
44 | 36 |
|
45 | | -# submit and capture the JobID |
46 | | -JOBID=$(sbatch <<-EOT | awk '{print $4}' |
47 | | - #!/usr/bin/env bash |
48 | | - ${sbatch_common_opts} |
49 | | - ${sbatch_device_opts} |
50 | | - |
51 | | - export job_slug="${job_slug}" |
52 | | - export device="${device}" |
53 | | -
|
54 | | - echo "Job slug is:" $job_slug |
55 | | - echo "Device is:" $device |
56 | | - |
57 | | - set -e -x |
58 | | -
|
59 | | - cd "\$SLURM_SUBMIT_DIR" |
60 | | - echo "Running in \$(pwd):" |
61 | | -
|
62 | | - # load your modules & env |
63 | | - . ./mfc.sh load -c p -m $device |
| 37 | +job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2" |
64 | 38 |
|
65 | | - # user script contents |
66 | | - n_ranks=12 |
| 39 | +sbatch <<EOT |
| 40 | +#!/bin/bash |
| 41 | +#SBATCH -Jshb-$job_slug # Job name |
| 42 | +#SBATCH --account=gts-sbryngelson3 # charge account |
| 43 | +#SBATCH -N1 # Number of nodes required |
| 44 | +$sbatch_device_opts |
| 45 | +#SBATCH -t 02:00:00 # Duration of the job (Ex: 15 mins) |
| 46 | +#SBATCH -q embers # QOS Name |
| 47 | +#SBATCH -o$job_slug.out # Combined output and error messages file |
| 48 | +#SBATCH -W # Do not exit until the submitted job terminates. |
67 | 49 |
|
68 | | - echo "My benchmarking device is:" $device |
69 | | - if [ "$device" = "gpu" ]; then |
70 | | - echo "Set device opts for GPU cases." |
71 | | - n_ranks=$(nvidia-smi -L | wc -l) # number of GPUs on node |
72 | | - gpu_ids=$(seq -s ' ' 0 $(($n_ranks-1))) # 0,1,2,...,gpu_count-1 |
73 | | - device_opts="--gpu -g $gpu_ids" |
74 | | - fi |
| 50 | +set -e |
| 51 | +set -x |
75 | 52 |
|
76 | | - tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build |
77 | | - currentdir=$tmpbuild/run-$(( RANDOM % 900 )) |
78 | | - mkdir -p $tmpbuild |
79 | | - mkdir -p $currentdir |
| 53 | +cd "\$SLURM_SUBMIT_DIR" |
| 54 | +echo "Running in $(pwd):" |
80 | 55 |
|
81 | | - export TMPDIR=$currentdir |
| 56 | +job_slug="$job_slug" |
| 57 | +job_device="$2" |
82 | 58 |
|
83 | | - if [ "$device" = "gpu" ]; then |
84 | | - echo "running GPU benchmarks" |
85 | | - ./mfc.sh bench --mem 12 -j $(nproc) -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks |
86 | | - elif [ "$device" = "cpu" ]; then |
87 | | - echo "running CPU benchmarks" |
88 | | - ./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks |
89 | | - else |
90 | | - echo "didn't find a device" |
91 | | - echo "device is" $device |
92 | | - exit 1 |
93 | | - fi |
| 59 | +. ./mfc.sh load -c p -m $2 |
94 | 60 |
|
95 | | - sleep 10 |
96 | | - rm -rf "$currentdir" || true |
| 61 | +$sbatch_script_contents |
97 | 62 |
|
98 | | - unset TMPDIR |
99 | 63 | EOT |
100 | | -) |
101 | | - |
102 | | -echo "Submitted: SLURM job $JOBID" |
103 | | - |
104 | | -# if this wrapper is killed/canceled, make sure SLURM job is cleaned up |
105 | | -trap '[[ -n "${JOBID:-}" ]] && scancel "$JOBID" >/dev/null 2>&1 || :' EXIT |
106 | | - |
107 | | -# ────────── Poll until SLURM job finishes ────────── |
108 | | -while :; do |
109 | | - # Try sacct first |
110 | | - STATE=$(sacct -j "$JOBID" --format=State --noheader --parsable2 | head -n1) |
111 | | - |
112 | | - # Fallback to squeue if sacct is empty |
113 | | - if [[ -z "$STATE" ]]; then |
114 | | - STATE=$(squeue -j "$JOBID" -h -o "%T" || echo "") |
115 | | - fi |
116 | | - |
117 | | - # If it’s one of SLURM’s terminal states, break immediately |
118 | | - case "$STATE" in |
119 | | - COMPLETED|FAILED|CANCELLED|TIMEOUT|PREEMPTED) |
120 | | - echo "Completed: SLURM job $JOBID reached terminal state: $STATE" |
121 | | - break |
122 | | - ;; |
123 | | - "") |
124 | | - echo "Completed: SLURM job $JOBID no longer in queue; assuming finished" |
125 | | - break |
126 | | - ;; |
127 | | - *) |
128 | | - echo "Waiting: SLURM job $JOBID state: $STATE" |
129 | | - sleep 10 |
130 | | - ;; |
131 | | - esac |
132 | | -done |
133 | 64 |
|
134 | | -# Now retrieve the exit code and exit with it |
135 | | -EXIT_CODE=$(sacct -j "$JOBID" --noheader --format=ExitCode | head -1 | cut -d: -f1) |
136 | | -echo "Completed: SLURM job $JOBID exit code: $EXIT_CODE" |
137 | | -exit "$EXIT_CODE" |
0 commit comments