|
2 | 2 | set -euo pipefail |
3 | 3 |
|
4 | 4 | usage() { |
5 | | - echo "Usage: $0 [cpu|gpu]" |
6 | | - exit 1 |
| 5 | + echo "Usage: $0 [cpu|gpu]" |
| 6 | + exit 1 |
7 | 7 | } |
8 | 8 |
|
9 | 9 | [[ $# -eq 1 ]] || usage |
10 | 10 |
|
11 | 11 | device="$1" |
12 | | -job_slug="test-$1" |
13 | | - |
14 | | -# common SBATCH directives |
15 | | -sbatch_common_opts="\ |
16 | | -#SBATCH -J MFC-test-$device # job name |
17 | | -#SBATCH --account=gts-sbryngelson3 # account |
18 | | -#SBATCH -N1 # nodes |
19 | | -#SBATCH -t 03:00:00 # walltime |
20 | | -#SBATCH -q embers # QOS |
21 | | -#SBATCH -o $job_slug.out # stdout+stderr |
22 | | -#SBATCH --mem-per-cpu=2G # default mem (overridden below) |
23 | | -" |
24 | | - |
25 | | -# CPU vs GPU overrides |
| 12 | +job_slug="test-$device" |
| 13 | + |
| 14 | +# Build sbatch arguments (use CLI args instead of #SBATCH lines) |
| 15 | +sbatch_args=( |
| 16 | + -J "MFC-test-$device" |
| 17 | + --account=gts-sbryngelson3 |
| 18 | + -N 1 |
| 19 | + -t 03:00:00 |
| 20 | + -q embers |
| 21 | + -o "${job_slug}.out" |
| 22 | + --mem-per-cpu=2G |
| 23 | + # Export variables for the job environment |
| 24 | + --export=ALL,job_slug="$job_slug",device="$device" |
| 25 | +) |
| 26 | + |
26 | 27 | if [[ "$device" == "cpu" ]]; then |
27 | | - sbatch_device_opts="\ |
28 | | -#SBATCH -p cpu-small |
29 | | -#SBATCH --ntasks-per-node=24 |
30 | | -" |
| 28 | + sbatch_args+=( |
| 29 | + -p cpu-small |
| 30 | + --ntasks-per-node=24 |
| 31 | + ) |
31 | 32 | elif [[ "$device" == "gpu" ]]; then |
32 | | - sbatch_device_opts="\ |
33 | | -#SBATCH -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s |
34 | | -#SBATCH --ntasks-per-node=4 |
35 | | -#SBATCH -G2 |
36 | | -" |
| 33 | + sbatch_args+=( |
| 34 | + -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s |
| 35 | + --ntasks-per-node=4 |
| 36 | + -G 2 |
| 37 | + ) |
37 | 38 | else |
38 | 39 | usage |
39 | 40 | fi |
40 | 41 |
|
41 | 42 | # submit and capture the JobID |
42 | | -JOBID=$(sbatch <<-EOT | awk '{print $4}' |
43 | | - #!/usr/bin/env bash |
44 | | - ${sbatch_common_opts} |
45 | | - ${sbatch_device_opts} |
46 | | -
|
47 | | - export job_slug="${job_slug}" |
48 | | - export device="${device}" |
49 | | -
|
50 | | - echo "Job slug is:" $job_slug |
51 | | - echo "Device is:" $device |
52 | | -
|
53 | | - set -e -x |
54 | | -
|
55 | | - cd "\$SLURM_SUBMIT_DIR" |
56 | | - echo "Running in \$(pwd):" |
| 43 | +JOBID=$( |
| 44 | + sbatch "${sbatch_args[@]}" <<'EOT' | awk '{print $4}' |
| 45 | +#!/usr/bin/env bash |
| 46 | +set -euo pipefail |
| 47 | +set -x |
57 | 48 |
|
58 | | - # load your modules & env |
59 | | - . ./mfc.sh load -c p -m $device |
| 49 | +echo "Job slug is: $job_slug" |
| 50 | +echo "Device is: $device" |
60 | 51 |
|
61 | | - # user script contents |
62 | | - export tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build |
63 | | - export currentdir=$tmpbuild/run-$(( RANDOM % 900 )) |
64 | | - mkdir -p $tmpbuild |
65 | | - mkdir -p $currentdir |
66 | | - export TMPDIR=$currentdir |
| 52 | +cd "$SLURM_SUBMIT_DIR" |
| 53 | +echo "Running in $(pwd)" |
67 | 54 |
|
68 | | - n_test_threads=8 |
| 55 | +# load your modules & env |
| 56 | +. ./mfc.sh load -c p -m "$device" |
69 | 57 |
|
70 | | - export build_opts="" |
71 | | - if [ "$device" = "gpu" ]; then |
72 | | - export build_opts="--gpu" |
73 | | - fi |
74 | | - echo "build_opts =" $build_opts |
| 58 | +# user script contents |
| 59 | +tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build |
| 60 | +mkdir -p "$tmpbuild" |
| 61 | +currentdir="$tmpbuild/run-$(( RANDOM % 900 ))" |
| 62 | +mkdir -p "$currentdir" |
| 63 | +export TMPDIR="$currentdir" |
75 | 64 |
|
76 | | - if [[ "$device" == "cpu" ]]; then |
77 | | - echo "CPU BUILD" |
78 | | - elif [[ "$device" == "gpu" ]]; then |
79 | | - echo "GPU BUILD" |
80 | | - else |
81 | | - exit 1 |
82 | | - fi |
| 65 | +n_test_threads=8 |
| 66 | +build_opts="" |
| 67 | +if [[ "$device" == "gpu" ]]; then |
| 68 | + build_opts="--gpu" |
| 69 | +fi |
| 70 | +echo "build_opts = $build_opts" |
83 | 71 |
|
84 | | - exit 1 |
| 72 | +if [[ "$device" == "cpu" ]]; then |
| 73 | + echo "CPU BUILD" |
| 74 | +elif [[ "$device" == "gpu" ]]; then |
| 75 | + echo "GPU BUILD" |
| 76 | +else |
| 77 | + echo "Unknown device: $device" >&2 |
| 78 | + exit 1 |
| 79 | +fi |
85 | 80 |
|
86 | | - ./mfc.sh test --dry-run -j $n_test_threads $build_opts |
| 81 | +# Dry run (kept from your original) |
| 82 | +./mfc.sh test --dry-run -j "$n_test_threads" $build_opts |
87 | 83 |
|
88 | | - if [ "$device" = "gpu" ]; then |
89 | | - export gpu_count=$(nvidia-smi -L | wc -l) # number of GPUs on node |
90 | | - export gpu_ids=$(seq -s ' ' 0 $(($gpu_count-1))) # 0,1,2,...,gpu_count-1 |
91 | | - export device_opts="-g $gpu_ids" |
92 | | - export n_test_threads=`expr $gpu_count \* 2` |
93 | | - fi |
| 84 | +# GPU-specific runtime options |
| 85 | +device_opts="" |
| 86 | +if [[ "$device" == "gpu" ]]; then |
| 87 | + if command -v nvidia-smi >/dev/null 2>&1; then |
| 88 | + gpu_count=$(nvidia-smi -L | wc -l) |
| 89 | + else |
| 90 | + gpu_count=0 |
| 91 | + fi |
94 | 92 |
|
95 | | - ./mfc.sh test --max-attempts 3 -a -j $n_test_threads $device_opts -- -c phoenix |
| 93 | + if [[ "$gpu_count" -gt 0 ]]; then |
| 94 | + gpu_ids=$(seq -s ' ' 0 $(( gpu_count - 1 ))) |
| 95 | + device_opts="-g $gpu_ids" |
| 96 | + n_test_threads=$(( gpu_count * 2 )) |
| 97 | + else |
| 98 | + echo "No GPUs detected; continuing without -g list" |
| 99 | + device_opts="" |
| 100 | + fi |
| 101 | +fi |
96 | 102 |
|
97 | | - sleep 10 |
98 | | - rm -rf "$currentdir" || true |
| 103 | +./mfc.sh test --max-attempts 3 -a -j "$n_test_threads" ${device_opts:-} -- -c phoenix |
99 | 104 |
|
100 | | - unset TMPDIR |
| 105 | +sleep 10 |
| 106 | +rm -rf "$currentdir" || true |
| 107 | +unset TMPDIR |
101 | 108 | EOT |
102 | 109 | ) |
103 | 110 |
|
@@ -134,6 +141,8 @@ while :; do |
134 | 141 | done |
135 | 142 |
|
136 | 143 | # Now retrieve the exit code and exit with it |
137 | | -EXIT_CODE=$(sacct -j "$JOBID" --noheader --format=ExitCode | head -1 | cut -d: -f1) |
| 144 | +# (small grace period in case accounting lags) |
| 145 | +sleep 2 |
| 146 | +EXIT_CODE=$(sacct -j "$JOBID" --noheader --format=ExitCode | head -1 | cut -d: -f1 || echo 1) |
138 | 147 | echo "Completed: SLURM job $JOBID exit code: $EXIT_CODE" |
139 | 148 | exit "$EXIT_CODE" |
0 commit comments