Skip to content

Commit a3c741a

Browse files
committed
fix again
1 parent 230bf05 commit a3c741a

File tree

1 file changed

+83
-74
lines changed

1 file changed

+83
-74
lines changed

.github/workflows/phoenix/submit.sh

Lines changed: 83 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -2,102 +2,109 @@
22
set -euo pipefail
33

44
usage() {
5-
echo "Usage: $0 [cpu|gpu]"
6-
exit 1
5+
echo "Usage: $0 [cpu|gpu]"
6+
exit 1
77
}
88

99
[[ $# -eq 1 ]] || usage
1010

1111
device="$1"
12-
job_slug="test-$1"
13-
14-
# common SBATCH directives
15-
sbatch_common_opts="\
16-
#SBATCH -J MFC-test-$device # job name
17-
#SBATCH --account=gts-sbryngelson3 # account
18-
#SBATCH -N1 # nodes
19-
#SBATCH -t 03:00:00 # walltime
20-
#SBATCH -q embers # QOS
21-
#SBATCH -o $job_slug.out # stdout+stderr
22-
#SBATCH --mem-per-cpu=2G # default mem (overridden below)
23-
"
24-
25-
# CPU vs GPU overrides
12+
job_slug="test-$device"
13+
14+
# Build sbatch arguments (use CLI args instead of #SBATCH lines)
15+
sbatch_args=(
16+
-J "MFC-test-$device"
17+
--account=gts-sbryngelson3
18+
-N 1
19+
-t 03:00:00
20+
-q embers
21+
-o "${job_slug}.out"
22+
--mem-per-cpu=2G
23+
# Export variables for the job environment
24+
--export=ALL,job_slug="$job_slug",device="$device"
25+
)
26+
2627
if [[ "$device" == "cpu" ]]; then
27-
sbatch_device_opts="\
28-
#SBATCH -p cpu-small
29-
#SBATCH --ntasks-per-node=24
30-
"
28+
sbatch_args+=(
29+
-p cpu-small
30+
--ntasks-per-node=24
31+
)
3132
elif [[ "$device" == "gpu" ]]; then
32-
sbatch_device_opts="\
33-
#SBATCH -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s
34-
#SBATCH --ntasks-per-node=4
35-
#SBATCH -G2
36-
"
33+
sbatch_args+=(
34+
-p gpu-v100,gpu-a100,gpu-h100,gpu-l40s
35+
--ntasks-per-node=4
36+
-G 2
37+
)
3738
else
3839
usage
3940
fi
4041

4142
# submit and capture the JobID
42-
JOBID=$(sbatch <<-EOT | awk '{print $4}'
43-
#!/usr/bin/env bash
44-
${sbatch_common_opts}
45-
${sbatch_device_opts}
46-
47-
export job_slug="${job_slug}"
48-
export device="${device}"
49-
50-
echo "Job slug is:" $job_slug
51-
echo "Device is:" $device
52-
53-
set -e -x
54-
55-
cd "\$SLURM_SUBMIT_DIR"
56-
echo "Running in \$(pwd):"
43+
JOBID=$(
44+
sbatch "${sbatch_args[@]}" <<'EOT' | awk '{print $4}'
45+
#!/usr/bin/env bash
46+
set -euo pipefail
47+
set -x
5748
58-
# load your modules & env
59-
. ./mfc.sh load -c p -m $device
49+
echo "Job slug is: $job_slug"
50+
echo "Device is: $device"
6051
61-
# user script contents
62-
export tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build
63-
export currentdir=$tmpbuild/run-$(( RANDOM % 900 ))
64-
mkdir -p $tmpbuild
65-
mkdir -p $currentdir
66-
export TMPDIR=$currentdir
52+
cd "$SLURM_SUBMIT_DIR"
53+
echo "Running in $(pwd)"
6754
68-
n_test_threads=8
55+
# load your modules & env
56+
. ./mfc.sh load -c p -m "$device"
6957
70-
export build_opts=""
71-
if [ "$device" = "gpu" ]; then
72-
export build_opts="--gpu"
73-
fi
74-
echo "build_opts =" $build_opts
58+
# user script contents
59+
tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build
60+
mkdir -p "$tmpbuild"
61+
currentdir="$tmpbuild/run-$(( RANDOM % 900 ))"
62+
mkdir -p "$currentdir"
63+
export TMPDIR="$currentdir"
7564
76-
if [[ "$device" == "cpu" ]]; then
77-
echo "CPU BUILD"
78-
elif [[ "$device" == "gpu" ]]; then
79-
echo "GPU BUILD"
80-
else
81-
exit 1
82-
fi
65+
n_test_threads=8
66+
build_opts=""
67+
if [[ "$device" == "gpu" ]]; then
68+
build_opts="--gpu"
69+
fi
70+
echo "build_opts = $build_opts"
8371
84-
exit 1
72+
if [[ "$device" == "cpu" ]]; then
73+
echo "CPU BUILD"
74+
elif [[ "$device" == "gpu" ]]; then
75+
echo "GPU BUILD"
76+
else
77+
echo "Unknown device: $device" >&2
78+
exit 1
79+
fi
8580
86-
./mfc.sh test --dry-run -j $n_test_threads $build_opts
81+
# Dry run (kept from your original)
82+
./mfc.sh test --dry-run -j "$n_test_threads" $build_opts
8783
88-
if [ "$device" = "gpu" ]; then
89-
export gpu_count=$(nvidia-smi -L | wc -l) # number of GPUs on node
90-
export gpu_ids=$(seq -s ' ' 0 $(($gpu_count-1))) # 0,1,2,...,gpu_count-1
91-
export device_opts="-g $gpu_ids"
92-
export n_test_threads=`expr $gpu_count \* 2`
93-
fi
84+
# GPU-specific runtime options
85+
device_opts=""
86+
if [[ "$device" == "gpu" ]]; then
87+
if command -v nvidia-smi >/dev/null 2>&1; then
88+
gpu_count=$(nvidia-smi -L | wc -l)
89+
else
90+
gpu_count=0
91+
fi
9492
95-
./mfc.sh test --max-attempts 3 -a -j $n_test_threads $device_opts -- -c phoenix
93+
if [[ "$gpu_count" -gt 0 ]]; then
94+
gpu_ids=$(seq -s ' ' 0 $(( gpu_count - 1 )))
95+
device_opts="-g $gpu_ids"
96+
n_test_threads=$(( gpu_count * 2 ))
97+
else
98+
echo "No GPUs detected; continuing without -g list"
99+
device_opts=""
100+
fi
101+
fi
96102
97-
sleep 10
98-
rm -rf "$currentdir" || true
103+
./mfc.sh test --max-attempts 3 -a -j "$n_test_threads" ${device_opts:-} -- -c phoenix
99104
100-
unset TMPDIR
105+
sleep 10
106+
rm -rf "$currentdir" || true
107+
unset TMPDIR
101108
EOT
102109
)
103110

@@ -134,6 +141,8 @@ while :; do
134141
done
135142

136143
# Now retrieve the exit code and exit with it
137-
EXIT_CODE=$(sacct -j "$JOBID" --noheader --format=ExitCode | head -1 | cut -d: -f1)
144+
# (small grace period in case accounting lags)
145+
sleep 2
146+
EXIT_CODE=$(sacct -j "$JOBID" --noheader --format=ExitCode | head -1 | cut -d: -f1 || echo 1)
138147
echo "Completed: SLURM job $JOBID exit code: $EXIT_CODE"
139148
exit "$EXIT_CODE"

0 commit comments

Comments
 (0)