Skip to content

Commit d81a0fd

Browse files
committed
back to old way
1 parent a3c741a commit d81a0fd

File tree

6 files changed

+133
-251
lines changed

6 files changed

+133
-251
lines changed

.github/workflows/bench.yml

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -79,19 +79,11 @@ jobs:
7979
wait %1 && wait %2
8080
8181
- name: Bench (Master v. PR)
82-
if: matrix.cluster == 'frontier'
8382
run: |
8483
(cd pr && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh .github/workflows/${{ matrix.cluster }}/bench.sh ${{ matrix.device }}) &
8584
(cd master && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh .github/workflows/${{ matrix.cluster }}/bench.sh ${{ matrix.device }}) &
8685
wait %1 && wait %2
8786
88-
- name: Bench (Master v. PR)
89-
if: matrix.cluster == 'phoenix'
90-
run: |
91-
(cd pr && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh ${{ matrix.device }}) &
92-
(cd master && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh ${{ matrix.device }}) &
93-
wait %1 && wait %2
94-
9587
- name: Generate & Post Comment
9688
run: |
9789
(cd pr && . ./mfc.sh load -c ${{ matrix.flag }} -m g)

.github/workflows/phoenix/bench.sh

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#!/bin/bash
2+
3+
n_ranks=12
4+
5+
if [ "$job_device" = "gpu" ]; then
6+
n_ranks=$(nvidia-smi -L | wc -l) # number of GPUs on node
7+
gpu_ids=$(seq -s ' ' 0 $(($n_ranks-1))) # 0,1,2,...,gpu_count-1
8+
device_opts="--gpu -g $gpu_ids"
9+
fi
10+
11+
tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build
12+
currentdir=$tmpbuild/run-$(( RANDOM % 900 ))
13+
mkdir -p $tmpbuild
14+
mkdir -p $currentdir
15+
16+
export TMPDIR=$currentdir
17+
18+
if [ "$job_device" = "gpu" ]; then
19+
./mfc.sh bench --mem 12 -j $(nproc) -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks
20+
else
21+
./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks
22+
fi
23+
24+
sleep 10
25+
rm -rf "$currentdir" || true
26+
27+
unset TMPDIR
Lines changed: 39 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -1,137 +1,64 @@
1-
#!/usr/bin/env bash
2-
set -euo pipefail
1+
#!/bin/bash
2+
3+
set -e
34

45
usage() {
5-
echo "Usage: $0 [cpu|gpu]"
6-
exit 1
6+
echo "Usage: $0 [script.sh] [cpu|gpu]"
77
}
88

9-
[[ $# -eq 1 ]] || usage
10-
11-
device="$1"
12-
job_slug="bench-$1"
13-
14-
# read the body of the user script
15-
sbatch_body=$(<"$sbatch_script")
16-
17-
# common SBATCH directives
18-
sbatch_common_opts="\
19-
#SBATCH -J MFC-benchmark-$device # job name
20-
#SBATCH --account=gts-sbryngelson3 # account
21-
#SBATCH -N1 # nodes
22-
#SBATCH -t 02:00:00 # walltime
23-
#SBATCH -q embers # QOS
24-
#SBATCH -o $job_slug.out # stdout+stderr
25-
#SBATCH --mem-per-cpu=2G # default mem (overridden below)
26-
"
9+
if [ ! -z "$1" ]; then
10+
sbatch_script_contents=`cat $1`
11+
else
12+
usage
13+
exit 1
14+
fi
2715

28-
# CPU vs GPU overrides
29-
if [[ "$device" == "cpu" ]]; then
30-
sbatch_device_opts="\
16+
sbatch_cpu_opts="\
3117
#SBATCH -p cpu-small # partition
3218
#SBATCH --ntasks-per-node=24 # Number of cores per node required
3319
#SBATCH --mem-per-cpu=2G # Memory per core\
3420
"
35-
elif [[ "$device" == "gpu" ]]; then
36-
sbatch_device_opts="\
21+
22+
sbatch_gpu_opts="\
3723
#SBATCH -CL40S
3824
#SBATCH --ntasks-per-node=4 # Number of cores per node required
3925
#SBATCH -G2\
4026
"
27+
28+
if [ "$2" = "cpu" ]; then
29+
sbatch_device_opts="$sbatch_cpu_opts"
30+
elif [ "$2" = "gpu" ]; then
31+
sbatch_device_opts="$sbatch_gpu_opts"
4132
else
42-
usage
33+
usage
34+
exit 1
4335
fi
4436

45-
# submit and capture the JobID
46-
JOBID=$(sbatch <<-EOT | awk '{print $4}'
47-
#!/usr/bin/env bash
48-
${sbatch_common_opts}
49-
${sbatch_device_opts}
50-
51-
export job_slug="${job_slug}"
52-
export device="${device}"
53-
54-
echo "Job slug is:" $job_slug
55-
echo "Device is:" $device
56-
57-
set -e -x
58-
59-
cd "\$SLURM_SUBMIT_DIR"
60-
echo "Running in \$(pwd):"
61-
62-
# load your modules & env
63-
. ./mfc.sh load -c p -m $device
37+
job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2"
6438

65-
# user script contents
66-
n_ranks=12
39+
sbatch <<EOT
40+
#!/bin/bash
41+
#SBATCH -Jshb-$job_slug # Job name
42+
#SBATCH --account=gts-sbryngelson3 # charge account
43+
#SBATCH -N1 # Number of nodes required
44+
$sbatch_device_opts
45+
#SBATCH -t 02:00:00 # Duration of the job (Ex: 15 mins)
46+
#SBATCH -q embers # QOS Name
47+
#SBATCH -o$job_slug.out # Combined output and error messages file
48+
#SBATCH -W # Do not exit until the submitted job terminates.
6749
68-
echo "My benchmarking device is:" $device
69-
if [ "$device" = "gpu" ]; then
70-
echo "Set device opts for GPU cases."
71-
n_ranks=$(nvidia-smi -L | wc -l) # number of GPUs on node
72-
gpu_ids=$(seq -s ' ' 0 $(($n_ranks-1))) # 0,1,2,...,gpu_count-1
73-
device_opts="--gpu -g $gpu_ids"
74-
fi
50+
set -e
51+
set -x
7552
76-
tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build
77-
currentdir=$tmpbuild/run-$(( RANDOM % 900 ))
78-
mkdir -p $tmpbuild
79-
mkdir -p $currentdir
53+
cd "\$SLURM_SUBMIT_DIR"
54+
echo "Running in $(pwd):"
8055
81-
export TMPDIR=$currentdir
56+
job_slug="$job_slug"
57+
job_device="$2"
8258
83-
if [ "$device" = "gpu" ]; then
84-
echo "running GPU benchmarks"
85-
./mfc.sh bench --mem 12 -j $(nproc) -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks
86-
elif [ "$device" = "cpu" ]; then
87-
echo "running CPU benchmarks"
88-
./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks
89-
else
90-
echo "didn't find a device"
91-
echo "device is" $device
92-
exit 1
93-
fi
59+
. ./mfc.sh load -c p -m $2
9460
95-
sleep 10
96-
rm -rf "$currentdir" || true
61+
$sbatch_script_contents
9762
98-
unset TMPDIR
9963
EOT
100-
)
101-
102-
echo "Submitted: SLURM job $JOBID"
103-
104-
# if this wrapper is killed/canceled, make sure SLURM job is cleaned up
105-
trap '[[ -n "${JOBID:-}" ]] && scancel "$JOBID" >/dev/null 2>&1 || :' EXIT
106-
107-
# ────────── Poll until SLURM job finishes ──────────
108-
while :; do
109-
# Try sacct first
110-
STATE=$(sacct -j "$JOBID" --format=State --noheader --parsable2 | head -n1)
111-
112-
# Fallback to squeue if sacct is empty
113-
if [[ -z "$STATE" ]]; then
114-
STATE=$(squeue -j "$JOBID" -h -o "%T" || echo "")
115-
fi
116-
117-
# If it’s one of SLURM’s terminal states, break immediately
118-
case "$STATE" in
119-
COMPLETED|FAILED|CANCELLED|TIMEOUT|PREEMPTED)
120-
echo "Completed: SLURM job $JOBID reached terminal state: $STATE"
121-
break
122-
;;
123-
"")
124-
echo "Completed: SLURM job $JOBID no longer in queue; assuming finished"
125-
break
126-
;;
127-
*)
128-
echo "Waiting: SLURM job $JOBID state: $STATE"
129-
sleep 10
130-
;;
131-
esac
132-
done
13364

134-
# Now retrieve the exit code and exit with it
135-
EXIT_CODE=$(sacct -j "$JOBID" --noheader --format=ExitCode | head -1 | cut -d: -f1)
136-
echo "Completed: SLURM job $JOBID exit code: $EXIT_CODE"
137-
exit "$EXIT_CODE"

0 commit comments

Comments
 (0)