Skip to content

Commit c8f1780

Browse files
committed
fix
1 parent 2c81b47 commit c8f1780

File tree

6 files changed

+95
-73
lines changed

6 files changed

+95
-73
lines changed

.github/workflows/bench.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,11 +79,19 @@ jobs:
7979
wait %1 && wait %2
8080
8181
- name: Bench (Master v. PR)
82+
if: matrix.cluster == 'frontier'
8283
run: |
8384
(cd pr && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh .github/workflows/${{ matrix.cluster }}/bench.sh ${{ matrix.device }}) &
8485
(cd master && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh .github/workflows/${{ matrix.cluster }}/bench.sh ${{ matrix.device }}) &
8586
wait %1 && wait %2
8687
88+
- name: Bench (Master v. PR)
89+
if: matrix.cluster == 'phoenix'
90+
run: |
91+
(cd pr && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh ${{ matrix.device }}) &
92+
(cd master && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh ${{ matrix.device }}) &
93+
wait %1 && wait %2
94+
8795
- name: Generate & Post Comment
8896
run: |
8997
(cd pr && . ./mfc.sh load -c ${{ matrix.flag }} -m g)

.github/workflows/phoenix/bench.sh

Lines changed: 0 additions & 28 deletions
This file was deleted.

.github/workflows/phoenix/submit-bench.sh

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,14 @@
22
set -euo pipefail
33

44
usage() {
5-
echo "Usage: $0 [script.sh] [cpu|gpu]"
5+
echo "Usage: $0 [cpu|gpu]"
66
exit 1
77
}
88

9-
[[ $# -eq 2 ]] || usage
9+
[[ $# -eq 1 ]] || usage
1010

11-
sbatch_script="$1"
12-
13-
device="$2"
14-
job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2"
11+
device="$1"
12+
job_slug="bench-$1"
1513

1614
# read the body of the user script
1715
sbatch_body=$(<"$sbatch_script")
@@ -65,7 +63,39 @@ JOBID=$(sbatch <<-EOT | awk '{print $4}'
6563
. ./mfc.sh load -c p -m $device
6664
6765
# user script contents
68-
${sbatch_body}
66+
n_ranks=12
67+
68+
echo "My benchmarking device is:" $device
69+
if [ "$device" = "gpu" ]; then
70+
echo "Set device opts for GPU cases."
71+
n_ranks=$(nvidia-smi -L | wc -l) # number of GPUs on node
72+
gpu_ids=$(seq -s ' ' 0 $(($n_ranks-1))) # 0,1,2,...,gpu_count-1
73+
device_opts="--gpu -g $gpu_ids"
74+
fi
75+
76+
tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build
77+
currentdir=$tmpbuild/run-$(( RANDOM % 900 ))
78+
mkdir -p $tmpbuild
79+
mkdir -p $currentdir
80+
81+
export TMPDIR=$currentdir
82+
83+
if [ "$device" = "gpu" ]; then
84+
echo "running GPU benchmarks"
85+
./mfc.sh bench --mem 12 -j $(nproc) -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks
86+
elif [ "$device" = "cpu" ]; then
87+
echo "running CPU benchmarks"
88+
./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks
89+
else
90+
echo "didn't find a device"
91+
echo "device is" $device
92+
exit 1
93+
fi
94+
95+
sleep 10
96+
rm -rf "$currentdir" || true
97+
98+
unset TMPDIR
6999
EOT
70100
)
71101

.github/workflows/phoenix/submit.sh

Lines changed: 49 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,14 @@
22
set -euo pipefail
33

44
usage() {
5-
echo "Usage: $0 [script.sh] [cpu|gpu]"
5+
echo "Usage: $0 [cpu|gpu]"
66
exit 1
77
}
88

9-
[[ $# -eq 2 ]] || usage
9+
[[ $# -eq 1 ]] || usage
1010

11-
sbatch_script="$1"
12-
device="$2"
13-
14-
job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2"
11+
device="$1"
12+
job_slug="test-$1"
1513

1614
# read the body of the user script
1715
sbatch_body=$(<"$sbatch_script")
@@ -49,6 +47,12 @@ JOBID=$(sbatch <<-EOT | awk '{print $4}'
4947
${sbatch_common_opts}
5048
${sbatch_device_opts}
5149
50+
export job_slug="${job_slug}"
51+
export device="${device}"
52+
53+
echo "Job slug is:" $job_slug
54+
echo "Device is:" $device
55+
5256
set -e -x
5357
5458
cd "\$SLURM_SUBMIT_DIR"
@@ -58,7 +62,45 @@ JOBID=$(sbatch <<-EOT | awk '{print $4}'
5862
. ./mfc.sh load -c p -m $device
5963
6064
# user script contents
61-
${sbatch_body}
65+
tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build
66+
currentdir=$tmpbuild/run-$(( RANDOM % 900 ))
67+
mkdir -p $tmpbuild
68+
mkdir -p $currentdir
69+
export TMPDIR=$currentdir
70+
71+
n_test_threads=8
72+
73+
build_opts=""
74+
if [ "$device" = "gpu" ]; then
75+
build_opts="--gpu"
76+
fi
77+
echo "build_opts =" $build_opts
78+
79+
if [[ "$device" == "cpu" ]]; then
80+
echo "CPU BUILD"
81+
elif [[ "$device" == "gpu" ]]; then
82+
echo "GPU BUILD"
83+
else
84+
exit 1
85+
fi
86+
87+
exit 1
88+
89+
./mfc.sh test --dry-run -j $n_test_threads $build_opts
90+
91+
if [ "$device" = "gpu" ]; then
92+
gpu_count=$(nvidia-smi -L | wc -l) # number of GPUs on node
93+
gpu_ids=$(seq -s ' ' 0 $(($gpu_count-1))) # 0,1,2,...,gpu_count-1
94+
device_opts="-g $gpu_ids"
95+
n_test_threads=`expr $gpu_count \* 2`
96+
fi
97+
98+
./mfc.sh test --max-attempts 3 -a -j $n_test_threads $device_opts -- -c phoenix
99+
100+
sleep 10
101+
rm -rf "$currentdir" || true
102+
103+
unset TMPDIR
62104
EOT
63105
)
64106

.github/workflows/phoenix/test.sh

Lines changed: 0 additions & 30 deletions
This file was deleted.

.github/workflows/test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ jobs:
111111

112112
- name: Build & Test
113113
if: matrix.lbl == 'gt'
114-
run: bash .github/workflows/phoenix/submit.sh .github/workflows/phoenix/test.sh ${{ matrix.device }}
114+
run: bash .github/workflows/phoenix/submit.sh ${{ matrix.device }}
115115

116116
- name: Build
117117
if: matrix.lbl == 'frontier'

0 commit comments

Comments
 (0)