Skip to content

Commit bac5df7

Browse files
author
mohdsaid497566
committed
Frontier Benchmarking (#453)
1 parent b263cf3 commit bac5df7

File tree

3 files changed

+126
-1
lines changed

3 files changed

+126
-1
lines changed

.github/workflows/bench.yml

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ jobs:
1818
with:
1919
filters: ".github/file-filter.yml"
2020

21-
self:
21+
phoenix:
2222
name: Georgia Tech | Phoenix (NVHPC)
2323
if: github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true'
2424
needs: file-changes
@@ -66,3 +66,52 @@ jobs:
6666
pr/build/benchmarks/*
6767
master/bench-${{ matrix.device }}.*
6868
master/build/benchmarks/*
69+
70+
frontier:
71+
name: Oak Ridge | Frontier (AMD ROCm)
72+
if: github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true'
73+
needs: file-changes
74+
strategy:
75+
matrix:
76+
device: ['cpu', 'gpu']
77+
runs-on:
78+
group: frontier
79+
labels: olcf
80+
timeout-minutes: 1400
81+
env:
82+
ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16
83+
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
84+
steps:
85+
- name: Clone - PR
86+
uses: actions/checkout@v4
87+
with:
88+
path: pr
89+
90+
- name: Clone - Master
91+
uses: actions/checkout@v4
92+
with:
93+
repository: MFlowCode/MFC
94+
ref: master
95+
path: master
96+
97+
- name: Bench (Master v. PR)
98+
run: |
99+
(cd pr && bash .github/workflows/frontier/submit-bench.sh .github/workflows/frontier/bench.sh ${{ matrix.device }}) &
100+
(cd master && bash .github/workflows/frontier/submit-bench.sh .github/workflows/frontier/bench.sh ${{ matrix.device }}) &
101+
wait %1 && wait %2
102+
103+
- name: Generate & Post Comment
104+
run: |
105+
(cd pr && . ./mfc.sh load -c p -m g)
106+
(cd pr && ./mfc.sh bench_diff ../master/bench-${{ matrix.device }}.yaml ../pr/bench-${{ matrix.device }}.yaml)
107+
108+
- name: Archive Logs
109+
uses: actions/upload-artifact@v4
110+
if: always()
111+
with:
112+
name: logs-frontier-${{ matrix.device }}
113+
path: |
114+
pr/bench-${{ matrix.device }}.*
115+
pr/build/benchmarks/*
116+
master/bench-${{ matrix.device }}.*
117+
master/build/benchmarks/*
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#!/bin/bash
2+
3+
n_ranks=12
4+
5+
if [ "$job_device" == "gpu" ]; then
6+
n_ranks=$(nvidia-smi -L | wc -l) # number of GPUs on node
7+
gpu_ids=$(seq -s ' ' 0 $(($n_ranks-1))) # 0,1,2,...,gpu_count-1
8+
device_opts="--gpu -g $gpu_ids"
9+
fi
10+
11+
mkdir -p /storage/scratch1/6/sbryngelson3/mytmp_build
12+
export TMPDIR=/storage/scratch1/6/sbryngelson3/mytmp_build
13+
14+
if ["$job_device" == "gpu"]; then
15+
./mfc.sh bench --mem 12 -j $(nproc) -o "$job_slug.yaml" -- -c frontier-bench $device_opts -n $n_ranks
16+
else
17+
./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c frontier-bench $device_opts -n $n_ranks
18+
fi
19+
20+
unset TMPDIR
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
#!/bin/bash
2+
3+
set -e
4+
5+
usage() {
6+
echo "Usage: $0 [script.sh] [cpu|gpu]"
7+
}
8+
9+
if [ ! -z "$1" ]; then
10+
sbatch_script_contents=`cat $1`
11+
else
12+
usage
13+
exit 1
14+
fi
15+
16+
if [ "$2" == "cpu" ]; then
17+
sbatch_device_opts="\
18+
#SBATCH -n 32 # Number of cores required"
19+
elif [ "$2" == "gpu" ]; then
20+
sbatch_device_opts="\
21+
#SBATCH -n 8 # Number of cores required"
22+
else
23+
usage
24+
exit 1
25+
fi
26+
27+
28+
job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2"
29+
30+
sbatch <<EOT
31+
#!/bin/bash
32+
#SBATCH -JMFC-$job_slug # Job name
33+
#SBATCH -A CFD154 # charge account
34+
#SBATCH -N 1 # Number of nodes required
35+
$sbatch_device_opts
36+
#SBATCH -t 01:59:00 # Duration of the job (Ex: 15 mins)
37+
#SBATCH -o$job_slug.out # Combined output and error messages file
38+
#SBATCH -p extended # Extended partition for shorter queues
39+
#SBATCH -q debug # Use debug QOS - only one job per user allowed in queue!
40+
#SBATCH -W # Do not exit until the submitted job terminates.
41+
42+
set -e
43+
set -x
44+
45+
cd "\$SLURM_SUBMIT_DIR"
46+
echo "Running in $(pwd):"
47+
48+
job_slug="$job_slug"
49+
job_device="$2"
50+
51+
. ./mfc.sh load -c f -m g
52+
53+
$sbatch_script_contents
54+
55+
EOT
56+

0 commit comments

Comments
 (0)