Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
bac5df7
Frontier Benchmarking (#453)
Jun 11, 2025
819e599
fixed frontier setup
Jun 11, 2025
17db70d
reduced frontier to bench on GPU
Malmahrouqi3 Jun 12, 2025
b28167f
updated frontier job instructions
Malmahrouqi3 Jun 12, 2025
ae04e14
fixed syntax in ymal file
Malmahrouqi3 Jun 12, 2025
8fb75b6
matrix.device synatx error addressed
Malmahrouqi3 Jun 12, 2025
57da396
another fix
Malmahrouqi3 Jun 12, 2025
0432f1f
fix
Malmahrouqi3 Jun 12, 2025
08052ea
matrix.device
Malmahrouqi3 Jun 12, 2025
b66403b
removed matrix.device - flatened
Malmahrouqi3 Jun 12, 2025
e49d55f
removed leading to syntax error
Malmahrouqi3 Jun 12, 2025
03034d7
replaced matrix.name to matrix.cluster to avoid syntax error
Malmahrouqi3 Jun 12, 2025
02688d9
Update bench.yml
Malmahrouqi3 Jun 16, 2025
c6258a6
just some ()
Malmahrouqi3 Jun 16, 2025
11fcf9e
Merge branch 'master' into frontier-CI2
Malmahrouqi3 Jun 17, 2025
4cbab63
Update bench.yml
Malmahrouqi3 Jun 17, 2025
51d34ae
Merge branch 'master' into frontier-CI2
sbryngelson Jun 18, 2025
a8268f3
Update submit-bench.sh
sbryngelson Jun 18, 2025
8dcf100
Update submit-bench.sh
Malmahrouqi3 Jun 18, 2025
bde0c17
Merge branch 'master' into frontier-CI2
sbryngelson Jun 18, 2025
def52bc
Merge branch 'master' into frontier-CI2
sbryngelson Jun 21, 2025
88fcf35
Merge branch 'master' into frontier-CI2
sbryngelson Jun 21, 2025
ea02640
Merge branch 'master' into frontier-CI2
sbryngelson Jun 21, 2025
f705d7f
Normalize line endings to UNIX format
Malmahrouqi3 Jun 22, 2025
53eec50
removed commented Phoenix part
Malmahrouqi3 Jun 22, 2025
93e0fac
removed file changes check
Malmahrouqi3 Jun 22, 2025
c7360eb
undo phoenix stuff
Malmahrouqi3 Jun 22, 2025
cc318bc
Revert "undo phoenix stuff"
Malmahrouqi3 Jun 22, 2025
5c8b925
Revert "Normalize line endings to UNIX format"
Malmahrouqi3 Jun 22, 2025
3c06357
some stuff
Malmahrouqi3 Jun 22, 2025
c6b1ecf
removed parts
Malmahrouqi3 Jun 22, 2025
3317bdb
Revert "Update submit-bench.sh"
Malmahrouqi3 Jun 22, 2025
27aa35b
undo things in test.yml
Malmahrouqi3 Jun 22, 2025
77b30e3
Merge branch 'master' into frontier-CI2
Malmahrouqi3 Jun 22, 2025
6158e12
fixing
sbryngelson Jun 23, 2025
5c6acc0
add build script
wilfonba Jun 25, 2025
32a8292
Merge branch 'master' into frontier-CI2
sbryngelson Jun 28, 2025
b842f21
Merge branch 'master' into frontier-CI2
sbryngelson Jul 3, 2025
405c39c
Update bench.yml
sbryngelson Jul 3, 2025
98437d2
Update submit-bench.sh
sbryngelson Jul 3, 2025
e7e7de8
Update bench.sh
sbryngelson Jul 3, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 56 additions & 1 deletion .github/workflows/bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
with:
filters: ".github/file-filter.yml"

self:
phoenix:
name: Georgia Tech | Phoenix (NVHPC)
if: github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true'
needs: file-changes
Expand Down Expand Up @@ -66,3 +66,58 @@ jobs:
pr/build/benchmarks/*
master/bench-${{ matrix.device }}.*
master/build/benchmarks/*

frontier:
name: Oak Ridge | Frontier (AMD ROCm)
if: github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true'
needs: file-changes
strategy:
matrix:
device: ['cpu', 'gpu']
runs-on:
group: frontier
labels: olcf
timeout-minutes: 1400
env:
ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
steps:
- name: Clone - PR
uses: actions/checkout@v4
with:
path: pr

- name: Clone - Master
uses: actions/checkout@v4
with:
repository: MFlowCode/MFC
ref: master
path: master

- name: Setup & Build
run: |
(cd pr && bash .github/workflows/frontier/build.sh ${{ matrix.device }}) &
(cd master && bash .github/workflows/frontier/build.sh ${{ matrix.device }}) &
wait %1 && wait %2

- name: Bench (Master v. PR)
run: |
(cd pr && bash .github/workflows/frontier/submit-bench.sh .github/workflows/frontier/bench.sh ${{ matrix.device }}) &
(cd master && bash .github/workflows/frontier/submit-bench.sh .github/workflows/frontier/bench.sh ${{ matrix.device }}) &
wait %1 && wait %2

- name: Generate & Post Comment
run: |
(cd pr && . ./mfc.sh load -c p -m g)
(cd pr && ./mfc.sh bench_diff ../master/bench-${{ matrix.device }}.yaml ../pr/bench-${{ matrix.device }}.yaml)

- name: Archive Logs
uses: actions/upload-artifact@v4
if: always()
with:
name: logs-frontier-${{ matrix.device }}
path: |
pr/bench-${{ matrix.device }}.*
pr/build/benchmarks/*
master/bench-${{ matrix.device }}.*
master/build/benchmarks/*
21 changes: 21 additions & 0 deletions .github/workflows/frontier/bench.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash

n_ranks=12

if [ "$job_device" == "gpu" ]; then
gpus=$(rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n' ' ')
n_ranks=$(echo "$gpus" | wc -w) # number of GPUs on node
gpu_ids=$(echo "$gpus" | tr ' ' '\n' | tr '\n' ' ' | sed 's/ $//') # GPU IDs from rocm-smi
device_opts="--gpu -g $gpu_ids"
fi

mkdir -p /storage/scratch1/6/sbryngelson3/mytmp_build
export TMPDIR=/storage/scratch1/6/sbryngelson3/mytmp_build

if [ "$job_device" == "gpu" ]; then
./mfc.sh bench --mem 12 -j $n_ranks -o "$job_slug.yaml" -- -c frontier-bench $device_opts -n $n_ranks
else
./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c frontier-bench $device_opts -n $n_ranks
fi

unset TMPDIR
56 changes: 56 additions & 0 deletions .github/workflows/frontier/submit-bench.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/bin/bash

set -e

usage() {
echo "Usage: $0 [script.sh] [cpu|gpu]"
}

if [ ! -z "$1" ]; then
sbatch_script_contents=`cat $1`
else
usage
exit 1
fi

if [ "$2" == "cpu" ]; then
sbatch_device_opts="\
#SBATCH -n 32 # Number of cores required"
elif [ "$2" == "gpu" ]; then
sbatch_device_opts="\
#SBATCH -n 8 # Number of cores required"
else
usage
exit 1
fi


job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2"

sbatch <<EOT
#!/bin/bash
#SBATCH -JMFC-$job_slug # Job name
#SBATCH -A CFD154 # charge account
#SBATCH -N 1 # Number of nodes required
$sbatch_device_opts
#SBATCH -t 01:59:00 # Duration of the job (Ex: 15 mins)
#SBATCH -o$job_slug.out # Combined output and error messages file
#SBATCH -p extended # Extended partition for shorter queues
#SBATCH -q debug # Use debug QOS - only one job per user allowed in queue!
#SBATCH -W # Do not exit until the submitted job terminates.

set -e
set -x

cd "\$SLURM_SUBMIT_DIR"
echo "Running in $(pwd):"

job_slug="$job_slug"
job_device="$2"

. ./mfc.sh load -c f -m g

$sbatch_script_contents

EOT

Loading