Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions .github/workflows/bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,11 @@ jobs:
run: |
cat pr/bench-${{ matrix.device }}.* 2>/dev/null || true
cat master/bench-${{ matrix.device }}.* 2>/dev/null || true

- name: Archive Logs

# All other runners (non-Phoenix) just run without special env
- name: Archive Logs (Frontier)
if: always() && matrix.cluster != 'phoenix'
uses: actions/upload-artifact@v4
if: always()
with:
name: ${{ matrix.cluster }}-${{ matrix.device }}
path: |
Expand Down
5 changes: 3 additions & 2 deletions .github/workflows/phoenix/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

n_ranks=12

if [ "$job_device" = "gpu" ]; then
echo "My benchmarking device is:" $device
if [ "$device" = "gpu" ]; then
n_ranks=$(nvidia-smi -L | wc -l) # number of GPUs on node
gpu_ids=$(seq -s ' ' 0 $(($n_ranks-1))) # 0,1,2,...,gpu_count-1
device_opts="--gpu -g $gpu_ids"
Expand All @@ -15,7 +16,7 @@ mkdir -p $currentdir

export TMPDIR=$currentdir

if [ "$job_device" = "gpu" ]; then
if [ "$device" = "gpu" ]; then
./mfc.sh bench --mem 12 -j $(nproc) -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks
else
./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks
Expand Down
121 changes: 82 additions & 39 deletions .github/workflows/phoenix/submit-bench.sh
Original file line number Diff line number Diff line change
@@ -1,64 +1,107 @@
#!/bin/bash

set -e
#!/usr/bin/env bash
set -euo pipefail

usage() {
echo "Usage: $0 [script.sh] [cpu|gpu]"
exit 1
}

if [ ! -z "$1" ]; then
sbatch_script_contents=`cat $1`
else
usage
exit 1
fi
[[ $# -eq 2 ]] || usage

sbatch_cpu_opts="\
sbatch_script="$1"

device="$2"
job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2"

# read the body of the user script
sbatch_body=$(<"$sbatch_script")

# common SBATCH directives
sbatch_common_opts="\
#SBATCH -J shb-${sbatch_script%%.sh}-$device # job name
#SBATCH --account=gts-sbryngelson3 # account
#SBATCH -N1 # nodes
#SBATCH -t 02:00:00 # walltime
#SBATCH -q embers # QOS
#SBATCH -o $job_slug.out # stdout+stderr
#SBATCH --mem-per-cpu=2G # default mem (overridden below)
"

# CPU vs GPU overrides
if [[ "$device" == "cpu" ]]; then
sbatch_device_opts="\
#SBATCH -p cpu-small # partition
#SBATCH --ntasks-per-node=24 # Number of cores per node required
#SBATCH --mem-per-cpu=2G # Memory per core\
"

sbatch_gpu_opts="\
elif [[ "$device" == "gpu" ]]; then
sbatch_device_opts="\
#SBATCH -CL40S
#SBATCH --ntasks-per-node=4 # Number of cores per node required
#SBATCH -G2\
"

if [ "$2" = "cpu" ]; then
sbatch_device_opts="$sbatch_cpu_opts"
elif [ "$2" = "gpu" ]; then
sbatch_device_opts="$sbatch_gpu_opts"
else
usage
exit 1
usage
fi

job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2"
# submit and capture the JobID
JOBID=$(sbatch <<-EOT | awk '{print $4}'
#!/usr/bin/env bash
${sbatch_common_opts}
${sbatch_device_opts}

export job_slug="${job_slug}"
export device="${device}"

sbatch <<EOT
#!/bin/bash
#SBATCH -Jshb-$job_slug # Job name
#SBATCH --account=gts-sbryngelson3 # charge account
#SBATCH -N1 # Number of nodes required
$sbatch_device_opts
#SBATCH -t 02:00:00 # Duration of the job (Ex: 15 mins)
#SBATCH -q embers # QOS Name
#SBATCH -o$job_slug.out # Combined output and error messages file
#SBATCH -W # Do not exit until the submitted job terminates.
echo "Job slug is:" $job_slug
echo "Device is:" $device

set -e -x

set -e
set -x
cd "\$SLURM_SUBMIT_DIR"
echo "Running in \$(pwd):"

cd "\$SLURM_SUBMIT_DIR"
echo "Running in $(pwd):"
# load your modules & env
. ./mfc.sh load -c p -m $device

job_slug="$job_slug"
job_device="$2"
# user script contents
${sbatch_body}
EOT
)

. ./mfc.sh load -c p -m $2
echo "🚀 Submitted SLURM job $JOBID"

$sbatch_script_contents
# if this wrapper is killed/canceled, make sure SLURM job is cleaned up
trap '[[ -n "${JOBID:-}" ]] && scancel "$JOBID" >/dev/null 2>&1 || :' EXIT

EOT
# ────────── Poll until SLURM job finishes ──────────
while :; do
# Try sacct first
STATE=$(sacct -j "$JOBID" --format=State --noheader --parsable2 | head -n1)

# Fallback to squeue if sacct is empty
if [[ -z "$STATE" ]]; then
STATE=$(squeue -j "$JOBID" -h -o "%T" || echo "")
fi

# If it’s one of SLURM’s terminal states, break immediately
case "$STATE" in
COMPLETED|FAILED|CANCELLED|TIMEOUT)
echo "✅ SLURM job $JOBID reached terminal state: $STATE"
break
;;
"")
echo "✅ SLURM job $JOBID no longer in queue; assuming finished"
break
;;
*)
echo "⏳ SLURM job $JOBID state: $STATE"
sleep 10
;;
esac
done

# Now retrieve the exit code and exit with it
EXIT_CODE=$(sacct -j "$JOBID" --noheader --format=ExitCode | head -1 | cut -d: -f1)
echo "🔚 SLURM job $JOBID exit code: $EXIT_CODE"
exit "$EXIT_CODE"
122 changes: 79 additions & 43 deletions .github/workflows/phoenix/submit.sh
Original file line number Diff line number Diff line change
@@ -1,64 +1,100 @@
#!/bin/bash

set -e
#!/usr/bin/env bash
set -euo pipefail

usage() {
echo "Usage: $0 [script.sh] [cpu|gpu]"
exit 1
}

if [ ! -z "$1" ]; then
sbatch_script_contents=`cat $1`
else
usage
exit 1
fi
[[ $# -eq 2 ]] || usage

sbatch_script="$1"
device="$2"

job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2"

# read the body of the user script
sbatch_body=$(<"$sbatch_script")

sbatch_cpu_opts="\
#SBATCH -p cpu-small # partition
#SBATCH --ntasks-per-node=24 # Number of cores per node required
#SBATCH --mem-per-cpu=2G # Memory per core\
# common SBATCH directives
sbatch_common_opts="\
#SBATCH -J shb-${sbatch_script%%.sh}-$device # job name
#SBATCH --account=gts-sbryngelson3 # account
#SBATCH -N1 # nodes
#SBATCH -t 03:00:00 # walltime
#SBATCH -q embers # QOS
#SBATCH -o $job_slug.out # stdout+stderr
#SBATCH --mem-per-cpu=2G # default mem (overridden below)
"

sbatch_gpu_opts="\
# CPU vs GPU overrides
if [[ "$device" == "cpu" ]]; then
sbatch_device_opts="\
#SBATCH -p cpu-small
#SBATCH --ntasks-per-node=24
"
elif [[ "$device" == "gpu" ]]; then
sbatch_device_opts="\
#SBATCH -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s
#SBATCH --ntasks-per-node=4 # Number of cores per node required
#SBATCH -G2\
#SBATCH --ntasks-per-node=4
#SBATCH -G2
"

if [ "$2" = "cpu" ]; then
sbatch_device_opts="$sbatch_cpu_opts"
elif [ "$2" = "gpu" ]; then
sbatch_device_opts="$sbatch_gpu_opts"
else
usage
exit 1
usage
fi

job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2"
# submit and capture the JobID
JOBID=$(sbatch <<-EOT | awk '{print $4}'
#!/usr/bin/env bash
${sbatch_common_opts}
${sbatch_device_opts}

sbatch <<EOT
#!/bin/bash
#SBATCH -Jshb-$job_slug # Job name
#SBATCH --account=gts-sbryngelson3 # charge account
#SBATCH -N1 # Number of nodes required
$sbatch_device_opts
#SBATCH -t 03:00:00 # Duration of the job (Ex: 15 mins)
#SBATCH -q embers # QOS Name
#SBATCH -o$job_slug.out # Combined output and error messages file
#SBATCH -W # Do not exit until the submitted job terminates.
set -e -x

set -e
set -x
cd "\$SLURM_SUBMIT_DIR"
echo "Running in \$(pwd):"

cd "\$SLURM_SUBMIT_DIR"
echo "Running in $(pwd):"
# load your modules & env
. ./mfc.sh load -c p -m $device

job_slug="$job_slug"
job_device="$2"
# user script contents
${sbatch_body}
EOT
)

. ./mfc.sh load -c p -m $2
echo "🚀 Submitted SLURM job $JOBID"

$sbatch_script_contents
# if this wrapper is killed/canceled, make sure SLURM job is cleaned up
trap '[[ -n "${JOBID:-}" ]] && scancel "$JOBID" >/dev/null 2>&1 || :' EXIT

EOT
# ────────── Poll until SLURM job finishes ──────────
while :; do
# Try sacct first
STATE=$(sacct -j "$JOBID" --format=State --noheader --parsable2 | head -n1)

# Fallback to squeue if sacct is empty
if [[ -z "$STATE" ]]; then
STATE=$(squeue -j "$JOBID" -h -o "%T" || echo "")
fi

# If it’s one of SLURM’s terminal states, break immediately
case "$STATE" in
COMPLETED|FAILED|CANCELLED|TIMEOUT)
echo "✅ SLURM job $JOBID reached terminal state: $STATE"
break
;;
"")
echo "✅ SLURM job $JOBID no longer in queue; assuming finished"
break
;;
*)
echo "⏳ SLURM job $JOBID state: $STATE"
sleep 10
;;
esac
done

# Now retrieve the exit code and exit with it
EXIT_CODE=$(sacct -j "$JOBID" --noheader --format=ExitCode | head -1 | cut -d: -f1)
echo "🔚 SLURM job $JOBID exit code: $EXIT_CODE"
exit "$EXIT_CODE"
15 changes: 12 additions & 3 deletions .github/workflows/phoenix/test.sh
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
#!/bin/bash

tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build
currentdir=$tmpbuild/run-$(( RANDOM % 900 ))
mkdir -p $tmpbuild
mkdir -p $currentdir
export TMPDIR=$currentdir

n_test_threads=8

build_opts=""
if [ "$job_device" = "gpu" ]; then
build_opts="--gpu"
fi

./mfc.sh test --dry-run -j 8 $build_opts

n_test_threads=8
./mfc.sh test --dry-run -j $n_test_threads $build_opts

if [ "$job_device" = "gpu" ]; then
gpu_count=$(nvidia-smi -L | wc -l) # number of GPUs on node
Expand All @@ -18,4 +24,7 @@ fi

./mfc.sh test --max-attempts 3 -a -j $n_test_threads $device_opts -- -c phoenix

sleep 10
rm -rf "$currentdir" || true

unset TMPDIR
3 changes: 2 additions & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ jobs:
group: phoenix
labels: ${{ matrix.lbl }}
env:
NODE_OPTIONS: ${{ matrix.lbl == 'gt' && '--max-old-space-size=2048' || '' }}
ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
steps:
Expand All @@ -125,7 +126,7 @@ jobs:

- name: Archive Logs
uses: actions/upload-artifact@v4
if: always()
if: matrix.lbl == 'frontier'
with:
name: logs-${{ strategy.job-index }}-${{ matrix.device }}
path: test-${{ matrix.device }}.out
Loading
Loading