Skip to content
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions .github/scripts/monitor_slurm_job.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/bin/bash
# Monitor a SLURM job and stream its output in real-time
# Usage: monitor_slurm_job.sh <job_id> <output_file>

set -e

if [ $# -ne 2 ]; then
echo "Usage: $0 <job_id> <output_file>"
exit 1
fi

job_id="$1"
output_file="$2"

echo "Submitted batch job $job_id"
echo "Monitoring output file: $output_file"

# Wait for file to appear (check job status if it takes a while)
echo "Waiting for job to start..."
while [ ! -f "$output_file" ]; do
# Check if job failed to start
if ! squeue -j "$job_id" &>/dev/null && [ ! -f "$output_file" ]; then
echo "ERROR: Job $job_id finished without creating output file"
exit 1
fi
sleep 5
done

echo "=== Streaming output for job $job_id ==="
# Stream output while job runs
tail -f "$output_file" &
tail_pid=$!

# Wait for job to complete with retry logic for transient squeue failures
squeue_failures=0
while true; do
if squeue -j "$job_id" &>/dev/null; then
squeue_failures=0
else
squeue_failures=$((squeue_failures + 1))
# Allow a few transient failures before concluding job is done
if [ $squeue_failures -ge 3 ]; then
break
fi
fi
sleep 5
done

# Stop tailing
kill $tail_pid 2>/dev/null || true

echo ""
echo "=== Final output ==="
cat "$output_file"

# Check exit status
exit_code=$(scontrol show job "$job_id" 2>/dev/null | grep -oP 'ExitCode=\K[0-9]+:[0-9]+' || echo "0:0")
if [ "$exit_code" != "0:0" ]; then
echo "ERROR: Job $job_id failed with exit code $exit_code"
exit 1
fi

echo "Job $job_id completed successfully"
exit 0

49 changes: 44 additions & 5 deletions .github/workflows/bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ jobs:
filters: ".github/file-filter.yml"

self:
name: "${{ matrix.name }} (${{ matrix.device }})"
name: "${{ matrix.name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})"
if: ${{ github.repository=='MFlowCode/MFC' && needs.file-changes.outputs.checkall=='true' && ((github.event_name=='pull_request_review' && github.event.review.state=='approved') || (github.event_name=='pull_request' && (github.event.pull_request.user.login=='sbryngelson' || github.event.pull_request.user.login=='wilfonba'))) }}
needs: file-changes
strategy:
Expand Down Expand Up @@ -73,7 +73,7 @@ jobs:
runs-on:
group: ${{ matrix.group }}
labels: ${{ matrix.labels }}
timeout-minutes: 1400
timeout-minutes: 480
env:
ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
Expand All @@ -99,9 +99,48 @@ jobs:

- name: Bench (Master v. PR)
run: |
(cd pr && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh .github/workflows/${{ matrix.cluster }}/bench.sh ${{ matrix.device }} ${{ matrix.interface }}) &
(cd master && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh .github/workflows/${{ matrix.cluster }}/bench.sh ${{ matrix.device }} ${{ matrix.interface }}) &
wait %1 && wait %2
set -e

# Function to submit and monitor using extracted script
submit_and_monitor() {
local dir=$1
local device=$2
local interface=$3
local cluster=$4

cd "$dir"

# Submit job
submit_output=$(bash .github/workflows/$cluster/submit-bench.sh \
.github/workflows/$cluster/bench.sh $device $interface 2>&1)

job_id=$(echo "$submit_output" | grep -oP 'Submitted batch job \K[0-9]+' || echo "")
job_slug="bench-$device-$interface"
output_file="${job_slug}.out"

if [ -z "$job_id" ]; then
echo "ERROR: Failed to submit job"
echo "$submit_output"
return 1
fi

# Use the monitoring script
bash .github/scripts/monitor_slurm_job.sh "$job_id" "$output_file"
}

# Run both jobs with monitoring
(submit_and_monitor pr ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}) &
pr_pid=$!

(submit_and_monitor master ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}) &
master_pid=$!

wait $pr_pid && pr_exit=$? || pr_exit=$?
wait $master_pid && master_exit=$? || master_exit=$?

if [ $pr_exit -ne 0 ] || [ $master_exit -ne 0 ]; then
exit 1
fi

- name: Generate & Post Comment
run: |
Expand Down
35 changes: 26 additions & 9 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -93,23 +93,40 @@ jobs:
OPT2: ${{ matrix.debug == 'debug' && '-% 20' || '' }}

self:
name: Self Hosted
name: "${{ matrix.cluster_name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})"
if: github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true'
needs: file-changes
continue-on-error: false
timeout-minutes: 1400
timeout-minutes: 480
strategy:
matrix:
device: ['gpu']
interface: ['acc', 'omp']
lbl: ['gt', 'frontier']
include:
- device: 'cpu'
# Phoenix (GT)
- lbl: 'gt'
cluster_name: 'Georgia Tech | Phoenix'
device: 'gpu'
interface: 'acc'
- lbl: 'gt'
cluster_name: 'Georgia Tech | Phoenix'
device: 'gpu'
interface: 'omp'
- lbl: 'gt'
cluster_name: 'Georgia Tech | Phoenix'
device: 'cpu'
interface: 'none'
lbl: 'gt'
- device: 'cpu'
# Frontier (ORNL)
- lbl: 'frontier'
cluster_name: 'Oak Ridge | Frontier'
device: 'gpu'
interface: 'acc'
- lbl: 'frontier'
cluster_name: 'Oak Ridge | Frontier'
device: 'gpu'
interface: 'omp'
- lbl: 'frontier'
cluster_name: 'Oak Ridge | Frontier'
device: 'cpu'
interface: 'none'
lbl: 'frontier'
runs-on:
group: phoenix
labels: ${{ matrix.lbl }}
Expand Down
104 changes: 87 additions & 17 deletions toolchain/mfc/bench.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import os, sys, uuid, subprocess, dataclasses, typing, math
import os, sys, uuid, subprocess, dataclasses, typing, math, traceback

import rich.table

Expand All @@ -16,6 +16,7 @@ class BenchCase:
path: str
args: typing.List[str]

# pylint: disable=too-many-locals, too-many-branches, too-many-statements
def bench(targets = None):
if targets is None:
targets = ARG("targets")
Expand All @@ -36,6 +37,10 @@ def bench(targets = None):
case.args = case.args + ARG("--")
case.path = os.path.abspath(case.path)

# Validate case file exists early
if not os.path.exists(case.path):
raise MFCException(f"Benchmark case file not found: {case.path}")

results = {
"metadata": {
"invocation": sys.argv[1:],
Expand All @@ -44,6 +49,8 @@ def bench(targets = None):
"cases": {},
}

failed_cases = []

for i, case in enumerate(CASES):
summary_filepath = os.path.join(bench_dirpath, f"{case.slug}.yaml")
log_filepath = os.path.join(bench_dirpath, f"{case.slug}.out")
Expand All @@ -54,21 +61,80 @@ def bench(targets = None):
cons.print(f"> Log: [bold]{os.path.relpath(log_filepath)}[/bold]")
cons.print(f"> Summary: [bold]{os.path.relpath(summary_filepath)}[/bold]")

with open(log_filepath, "w") as log_file:
system(
["./mfc.sh", "run", case.path, "--case-optimization"] +
["--targets"] + [t.name for t in targets] +
["--output-summary", summary_filepath] +
case.args +
["--", "--gbpp", ARG('mem')],
stdout=log_file,
stderr=subprocess.STDOUT)

results["cases"][case.slug] = {
"description": dataclasses.asdict(case),
"output_summary": file_load_yaml(summary_filepath),
}
try:
with open(log_filepath, "w") as log_file:
result = system(
["./mfc.sh", "run", case.path, "--case-optimization"] +
["--targets"] + [t.name for t in targets] +
["--output-summary", summary_filepath] +
case.args +
["--", "--gbpp", str(ARG('mem'))],
stdout=log_file,
stderr=subprocess.STDOUT)

# Check return code (handle CompletedProcess or int defensively)
rc = result.returncode if hasattr(result, "returncode") else result
if rc != 0:
cons.print(f"[bold red]ERROR[/bold red]: Case {case.slug} failed with exit code {rc}")
cons.print(f"[bold red] Check log at: {log_filepath}[/bold red]")
failed_cases.append(case.slug)
continue

# Validate summary file exists
if not os.path.exists(summary_filepath):
cons.print(f"[bold red]ERROR[/bold red]: Summary file not created for {case.slug}")
cons.print(f"[bold red] Expected: {summary_filepath}[/bold red]")
failed_cases.append(case.slug)
continue

# Load summary
summary = file_load_yaml(summary_filepath)

# Validate all targets have required data
validation_failed = False
for target in targets:
if target.name not in summary:
cons.print(f"[bold red]ERROR[/bold red]: Target {target.name} missing from summary for {case.slug}")
validation_failed = True
break

if "exec" not in summary[target.name]:
cons.print(f"[bold red]ERROR[/bold red]: 'exec' time missing for {target.name} in {case.slug}")
validation_failed = True
break

if target.name == "simulation" and "grind" not in summary[target.name]:
cons.print(f"[bold red]ERROR[/bold red]: 'grind' time missing for simulation in {case.slug}")
validation_failed = True
break

if validation_failed:
failed_cases.append(case.slug)
continue

# Add to results
results["cases"][case.slug] = {
"description": dataclasses.asdict(case),
"output_summary": summary,
}
cons.print(f"[bold green]✓[/bold green] Case {case.slug} completed successfully")

except Exception as e:
cons.print(f"[bold red]ERROR[/bold red]: Unexpected error running {case.slug}: {e}")
failed_cases.append(case.slug)
finally:
cons.unindent()

# Report results
if failed_cases:
cons.print()
cons.print(f"[bold red]Failed cases ({len(failed_cases)}):[/bold red]")
for slug in failed_cases:
cons.print(f" - {slug}")
cons.print()
raise MFCException(f"Benchmarking failed: {len(failed_cases)}/{len(CASES)} cases failed")

# Write output
file_dump_yaml(ARG("output"), results)

cons.print(f"Wrote results to [bold magenta]{os.path.relpath(ARG('output'))}[/bold magenta].")
Expand Down Expand Up @@ -140,8 +206,12 @@ def diff():
if grind_time_value <0.95:
cons.print(f"[bold red]Error[/bold red]: Benchmarking failed since grind time speedup for {target.name} below acceptable threshold (<0.95) - Case: {slug}")
err = 1
except Exception as _:
pass
except Exception as e:
cons.print(
f"[bold red]ERROR[/bold red]: Failed to compute speedup for {target.name} in {slug}: {e}\n"
f"{traceback.format_exc()}"
)
err = 1

table.add_row(f"[magenta]{slug}[/magenta]", *speedups)

Expand Down
Loading
Loading