Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 79 additions & 5 deletions .github/workflows/bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ jobs:
filters: ".github/file-filter.yml"

self:
name: "${{ matrix.name }} (${{ matrix.device }})"
name: "${{ matrix.name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})"
if: ${{ github.repository=='MFlowCode/MFC' && needs.file-changes.outputs.checkall=='true' && ((github.event_name=='pull_request_review' && github.event.review.state=='approved') || (github.event_name=='pull_request' && (github.event.pull_request.user.login=='sbryngelson' || github.event.pull_request.user.login=='wilfonba'))) }}
needs: file-changes
strategy:
Expand Down Expand Up @@ -73,7 +73,7 @@ jobs:
runs-on:
group: ${{ matrix.group }}
labels: ${{ matrix.labels }}
timeout-minutes: 1400
timeout-minutes: 480
env:
ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
Expand All @@ -99,9 +99,83 @@ jobs:

- name: Bench (Master v. PR)
run: |
(cd pr && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh .github/workflows/${{ matrix.cluster }}/bench.sh ${{ matrix.device }} ${{ matrix.interface }}) &
(cd master && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh .github/workflows/${{ matrix.cluster }}/bench.sh ${{ matrix.device }} ${{ matrix.interface }}) &
wait %1 && wait %2
set -e

# Function to submit and monitor
submit_and_monitor() {
local dir=$1
local device=$2
local interface=$3
local cluster=$4

cd "$dir"

# Submit job
submit_output=$(bash .github/workflows/$cluster/submit-bench.sh \
.github/workflows/$cluster/bench.sh $device $interface 2>&1)

job_id=$(echo "$submit_output" | grep -oP 'Submitted batch job \K[0-9]+' || echo "")
job_slug="bench-$device-$interface"
output_file="${job_slug}.out"

if [ -z "$job_id" ]; then
echo "ERROR: Failed to submit job"
echo "$submit_output"
return 1
fi

echo "Submitted batch job $job_id"
echo "Monitoring output file: $output_file"

# Wait for file to appear (check job status if it takes a while)
echo "Waiting for job to start..."
while [ ! -f "$output_file" ]; do
# Check if job failed to start
if ! squeue -j "$job_id" &>/dev/null && [ ! -f "$output_file" ]; then
echo "ERROR: Job $job_id finished without creating output file"
return 1
fi
sleep 5
done

echo "=== Streaming output for job $job_id ==="
# Stream output while job runs
tail -f "$output_file" &
tail_pid=$!

# Wait for job to complete (will wait up to GitHub Actions timeout)
while squeue -j "$job_id" &>/dev/null; do
sleep 5
done

# Stop tailing
kill $tail_pid 2>/dev/null || true

echo ""
echo "=== Final output ==="
cat "$output_file"

# Check exit status
exit_code=$(scontrol show job "$job_id" 2>/dev/null | grep -oP 'ExitCode=\K[0-9]+' || echo "0:0")
if [ "$exit_code" != "0" ] && [ "$exit_code" != "0:0" ]; then
echo "ERROR: Job $job_id failed with exit code $exit_code"
return 1
fi
}

# Run both jobs with monitoring
(submit_and_monitor pr ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}) &
pr_pid=$!

(submit_and_monitor master ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}) &
master_pid=$!

wait $pr_pid && pr_exit=$? || pr_exit=$?
wait $master_pid && master_exit=$? || master_exit=$?

if [ $pr_exit -ne 0 ] || [ $master_exit -ne 0 ]; then
exit 1
fi

- name: Generate & Post Comment
run: |
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -93,11 +93,11 @@ jobs:
OPT2: ${{ matrix.debug == 'debug' && '-% 20' || '' }}

self:
name: Self Hosted
name: "${{ matrix.lbl == 'gt' && 'Georgia Tech | Phoenix' || 'Oak Ridge | Frontier' }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})"
if: github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true'
needs: file-changes
continue-on-error: false
timeout-minutes: 1400
timeout-minutes: 480
strategy:
matrix:
device: ['gpu']
Expand Down
101 changes: 85 additions & 16 deletions toolchain/mfc/bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class BenchCase:
path: str
args: typing.List[str]

# pylint: disable=too-many-locals, too-many-branches, too-many-statements
def bench(targets = None):
if targets is None:
targets = ARG("targets")
Expand All @@ -36,6 +37,10 @@ def bench(targets = None):
case.args = case.args + ARG("--")
case.path = os.path.abspath(case.path)

# Validate case file exists early
if not os.path.exists(case.path):
raise MFCException(f"Benchmark case file not found: {case.path}")

results = {
"metadata": {
"invocation": sys.argv[1:],
Expand All @@ -44,6 +49,8 @@ def bench(targets = None):
"cases": {},
}

failed_cases = []

for i, case in enumerate(CASES):
summary_filepath = os.path.join(bench_dirpath, f"{case.slug}.yaml")
log_filepath = os.path.join(bench_dirpath, f"{case.slug}.out")
Expand All @@ -54,21 +61,82 @@ def bench(targets = None):
cons.print(f"> Log: [bold]{os.path.relpath(log_filepath)}[/bold]")
cons.print(f"> Summary: [bold]{os.path.relpath(summary_filepath)}[/bold]")

with open(log_filepath, "w") as log_file:
system(
["./mfc.sh", "run", case.path, "--case-optimization"] +
["--targets"] + [t.name for t in targets] +
["--output-summary", summary_filepath] +
case.args +
["--", "--gbpp", ARG('mem')],
stdout=log_file,
stderr=subprocess.STDOUT)

results["cases"][case.slug] = {
"description": dataclasses.asdict(case),
"output_summary": file_load_yaml(summary_filepath),
}
try:
with open(log_filepath, "w") as log_file:
result = system(
["./mfc.sh", "run", case.path, "--case-optimization"] +
["--targets"] + [t.name for t in targets] +
["--output-summary", summary_filepath] +
case.args +
["--", "--gbpp", ARG('mem')],
stdout=log_file,
stderr=subprocess.STDOUT)

# Check return code
if result.returncode != 0:
cons.print(f"[bold red]ERROR[/bold red]: Case {case.slug} failed with exit code {result.returncode}")
cons.print(f"[bold red] Check log at: {log_filepath}[/bold red]")
failed_cases.append(case.slug)
cons.unindent()
continue

# Validate summary file exists
if not os.path.exists(summary_filepath):
cons.print(f"[bold red]ERROR[/bold red]: Summary file not created for {case.slug}")
cons.print(f"[bold red] Expected: {summary_filepath}[/bold red]")
failed_cases.append(case.slug)
cons.unindent()
continue

# Load summary
summary = file_load_yaml(summary_filepath)

# Validate all targets have required data
validation_failed = False
for target in targets:
if target.name not in summary:
cons.print(f"[bold red]ERROR[/bold red]: Target {target.name} missing from summary for {case.slug}")
validation_failed = True
break

if "exec" not in summary[target.name]:
cons.print(f"[bold red]ERROR[/bold red]: 'exec' time missing for {target.name} in {case.slug}")
validation_failed = True
break

if target.name == "simulation" and "grind" not in summary[target.name]:
cons.print(f"[bold red]ERROR[/bold red]: 'grind' time missing for simulation in {case.slug}")
validation_failed = True
break

if validation_failed:
failed_cases.append(case.slug)
cons.unindent()
continue

# Add to results
results["cases"][case.slug] = {
"description": dataclasses.asdict(case),
"output_summary": summary,
}
cons.print(f"[bold green]✓[/bold green] Case {case.slug} completed successfully")

except Exception as e:
cons.print(f"[bold red]ERROR[/bold red]: Unexpected error running {case.slug}: {e}")
failed_cases.append(case.slug)

cons.unindent()

# Report results
if failed_cases:
cons.print()
cons.print(f"[bold red]Failed cases ({len(failed_cases)}):[/bold red]")
for slug in failed_cases:
cons.print(f" - {slug}")
cons.print()
raise MFCException(f"Benchmarking failed: {len(failed_cases)}/{len(CASES)} cases failed")

# Write output
file_dump_yaml(ARG("output"), results)

cons.print(f"Wrote results to [bold magenta]{os.path.relpath(ARG('output'))}[/bold magenta].")
Expand Down Expand Up @@ -140,8 +208,9 @@ def diff():
if grind_time_value <0.95:
cons.print(f"[bold red]Error[/bold red]: Benchmarking failed since grind time speedup for {target.name} below acceptable threshold (<0.95) - Case: {slug}")
err = 1
except Exception as _:
pass
except Exception as e:
cons.print(f"[bold red]ERROR[/bold red]: Failed to compute speedup for {target.name} in {slug}: {e}")
err = 1

table.add_row(f"[magenta]{slug}[/magenta]", *speedups)

Expand Down
Loading
Loading