MFlowCode · sbryngelson · Dec 5, 2025 · Dec 5, 2025 · Dec 5, 2025 · Dec 5, 2025
@@ -0,0 +1,65 @@
+#!/bin/bash
+# Monitor a SLURM job and stream its output in real-time
+# Usage: monitor_slurm_job.sh <job_id> <output_file>
+
+set -e
+
+if [ $# -ne 2 ]; then
+    echo "Usage: $0 <job_id> <output_file>"
+    exit 1
+fi
+
+job_id="$1"
+output_file="$2"
+
+echo "Submitted batch job $job_id"
+echo "Monitoring output file: $output_file"
+
+# Wait for file to appear (check job status if it takes a while)
+echo "Waiting for job to start..."
+while [ ! -f "$output_file" ]; do
+  # Check if job failed to start
+  if ! squeue -j "$job_id" &>/dev/null && [ ! -f "$output_file" ]; then
+    echo "ERROR: Job $job_id finished without creating output file"
+    exit 1
+  fi
+  sleep 5
+done
+
+echo "=== Streaming output for job $job_id ==="
+# Stream output while job runs
+tail -f "$output_file" &
+tail_pid=$!
+
+# Wait for job to complete with retry logic for transient squeue failures
+squeue_failures=0
+while true; do
+  if squeue -j "$job_id" &>/dev/null; then
+    squeue_failures=0
+  else
+    squeue_failures=$((squeue_failures + 1))
+    # Allow a few transient failures before concluding job is done
+    if [ $squeue_failures -ge 3 ]; then
+      break
+    fi
+  fi
+  sleep 5
+done
+
+# Stop tailing
+kill $tail_pid 2>/dev/null || true
+
+echo ""
+echo "=== Final output ==="
+cat "$output_file"
+
+# Check exit status
+exit_code=$(scontrol show job "$job_id" 2>/dev/null | grep -oP 'ExitCode=\K[0-9]+:[0-9]+' || echo "0:0")
+if [ "$exit_code" != "0:0" ]; then
+  echo "ERROR: Job $job_id failed with exit code $exit_code"
+  exit 1
+fi
+
+echo "Job $job_id completed successfully"
+exit 0
+
@@ -23,7 +23,7 @@ jobs:
           filters: ".github/file-filter.yml"
 
   self:
-    name: "${{ matrix.name }} (${{ matrix.device }})"
+    name: "${{ matrix.name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})"
     if: ${{ github.repository=='MFlowCode/MFC' && needs.file-changes.outputs.checkall=='true' && ((github.event_name=='pull_request_review' && github.event.review.state=='approved') || (github.event_name=='pull_request' && (github.event.pull_request.user.login=='sbryngelson' || github.event.pull_request.user.login=='wilfonba'))) }}
     needs: file-changes
     strategy:
@@ -73,7 +73,7 @@ jobs:
     runs-on:
       group: ${{ matrix.group }}
       labels: ${{ matrix.labels }}
-    timeout-minutes: 1400
+    timeout-minutes: 480
     env:
       ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16
       ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
@@ -99,9 +99,48 @@ jobs:
 
       - name: Bench (Master v. PR)
         run: |
-          (cd pr     && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh .github/workflows/${{ matrix.cluster }}/bench.sh ${{ matrix.device }} ${{ matrix.interface }}) &
-          (cd master && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh .github/workflows/${{ matrix.cluster }}/bench.sh ${{ matrix.device }} ${{ matrix.interface }}) &
-          wait %1 && wait %2
+          set -e
+
+          # Function to submit and monitor using extracted script
+          submit_and_monitor() {
+            local dir=$1
+            local device=$2
+            local interface=$3
+            local cluster=$4
+
+            cd "$dir"
+
+            # Submit job
+            submit_output=$(bash .github/workflows/$cluster/submit-bench.sh \
+              .github/workflows/$cluster/bench.sh $device $interface 2>&1)
+
+            job_id=$(echo "$submit_output" | grep -oP 'Submitted batch job \K[0-9]+' || echo "")
+            job_slug="bench-$device-$interface"
+            output_file="${job_slug}.out"
+
+            if [ -z "$job_id" ]; then
+              echo "ERROR: Failed to submit job"
+              echo "$submit_output"
+              return 1
+            fi
+
+            # Use the monitoring script
+            bash .github/scripts/monitor_slurm_job.sh "$job_id" "$output_file"
+          }
+
+          # Run both jobs with monitoring
+          (submit_and_monitor pr ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}) &
+          pr_pid=$!
+
+          (submit_and_monitor master ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}) &
+          master_pid=$!
+
+          wait $pr_pid && pr_exit=$? || pr_exit=$?
+          wait $master_pid && master_exit=$? || master_exit=$?
+
+          if [ $pr_exit -ne 0 ] || [ $master_exit -ne 0 ]; then
+            exit 1
+          fi
 
       - name: Generate & Post Comment
         run: |

@@ -93,23 +93,40 @@ jobs:
           OPT2: ${{ matrix.debug == 'debug' && '-% 20' || '' }}
 
   self:
-    name: Self Hosted
+    name: "${{ matrix.cluster_name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})"
     if: github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true'
     needs: file-changes
     continue-on-error: false
-    timeout-minutes: 1400
+    timeout-minutes: 480
     strategy:
       matrix:
-        device: ['gpu']
-        interface: ['acc', 'omp']
-        lbl: ['gt', 'frontier']
         include:
-          - device: 'cpu'
+          # Phoenix (GT)
+          - lbl: 'gt'
+            cluster_name: 'Georgia Tech | Phoenix'
+            device: 'gpu'
+            interface: 'acc'
+          - lbl: 'gt'
+            cluster_name: 'Georgia Tech | Phoenix'
+            device: 'gpu'
+            interface: 'omp'
+          - lbl: 'gt'
+            cluster_name: 'Georgia Tech | Phoenix'
+            device: 'cpu'
             interface: 'none'
-            lbl: 'gt'
-          - device: 'cpu'
+          # Frontier (ORNL)
+          - lbl: 'frontier'
+            cluster_name: 'Oak Ridge | Frontier'
+            device: 'gpu'
+            interface: 'acc'
+          - lbl: 'frontier'
+            cluster_name: 'Oak Ridge | Frontier'
+            device: 'gpu'
+            interface: 'omp'
+          - lbl: 'frontier'
+            cluster_name: 'Oak Ridge | Frontier'
+            device: 'cpu'
             interface: 'none'
-            lbl: 'frontier'
     runs-on:
       group:  phoenix
       labels: ${{ matrix.lbl }}

diff --git a/toolchain/mfc/bench.py b/toolchain/mfc/bench.py
@@ -1,4 +1,4 @@
-import os, sys, uuid, subprocess, dataclasses, typing, math
+import os, sys, uuid, subprocess, dataclasses, typing, math, traceback
 
 import rich.table
 
@@ -16,6 +16,7 @@ class BenchCase:
     path: str
     args: typing.List[str]
 
+# pylint: disable=too-many-locals, too-many-branches, too-many-statements
 def bench(targets = None):
     if targets is None:
         targets = ARG("targets")
@@ -36,6 +37,10 @@ def bench(targets = None):
         case.args = case.args + ARG("--")
         case.path = os.path.abspath(case.path)
 
+        # Validate case file exists early
+        if not os.path.exists(case.path):
+            raise MFCException(f"Benchmark case file not found: {case.path}")
+
     results = {
         "metadata": {
             "invocation": sys.argv[1:],
@@ -44,6 +49,8 @@ def bench(targets = None):
         "cases": {},
     }
 
+    failed_cases = []
+
     for i, case in enumerate(CASES):
         summary_filepath = os.path.join(bench_dirpath, f"{case.slug}.yaml")
         log_filepath     = os.path.join(bench_dirpath, f"{case.slug}.out")
@@ -54,21 +61,80 @@ def bench(targets = None):
         cons.print(f"> Log:     [bold]{os.path.relpath(log_filepath)}[/bold]")
         cons.print(f"> Summary: [bold]{os.path.relpath(summary_filepath)}[/bold]")
 
-        with open(log_filepath, "w") as log_file:
-            system(
-                ["./mfc.sh", "run", case.path, "--case-optimization"] +
-                ["--targets"] + [t.name for t in targets] +
-                ["--output-summary", summary_filepath] +
-                case.args +
-                ["--", "--gbpp", ARG('mem')],
-                stdout=log_file,
-                stderr=subprocess.STDOUT)
-
-        results["cases"][case.slug] = {
-            "description":    dataclasses.asdict(case),
-            "output_summary": file_load_yaml(summary_filepath),
-        }
+        try:
+            with open(log_filepath, "w") as log_file:
+                result = system(
+                    ["./mfc.sh", "run", case.path, "--case-optimization"] +
+                    ["--targets"] + [t.name for t in targets] +
+                    ["--output-summary", summary_filepath] +
+                    case.args +
+                    ["--", "--gbpp", str(ARG('mem'))],
+                    stdout=log_file,
+                    stderr=subprocess.STDOUT)
+
+            # Check return code (handle CompletedProcess or int defensively)
+            rc = result.returncode if hasattr(result, "returncode") else result
+            if rc != 0:
+                cons.print(f"[bold red]ERROR[/bold red]: Case {case.slug} failed with exit code {rc}")
+                cons.print(f"[bold red]      Check log at: {log_filepath}[/bold red]")
+                failed_cases.append(case.slug)
+                continue
+
+            # Validate summary file exists
+            if not os.path.exists(summary_filepath):
+                cons.print(f"[bold red]ERROR[/bold red]: Summary file not created for {case.slug}")
+                cons.print(f"[bold red]      Expected: {summary_filepath}[/bold red]")
+                failed_cases.append(case.slug)
+                continue
+
+            # Load summary
+            summary = file_load_yaml(summary_filepath)
+
+            # Validate all targets have required data
+            validation_failed = False
+            for target in targets:
+                if target.name not in summary:
+                    cons.print(f"[bold red]ERROR[/bold red]: Target {target.name} missing from summary for {case.slug}")
+                    validation_failed = True
+                    break
+
+                if "exec" not in summary[target.name]:
+                    cons.print(f"[bold red]ERROR[/bold red]: 'exec' time missing for {target.name} in {case.slug}")
+                    validation_failed = True
+                    break
+
+                if target.name == "simulation" and "grind" not in summary[target.name]:
+                    cons.print(f"[bold red]ERROR[/bold red]: 'grind' time missing for simulation in {case.slug}")
+                    validation_failed = True
+                    break
+
+            if validation_failed:
+                failed_cases.append(case.slug)
+                continue
+
+            # Add to results
+            results["cases"][case.slug] = {
+                "description":    dataclasses.asdict(case),
+                "output_summary": summary,
+            }
+            cons.print(f"[bold green]✓[/bold green] Case {case.slug} completed successfully")
+
+        except Exception as e:
+            cons.print(f"[bold red]ERROR[/bold red]: Unexpected error running {case.slug}: {e}")
+            failed_cases.append(case.slug)
+        finally:
+            cons.unindent()
+
+    # Report results
+    if failed_cases:
+        cons.print()
+        cons.print(f"[bold red]Failed cases ({len(failed_cases)}):[/bold red]")
+        for slug in failed_cases:
+            cons.print(f"  - {slug}")
+        cons.print()
+        raise MFCException(f"Benchmarking failed: {len(failed_cases)}/{len(CASES)} cases failed")
 
+    # Write output
     file_dump_yaml(ARG("output"), results)
 
     cons.print(f"Wrote results to [bold magenta]{os.path.relpath(ARG('output'))}[/bold magenta].")
@@ -140,8 +206,12 @@ def diff():
                     if grind_time_value <0.95:
                         cons.print(f"[bold red]Error[/bold red]: Benchmarking failed since grind time speedup for {target.name} below acceptable threshold (<0.95) - Case: {slug}")
                         err = 1
-            except Exception as _:
-                pass
+            except Exception as e:
+                cons.print(
+                    f"[bold red]ERROR[/bold red]: Failed to compute speedup for {target.name} in {slug}: {e}\n"
+                    f"{traceback.format_exc()}"
+                )
+                err = 1
 
         table.add_row(f"[magenta]{slug}[/magenta]", *speedups)