better monitoring

sbryngelson · sbryngelson · commit 87fcd74b6222 · 2025-12-08T16:12:31.000-05:00
diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh
@@ -50,15 +50,24 @@ while [ ! -f "$output_file" ]; do
 done
 
 echo "=== Streaming output for job $job_id ==="
-# Stream output while job runs
-tail -f "$output_file" &
+# Stream output while job runs (explicitly redirect to ensure output visibility)
+tail -f "$output_file" 2>&1 &
 tail_pid=$!
 
+# Give tail a moment to start and show initial output
+sleep 2
+
 # Wait for job to complete with retry logic for transient squeue failures
 squeue_failures=0
+heartbeat_counter=0
 while true; do
   if squeue -j "$job_id" &>/dev/null; then
     squeue_failures=0
+    # Print heartbeat every 60 seconds (12 iterations * 5 sec)
+    heartbeat_counter=$((heartbeat_counter + 1))
+    if [ $((heartbeat_counter % 12)) -eq 0 ]; then
+      echo "[$(date +%H:%M:%S)] Job $job_id still running..."
+    fi
   else
     squeue_failures=$((squeue_failures + 1))
     # Check if job actually completed using sacct (if available)
@@ -68,6 +77,7 @@ while true; do
         # Consider job done only if it reached a terminal state
         case "$state" in
           COMPLETED|FAILED|CANCELLED|TIMEOUT|OUT_OF_MEMORY)
+            echo "[$(date +%H:%M:%S)] Job $job_id reached terminal state: $state"
             break
             ;;
           *)
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
@@ -108,6 +108,7 @@ jobs:
             local interface=$3
             local cluster=$4
             
+            echo "[$dir] Submitting benchmark for $device-$interface on $cluster..."
             cd "$dir"
             
             # Submit job
@@ -119,38 +120,61 @@ jobs:
             output_file="${job_slug}.out"
             
             if [ -z "$job_id" ]; then
-              echo "ERROR: Failed to submit job"
+              echo "[$dir] ERROR: Failed to submit job"
               echo "$submit_output"
               return 1
             fi
             
+            echo "[$dir] Job ID: $job_id, monitoring output file: $output_file"
+            
             # Use the monitoring script
             bash .github/scripts/monitor_slurm_job.sh "$job_id" "$output_file"
+            
+            echo "[$dir] Monitoring complete for job $job_id"
           }
           
           # Run both jobs with monitoring
+          echo "=========================================="
+          echo "Starting parallel benchmark jobs..."
+          echo "=========================================="
+          
           (submit_and_monitor pr ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}) &
           pr_pid=$!
+          echo "PR job started in background (PID: $pr_pid)"
           
           (submit_and_monitor master ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}) &
           master_pid=$!
+          echo "Master job started in background (PID: $master_pid)"
+          
+          echo "Waiting for both jobs to complete..."
           
           # Wait and capture exit codes reliably
           pr_exit=0
           master_exit=0
 
           if ! wait "$pr_pid"; then
             pr_exit=$?
+            echo "PR job exited with code: $pr_exit"
+          else
+            echo "PR job completed successfully"
           fi
+          
           if ! wait "$master_pid"; then
             master_exit=$?
+            echo "Master job exited with code: $master_exit"
+          else
+            echo "Master job completed successfully"
           fi
 
           # Explicitly check and quote to avoid test errors
           if [ "${pr_exit}" -ne 0 ] || [ "${master_exit}" -ne 0 ]; then
-            echo "One or both benchmark jobs failed: pr_exit=${pr_exit}, master_exit=${master_exit}"
+            echo "ERROR: One or both benchmark jobs failed: pr_exit=${pr_exit}, master_exit=${master_exit}"
             exit 1
           fi
+          
+          echo "=========================================="
+          echo "Both benchmark jobs completed successfully!"
+          echo "=========================================="
 
       - name: Generate & Post Comment
         run: |
diff --git a/toolchain/mfc/bench.py b/toolchain/mfc/bench.py
@@ -29,118 +29,121 @@ def bench(targets = None):
     cons.print()
     cons.print(f"[bold]Benchmarking {format_list_to_string(ARG('targets'), 'magenta')} ([magenta]{os.path.relpath(bench_dirpath)}[/magenta]):[/bold]")
     cons.indent()
-    cons.print()
 
-    CASES = [ BenchCase(**case) for case in file_load_yaml(MFC_BENCH_FILEPATH) ]
+    try:
+        cons.print()
 
-    for case in CASES:
-        case.args = case.args + ARG("--")
-        case.path = os.path.abspath(case.path)
+        CASES = [ BenchCase(**case) for case in file_load_yaml(MFC_BENCH_FILEPATH) ]
 
-        # Validate case file exists early
-        if not os.path.exists(case.path):
-            raise MFCException(f"Benchmark case file not found: {case.path}")
+        for case in CASES:
+            case.args = case.args + ARG("--")
+            case.path = os.path.abspath(case.path)
 
-    results = {
-        "metadata": {
-            "invocation": sys.argv[1:],
-            "lock":       dataclasses.asdict(CFG())
-        },
-        "cases": {},
-    }
+            # Validate case file exists early
+            if not os.path.exists(case.path):
+                raise MFCException(f"Benchmark case file not found: {case.path}")
 
-    failed_cases = []
+        results = {
+            "metadata": {
+                "invocation": sys.argv[1:],
+                "lock":       dataclasses.asdict(CFG())
+            },
+            "cases": {},
+        }
 
-    for i, case in enumerate(CASES):
-        summary_filepath = os.path.join(bench_dirpath, f"{case.slug}.yaml")
-        log_filepath     = os.path.join(bench_dirpath, f"{case.slug}.out")
+        failed_cases = []
 
-        cons.print(f"{str(i+1).zfill(len(CASES) // 10 + 1)}/{len(CASES)}: {case.slug} @ [bold]{os.path.relpath(case.path)}[/bold]")
-        cons.indent()
-        cons.print()
-        cons.print(f"> Log:     [bold]{os.path.relpath(log_filepath)}[/bold]")
-        cons.print(f"> Summary: [bold]{os.path.relpath(summary_filepath)}[/bold]")
-
-        try:
-            with open(log_filepath, "w") as log_file:
-                result = system(
-                    ["./mfc.sh", "run", case.path, "--case-optimization"] +
-                    ["--targets"] + [t.name for t in targets] +
-                    ["--output-summary", summary_filepath] +
-                    case.args +
-                    ["--", "--gbpp", str(ARG('mem'))],
-                    stdout=log_file,
-                    stderr=subprocess.STDOUT)
-
-            # Check return code (handle CompletedProcess or int defensively)
-            rc = result.returncode if hasattr(result, "returncode") else result
-            if rc != 0:
-                cons.print(f"[bold red]ERROR[/bold red]: Case {case.slug} failed with exit code {rc}")
-                cons.print(f"[bold red]      Check log at: {log_filepath}[/bold red]")
-                failed_cases.append(case.slug)
-                continue
+        for i, case in enumerate(CASES):
+            summary_filepath = os.path.join(bench_dirpath, f"{case.slug}.yaml")
+            log_filepath     = os.path.join(bench_dirpath, f"{case.slug}.out")
 
-            # Validate summary file exists
-            if not os.path.exists(summary_filepath):
-                cons.print(f"[bold red]ERROR[/bold red]: Summary file not created for {case.slug}")
-                cons.print(f"[bold red]      Expected: {summary_filepath}[/bold red]")
-                failed_cases.append(case.slug)
-                continue
-
-            # Load summary
-            summary = file_load_yaml(summary_filepath)
-
-            # Validate all targets have required data
-            validation_failed = False
-            for target in targets:
-                if target.name not in summary:
-                    cons.print(f"[bold red]ERROR[/bold red]: Target {target.name} missing from summary for {case.slug}")
-                    validation_failed = True
-                    break
-
-                if "exec" not in summary[target.name]:
-                    cons.print(f"[bold red]ERROR[/bold red]: 'exec' time missing for {target.name} in {case.slug}")
-                    validation_failed = True
-                    break
-
-                if target.name == "simulation" and "grind" not in summary[target.name]:
-                    cons.print(f"[bold red]ERROR[/bold red]: 'grind' time missing for simulation in {case.slug}")
-                    validation_failed = True
-                    break
-
-            if validation_failed:
+            cons.print(f"{str(i+1).zfill(len(CASES) // 10 + 1)}/{len(CASES)}: {case.slug} @ [bold]{os.path.relpath(case.path)}[/bold]")
+            cons.indent()
+            cons.print()
+            cons.print(f"> Log:     [bold]{os.path.relpath(log_filepath)}[/bold]")
+            cons.print(f"> Summary: [bold]{os.path.relpath(summary_filepath)}[/bold]")
+
+            try:
+                with open(log_filepath, "w") as log_file:
+                    result = system(
+                        ["./mfc.sh", "run", case.path, "--case-optimization"] +
+                        ["--targets"] + [t.name for t in targets] +
+                        ["--output-summary", summary_filepath] +
+                        case.args +
+                        ["--", "--gbpp", str(ARG('mem'))],
+                        stdout=log_file,
+                        stderr=subprocess.STDOUT)
+
+                # Check return code (handle CompletedProcess or int defensively)
+                rc = result.returncode if hasattr(result, "returncode") else result
+                if rc != 0:
+                    cons.print(f"[bold red]ERROR[/bold red]: Case {case.slug} failed with exit code {rc}")
+                    cons.print(f"[bold red]      Check log at: {log_filepath}[/bold red]")
+                    failed_cases.append(case.slug)
+                    continue
+
+                # Validate summary file exists
+                if not os.path.exists(summary_filepath):
+                    cons.print(f"[bold red]ERROR[/bold red]: Summary file not created for {case.slug}")
+                    cons.print(f"[bold red]      Expected: {summary_filepath}[/bold red]")
+                    failed_cases.append(case.slug)
+                    continue
+
+                # Load summary
+                summary = file_load_yaml(summary_filepath)
+
+                # Validate all targets have required data
+                validation_failed = False
+                for target in targets:
+                    if target.name not in summary:
+                        cons.print(f"[bold red]ERROR[/bold red]: Target {target.name} missing from summary for {case.slug}")
+                        validation_failed = True
+                        break
+
+                    if "exec" not in summary[target.name]:
+                        cons.print(f"[bold red]ERROR[/bold red]: 'exec' time missing for {target.name} in {case.slug}")
+                        validation_failed = True
+                        break
+
+                    if target.name == "simulation" and "grind" not in summary[target.name]:
+                        cons.print(f"[bold red]ERROR[/bold red]: 'grind' time missing for simulation in {case.slug}")
+                        validation_failed = True
+                        break
+
+                if validation_failed:
+                    failed_cases.append(case.slug)
+                    continue
+
+                # Add to results
+                results["cases"][case.slug] = {
+                    "description":    dataclasses.asdict(case),
+                    "output_summary": summary,
+                }
+                cons.print(f"[bold green]✓[/bold green] Case {case.slug} completed successfully")
+
+            except Exception as e:
+                cons.print(f"[bold red]ERROR[/bold red]: Unexpected error running {case.slug}: {e}")
+                cons.print(f"[dim]{traceback.format_exc()}[/dim]")
                 failed_cases.append(case.slug)
-                continue
-
-            # Add to results
-            results["cases"][case.slug] = {
-                "description":    dataclasses.asdict(case),
-                "output_summary": summary,
-            }
-            cons.print(f"[bold green]✓[/bold green] Case {case.slug} completed successfully")
-
-        except Exception as e:
-            cons.print(f"[bold red]ERROR[/bold red]: Unexpected error running {case.slug}: {e}")
-            cons.print(f"[dim]{traceback.format_exc()}[/dim]")
-            failed_cases.append(case.slug)
-        finally:
-            cons.unindent()
-
-    # Report results
-    if failed_cases:
-        cons.print()
-        cons.print(f"[bold red]Failed cases ({len(failed_cases)}):[/bold red]")
-        for slug in failed_cases:
-            cons.print(f"  - {slug}")
-        cons.print()
-        raise MFCException(f"Benchmarking failed: {len(failed_cases)}/{len(CASES)} cases failed")
+            finally:
+                cons.unindent()
+
+        # Report results
+        if failed_cases:
+            cons.print()
+            cons.print(f"[bold red]Failed cases ({len(failed_cases)}):[/bold red]")
+            for slug in failed_cases:
+                cons.print(f"  - {slug}")
+            cons.print()
+            raise MFCException(f"Benchmarking failed: {len(failed_cases)}/{len(CASES)} cases failed")
 
-    # Write output
-    file_dump_yaml(ARG("output"), results)
+        # Write output
+        file_dump_yaml(ARG("output"), results)
 
-    cons.print(f"Wrote results to [bold magenta]{os.path.relpath(ARG('output'))}[/bold magenta].")
+        cons.print(f"Wrote results to [bold magenta]{os.path.relpath(ARG('output'))}[/bold magenta].")
 
-    cons.unindent()
+    finally:
+        cons.unindent()
 
 
 # TODO: This function is too long and not nicely written at all. Someone should