more fixing!

sbryngelson · sbryngelson · commit f79f4d169698 · 2025-12-05T12:08:11.000-05:00
diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+# Monitor a SLURM job and stream its output in real-time
+# Usage: monitor_slurm_job.sh <job_id> <output_file>
+
+set -e
+
+if [ $# -ne 2 ]; then
+    echo "Usage: $0 <job_id> <output_file>"
+    exit 1
+fi
+
+job_id="$1"
+output_file="$2"
+
+echo "Submitted batch job $job_id"
+echo "Monitoring output file: $output_file"
+
+# Wait for file to appear (check job status if it takes a while)
+echo "Waiting for job to start..."
+while [ ! -f "$output_file" ]; do
+  # Check if job failed to start
+  if ! squeue -j "$job_id" &>/dev/null && [ ! -f "$output_file" ]; then
+    echo "ERROR: Job $job_id finished without creating output file"
+    exit 1
+  fi
+  sleep 5
+done
+
+echo "=== Streaming output for job $job_id ==="
+# Stream output while job runs
+tail -f "$output_file" &
+tail_pid=$!
+
+# Wait for job to complete with retry logic for transient squeue failures
+squeue_failures=0
+while true; do
+  if squeue -j "$job_id" &>/dev/null; then
+    squeue_failures=0
+  else
+    squeue_failures=$((squeue_failures + 1))
+    # Allow a few transient failures before concluding job is done
+    if [ $squeue_failures -ge 3 ]; then
+      break
+    fi
+  fi
+  sleep 5
+done
+
+# Stop tailing
+kill $tail_pid 2>/dev/null || true
+
+echo ""
+echo "=== Final output ==="
+cat "$output_file"
+
+# Check exit status
+exit_code=$(scontrol show job "$job_id" 2>/dev/null | grep -oP 'ExitCode=\K[0-9]+:[0-9]+' || echo "0:0")
+if [ "$exit_code" != "0:0" ]; then
+  echo "ERROR: Job $job_id failed with exit code $exit_code"
+  exit 1
+fi
+
+echo "Job $job_id completed successfully"
+exit 0
+
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
@@ -101,7 +101,7 @@ jobs:
         run: |
           set -e
           
-          # Function to submit and monitor
+          # Function to submit and monitor using extracted script
           submit_and_monitor() {
             local dir=$1
             local device=$2
@@ -124,43 +124,8 @@ jobs:
               return 1
             fi
             
-            echo "Submitted batch job $job_id"
-            echo "Monitoring output file: $output_file"
-            
-            # Wait for file to appear (check job status if it takes a while)
-            echo "Waiting for job to start..."
-            while [ ! -f "$output_file" ]; do
-              # Check if job failed to start
-              if ! squeue -j "$job_id" &>/dev/null && [ ! -f "$output_file" ]; then
-                echo "ERROR: Job $job_id finished without creating output file"
-                return 1
-              fi
-              sleep 5
-            done
-            
-            echo "=== Streaming output for job $job_id ==="
-            # Stream output while job runs
-            tail -f "$output_file" &
-            tail_pid=$!
-            
-            # Wait for job to complete (will wait up to GitHub Actions timeout)
-            while squeue -j "$job_id" &>/dev/null; do
-              sleep 5
-            done
-            
-            # Stop tailing
-            kill $tail_pid 2>/dev/null || true
-            
-            echo ""
-            echo "=== Final output ==="
-            cat "$output_file"
-            
-            # Check exit status
-            exit_code=$(scontrol show job "$job_id" 2>/dev/null | grep -oP 'ExitCode=\K[0-9]+' || echo "0:0")
-            if [ "$exit_code" != "0" ] && [ "$exit_code" != "0:0" ]; then
-              echo "ERROR: Job $job_id failed with exit code $exit_code"
-              return 1
-            fi
+            # Use the monitoring script
+            bash .github/scripts/monitor_slurm_job.sh "$job_id" "$output_file"
           }
           
           # Run both jobs with monitoring
diff --git a/toolchain/mfc/bench.py b/toolchain/mfc/bench.py
@@ -68,13 +68,14 @@ def bench(targets = None):
                     ["--targets"] + [t.name for t in targets] +
                     ["--output-summary", summary_filepath] +
                     case.args +
-                    ["--", "--gbpp", ARG('mem')],
+                    ["--", "--gbpp", str(ARG('mem'))],
                     stdout=log_file,
                     stderr=subprocess.STDOUT)
 
-            # Check return code
-            if result.returncode != 0:
-                cons.print(f"[bold red]ERROR[/bold red]: Case {case.slug} failed with exit code {result.returncode}")
+            # Check return code (handle CompletedProcess or int defensively)
+            rc = result.returncode if hasattr(result, "returncode") else result
+            if rc != 0:
+                cons.print(f"[bold red]ERROR[/bold red]: Case {case.slug} failed with exit code {rc}")
                 cons.print(f"[bold red]      Check log at: {log_filepath}[/bold red]")
                 failed_cases.append(case.slug)
                 cons.unindent()
@@ -111,7 +112,6 @@ def bench(targets = None):
 
             if validation_failed:
                 failed_cases.append(case.slug)
-                cons.unindent()
                 continue
 
             # Add to results
@@ -124,8 +124,8 @@ def bench(targets = None):
         except Exception as e:
             cons.print(f"[bold red]ERROR[/bold red]: Unexpected error running {case.slug}: {e}")
             failed_cases.append(case.slug)
-
-        cons.unindent()
+        finally:
+            cons.unindent()
 
     # Report results
     if failed_cases:
@@ -209,7 +209,11 @@ def diff():
                         cons.print(f"[bold red]Error[/bold red]: Benchmarking failed since grind time speedup for {target.name} below acceptable threshold (<0.95) - Case: {slug}")
                         err = 1
             except Exception as e:
-                cons.print(f"[bold red]ERROR[/bold red]: Failed to compute speedup for {target.name} in {slug}: {e}")
+                import traceback
+                cons.print(
+                    f"[bold red]ERROR[/bold red]: Failed to compute speedup for {target.name} in {slug}: {e}\n"
+                    f"{traceback.format_exc()}"
+                )
                 err = 1
 
         table.add_row(f"[magenta]{slug}[/magenta]", *speedups)
diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py
@@ -1,4 +1,4 @@
-import os, typing, shutil, time, itertools, signal
+import os, typing, shutil, time, itertools, threading
 from random import sample, seed
 
 import rich, rich.table
@@ -30,12 +30,15 @@
 # Per-test timeout (1 hour)
 TEST_TIMEOUT_SECONDS = 3600
 
+# Global abort flag for thread-safe early termination
+# This flag is set when the failure rate exceeds the threshold, signaling
+# all worker threads to exit gracefully. This avoids raising exceptions
+# from worker threads which could leave the scheduler in an undefined state.
+abort_tests = threading.Event()
+
 class TestTimeoutError(MFCException):
     pass
 
-def timeout_handler(signum, frame):
-    raise TestTimeoutError("Test case exceeded 1 hour timeout")
-
 # pylint: disable=too-many-branches, trailing-whitespace
 def __filter(cases_) -> typing.List[TestCase]:
     cases = cases_[:]
@@ -173,6 +176,15 @@ def test():
         [ sched.Task(ppn=case.ppn, func=handle_case, args=[case], load=case.get_cell_count()) for case in cases ],
         ARG("jobs"), ARG("gpus"))
 
+    # Check if we aborted due to high failure rate
+    if abort_tests.is_set():
+        total_completed = nFAIL + nPASS
+        cons.print()
+        cons.unindent()
+        raise MFCException(
+            f"Excessive test failures: {nFAIL}/{total_completed} failed ({nFAIL/total_completed*100:.1f}%)"
+        )
+
     nSKIP = len(skipped_cases)
     cons.print()
     cons.unindent()
@@ -199,22 +211,32 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
     global current_test_number
     start_time = time.time()
 
-    # Set timeout alarm
-    signal.signal(signal.SIGALRM, timeout_handler)
-    signal.alarm(TEST_TIMEOUT_SECONDS)
+    # Set timeout using threading.Timer (works in worker threads)
+    # Note: signal.alarm() only works in the main thread, so we use
+    # threading.Timer which works correctly in worker threads spawned by sched.sched
+    timeout_flag = threading.Event()
+    timeout_timer = threading.Timer(TEST_TIMEOUT_SECONDS, timeout_flag.set)
+    timeout_timer.start()
 
     tol = case.compute_tolerance()
     case.delete_output()
     case.create_directory()
 
     if ARG("dry_run"):
         cons.print(f"  [bold magenta]{case.get_uuid()}[/bold magenta]     SKIP     {case.trace}")
-        signal.alarm(0)  # Cancel alarm
+        timeout_timer.cancel()
         return
 
     try:
+        # Check timeout before starting
+        if timeout_flag.is_set():
+            raise TestTimeoutError("Test case exceeded 1 hour timeout")
         cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices)
 
+        # Check timeout after simulation
+        if timeout_flag.is_set():
+            raise TestTimeoutError("Test case exceeded 1 hour timeout")
+
         out_filepath = os.path.join(case.get_dirpath(), "out_pre_sim.txt")
 
         common.file_write(out_filepath, cmd.stdout)
@@ -261,26 +283,28 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
             out_filepath = os.path.join(case.get_dirpath(), "out_post.txt")
             common.file_write(out_filepath, cmd.stdout)
 
-            for silo_filepath in os.listdir(os.path.join(case.get_dirpath(), 'silo_hdf5', 'p0')):
-                silo_filepath = os.path.join(case.get_dirpath(), 'silo_hdf5', 'p0', silo_filepath)
-                h5dump        = f"{HDF5.get_install_dirpath(case.to_input_file())}/bin/h5dump"
+            silo_dir = os.path.join(case.get_dirpath(), 'silo_hdf5', 'p0')
+            if os.path.isdir(silo_dir):
+                for silo_filename in os.listdir(silo_dir):
+                    silo_filepath = os.path.join(silo_dir, silo_filename)
+                    h5dump        = f"{HDF5.get_install_dirpath(case.to_input_file())}/bin/h5dump"
 
-                if not os.path.exists(h5dump or ""):
-                    if not does_command_exist("h5dump"):
-                        raise MFCException("h5dump couldn't be found.")
+                    if not os.path.exists(h5dump or ""):
+                        if not does_command_exist("h5dump"):
+                            raise MFCException("h5dump couldn't be found.")
 
-                    h5dump = shutil.which("h5dump")
+                        h5dump = shutil.which("h5dump")
 
-                output, err = get_program_output([h5dump, silo_filepath])
+                    output, err = get_program_output([h5dump, silo_filepath])
 
-                if err != 0:
-                    raise MFCException(f"Test {case}: Failed to run h5dump. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.")
+                    if err != 0:
+                        raise MFCException(f"Test {case}: Failed to run h5dump. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.")
 
-                if "nan," in output:
-                    raise MFCException(f"Test {case}: Post Process has detected a NaN. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.")
+                    if "nan," in output:
+                        raise MFCException(f"Test {case}: Post Process has detected a NaN. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.")
 
-                if "inf," in output:
-                    raise MFCException(f"Test {case}: Post Process has detected an Infinity. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.")
+                    if "inf," in output:
+                        raise MFCException(f"Test {case}: Post Process has detected an Infinity. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.")
 
         case.delete_output()
 
@@ -298,14 +322,18 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
             f"Check the log at: {os.path.join(case.get_dirpath(), 'out_pre_sim.txt')}"
         ) from exc
     finally:
-        signal.alarm(0)  # Cancel alarm
+        timeout_timer.cancel()  # Cancel timeout timer
 
 
 def handle_case(case: TestCase, devices: typing.Set[int]):
     # pylint: disable=global-statement, global-variable-not-assigned
     global nFAIL, nPASS, nSKIP
     global errors
 
+    # Check if we should abort before processing this case
+    if abort_tests.is_set():
+        return  # Exit gracefully if abort was requested
+
     nAttempts = 0
     if ARG('single'):
         max_attempts = max(ARG('max_attempts'), 3)
@@ -337,10 +365,10 @@ def handle_case(case: TestCase, devices: typing.Set[int]):
                 failure_rate = nFAIL / total_completed
                 if failure_rate >= FAILURE_RATE_THRESHOLD:
                     cons.print(f"\n[bold red]CRITICAL: {failure_rate*100:.1f}% failure rate detected after {total_completed} tests.[/bold red]")
-                    cons.print(f"[bold red]This suggests a systemic issue (bad build, broken environment, etc.)[/bold red]")
-                    cons.print(f"[bold red]Aborting remaining tests to fail fast.[/bold red]\n")
-                    raise MFCException(
-                        f"Excessive test failures: {nFAIL}/{total_completed} failed ({failure_rate*100:.1f}%)"
-                    )
+                    cons.print("[bold red]This suggests a systemic issue (bad build, broken environment, etc.)[/bold red]")
+                    cons.print("[bold red]Aborting remaining tests to fail fast.[/bold red]\n")
+                    # Set abort flag instead of raising exception from worker thread
+                    abort_tests.set()
+                    return  # Exit gracefully
 
         return