Add debug logging to test evaluation scripts

karangattu · karangattu · commit 06b37069d70d · 2025-08-22T10:26:17.000-07:00
Introduced additional print statements in prepare_comment.py and average_results.py to aid in debugging file discovery and directory contents. Also updated run-test-evaluation.sh to initialize the results directory structure only once before test attempts, improving efficiency and clarity.
diff --git a/tests/inspect-ai/scripts/prepare_comment.py b/tests/inspect-ai/scripts/prepare_comment.py
@@ -28,9 +28,15 @@ def prepare_comment(summary_path: Union[str, Path]) -> int:
         # Try to read the pytest averaged summary
         pytest_results = None
         pytest_summary_path = summary_path.parent / "pytest_summary.json"
+        print(f"Looking for pytest summary at: {pytest_summary_path}")
         if pytest_summary_path.exists():
             with open(pytest_summary_path, "r") as f:
                 pytest_results = json.load(f)
+            print(f"Found pytest results: {pytest_results}")
+        else:
+            print(
+                f"Pytest summary not found. Directory contents: {list(summary_path.parent.iterdir())}"
+            )
 
         # Try to read the combined summary for overall gate status
         combined_results = None
diff --git a/tests/inspect-ai/scripts/run-test-evaluation.sh b/tests/inspect-ai/scripts/run-test-evaluation.sh
@@ -24,11 +24,13 @@ cleanup_processes() {
 
 trap cleanup_processes EXIT
 
+# Initialize results directory structure once
+rm -rf results/
+mkdir -p results/
+
 for i in $(seq 1 "$ATTEMPTS"); do
   log_with_timestamp "Starting attempt $i of $ATTEMPTS"
 
-  rm -rf results/
-  mkdir -p results/
   mkdir -p results/attempts/attempt_$i/
   rm -f test-results.xml
 
diff --git a/tests/inspect-ai/utils/scripts/average_results.py b/tests/inspect-ai/utils/scripts/average_results.py
@@ -32,9 +32,15 @@ def process_inspect_ai_results(attempts_dir: Path) -> Dict[str, Any]:
 
     if not attempt_dirs:
         print("No attempt directories found")
+        print(f"Looking in: {attempts_dir}")
+        print(
+            f"Directory contents: {list(attempts_dir.iterdir()) if attempts_dir.exists() else 'Directory does not exist'}"
+        )
         return {}
 
-    print(f"Found {len(attempt_dirs)} attempts to average")
+    print(
+        f"Found {len(attempt_dirs)} attempts to average: {[d.name for d in attempt_dirs]}"
+    )
 
     all_summaries: List[Dict[str, Union[int, float, bool]]] = []
 
@@ -146,14 +152,22 @@ def process_pytest_results(attempts_dir: Path) -> Dict[str, Any]:
 
     if not attempt_dirs:
         print("No attempt directories found for pytest results")
+        print(f"Looking in: {attempts_dir}")
+        print(
+            f"Directory contents: {list(attempts_dir.iterdir()) if attempts_dir.exists() else 'Directory does not exist'}"
+        )
         return {}
 
     all_pytest_summaries: List[Dict[str, Union[int, float]]] = []
 
     for attempt_dir in attempt_dirs:
         xml_file = attempt_dir / "test-results.xml"
+        print(f"Looking for XML file: {xml_file}")
         if not xml_file.exists():
             print(f"Warning: No test-results.xml found in {attempt_dir}")
+            print(
+                f"Directory contents: {list(attempt_dir.iterdir()) if attempt_dir.exists() else 'Directory does not exist'}"
+            )
             continue
 
         try: