|
| 1 | +""" |
| 2 | +Script to average inspect-ai and pytest results across multiple attempts. |
| 3 | +
|
| 4 | +This script processes results from multiple attempts stored in separate directories |
| 5 | +and creates averaged results maintaining the same structure as single-attempt results. |
| 6 | +""" |
| 7 | + |
| 8 | +import json |
| 9 | +import statistics |
| 10 | +import sys |
| 11 | +import xml.etree.ElementTree as ET |
| 12 | +from pathlib import Path |
| 13 | +from typing import Any, Dict, List, Union |
| 14 | + |
| 15 | + |
| 16 | +def process_inspect_ai_results(attempts_dir: Path) -> Dict[str, Any]: |
| 17 | + """ |
| 18 | + Process and average inspect-ai results across multiple attempts. |
| 19 | +
|
| 20 | + Args: |
| 21 | + attempts_dir: Directory containing attempt subdirectories |
| 22 | +
|
| 23 | + Returns: |
| 24 | + Averaged summary dictionary with same structure as single attempt |
| 25 | + """ |
| 26 | + attempt_dirs = [ |
| 27 | + d |
| 28 | + for d in attempts_dir.iterdir() |
| 29 | + if d.is_dir() and d.name.startswith("attempt_") |
| 30 | + ] |
| 31 | + attempt_dirs.sort(key=lambda x: int(x.name.split("_")[1])) |
| 32 | + |
| 33 | + if not attempt_dirs: |
| 34 | + print("No attempt directories found") |
| 35 | + return {} |
| 36 | + |
| 37 | + print(f"Found {len(attempt_dirs)} attempts to average") |
| 38 | + |
| 39 | + all_summaries: List[Dict[str, Union[int, float, bool]]] = [] |
| 40 | + |
| 41 | + for attempt_dir in attempt_dirs: |
| 42 | + # Find the JSON result file in this attempt |
| 43 | + json_files = list(attempt_dir.glob("*.json")) |
| 44 | + if not json_files: |
| 45 | + print(f"Warning: No JSON files found in {attempt_dir}") |
| 46 | + continue |
| 47 | + |
| 48 | + # Use the first JSON file (should only be one) |
| 49 | + result_file = json_files[0] |
| 50 | + |
| 51 | + # Process this single result to get summary |
| 52 | + with open(result_file, "r", encoding="utf-8") as f: |
| 53 | + try: |
| 54 | + data = json.load(f) |
| 55 | + except json.JSONDecodeError as e: |
| 56 | + print(f"Error decoding JSON from {result_file}: {e}") |
| 57 | + continue |
| 58 | + |
| 59 | + samples = data.get("samples", []) |
| 60 | + total_tests = len(samples) |
| 61 | + |
| 62 | + if total_tests == 0: |
| 63 | + print(f"Warning: No samples found in {result_file}") |
| 64 | + continue |
| 65 | + |
| 66 | + # Count results |
| 67 | + passed_tests = sum( |
| 68 | + 1 |
| 69 | + for s in samples |
| 70 | + if s.get("scores", {}).get("model_graded_qa", {}).get("value") == "C" |
| 71 | + ) |
| 72 | + partial_tests = sum( |
| 73 | + 1 |
| 74 | + for s in samples |
| 75 | + if s.get("scores", {}).get("model_graded_qa", {}).get("value") == "P" |
| 76 | + ) |
| 77 | + failed_tests = sum( |
| 78 | + 1 |
| 79 | + for s in samples |
| 80 | + if s.get("scores", {}).get("model_graded_qa", {}).get("value") == "I" |
| 81 | + ) |
| 82 | + |
| 83 | + passing_tests = passed_tests + partial_tests |
| 84 | + pass_rate = (passing_tests / total_tests) * 100 if total_tests > 0 else 0 |
| 85 | + |
| 86 | + summary: Dict[str, Union[int, float, bool]] = { |
| 87 | + "total": total_tests, |
| 88 | + "passed": passed_tests, |
| 89 | + "partial": partial_tests, |
| 90 | + "failed": failed_tests, |
| 91 | + "pass_rate": pass_rate, |
| 92 | + "quality_gate_passed": pass_rate >= 80, |
| 93 | + } |
| 94 | + |
| 95 | + all_summaries.append(summary) |
| 96 | + print( |
| 97 | + f"Attempt {attempt_dir.name}: {passed_tests}C + {partial_tests}P + {failed_tests}I = {passing_tests}/{total_tests} ({pass_rate:.1f}%)" |
| 98 | + ) |
| 99 | + |
| 100 | + if not all_summaries: |
| 101 | + print("No valid summaries found to average") |
| 102 | + return {} |
| 103 | + |
| 104 | + # Calculate averages |
| 105 | + avg_summary: Dict[str, Union[int, float, bool, str]] = { |
| 106 | + "total": statistics.mean(float(s["total"]) for s in all_summaries), |
| 107 | + "passed": statistics.mean(float(s["passed"]) for s in all_summaries), |
| 108 | + "partial": statistics.mean(float(s["partial"]) for s in all_summaries), |
| 109 | + "failed": statistics.mean(float(s["failed"]) for s in all_summaries), |
| 110 | + "pass_rate": statistics.mean(float(s["pass_rate"]) for s in all_summaries), |
| 111 | + } |
| 112 | + |
| 113 | + # Round to reasonable precision |
| 114 | + avg_summary["total"] = round(float(avg_summary["total"]), 1) |
| 115 | + avg_summary["passed"] = round(float(avg_summary["passed"]), 1) |
| 116 | + avg_summary["partial"] = round(float(avg_summary["partial"]), 1) |
| 117 | + avg_summary["failed"] = round(float(avg_summary["failed"]), 1) |
| 118 | + avg_summary["pass_rate"] = round(float(avg_summary["pass_rate"]), 1) |
| 119 | + avg_summary["quality_gate_passed"] = avg_summary["pass_rate"] >= 80 |
| 120 | + avg_summary["details"] = ( |
| 121 | + f"Averaged across {len(all_summaries)} attempts: " |
| 122 | + f"Complete: {avg_summary['passed']}, Partial: {avg_summary['partial']}, " |
| 123 | + f"Incomplete: {avg_summary['failed']}, " |
| 124 | + f"Passing: {avg_summary['passed'] + avg_summary['partial']}/{avg_summary['total']}" |
| 125 | + ) |
| 126 | + |
| 127 | + return avg_summary |
| 128 | + |
| 129 | + |
| 130 | +def process_pytest_results(attempts_dir: Path) -> Dict[str, Any]: |
| 131 | + """ |
| 132 | + Process and average pytest results across multiple attempts. |
| 133 | +
|
| 134 | + Args: |
| 135 | + attempts_dir: Directory containing attempt subdirectories |
| 136 | +
|
| 137 | + Returns: |
| 138 | + Averaged pytest summary dictionary |
| 139 | + """ |
| 140 | + attempt_dirs = [ |
| 141 | + d |
| 142 | + for d in attempts_dir.iterdir() |
| 143 | + if d.is_dir() and d.name.startswith("attempt_") |
| 144 | + ] |
| 145 | + attempt_dirs.sort(key=lambda x: int(x.name.split("_")[1])) |
| 146 | + |
| 147 | + if not attempt_dirs: |
| 148 | + print("No attempt directories found for pytest results") |
| 149 | + return {} |
| 150 | + |
| 151 | + all_pytest_summaries: List[Dict[str, Union[int, float]]] = [] |
| 152 | + |
| 153 | + for attempt_dir in attempt_dirs: |
| 154 | + xml_file = attempt_dir / "test-results.xml" |
| 155 | + if not xml_file.exists(): |
| 156 | + print(f"Warning: No test-results.xml found in {attempt_dir}") |
| 157 | + continue |
| 158 | + |
| 159 | + try: |
| 160 | + tree = ET.parse(xml_file) |
| 161 | + root = tree.getroot() |
| 162 | + |
| 163 | + # Extract test metrics from XML |
| 164 | + total_tests = int(root.get("tests", 0)) |
| 165 | + failures = int(root.get("failures", 0)) |
| 166 | + errors = int(root.get("errors", 0)) |
| 167 | + skipped = int(root.get("skipped", 0)) |
| 168 | + |
| 169 | + passed_tests = total_tests - failures - errors - skipped |
| 170 | + pass_rate = (passed_tests / total_tests) * 100 if total_tests > 0 else 0 |
| 171 | + |
| 172 | + pytest_summary: Dict[str, Union[int, float]] = { |
| 173 | + "total": total_tests, |
| 174 | + "passed": passed_tests, |
| 175 | + "failed": failures, |
| 176 | + "errors": errors, |
| 177 | + "skipped": skipped, |
| 178 | + "pass_rate": pass_rate, |
| 179 | + } |
| 180 | + |
| 181 | + all_pytest_summaries.append(pytest_summary) |
| 182 | + print( |
| 183 | + f"Attempt {attempt_dir.name} pytest: {passed_tests}/{total_tests} passed ({pass_rate:.1f}%)" |
| 184 | + ) |
| 185 | + |
| 186 | + except (ET.ParseError, ValueError) as e: |
| 187 | + print(f"Error parsing {xml_file}: {e}") |
| 188 | + continue |
| 189 | + |
| 190 | + if not all_pytest_summaries: |
| 191 | + print("No valid pytest summaries found to average") |
| 192 | + return {} |
| 193 | + |
| 194 | + # Calculate averages for pytest |
| 195 | + avg_pytest: Dict[str, Union[int, float, str]] = { |
| 196 | + "total": statistics.mean(float(s["total"]) for s in all_pytest_summaries), |
| 197 | + "passed": statistics.mean(float(s["passed"]) for s in all_pytest_summaries), |
| 198 | + "failed": statistics.mean(float(s["failed"]) for s in all_pytest_summaries), |
| 199 | + "errors": statistics.mean(float(s["errors"]) for s in all_pytest_summaries), |
| 200 | + "skipped": statistics.mean(float(s["skipped"]) for s in all_pytest_summaries), |
| 201 | + "pass_rate": statistics.mean( |
| 202 | + float(s["pass_rate"]) for s in all_pytest_summaries |
| 203 | + ), |
| 204 | + } |
| 205 | + |
| 206 | + # Round to reasonable precision |
| 207 | + for key in avg_pytest: |
| 208 | + if key != "details": |
| 209 | + avg_pytest[key] = round(float(avg_pytest[key]), 1) |
| 210 | + |
| 211 | + avg_pytest["details"] = ( |
| 212 | + f"Averaged across {len(all_pytest_summaries)} attempts: " |
| 213 | + f"Passed: {avg_pytest['passed']}, Failed: {avg_pytest['failed']}, " |
| 214 | + f"Errors: {avg_pytest['errors']}, Skipped: {avg_pytest['skipped']} " |
| 215 | + f"({avg_pytest['pass_rate']:.1f}% pass rate)" |
| 216 | + ) |
| 217 | + |
| 218 | + return avg_pytest |
| 219 | + |
| 220 | + |
| 221 | +def main(): |
| 222 | + """Main function to process and average results.""" |
| 223 | + if len(sys.argv) != 3: |
| 224 | + print("Usage: python average_results.py <attempts_dir> <output_dir>") |
| 225 | + sys.exit(1) |
| 226 | + |
| 227 | + attempts_dir = Path(sys.argv[1]) |
| 228 | + output_dir = Path(sys.argv[2]) |
| 229 | + |
| 230 | + if not attempts_dir.exists() or not attempts_dir.is_dir(): |
| 231 | + print(f"Error: Attempts directory does not exist: {attempts_dir}") |
| 232 | + sys.exit(1) |
| 233 | + |
| 234 | + output_dir.mkdir(parents=True, exist_ok=True) |
| 235 | + |
| 236 | + # Process inspect-ai results |
| 237 | + print("Processing inspect-ai results...") |
| 238 | + inspect_summary = process_inspect_ai_results(attempts_dir) |
| 239 | + |
| 240 | + if inspect_summary: |
| 241 | + summary_file = output_dir / "summary.json" |
| 242 | + with open(summary_file, "w") as f: |
| 243 | + json.dump(inspect_summary, f, indent=2) |
| 244 | + print(f"Inspect-AI averaged summary saved to: {summary_file}") |
| 245 | + print( |
| 246 | + f"Averaged pass rate (Complete + Partial): {inspect_summary['pass_rate']:.1f}%" |
| 247 | + ) |
| 248 | + else: |
| 249 | + print("No inspect-ai results to average") |
| 250 | + |
| 251 | + # Process pytest results |
| 252 | + print("\nProcessing pytest results...") |
| 253 | + pytest_summary = process_pytest_results(attempts_dir) |
| 254 | + |
| 255 | + if pytest_summary: |
| 256 | + pytest_summary_file = output_dir / "pytest_summary.json" |
| 257 | + with open(pytest_summary_file, "w") as f: |
| 258 | + json.dump(pytest_summary, f, indent=2) |
| 259 | + print(f"Pytest averaged summary saved to: {pytest_summary_file}") |
| 260 | + print(f"Averaged pytest pass rate: {pytest_summary['pass_rate']:.1f}%") |
| 261 | + else: |
| 262 | + print("No pytest results to average") |
| 263 | + |
| 264 | + # Create a combined summary |
| 265 | + if inspect_summary or pytest_summary: |
| 266 | + combined_summary = { |
| 267 | + "inspect_ai": inspect_summary, |
| 268 | + "pytest": pytest_summary, |
| 269 | + "overall_quality_gate_passed": ( |
| 270 | + ( |
| 271 | + inspect_summary.get("quality_gate_passed", False) |
| 272 | + and ( |
| 273 | + pytest_summary.get("pass_rate", 0) >= 85 |
| 274 | + ) # 85% threshold for pytest |
| 275 | + ) |
| 276 | + if inspect_summary and pytest_summary |
| 277 | + else False |
| 278 | + ), |
| 279 | + } |
| 280 | + |
| 281 | + combined_file = output_dir / "combined_summary.json" |
| 282 | + with open(combined_file, "w") as f: |
| 283 | + json.dump(combined_summary, f, indent=2) |
| 284 | + print(f"Combined summary saved to: {combined_file}") |
| 285 | + |
| 286 | + |
| 287 | +if __name__ == "__main__": |
| 288 | + main() |
0 commit comments