Skip to content

Commit 4d44a1c

Browse files
committed
Average test results across multiple attempts
Introduces a script to average inspect-ai and pytest results from multiple attempts, updating the test evaluation workflow and bash script to store per-attempt results and generate summary files. The workflow now checks for the averaged summary, and the new Python script computes and saves averaged metrics for both inspect-ai and pytest runs.
1 parent df65e84 commit 4d44a1c

File tree

3 files changed

+302
-11
lines changed

3 files changed

+302
-11
lines changed

.github/workflows/verify-test-generation-prompts.yaml

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -56,21 +56,20 @@ jobs:
5656
with:
5757
name: test-results-${{ github.run_id }}
5858
path: |
59-
test-results.xml
6059
results/
6160
retention-days: 7
6261

6362
- name: Process Results
6463
timeout-minutes: 2
6564
run: |
66-
# Find the latest evaluation result file and process it
67-
latest_result=$(ls -t results/*.json | head -1)
68-
if [ ! -f "$latest_result" ]; then
69-
echo "No result files found in results/ directory"
65+
# Results are already averaged by the bash script, just verify they exist
66+
if [ ! -f "results/summary.json" ]; then
67+
echo "No averaged summary found at results/summary.json"
68+
ls -la results/
7069
exit 1
7170
else
72-
echo "Processing results from: $latest_result"
73-
python tests/inspect-ai/utils/scripts/process_results.py "$latest_result"
71+
echo "Using averaged results from all attempts"
72+
cat results/summary.json
7473
fi
7574
7675
- name: Check Quality Gate

tests/inspect-ai/scripts/run-test-evaluation.sh

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,14 +29,15 @@ for i in $(seq 1 "$ATTEMPTS"); do
2929

3030
rm -rf results/
3131
mkdir -p results/
32+
mkdir -p results/attempts/attempt_$i/
3233
rm -f test-results.xml
3334

3435
log_with_timestamp "[Attempt $i] Creating test metadata..."
3536
python tests/inspect-ai/scripts/create_test_metadata.py
3637

3738
log_with_timestamp "[Attempt $i] Running Inspect AI evaluation..."
3839
inspect eval tests/inspect-ai/scripts/evaluation.py@shiny_test_evaluation \
39-
--log-dir results/ \
40+
--log-dir results/attempts/attempt_$i/ \
4041
--log-format json
4142

4243
log_with_timestamp "[Attempt $i] Running tests..."
@@ -47,7 +48,7 @@ for i in $(seq 1 "$ATTEMPTS"); do
4748
--tb=short \
4849
--disable-warnings \
4950
--maxfail="$PYTEST_MAXFAIL" \
50-
--junit-xml=test-results.xml \
51+
--junit-xml=results/attempts/attempt_$i/test-results.xml \
5152
--durations=10 \
5253
--timeout="$PYTEST_PER_TEST_TIMEOUT" \
5354
--timeout-method=signal \
@@ -61,8 +62,8 @@ for i in $(seq 1 "$ATTEMPTS"); do
6162
fi
6263

6364
if [ "${test_exit_code:-0}" -ne 0 ]; then
64-
if [ -f test-results.xml ]; then
65-
failure_count=$(grep -o 'failures="[0-9]*"' test-results.xml | grep -o '[0-9]*' || echo "0")
65+
if [ -f results/attempts/attempt_$i/test-results.xml ]; then
66+
failure_count=$(grep -o 'failures="[0-9]*"' results/attempts/attempt_$i/test-results.xml | grep -o '[0-9]*' || echo "0")
6667
else
6768
failure_count=0
6869
fi
@@ -78,3 +79,6 @@ for i in $(seq 1 "$ATTEMPTS"); do
7879
done
7980

8081
log_with_timestamp "All $ATTEMPTS evaluation and test runs passed successfully."
82+
83+
log_with_timestamp "Averaging results across all attempts..."
84+
python tests/inspect-ai/utils/scripts/average_results.py results/attempts/ results/
Lines changed: 288 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,288 @@
1+
"""
2+
Script to average inspect-ai and pytest results across multiple attempts.
3+
4+
This script processes results from multiple attempts stored in separate directories
5+
and creates averaged results maintaining the same structure as single-attempt results.
6+
"""
7+
8+
import json
9+
import statistics
10+
import sys
11+
import xml.etree.ElementTree as ET
12+
from pathlib import Path
13+
from typing import Any, Dict, List, Union
14+
15+
16+
def process_inspect_ai_results(attempts_dir: Path) -> Dict[str, Any]:
17+
"""
18+
Process and average inspect-ai results across multiple attempts.
19+
20+
Args:
21+
attempts_dir: Directory containing attempt subdirectories
22+
23+
Returns:
24+
Averaged summary dictionary with same structure as single attempt
25+
"""
26+
attempt_dirs = [
27+
d
28+
for d in attempts_dir.iterdir()
29+
if d.is_dir() and d.name.startswith("attempt_")
30+
]
31+
attempt_dirs.sort(key=lambda x: int(x.name.split("_")[1]))
32+
33+
if not attempt_dirs:
34+
print("No attempt directories found")
35+
return {}
36+
37+
print(f"Found {len(attempt_dirs)} attempts to average")
38+
39+
all_summaries: List[Dict[str, Union[int, float, bool]]] = []
40+
41+
for attempt_dir in attempt_dirs:
42+
# Find the JSON result file in this attempt
43+
json_files = list(attempt_dir.glob("*.json"))
44+
if not json_files:
45+
print(f"Warning: No JSON files found in {attempt_dir}")
46+
continue
47+
48+
# Use the first JSON file (should only be one)
49+
result_file = json_files[0]
50+
51+
# Process this single result to get summary
52+
with open(result_file, "r", encoding="utf-8") as f:
53+
try:
54+
data = json.load(f)
55+
except json.JSONDecodeError as e:
56+
print(f"Error decoding JSON from {result_file}: {e}")
57+
continue
58+
59+
samples = data.get("samples", [])
60+
total_tests = len(samples)
61+
62+
if total_tests == 0:
63+
print(f"Warning: No samples found in {result_file}")
64+
continue
65+
66+
# Count results
67+
passed_tests = sum(
68+
1
69+
for s in samples
70+
if s.get("scores", {}).get("model_graded_qa", {}).get("value") == "C"
71+
)
72+
partial_tests = sum(
73+
1
74+
for s in samples
75+
if s.get("scores", {}).get("model_graded_qa", {}).get("value") == "P"
76+
)
77+
failed_tests = sum(
78+
1
79+
for s in samples
80+
if s.get("scores", {}).get("model_graded_qa", {}).get("value") == "I"
81+
)
82+
83+
passing_tests = passed_tests + partial_tests
84+
pass_rate = (passing_tests / total_tests) * 100 if total_tests > 0 else 0
85+
86+
summary: Dict[str, Union[int, float, bool]] = {
87+
"total": total_tests,
88+
"passed": passed_tests,
89+
"partial": partial_tests,
90+
"failed": failed_tests,
91+
"pass_rate": pass_rate,
92+
"quality_gate_passed": pass_rate >= 80,
93+
}
94+
95+
all_summaries.append(summary)
96+
print(
97+
f"Attempt {attempt_dir.name}: {passed_tests}C + {partial_tests}P + {failed_tests}I = {passing_tests}/{total_tests} ({pass_rate:.1f}%)"
98+
)
99+
100+
if not all_summaries:
101+
print("No valid summaries found to average")
102+
return {}
103+
104+
# Calculate averages
105+
avg_summary: Dict[str, Union[int, float, bool, str]] = {
106+
"total": statistics.mean(float(s["total"]) for s in all_summaries),
107+
"passed": statistics.mean(float(s["passed"]) for s in all_summaries),
108+
"partial": statistics.mean(float(s["partial"]) for s in all_summaries),
109+
"failed": statistics.mean(float(s["failed"]) for s in all_summaries),
110+
"pass_rate": statistics.mean(float(s["pass_rate"]) for s in all_summaries),
111+
}
112+
113+
# Round to reasonable precision
114+
avg_summary["total"] = round(float(avg_summary["total"]), 1)
115+
avg_summary["passed"] = round(float(avg_summary["passed"]), 1)
116+
avg_summary["partial"] = round(float(avg_summary["partial"]), 1)
117+
avg_summary["failed"] = round(float(avg_summary["failed"]), 1)
118+
avg_summary["pass_rate"] = round(float(avg_summary["pass_rate"]), 1)
119+
avg_summary["quality_gate_passed"] = avg_summary["pass_rate"] >= 80
120+
avg_summary["details"] = (
121+
f"Averaged across {len(all_summaries)} attempts: "
122+
f"Complete: {avg_summary['passed']}, Partial: {avg_summary['partial']}, "
123+
f"Incomplete: {avg_summary['failed']}, "
124+
f"Passing: {avg_summary['passed'] + avg_summary['partial']}/{avg_summary['total']}"
125+
)
126+
127+
return avg_summary
128+
129+
130+
def process_pytest_results(attempts_dir: Path) -> Dict[str, Any]:
131+
"""
132+
Process and average pytest results across multiple attempts.
133+
134+
Args:
135+
attempts_dir: Directory containing attempt subdirectories
136+
137+
Returns:
138+
Averaged pytest summary dictionary
139+
"""
140+
attempt_dirs = [
141+
d
142+
for d in attempts_dir.iterdir()
143+
if d.is_dir() and d.name.startswith("attempt_")
144+
]
145+
attempt_dirs.sort(key=lambda x: int(x.name.split("_")[1]))
146+
147+
if not attempt_dirs:
148+
print("No attempt directories found for pytest results")
149+
return {}
150+
151+
all_pytest_summaries: List[Dict[str, Union[int, float]]] = []
152+
153+
for attempt_dir in attempt_dirs:
154+
xml_file = attempt_dir / "test-results.xml"
155+
if not xml_file.exists():
156+
print(f"Warning: No test-results.xml found in {attempt_dir}")
157+
continue
158+
159+
try:
160+
tree = ET.parse(xml_file)
161+
root = tree.getroot()
162+
163+
# Extract test metrics from XML
164+
total_tests = int(root.get("tests", 0))
165+
failures = int(root.get("failures", 0))
166+
errors = int(root.get("errors", 0))
167+
skipped = int(root.get("skipped", 0))
168+
169+
passed_tests = total_tests - failures - errors - skipped
170+
pass_rate = (passed_tests / total_tests) * 100 if total_tests > 0 else 0
171+
172+
pytest_summary: Dict[str, Union[int, float]] = {
173+
"total": total_tests,
174+
"passed": passed_tests,
175+
"failed": failures,
176+
"errors": errors,
177+
"skipped": skipped,
178+
"pass_rate": pass_rate,
179+
}
180+
181+
all_pytest_summaries.append(pytest_summary)
182+
print(
183+
f"Attempt {attempt_dir.name} pytest: {passed_tests}/{total_tests} passed ({pass_rate:.1f}%)"
184+
)
185+
186+
except (ET.ParseError, ValueError) as e:
187+
print(f"Error parsing {xml_file}: {e}")
188+
continue
189+
190+
if not all_pytest_summaries:
191+
print("No valid pytest summaries found to average")
192+
return {}
193+
194+
# Calculate averages for pytest
195+
avg_pytest: Dict[str, Union[int, float, str]] = {
196+
"total": statistics.mean(float(s["total"]) for s in all_pytest_summaries),
197+
"passed": statistics.mean(float(s["passed"]) for s in all_pytest_summaries),
198+
"failed": statistics.mean(float(s["failed"]) for s in all_pytest_summaries),
199+
"errors": statistics.mean(float(s["errors"]) for s in all_pytest_summaries),
200+
"skipped": statistics.mean(float(s["skipped"]) for s in all_pytest_summaries),
201+
"pass_rate": statistics.mean(
202+
float(s["pass_rate"]) for s in all_pytest_summaries
203+
),
204+
}
205+
206+
# Round to reasonable precision
207+
for key in avg_pytest:
208+
if key != "details":
209+
avg_pytest[key] = round(float(avg_pytest[key]), 1)
210+
211+
avg_pytest["details"] = (
212+
f"Averaged across {len(all_pytest_summaries)} attempts: "
213+
f"Passed: {avg_pytest['passed']}, Failed: {avg_pytest['failed']}, "
214+
f"Errors: {avg_pytest['errors']}, Skipped: {avg_pytest['skipped']} "
215+
f"({avg_pytest['pass_rate']:.1f}% pass rate)"
216+
)
217+
218+
return avg_pytest
219+
220+
221+
def main():
222+
"""Main function to process and average results."""
223+
if len(sys.argv) != 3:
224+
print("Usage: python average_results.py <attempts_dir> <output_dir>")
225+
sys.exit(1)
226+
227+
attempts_dir = Path(sys.argv[1])
228+
output_dir = Path(sys.argv[2])
229+
230+
if not attempts_dir.exists() or not attempts_dir.is_dir():
231+
print(f"Error: Attempts directory does not exist: {attempts_dir}")
232+
sys.exit(1)
233+
234+
output_dir.mkdir(parents=True, exist_ok=True)
235+
236+
# Process inspect-ai results
237+
print("Processing inspect-ai results...")
238+
inspect_summary = process_inspect_ai_results(attempts_dir)
239+
240+
if inspect_summary:
241+
summary_file = output_dir / "summary.json"
242+
with open(summary_file, "w") as f:
243+
json.dump(inspect_summary, f, indent=2)
244+
print(f"Inspect-AI averaged summary saved to: {summary_file}")
245+
print(
246+
f"Averaged pass rate (Complete + Partial): {inspect_summary['pass_rate']:.1f}%"
247+
)
248+
else:
249+
print("No inspect-ai results to average")
250+
251+
# Process pytest results
252+
print("\nProcessing pytest results...")
253+
pytest_summary = process_pytest_results(attempts_dir)
254+
255+
if pytest_summary:
256+
pytest_summary_file = output_dir / "pytest_summary.json"
257+
with open(pytest_summary_file, "w") as f:
258+
json.dump(pytest_summary, f, indent=2)
259+
print(f"Pytest averaged summary saved to: {pytest_summary_file}")
260+
print(f"Averaged pytest pass rate: {pytest_summary['pass_rate']:.1f}%")
261+
else:
262+
print("No pytest results to average")
263+
264+
# Create a combined summary
265+
if inspect_summary or pytest_summary:
266+
combined_summary = {
267+
"inspect_ai": inspect_summary,
268+
"pytest": pytest_summary,
269+
"overall_quality_gate_passed": (
270+
(
271+
inspect_summary.get("quality_gate_passed", False)
272+
and (
273+
pytest_summary.get("pass_rate", 0) >= 85
274+
) # 85% threshold for pytest
275+
)
276+
if inspect_summary and pytest_summary
277+
else False
278+
),
279+
}
280+
281+
combined_file = output_dir / "combined_summary.json"
282+
with open(combined_file, "w") as f:
283+
json.dump(combined_summary, f, indent=2)
284+
print(f"Combined summary saved to: {combined_file}")
285+
286+
287+
if __name__ == "__main__":
288+
main()

0 commit comments

Comments
 (0)