-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathrun_analysis.py
More file actions
80 lines (64 loc) · 2.8 KB
/
run_analysis.py
File metadata and controls
80 lines (64 loc) · 2.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/env python3
import os
import argparse
from get_results import (
get_results_from_judgements,
calculate_language_fidelity,
name_mappings
)
def main() -> None:
"""Main function to run analysis and generate visualizations."""
parser = argparse.ArgumentParser(description="Run analysis on Leaving Certificate results.")
parser.add_argument("--model", default="gemini-2.0-flash", help="Model to analyze")
parser.add_argument("--judge-model", default="gemini-2.5-flash-preview-04-17", help="Judge model")
parser.add_argument("--judgements-dir", default="judgements", help="Directory containing judgements")
parser.add_argument("--responses-dir", default="responses", help="Directory containing responses")
parser.add_argument("--output-dir", default="output", help="Directory to save outputs")
args = parser.parse_args()
# Create output directory if it doesn't exist
os.makedirs(args.output_dir, exist_ok=True)
print(f"Analyzing model: {args.model}")
# Get list of subjects
subjects = list(name_mappings.keys())
# Get results from judgements
print(f"Getting results from judgements for {len(subjects)} subjects...")
results, lang_fidelity, confidences, correct_irish, incorrect_irish, both = get_results_from_judgements(
subjects=subjects,
model=args.model,
judge_model=args.judge_model,
judgements_dir=args.judgements_dir
)
# Calculate language fidelity
print("Calculating language fidelity...")
language_fidelity = calculate_language_fidelity(
subjects=subjects,
models=[args.model],
responses_dir=args.responses_dir
)
# Display summary
print("\n--- Summary ---")
print(f"Model: {args.model}")
english_subjects = [s for s in subjects if s.endswith('EV')]
irish_subjects = [s for s in subjects if s.endswith('IV')]
english_score = 0
english_count = 0
irish_score = 0
irish_count = 0
for subject in english_subjects:
if subject in results and len(results[subject]) > 0:
score = sum(results[subject]) / len(results[subject]) * 100
english_score += score
english_count += 1
print(f"{name_mappings[subject]}: {score:.2f}%")
print("\n")
for subject in irish_subjects:
if subject in results and len(results[subject]) > 0:
score = sum(results[subject]) / len(results[subject]) * 100
irish_score += score
irish_count += 1
print(f"{name_mappings[subject]}: {score:.2f}%")
print("\nAverage scores:")
print(f"English: {english_score/english_count if english_count else 0:.2f}%")
print(f"Irish: {irish_score/irish_count if irish_count else 0:.2f}%")
if __name__ == "__main__":
main()