Skip to content

Commit 17110cd

Browse files
committed
feat: Add failure reason to eval results conversion for merbench page
1 parent 48fe56f commit 17110cd

File tree

1 file changed

+182
-0
lines changed

1 file changed

+182
-0
lines changed
Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
#!/usr/bin/env python3
2+
import pandas as pd
3+
import json
4+
import sys
5+
from pathlib import Path
6+
7+
# Add parent directory to path to import modules
8+
sys.path.append(str(Path(__file__).parent.parent))
9+
10+
from agents_mcp_usage.evaluations.mermaid_evals.dashboard_config import DEFAULT_CONFIG
11+
from agents_mcp_usage.evaluations.mermaid_evals.schemas import DashboardConfig
12+
13+
def parse_metric_details(metric_details_str):
14+
"""Safely parse JSON string from Metric_details column."""
15+
if pd.isna(metric_details_str) or not metric_details_str:
16+
return {}
17+
try:
18+
return json.loads(metric_details_str.replace("'", '"'))
19+
except (json.JSONDecodeError, TypeError):
20+
return {}
21+
22+
def calculate_failure_analysis_data(df):
23+
"""Calculate failure counts by model and failure type."""
24+
failure_series = [
25+
{"name": "Invalid Diagram", "column": "Score_MermaidDiagramValid", "condition": "== 0"},
26+
{"name": "MCP Tool Failure", "column": "Score_UsedBothMCPTools", "condition": "< 1"},
27+
{"name": "Usage Limit Exceeded", "column": "Score_UsageLimitNotExceeded", "condition": "== 0"},
28+
]
29+
30+
models = sorted(df["Model"].unique())
31+
failure_data = []
32+
33+
for model in models:
34+
model_data = df[df["Model"] == model]
35+
failure_counts = {"Model": model}
36+
37+
for series in failure_series:
38+
condition_str = f"`{series['column']}` {series['condition']}"
39+
count = model_data.eval(condition_str).sum()
40+
failure_counts[series["name"]] = int(count)
41+
42+
failure_data.append(failure_counts)
43+
44+
return failure_data
45+
46+
def process_csv_for_static_site(csv_path):
47+
"""Process CSV file and return data structure for static site."""
48+
# Load configuration
49+
config = DashboardConfig(**DEFAULT_CONFIG)
50+
51+
# Read CSV
52+
df = pd.read_csv(csv_path)
53+
54+
# Replace NaN values with 0 for numeric columns
55+
numeric_columns = ['Metric_request_tokens', 'Metric_response_tokens', 'Metric_total_tokens']
56+
for col in numeric_columns:
57+
if col in df.columns:
58+
df[col] = df[col].fillna(0)
59+
60+
# Extract grouping column (test case types)
61+
df['test_group'] = df['Case'].apply(lambda x: x.split('_')[-1] if '_' in x else 'other')
62+
63+
# Parse metric details to extract token information
64+
if "Metric_details" in df.columns:
65+
metric_details = df["Metric_details"].apply(parse_metric_details)
66+
df["thinking_tokens"] = metric_details.apply(lambda x: x.get("thoughts_tokens", 0))
67+
df["text_tokens"] = metric_details.apply(lambda x: x.get("text_prompt_tokens", 0))
68+
else:
69+
df["thinking_tokens"] = 0
70+
df["text_tokens"] = 0
71+
72+
# Calculate total tokens
73+
df["total_tokens"] = df["Metric_total_tokens"].fillna(0)
74+
75+
# Calculate success rate (primary metric)
76+
df["Success_Rate"] = df["Score_MermaidDiagramValid"] * 100
77+
78+
# Extract provider from model name
79+
def extract_provider(model_name):
80+
if model_name.startswith("gemini-"):
81+
return "Google"
82+
elif "claude" in model_name.lower():
83+
return "Anthropic"
84+
elif "gpt" in model_name.lower():
85+
return "OpenAI"
86+
else:
87+
return "Other"
88+
89+
df["provider"] = df["Model"].apply(extract_provider)
90+
91+
# Create leaderboard data
92+
leaderboard = df.groupby("Model").agg({
93+
"Success_Rate": "mean",
94+
"Duration": "mean",
95+
"total_tokens": "mean",
96+
"Case": "count", # Number of runs
97+
"provider": "first"
98+
}).reset_index()
99+
100+
leaderboard.columns = ["Model", "Success_Rate", "Avg_Duration", "Avg_Tokens", "Runs", "Provider"]
101+
leaderboard = leaderboard.sort_values("Success_Rate", ascending=False)
102+
103+
# Create data for Pareto frontier plot
104+
pareto_data = df.groupby("Model").agg({
105+
"Success_Rate": "mean",
106+
"Duration": "mean",
107+
"total_tokens": "mean",
108+
"Metric_request_tokens": lambda x: x[x > 0].mean() if any(x > 0) else 0,
109+
"Metric_response_tokens": lambda x: x[x > 0].mean() if any(x > 0) else 0
110+
}).reset_index()
111+
112+
# Fill any remaining NaN values with 0
113+
pareto_data = pareto_data.fillna(0)
114+
115+
# Create test group performance data
116+
test_groups_data = df.groupby(["Model", "test_group"]).agg({
117+
"Score_MermaidDiagramValid": "mean",
118+
"Score_UsageLimitNotExceeded": "mean",
119+
"Score_UsedBothMCPTools": "mean"
120+
}).reset_index()
121+
122+
# Calculate failure analysis data
123+
failure_analysis_data = calculate_failure_analysis_data(df)
124+
125+
# Calculate aggregate statistics
126+
stats = {
127+
"total_runs": len(df),
128+
"models_evaluated": df["Model"].nunique(),
129+
"test_cases": df["Case"].nunique(),
130+
"test_groups": sorted(df["test_group"].unique().tolist()),
131+
"providers": sorted(df["provider"].unique().tolist()),
132+
"models": sorted(df["Model"].unique().tolist())
133+
}
134+
135+
# Create final data structure
136+
output_data = {
137+
"stats": stats,
138+
"leaderboard": leaderboard.to_dict(orient="records"),
139+
"pareto_data": pareto_data.to_dict(orient="records"),
140+
"test_groups_data": test_groups_data.to_dict(orient="records"),
141+
"failure_analysis_data": failure_analysis_data,
142+
"raw_data": df[[
143+
"Model", "Case", "test_group", "Duration",
144+
"Score_MermaidDiagramValid", "Score_UsageLimitNotExceeded",
145+
"Score_UsedBothMCPTools", "total_tokens", "provider",
146+
"Metric_request_tokens", "Metric_response_tokens"
147+
]].to_dict(orient="records"),
148+
"config": {
149+
"title": config.title,
150+
"description": config.description,
151+
"primary_metric": {
152+
"name": "Success_Rate",
153+
"label": "Success Rate (%)"
154+
}
155+
}
156+
}
157+
158+
return output_data
159+
160+
def main():
161+
csv_path = "/home/ubuntu/projects/agents-mcp-usage/mermaid_eval_results/Jun_gemini_results.csv"
162+
output_path = "/home/ubuntu/projects/agents-mcp-usage/agents_mcp_usage/evaluations/mermaid_evals/results/Jun_gemini_results_processed.json"
163+
164+
print(f"Processing {csv_path}...")
165+
data = process_csv_for_static_site(csv_path)
166+
167+
# Convert the data to JSON string, replacing NaN with null
168+
json_str = json.dumps(data, indent=2)
169+
# Replace NaN values with null for valid JSON
170+
json_str = json_str.replace(": NaN", ": null")
171+
172+
# Write output
173+
with open(output_path, 'w') as f:
174+
f.write(json_str)
175+
176+
print(f"Data processed and saved to {output_path}")
177+
print(f"- Total runs: {data['stats']['total_runs']}")
178+
print(f"- Models evaluated: {data['stats']['models_evaluated']}")
179+
print(f"- Test cases: {data['stats']['test_cases']}")
180+
181+
if __name__ == "__main__":
182+
main()

0 commit comments

Comments
 (0)