|
| 1 | +#!/usr/bin/env python3 |
| 2 | +import pandas as pd |
| 3 | +import json |
| 4 | +import sys |
| 5 | +from pathlib import Path |
| 6 | + |
| 7 | +# Add parent directory to path to import modules |
| 8 | +sys.path.append(str(Path(__file__).parent.parent)) |
| 9 | + |
| 10 | +from agents_mcp_usage.evaluations.mermaid_evals.dashboard_config import DEFAULT_CONFIG |
| 11 | +from agents_mcp_usage.evaluations.mermaid_evals.schemas import DashboardConfig |
| 12 | + |
| 13 | +def parse_metric_details(metric_details_str): |
| 14 | + """Safely parse JSON string from Metric_details column.""" |
| 15 | + if pd.isna(metric_details_str) or not metric_details_str: |
| 16 | + return {} |
| 17 | + try: |
| 18 | + return json.loads(metric_details_str.replace("'", '"')) |
| 19 | + except (json.JSONDecodeError, TypeError): |
| 20 | + return {} |
| 21 | + |
| 22 | +def calculate_failure_analysis_data(df): |
| 23 | + """Calculate failure counts by model and failure type.""" |
| 24 | + failure_series = [ |
| 25 | + {"name": "Invalid Diagram", "column": "Score_MermaidDiagramValid", "condition": "== 0"}, |
| 26 | + {"name": "MCP Tool Failure", "column": "Score_UsedBothMCPTools", "condition": "< 1"}, |
| 27 | + {"name": "Usage Limit Exceeded", "column": "Score_UsageLimitNotExceeded", "condition": "== 0"}, |
| 28 | + ] |
| 29 | + |
| 30 | + models = sorted(df["Model"].unique()) |
| 31 | + failure_data = [] |
| 32 | + |
| 33 | + for model in models: |
| 34 | + model_data = df[df["Model"] == model] |
| 35 | + failure_counts = {"Model": model} |
| 36 | + |
| 37 | + for series in failure_series: |
| 38 | + condition_str = f"`{series['column']}` {series['condition']}" |
| 39 | + count = model_data.eval(condition_str).sum() |
| 40 | + failure_counts[series["name"]] = int(count) |
| 41 | + |
| 42 | + failure_data.append(failure_counts) |
| 43 | + |
| 44 | + return failure_data |
| 45 | + |
| 46 | +def process_csv_for_static_site(csv_path): |
| 47 | + """Process CSV file and return data structure for static site.""" |
| 48 | + # Load configuration |
| 49 | + config = DashboardConfig(**DEFAULT_CONFIG) |
| 50 | + |
| 51 | + # Read CSV |
| 52 | + df = pd.read_csv(csv_path) |
| 53 | + |
| 54 | + # Replace NaN values with 0 for numeric columns |
| 55 | + numeric_columns = ['Metric_request_tokens', 'Metric_response_tokens', 'Metric_total_tokens'] |
| 56 | + for col in numeric_columns: |
| 57 | + if col in df.columns: |
| 58 | + df[col] = df[col].fillna(0) |
| 59 | + |
| 60 | + # Extract grouping column (test case types) |
| 61 | + df['test_group'] = df['Case'].apply(lambda x: x.split('_')[-1] if '_' in x else 'other') |
| 62 | + |
| 63 | + # Parse metric details to extract token information |
| 64 | + if "Metric_details" in df.columns: |
| 65 | + metric_details = df["Metric_details"].apply(parse_metric_details) |
| 66 | + df["thinking_tokens"] = metric_details.apply(lambda x: x.get("thoughts_tokens", 0)) |
| 67 | + df["text_tokens"] = metric_details.apply(lambda x: x.get("text_prompt_tokens", 0)) |
| 68 | + else: |
| 69 | + df["thinking_tokens"] = 0 |
| 70 | + df["text_tokens"] = 0 |
| 71 | + |
| 72 | + # Calculate total tokens |
| 73 | + df["total_tokens"] = df["Metric_total_tokens"].fillna(0) |
| 74 | + |
| 75 | + # Calculate success rate (primary metric) |
| 76 | + df["Success_Rate"] = df["Score_MermaidDiagramValid"] * 100 |
| 77 | + |
| 78 | + # Extract provider from model name |
| 79 | + def extract_provider(model_name): |
| 80 | + if model_name.startswith("gemini-"): |
| 81 | + return "Google" |
| 82 | + elif "claude" in model_name.lower(): |
| 83 | + return "Anthropic" |
| 84 | + elif "gpt" in model_name.lower(): |
| 85 | + return "OpenAI" |
| 86 | + else: |
| 87 | + return "Other" |
| 88 | + |
| 89 | + df["provider"] = df["Model"].apply(extract_provider) |
| 90 | + |
| 91 | + # Create leaderboard data |
| 92 | + leaderboard = df.groupby("Model").agg({ |
| 93 | + "Success_Rate": "mean", |
| 94 | + "Duration": "mean", |
| 95 | + "total_tokens": "mean", |
| 96 | + "Case": "count", # Number of runs |
| 97 | + "provider": "first" |
| 98 | + }).reset_index() |
| 99 | + |
| 100 | + leaderboard.columns = ["Model", "Success_Rate", "Avg_Duration", "Avg_Tokens", "Runs", "Provider"] |
| 101 | + leaderboard = leaderboard.sort_values("Success_Rate", ascending=False) |
| 102 | + |
| 103 | + # Create data for Pareto frontier plot |
| 104 | + pareto_data = df.groupby("Model").agg({ |
| 105 | + "Success_Rate": "mean", |
| 106 | + "Duration": "mean", |
| 107 | + "total_tokens": "mean", |
| 108 | + "Metric_request_tokens": lambda x: x[x > 0].mean() if any(x > 0) else 0, |
| 109 | + "Metric_response_tokens": lambda x: x[x > 0].mean() if any(x > 0) else 0 |
| 110 | + }).reset_index() |
| 111 | + |
| 112 | + # Fill any remaining NaN values with 0 |
| 113 | + pareto_data = pareto_data.fillna(0) |
| 114 | + |
| 115 | + # Create test group performance data |
| 116 | + test_groups_data = df.groupby(["Model", "test_group"]).agg({ |
| 117 | + "Score_MermaidDiagramValid": "mean", |
| 118 | + "Score_UsageLimitNotExceeded": "mean", |
| 119 | + "Score_UsedBothMCPTools": "mean" |
| 120 | + }).reset_index() |
| 121 | + |
| 122 | + # Calculate failure analysis data |
| 123 | + failure_analysis_data = calculate_failure_analysis_data(df) |
| 124 | + |
| 125 | + # Calculate aggregate statistics |
| 126 | + stats = { |
| 127 | + "total_runs": len(df), |
| 128 | + "models_evaluated": df["Model"].nunique(), |
| 129 | + "test_cases": df["Case"].nunique(), |
| 130 | + "test_groups": sorted(df["test_group"].unique().tolist()), |
| 131 | + "providers": sorted(df["provider"].unique().tolist()), |
| 132 | + "models": sorted(df["Model"].unique().tolist()) |
| 133 | + } |
| 134 | + |
| 135 | + # Create final data structure |
| 136 | + output_data = { |
| 137 | + "stats": stats, |
| 138 | + "leaderboard": leaderboard.to_dict(orient="records"), |
| 139 | + "pareto_data": pareto_data.to_dict(orient="records"), |
| 140 | + "test_groups_data": test_groups_data.to_dict(orient="records"), |
| 141 | + "failure_analysis_data": failure_analysis_data, |
| 142 | + "raw_data": df[[ |
| 143 | + "Model", "Case", "test_group", "Duration", |
| 144 | + "Score_MermaidDiagramValid", "Score_UsageLimitNotExceeded", |
| 145 | + "Score_UsedBothMCPTools", "total_tokens", "provider", |
| 146 | + "Metric_request_tokens", "Metric_response_tokens" |
| 147 | + ]].to_dict(orient="records"), |
| 148 | + "config": { |
| 149 | + "title": config.title, |
| 150 | + "description": config.description, |
| 151 | + "primary_metric": { |
| 152 | + "name": "Success_Rate", |
| 153 | + "label": "Success Rate (%)" |
| 154 | + } |
| 155 | + } |
| 156 | + } |
| 157 | + |
| 158 | + return output_data |
| 159 | + |
| 160 | +def main(): |
| 161 | + csv_path = "/home/ubuntu/projects/agents-mcp-usage/mermaid_eval_results/Jun_gemini_results.csv" |
| 162 | + output_path = "/home/ubuntu/projects/agents-mcp-usage/agents_mcp_usage/evaluations/mermaid_evals/results/Jun_gemini_results_processed.json" |
| 163 | + |
| 164 | + print(f"Processing {csv_path}...") |
| 165 | + data = process_csv_for_static_site(csv_path) |
| 166 | + |
| 167 | + # Convert the data to JSON string, replacing NaN with null |
| 168 | + json_str = json.dumps(data, indent=2) |
| 169 | + # Replace NaN values with null for valid JSON |
| 170 | + json_str = json_str.replace(": NaN", ": null") |
| 171 | + |
| 172 | + # Write output |
| 173 | + with open(output_path, 'w') as f: |
| 174 | + f.write(json_str) |
| 175 | + |
| 176 | + print(f"Data processed and saved to {output_path}") |
| 177 | + print(f"- Total runs: {data['stats']['total_runs']}") |
| 178 | + print(f"- Models evaluated: {data['stats']['models_evaluated']}") |
| 179 | + print(f"- Test cases: {data['stats']['test_cases']}") |
| 180 | + |
| 181 | +if __name__ == "__main__": |
| 182 | + main() |
0 commit comments