From 223701f747216ec0488b77fed9b16c670e4a1c30 Mon Sep 17 00:00:00 2001 From: Andrew Ginns Date: Wed, 2 Jul 2025 09:50:42 +0000 Subject: [PATCH 1/2] refact: Add named args to preprocess script --- .../evaluations/mermaid_evals/README.md | 4 +-- .../scripts/preprocess_merbench_data.py | 28 +++++++++++++++++-- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/agents_mcp_usage/evaluations/mermaid_evals/README.md b/agents_mcp_usage/evaluations/mermaid_evals/README.md index 553e033..95a5d6e 100644 --- a/agents_mcp_usage/evaluations/mermaid_evals/README.md +++ b/agents_mcp_usage/evaluations/mermaid_evals/README.md @@ -102,8 +102,8 @@ The local dashboard (`merbench_ui.py`) provides: ```bash # Convert CSV results to JSON format for the public Merbench website uv run agents_mcp_usage/evaluations/mermaid_evals/scripts/preprocess_merbench_data.py \ - mermaid_eval_results/_combined_results.csv \ - agents_mcp_usage/evaluations/mermaid_evals/results/_processed.json + -i "mermaid_eval_results/_combined_results.csv" \ + -o "agents_mcp_usage/evaluations/mermaid_evals/results/_processed.json" ``` ## Evaluation Task & Test Cases diff --git a/agents_mcp_usage/evaluations/mermaid_evals/scripts/preprocess_merbench_data.py b/agents_mcp_usage/evaluations/mermaid_evals/scripts/preprocess_merbench_data.py index 844f98d..0fe2b40 100644 --- a/agents_mcp_usage/evaluations/mermaid_evals/scripts/preprocess_merbench_data.py +++ b/agents_mcp_usage/evaluations/mermaid_evals/scripts/preprocess_merbench_data.py @@ -2,6 +2,8 @@ import pandas as pd import json import sys +import argparse +from datetime import datetime from pathlib import Path # Add parent directory to path to import modules @@ -9,6 +11,7 @@ from agents_mcp_usage.evaluations.mermaid_evals.dashboard_config import DEFAULT_CONFIG from agents_mcp_usage.evaluations.mermaid_evals.schemas import DashboardConfig +from agents_mcp_usage.utils import get_project_root def parse_metric_details(metric_details_str): """Safely parse JSON string from Metric_details column.""" @@ -158,8 +161,29 @@ def extract_provider(model_name): return output_data def main(): - csv_path = "/home/ubuntu/projects/agents-mcp-usage/mermaid_eval_results/Jun_gemini_results.csv" - output_path = "/home/ubuntu/projects/agents-mcp-usage/agents_mcp_usage/evaluations/mermaid_evals/results/Jun_gemini_results_processed.json" + parser = argparse.ArgumentParser(description="Process CSV evaluation results for static site") + parser.add_argument("-i", "--input_csv", nargs="?", help="Path to input CSV file", default=None) + parser.add_argument("-o", "--output_json", nargs="?", help="Path to output JSON file", default=None) + + args = parser.parse_args() + + project_root = get_project_root() + current_month = datetime.now().strftime("%b").lower() + + # Set default paths if not provided + if args.input_csv: + csv_path = Path(args.input_csv) + if not csv_path.is_absolute(): + csv_path = project_root / csv_path + else: + csv_path = project_root / "mermaid_eval_results" / f"{current_month}_gemini_results.csv" + + if args.output_json: + output_path = Path(args.output_json) + if not output_path.is_absolute(): + output_path = project_root / output_path + else: + output_path = project_root / "agents_mcp_usage" / "evaluations" / "mermaid_evals" / "results" / f"{current_month}_gemini_results_processed.json" print(f"Processing {csv_path}...") data = process_csv_for_static_site(csv_path) From 5b83398ff30b1d6470ae249302343ab52a73aed2 Mon Sep 17 00:00:00 2001 From: Andrew Ginns Date: Thu, 3 Jul 2025 08:59:30 +0000 Subject: [PATCH 2/2] chore: Remove top level results --- mermaid_eval_results/Jun_gemini_results.csv | 106 -------------------- 1 file changed, 106 deletions(-) delete mode 100644 mermaid_eval_results/Jun_gemini_results.csv diff --git a/mermaid_eval_results/Jun_gemini_results.csv b/mermaid_eval_results/Jun_gemini_results.csv deleted file mode 100644 index 7238878..0000000 --- a/mermaid_eval_results/Jun_gemini_results.csv +++ /dev/null @@ -1,106 +0,0 @@ -Model,Run,Case,Duration,Fixed_Diagram_Length,Failure_Reason,Tools_Used,Score_MermaidDiagramValid,Score_UsageLimitNotExceeded,Score_UsedBothMCPTools,Metric_details,Metric_request_tokens,Metric_requests,Metric_response_tokens,Metric_total_tokens -gemini-2.5-pro-preview-06-05,1,fix_invalid_diagram_easy,25.314810254,1602,,get_current_time|validate_mermaid_diagram,1.0,1.0,1.0,"{'thoughts_tokens': 1109, 'text_prompt_tokens': 2999}",2999,2,1180,5288 -gemini-2.5-pro-preview-06-05,1,fix_invalid_diagram_medium,29.164007858,1508,,get_current_time|validate_mermaid_diagram,0.0,1.0,1.0,"{'thoughts_tokens': 1082, 'text_prompt_tokens': 3007}",3007,2,1155,5244 -gemini-2.5-pro-preview-06-05,1,fix_invalid_diagram_hard,88.240292399,1573,,get_current_time|validate_mermaid_diagram,0.0,1.0,1.0,"{'thoughts_tokens': 3882, 'text_prompt_tokens': 11479}",11479,4,2447,17808 -gemini-2.5-pro-preview-06-05,2,fix_invalid_diagram_easy,46.903850207,1658,,get_current_time|validate_mermaid_diagram,1.0,1.0,1.0,"{'thoughts_tokens': 2033, 'text_prompt_tokens': 3861}",3861,3,1196,7090 -gemini-2.5-pro-preview-06-05,2,fix_invalid_diagram_medium,60.831734772,1570,,get_current_time|validate_mermaid_diagram,0.0,1.0,1.0,"{'thoughts_tokens': 5420, 'text_prompt_tokens': 3007}",3007,2,1171,9598 -gemini-2.5-pro-preview-06-05,2,fix_invalid_diagram_hard,32.209129106,1572,,get_current_time|validate_mermaid_diagram,0.0,1.0,1.0,"{'thoughts_tokens': 1867, 'text_prompt_tokens': 3007}",3007,2,1173,6047 -gemini-2.5-pro-preview-06-05,3,fix_invalid_diagram_easy,31.297773585,1649,,get_current_time|validate_mermaid_diagram,1.0,1.0,1.0,"{'thoughts_tokens': 1671, 'text_prompt_tokens': 2999}",2999,2,1196,5866 -gemini-2.5-pro-preview-06-05,3,fix_invalid_diagram_medium,83.609651342,1569,,get_current_time|validate_mermaid_diagram,0.0,1.0,1.0,"{'thoughts_tokens': 5399, 'text_prompt_tokens': 7453}",7453,4,1820,14672 -gemini-2.5-pro-preview-06-05,3,fix_invalid_diagram_hard,53.065635183,1445,,get_current_time|validate_mermaid_diagram,0.0,1.0,1.0,"{'thoughts_tokens': 3100, 'text_prompt_tokens': 6507}",6507,3,1744,11351 -gemini-2.5-pro-preview-06-05,4,fix_invalid_diagram_easy,32.600817879,1573,,validate_mermaid_diagram|get_current_time,1.0,1.0,1.0,"{'thoughts_tokens': 1394, 'text_prompt_tokens': 3107}",3107,2,1192,5693 -gemini-2.5-pro-preview-06-05,4,fix_invalid_diagram_medium,33.96250988,1627,,validate_mermaid_diagram|get_current_time,1.0,1.0,1.0,"{'thoughts_tokens': 1552, 'text_prompt_tokens': 3107}",3107,2,1190,5849 -gemini-2.5-pro-preview-06-05,4,fix_invalid_diagram_hard,40.243743396,1509,,get_current_time|validate_mermaid_diagram,0.0,1.0,1.0,"{'thoughts_tokens': 2127, 'text_prompt_tokens': 3107}",3107,2,1154,6388 -gemini-2.5-pro-preview-06-05,5,fix_invalid_diagram_easy,27.064979554,1595,,get_current_time|validate_mermaid_diagram,1.0,1.0,1.0,"{'thoughts_tokens': 1159, 'text_prompt_tokens': 2999}",2999,2,1179,5337 -gemini-2.5-pro-preview-06-05,5,fix_invalid_diagram_medium,86.028303837,1562,,get_current_time|validate_mermaid_diagram,0.0,1.0,1.0,"{'thoughts_tokens': 4707, 'text_prompt_tokens': 11366}",11366,4,2399,18472 -gemini-2.5-pro-preview-06-05,5,fix_invalid_diagram_hard,32.780646607,1440,,get_current_time|validate_mermaid_diagram,0.0,1.0,1.0,"{'thoughts_tokens': 1563, 'text_prompt_tokens': 3007}",3007,2,1133,5703 -gemini-2.5-pro-preview-05-06,1,fix_invalid_diagram_easy,90.311101073,1636,,get_current_time|validate_mermaid_diagram,1.0,1.0,1.0,"{'thoughts_tokens': 190003, 'text_prompt_tokens': 6385}",6385,4,1480,197868 -gemini-2.5-pro-preview-05-06,1,fix_invalid_diagram_medium,142.709473835,0,usage_limit_exceeded,,0.0,0.0,0.0,,,,, -gemini-2.5-pro-preview-05-06,1,fix_invalid_diagram_hard,59.031657247,0,response_validation_failed,,0.0,1.0,0.0,,,,, -gemini-2.5-pro-preview-05-06,2,fix_invalid_diagram_easy,19.541418498,0,response_validation_failed,,0.0,1.0,0.0,,,,, -gemini-2.5-pro-preview-05-06,2,fix_invalid_diagram_medium,125.222876626,0,usage_limit_exceeded,,0.0,0.0,0.0,,,,, -gemini-2.5-pro-preview-05-06,2,fix_invalid_diagram_hard,36.623935689,0,response_validation_failed,,0.0,1.0,0.0,,,,, -gemini-2.5-pro-preview-05-06,3,fix_invalid_diagram_easy,60.148244504,1658,,get_current_time|validate_mermaid_diagram,1.0,1.0,1.0,"{'thoughts_tokens': 15739, 'text_prompt_tokens': 54413}",54413,4,2191,72343 -gemini-2.5-pro-preview-05-06,3,fix_invalid_diagram_medium,123.194014333,1690,,get_current_time|validate_mermaid_diagram,1.0,1.0,1.0,"{'thoughts_tokens': 119869, 'text_prompt_tokens': 60324}",60324,5,3883,184076 -gemini-2.5-pro-preview-05-06,3,fix_invalid_diagram_hard,16.74454915,0,response_validation_failed,,0.0,1.0,0.0,,,,, -gemini-2.5-pro-preview-05-06,4,fix_invalid_diagram_easy,64.164246072,1651,,get_current_time|validate_mermaid_diagram,1.0,1.0,1.0,"{'thoughts_tokens': 18452, 'text_prompt_tokens': 56413}",56413,4,3431,78296 -gemini-2.5-pro-preview-05-06,4,fix_invalid_diagram_medium,42.140703106,1509,,get_current_time|validate_mermaid_diagram,0.0,1.0,1.0,"{'thoughts_tokens': 85357, 'text_prompt_tokens': 4519}",4519,3,1342,91218 -gemini-2.5-pro-preview-05-06,4,fix_invalid_diagram_hard,95.642718199,0,response_validation_failed,,0.0,1.0,0.0,,,,, -gemini-2.5-pro-preview-05-06,5,fix_invalid_diagram_easy,62.127767944,1647,,get_current_time|validate_mermaid_diagram,1.0,1.0,1.0,"{'thoughts_tokens': 8942, 'text_prompt_tokens': 56065}",56065,4,3177,68184 -gemini-2.5-pro-preview-05-06,5,fix_invalid_diagram_medium,112.559107875,0,usage_limit_exceeded,,0.0,0.0,0.0,,,,, -gemini-2.5-pro-preview-05-06,5,fix_invalid_diagram_hard,112.154635461,0,usage_limit_exceeded,,0.0,0.0,0.0,,,,, -gemini-2.5-pro-preview-03-25,1,fix_invalid_diagram_easy,67.505358126,1638,,get_current_time|validate_mermaid_diagram,1.0,1.0,1.0,"{'thoughts_tokens': 39337, 'text_prompt_tokens': 54862}",54862,4,2603,96802 -gemini-2.5-pro-preview-03-25,1,fix_invalid_diagram_medium,22.877179664,0,response_validation_failed,,0.0,1.0,0.0,,,,, -gemini-2.5-pro-preview-03-25,1,fix_invalid_diagram_hard,46.778307508,1508,,get_current_time|validate_mermaid_diagram,0.0,1.0,1.0,"{'thoughts_tokens': 12915, 'text_prompt_tokens': 4004}",4004,3,1315,18234 -gemini-2.5-pro-preview-03-25,2,fix_invalid_diagram_easy,69.72560593,1601,,get_current_time|validate_mermaid_diagram,1.0,1.0,1.0,"{'thoughts_tokens': 68495, 'text_prompt_tokens': 39228}",39228,4,1996,109719 -gemini-2.5-pro-preview-03-25,2,fix_invalid_diagram_medium,107.98203716,0,usage_limit_exceeded,,0.0,0.0,0.0,,,,, -gemini-2.5-pro-preview-03-25,2,fix_invalid_diagram_hard,136.260362709,0,usage_limit_exceeded,,0.0,0.0,0.0,,,,, -gemini-2.5-pro-preview-03-25,3,fix_invalid_diagram_easy,80.179718025,1625,,get_current_time|validate_mermaid_diagram,1.0,1.0,1.0,"{'thoughts_tokens': 152940, 'text_prompt_tokens': 3874}",3874,3,1511,158325 -gemini-2.5-pro-preview-03-25,3,fix_invalid_diagram_medium,54.728993541,0,usage_limit_exceeded,,0.0,0.0,0.0,,,,, -gemini-2.5-pro-preview-03-25,3,fix_invalid_diagram_hard,99.128847196,0,usage_limit_exceeded,,0.0,0.0,0.0,,,,, -gemini-2.5-pro-preview-03-25,4,fix_invalid_diagram_easy,46.010087457,1581,,get_current_time|validate_mermaid_diagram,0.0,1.0,1.0,"{'thoughts_tokens': 25388, 'text_prompt_tokens': 3980}",3980,3,1295,30663 -gemini-2.5-pro-preview-03-25,4,fix_invalid_diagram_medium,90.896605166,0,usage_limit_exceeded,,0.0,0.0,0.0,,,,, -gemini-2.5-pro-preview-03-25,4,fix_invalid_diagram_hard,249.374101535,0,usage_limit_exceeded,,0.0,0.0,0.0,,,,, -gemini-2.5-pro-preview-03-25,5,fix_invalid_diagram_easy,115.493402964,1713,,get_current_time|validate_mermaid_diagram,1.0,1.0,1.0,"{'thoughts_tokens': 96056, 'text_prompt_tokens': 55991}",55991,4,3221,155268 -gemini-2.5-pro-preview-03-25,5,fix_invalid_diagram_medium,90.85619701,0,usage_limit_exceeded,,0.0,0.0,0.0,,,,, -gemini-2.5-pro-preview-03-25,5,fix_invalid_diagram_hard,233.217439735,0,usage_limit_exceeded,,0.0,0.0,0.0,,,,, -gemini-2.0-flash,1,fix_invalid_diagram_easy,8.909464048,1496,,,0.0,1.0,0.0,"{'text_prompt_tokens': 736, 'text_candidates_tokens': 498}",736,1,498,1234 -gemini-2.0-flash,1,fix_invalid_diagram_medium,5.837458282,1487,,,0.0,1.0,0.0,"{'text_prompt_tokens': 736, 'text_candidates_tokens': 503}",736,1,503,1239 -gemini-2.0-flash,1,fix_invalid_diagram_hard,6.413123275,1480,,,0.0,1.0,0.0,"{'text_prompt_tokens': 736, 'text_candidates_tokens': 502}",736,1,502,1238 -gemini-2.0-flash,2,fix_invalid_diagram_easy,5.292221706,1496,,,0.0,1.0,0.0,"{'text_prompt_tokens': 736, 'text_candidates_tokens': 498}",736,1,498,1234 -gemini-2.0-flash,2,fix_invalid_diagram_medium,12.366546526,1567,,get_current_time|validate_mermaid_diagram,0.0,1.0,1.0,"{'text_prompt_tokens': 2723, 'text_candidates_tokens': 1073}",2723,2,1073,3796 -gemini-2.0-flash,2,fix_invalid_diagram_hard,6.458367757,1491,,,0.0,1.0,0.0,"{'text_prompt_tokens': 736, 'text_candidates_tokens': 506}",736,1,506,1242 -gemini-2.0-flash,3,fix_invalid_diagram_easy,7.122025352,1576,,,0.0,1.0,0.0,"{'text_prompt_tokens': 736, 'text_candidates_tokens': 525}",736,1,525,1261 -gemini-2.0-flash,3,fix_invalid_diagram_medium,15.055406281,1567,,get_current_time|validate_mermaid_diagram,0.0,1.0,1.0,"{'text_prompt_tokens': 2723, 'text_candidates_tokens': 1073}",2723,2,1073,3796 -gemini-2.0-flash,3,fix_invalid_diagram_hard,6.581593788,1480,,,0.0,1.0,0.0,"{'text_prompt_tokens': 736, 'text_candidates_tokens': 502}",736,1,502,1238 -gemini-2.0-flash,4,fix_invalid_diagram_easy,8.904104594,1561,,,0.0,1.0,0.0,"{'text_prompt_tokens': 736, 'text_candidates_tokens': 520}",736,1,520,1256 -gemini-2.0-flash,4,fix_invalid_diagram_medium,3.709682467,1480,,,0.0,1.0,0.0,"{'text_prompt_tokens': 736, 'text_candidates_tokens': 502}",736,1,502,1238 -gemini-2.0-flash,4,fix_invalid_diagram_hard,7.48106499,1487,,,0.0,1.0,0.0,"{'text_prompt_tokens': 736, 'text_candidates_tokens': 503}",736,1,503,1239 -gemini-2.0-flash,5,fix_invalid_diagram_easy,4.313346779,1496,,,0.0,1.0,0.0,"{'text_prompt_tokens': 736, 'text_candidates_tokens': 498}",736,1,498,1234 -gemini-2.0-flash,5,fix_invalid_diagram_medium,4.285199703,1487,,,0.0,1.0,0.0,"{'text_prompt_tokens': 736, 'text_candidates_tokens': 503}",736,1,503,1239 -gemini-2.0-flash,5,fix_invalid_diagram_hard,3.759377617,1487,,,0.0,1.0,0.0,"{'text_prompt_tokens': 736, 'text_candidates_tokens': 503}",736,1,503,1239 -gemini-2.5-flash-preview-04-17,1,fix_invalid_diagram_easy,60.637950634,1594,,get_current_time|validate_mermaid_diagram,0.0,1.0,1.0,"{'thoughts_tokens': 58731, 'text_prompt_tokens': 15093, 'cached_content_tokens': 2900, 'text_cache_tokens': 2900}",15093,5,2470,76294 -gemini-2.5-flash-preview-04-17,1,fix_invalid_diagram_medium,31.479987348,1569,,get_current_time|validate_mermaid_diagram,0.0,1.0,1.0,"{'thoughts_tokens': 30019, 'text_prompt_tokens': 8696}",8696,4,1852,40567 -gemini-2.5-flash-preview-04-17,1,fix_invalid_diagram_hard,5.009089436,1562,,,0.0,1.0,0.0,"{'thoughts_tokens': 396, 'text_prompt_tokens': 810}",810,1,524,1730 -gemini-2.5-flash-preview-04-17,2,fix_invalid_diagram_easy,27.092670181,1631,,get_current_time|validate_mermaid_diagram,1.0,1.0,1.0,"{'thoughts_tokens': 63275, 'text_prompt_tokens': 7623}",7623,3,1833,72731 -gemini-2.5-flash-preview-04-17,2,fix_invalid_diagram_medium,15.495790935,1567,,get_current_time,0.0,1.0,0.5,"{'thoughts_tokens': 2538, 'text_prompt_tokens': 1669}",1669,2,552,4759 -gemini-2.5-flash-preview-04-17,2,fix_invalid_diagram_hard,15.96490713,1570,,get_current_time|validate_mermaid_diagram,0.0,1.0,1.0,"{'thoughts_tokens': 1481, 'text_prompt_tokens': 3007}",3007,2,1173,5661 -gemini-2.5-flash-preview-04-17,3,fix_invalid_diagram_easy,54.049785353,1581,,get_current_time|validate_mermaid_diagram,0.0,1.0,1.0,"{'thoughts_tokens': 5893, 'text_prompt_tokens': 4407, 'cached_content_tokens': 1795, 'text_cache_tokens': 1795}",4407,3,1179,11479 -gemini-2.5-flash-preview-04-17,3,fix_invalid_diagram_medium,21.902809368,1570,,get_current_time|validate_mermaid_diagram,0.0,1.0,1.0,"{'thoughts_tokens': 2130, 'text_prompt_tokens': 3868}",3868,3,1173,7171 -gemini-2.5-flash-preview-04-17,3,fix_invalid_diagram_hard,55.958218108,1563,,get_current_time|validate_mermaid_diagram,0.0,1.0,1.0,"{'thoughts_tokens': 18135, 'text_prompt_tokens': 3534}",3534,2,1685,23354 -gemini-2.5-flash-preview-04-17,4,fix_invalid_diagram_easy,11.505478102,1576,,,0.0,1.0,0.0,"{'thoughts_tokens': 9091, 'text_prompt_tokens': 811}",811,1,526,10428 -gemini-2.5-flash-preview-04-17,4,fix_invalid_diagram_medium,28.035491061,1568,,get_current_time|validate_mermaid_diagram,0.0,1.0,1.0,"{'thoughts_tokens': 12355, 'text_prompt_tokens': 6558}",6558,3,1804,20717 -gemini-2.5-flash-preview-04-17,4,fix_invalid_diagram_hard,21.009636819,1561,,get_current_time|validate_mermaid_diagram,0.0,1.0,1.0,"{'thoughts_tokens': 17356, 'text_prompt_tokens': 3902}",3902,3,1189,22447 -gemini-2.5-flash-preview-04-17,5,fix_invalid_diagram_easy,29.185720392,0,response_validation_failed,,0.0,1.0,0.0,,,,, -gemini-2.5-flash-preview-04-17,5,fix_invalid_diagram_medium,7.030499633,51,,,0.0,1.0,0.0,"{'thoughts_tokens': 1282, 'text_prompt_tokens': 810}",810,1,17,2109 -gemini-2.5-flash-preview-04-17,5,fix_invalid_diagram_hard,28.157089277,1568,,get_current_time|validate_mermaid_diagram,0.0,1.0,1.0,"{'thoughts_tokens': 2803, 'text_prompt_tokens': 3868}",3868,3,1173,7844 -gemini-2.5-flash,1,fix_invalid_diagram_easy,16.271507577,1616,,validate_mermaid_diagram,1.0,1.0,0.5,"{'thoughts_tokens': 19715, 'text_prompt_tokens': 3060}",3060,2,1164,23939 -gemini-2.5-flash,1,fix_invalid_diagram_medium,15.337420676,1568,,validate_mermaid_diagram|get_current_time,0.0,1.0,1.0,"{'thoughts_tokens': 1114, 'text_prompt_tokens': 5361}",5361,3,1174,7649 -gemini-2.5-flash,1,fix_invalid_diagram_hard,6.332825299,1569,,get_current_time,0.0,1.0,0.5,"{'thoughts_tokens': 417, 'text_prompt_tokens': 1669}",1669,2,552,2638 -gemini-2.5-flash,2,fix_invalid_diagram_easy,5.406013529,1581,,get_current_time,0.0,1.0,0.5,"{'thoughts_tokens': 392, 'text_prompt_tokens': 1671}",1671,2,554,2617 -gemini-2.5-flash,2,fix_invalid_diagram_medium,4.596515261,1568,,get_current_time,0.0,1.0,0.5,"{'thoughts_tokens': 335, 'text_prompt_tokens': 1669}",1669,2,552,2556 -gemini-2.5-flash,2,fix_invalid_diagram_hard,10.369176736,1567,,get_current_time|validate_mermaid_diagram,0.0,1.0,1.0,"{'thoughts_tokens': 1767, 'text_prompt_tokens': 3981}",3981,3,1184,6932 -gemini-2.5-flash,3,fix_invalid_diagram_easy,6.68512552,1583,,get_current_time,0.0,1.0,0.5,"{'thoughts_tokens': 3177, 'text_prompt_tokens': 1671}",1671,2,554,5402 -gemini-2.5-flash,3,fix_invalid_diagram_medium,8.790923666,1569,,get_current_time,0.0,1.0,0.5,"{'thoughts_tokens': 652, 'text_prompt_tokens': 1669}",1669,2,554,2875 -gemini-2.5-flash,3,fix_invalid_diagram_hard,12.593662815,1569,,get_current_time,0.0,1.0,0.5,"{'thoughts_tokens': 1515, 'text_prompt_tokens': 1669}",1669,2,554,3738 -gemini-2.5-flash,4,fix_invalid_diagram_easy,23.406116351,1785,,get_current_time|validate_mermaid_diagram,1.0,1.0,1.0,"{'thoughts_tokens': 41536, 'text_prompt_tokens': 4005}",4005,3,1266,46807 -gemini-2.5-flash,4,fix_invalid_diagram_medium,5.594700624,1567,,get_current_time,0.0,1.0,0.5,"{'thoughts_tokens': 228, 'text_prompt_tokens': 1669}",1669,2,552,2449 -gemini-2.5-flash,4,fix_invalid_diagram_hard,21.817208931,1568,,get_current_time|validate_mermaid_diagram,0.0,1.0,1.0,"{'thoughts_tokens': 36903, 'text_prompt_tokens': 3980}",3980,3,1185,42068 -gemini-2.5-flash,5,fix_invalid_diagram_easy,14.751506297,1713,,get_current_time|validate_mermaid_diagram,1.0,1.0,1.0,"{'thoughts_tokens': 4346, 'text_prompt_tokens': 3970}",3970,3,1219,9535 -gemini-2.5-flash,5,fix_invalid_diagram_medium,26.279386281,1568,,validate_mermaid_diagram|get_current_time,0.0,1.0,1.0,"{'thoughts_tokens': 15068, 'text_prompt_tokens': 9113}",9113,4,1804,25985 -gemini-2.5-flash,5,fix_invalid_diagram_hard,14.502334274,1570,,validate_mermaid_diagram|get_current_time,0.0,1.0,1.0,"{'thoughts_tokens': 852, 'text_prompt_tokens': 5361}",5361,3,1174,7387 -gemini-2.5-flash-lite-preview-06-17,1,fix_invalid_diagram_easy,4.444876941,1576,,add|validate_mermaid_diagram,0.0,1.0,0.5,{'text_prompt_tokens': 3088},3088,2,1196,4284 -gemini-2.5-flash-lite-preview-06-17,1,fix_invalid_diagram_medium,4.627413202,1564,,add|validate_mermaid_diagram,0.0,1.0,0.5,{'text_prompt_tokens': 3089},3089,2,1202,4291 -gemini-2.5-flash-lite-preview-06-17,1,fix_invalid_diagram_hard,3.704063431,181,,add|validate_mermaid_diagram,0.0,1.0,0.5,{'text_prompt_tokens': 3091},3091,2,683,3774 -gemini-2.5-flash-lite-preview-06-17,2,fix_invalid_diagram_easy,4.447734786,1576,,add|validate_mermaid_diagram,0.0,1.0,0.5,{'text_prompt_tokens': 3091},3091,2,1191,4282 -gemini-2.5-flash-lite-preview-06-17,2,fix_invalid_diagram_medium,4.576908765,1558,,add|validate_mermaid_diagram,0.0,1.0,0.5,{'text_prompt_tokens': 3091},3091,2,1180,4271 -gemini-2.5-flash-lite-preview-06-17,2,fix_invalid_diagram_hard,5.032189281,1943,,add|get_current_time|validate_mermaid_diagram,0.0,1.0,1.0,{'text_prompt_tokens': 3136},3136,2,1476,4612 -gemini-2.5-flash-lite-preview-06-17,3,fix_invalid_diagram_easy,4.472401128,1576,,add|validate_mermaid_diagram,0.0,1.0,0.5,{'text_prompt_tokens': 3088},3088,2,1186,4274 -gemini-2.5-flash-lite-preview-06-17,3,fix_invalid_diagram_medium,3.732016304,109,,get_current_time|validate_mermaid_diagram,0.0,1.0,1.0,{'text_prompt_tokens': 3107},3107,2,671,3778 -gemini-2.5-flash-lite-preview-06-17,3,fix_invalid_diagram_hard,4.756348604,1837,,add|get_current_time|validate_mermaid_diagram,0.0,1.0,1.0,{'text_prompt_tokens': 3136},3136,2,1435,4571 -gemini-2.5-flash-lite-preview-06-17,4,fix_invalid_diagram_easy,4.71480991,1583,,get_current_time|validate_mermaid_diagram,1.0,1.0,1.0,{'text_prompt_tokens': 3104},3104,2,1214,4318 -gemini-2.5-flash-lite-preview-06-17,4,fix_invalid_diagram_medium,3.797644523,101,,get_current_time|validate_mermaid_diagram,0.0,1.0,1.0,{'text_prompt_tokens': 3107},3107,2,669,3776 -gemini-2.5-flash-lite-preview-06-17,4,fix_invalid_diagram_hard,4.755809993,1564,,add|validate_mermaid_diagram,0.0,1.0,0.5,{'text_prompt_tokens': 3092},3092,2,1204,4296 -gemini-2.5-flash-lite-preview-06-17,5,fix_invalid_diagram_easy,4.602118065,1576,,add|validate_mermaid_diagram,0.0,1.0,0.5,"{'text_prompt_tokens': 3088, 'cached_content_tokens': 1747, 'text_cache_tokens': 1747}",3088,2,1185,4273 -gemini-2.5-flash-lite-preview-06-17,5,fix_invalid_diagram_medium,4.466044834,1562,,get_current_time|validate_mermaid_diagram,0.0,1.0,1.0,{'text_prompt_tokens': 3107},3107,2,1205,4312 -gemini-2.5-flash-lite-preview-06-17,5,fix_invalid_diagram_hard,4.130914105,224,,add|get_current_time|validate_mermaid_diagram,0.0,1.0,1.0,{'text_prompt_tokens': 3136},3136,2,725,3861 \ No newline at end of file