Merge pull request #13 from andrewginns/refactor-evals

andrewginns · web-flow · commit 6891bff255ad · 2025-07-04T16:27:25.000+01:00
Tweak scripts that run Merbench
diff --git a/agents_mcp_usage/evaluations/mermaid_evals/README.md b/agents_mcp_usage/evaluations/mermaid_evals/README.md
@@ -102,8 +102,8 @@ The local dashboard (`merbench_ui.py`) provides:
 ```bash
 # Convert CSV results to JSON format for the public Merbench website
 uv run agents_mcp_usage/evaluations/mermaid_evals/scripts/preprocess_merbench_data.py \
-  mermaid_eval_results/<timestamp>_combined_results.csv \
-  agents_mcp_usage/evaluations/mermaid_evals/results/<timestamp>_processed.json
+  -i "mermaid_eval_results/<timestamp>_combined_results.csv" \
+  -o "agents_mcp_usage/evaluations/mermaid_evals/results/<timestamp>_processed.json"
 ```
 
 ## Evaluation Task & Test Cases
diff --git a/agents_mcp_usage/evaluations/mermaid_evals/dashboard_config.py b/agents_mcp_usage/evaluations/mermaid_evals/dashboard_config.py
@@ -21,12 +21,11 @@
     # --- General Dashboard Settings ---
     "title": "🧜‍♀️ Merbench - LLM Evaluation ",
     "description": (
-        "Getting LLMs to consistently nail the mermaid diagram syntax can be... an adventure. "
-        "\n\nMerbench tests this ability by providing an LLM Agent access to an MCP server that both validates "
-        "and provides error messages to guide correction of syntax. There are three different difficulty levels (test cases), "
-        "and the LLM is given a fixed number of attempts to fix the diagram, if this is exceeded, the test case is considered failed. "
-        "\n\n **Performance is a measure of both tool usage, and Mermaid syntax understanding.**"
-        "\n\nThe leaderboard shows the average success rate across all selected models and difficulty levels over *n runs*."
+        "Getting LLMs to consistently nail the Mermaid diagram syntax can be... an adventure. "
+        "\n\nMerbench evaluates an LLM's ability to autonomously write and debug Mermaid syntax. The agent can access "
+        "an MCP server that validates its code and provides error feedback, guiding it towards a correct solution."
+        "\n\nEach model is tested across three difficulty levels, with a limited number of five attempts per test case. "
+        "Performance is measured by the final success rate, averaged over complete runs, **reflecting both an understanding of Mermaid syntax and effective tool usage.**"
     ),
     "icon": "🧜‍♀️",  # Emoji for the browser tab
     # --- Primary Metric Configuration ---
diff --git a/agents_mcp_usage/evaluations/mermaid_evals/run_multi_evals.py b/agents_mcp_usage/evaluations/mermaid_evals/run_multi_evals.py
@@ -49,6 +49,10 @@
     # "gemini-2.5-pro-preview-03-25",
     # "gemini-2.0-flash",
     "gemini-2.5-flash",
+    # "bedrock:us.amazon.nova-pro-v1:0",
+    # "bedrock:us.amazon.nova-lite-v1:0",
+    # "bedrock:us.amazon.nova-micro-v1:0",
+    # "bedrock:us.amazon.nova-premier-v1:0",
     # "openai:o4-mini",
     # "openai:gpt-4.1",
     # "openai:gpt-4.1-mini",
diff --git a/agents_mcp_usage/evaluations/mermaid_evals/scripts/preprocess_merbench_data.py b/agents_mcp_usage/evaluations/mermaid_evals/scripts/preprocess_merbench_data.py
@@ -2,13 +2,16 @@
 import pandas as pd
 import json
 import sys
+import argparse
+from datetime import datetime
 from pathlib import Path
 
 # Add parent directory to path to import modules
 sys.path.append(str(Path(__file__).parent.parent))
 
 from agents_mcp_usage.evaluations.mermaid_evals.dashboard_config import DEFAULT_CONFIG
 from agents_mcp_usage.evaluations.mermaid_evals.schemas import DashboardConfig
+from agents_mcp_usage.utils import get_project_root
 
 def parse_metric_details(metric_details_str):
     """Safely parse JSON string from Metric_details column."""
@@ -79,10 +82,14 @@ def process_csv_for_static_site(csv_path):
     def extract_provider(model_name):
         if model_name.startswith("gemini-"):
             return "Google"
+        elif "nova" in model_name.lower():
+            return "Amazon"
         elif "claude" in model_name.lower():
             return "Anthropic"
         elif "gpt" in model_name.lower():
             return "OpenAI"
+        elif model_name.startswith("o"):
+            return "OpenAI"
         else:
             return "Other"
     
@@ -158,8 +165,29 @@ def extract_provider(model_name):
     return output_data
 
 def main():
-    csv_path = "/home/ubuntu/projects/agents-mcp-usage/mermaid_eval_results/Jun_gemini_results.csv"
-    output_path = "/home/ubuntu/projects/agents-mcp-usage/agents_mcp_usage/evaluations/mermaid_evals/results/Jun_gemini_results_processed.json"
+    parser = argparse.ArgumentParser(description="Process CSV evaluation results for static site")
+    parser.add_argument("-i", "--input_csv", nargs="?", help="Path to input CSV file", default=None)
+    parser.add_argument("-o", "--output_json", nargs="?", help="Path to output JSON file", default=None)
+    
+    args = parser.parse_args()
+    
+    project_root = get_project_root()
+    current_month = datetime.now().strftime("%b").lower()
+    
+    # Set default paths if not provided
+    if args.input_csv:
+        csv_path = Path(args.input_csv)
+        if not csv_path.is_absolute():
+            csv_path = project_root / csv_path
+    else:
+        csv_path = project_root / "mermaid_eval_results" / "latest_combined_results.csv"
+    
+    if args.output_json:
+        output_path = Path(args.output_json)
+        if not output_path.is_absolute():
+            output_path = project_root / output_path
+    else:
+        output_path = project_root / "agents_mcp_usage" / "evaluations" / "mermaid_evals" / "results" / f"{current_month}_results_processed.json"
     
     print(f"Processing {csv_path}...")
     data = process_csv_for_static_site(csv_path)
diff --git a/mermaid_eval_results/Jun_gemini_results.csv b/mermaid_eval_results/Jun_gemini_results.csv