chore: Changes to copy, models, and provider attribution

andrewginns · andrewginns · commit a40fe34cb2da · 2025-07-04T15:22:28.000Z
diff --git a/agents_mcp_usage/evaluations/mermaid_evals/dashboard_config.py b/agents_mcp_usage/evaluations/mermaid_evals/dashboard_config.py
@@ -21,12 +21,11 @@
     # --- General Dashboard Settings ---
     "title": "🧜‍♀️ Merbench - LLM Evaluation ",
     "description": (
-        "Getting LLMs to consistently nail the mermaid diagram syntax can be... an adventure. "
-        "\n\nMerbench tests this ability by providing an LLM Agent access to an MCP server that both validates "
-        "and provides error messages to guide correction of syntax. There are three different difficulty levels (test cases), "
-        "and the LLM is given a fixed number of attempts to fix the diagram, if this is exceeded, the test case is considered failed. "
-        "\n\n **Performance is a measure of both tool usage, and Mermaid syntax understanding.**"
-        "\n\nThe leaderboard shows the average success rate across all selected models and difficulty levels over *n runs*."
+        "Getting LLMs to consistently nail the Mermaid diagram syntax can be... an adventure. "
+        "\n\nMerbench evaluates an LLM's ability to autonomously write and debug Mermaid syntax. The agent can access "
+        "an MCP server that validates its code and provides error feedback, guiding it towards a correct solution."
+        "\n\nEach model is tested across three difficulty levels, with a limited number of five attempts per test case. "
+        "Performance is measured by the final success rate, averaged over complete runs, **reflecting both an understanding of Mermaid syntax and effective tool usage.**"
     ),
     "icon": "🧜‍♀️",  # Emoji for the browser tab
     # --- Primary Metric Configuration ---
diff --git a/agents_mcp_usage/evaluations/mermaid_evals/run_multi_evals.py b/agents_mcp_usage/evaluations/mermaid_evals/run_multi_evals.py
@@ -49,6 +49,10 @@
     # "gemini-2.5-pro-preview-03-25",
     # "gemini-2.0-flash",
     "gemini-2.5-flash",
+    # "bedrock:us.amazon.nova-pro-v1:0",
+    # "bedrock:us.amazon.nova-lite-v1:0",
+    # "bedrock:us.amazon.nova-micro-v1:0",
+    # "bedrock:us.amazon.nova-premier-v1:0",
     # "openai:o4-mini",
     # "openai:gpt-4.1",
     # "openai:gpt-4.1-mini",
diff --git a/agents_mcp_usage/evaluations/mermaid_evals/scripts/preprocess_merbench_data.py b/agents_mcp_usage/evaluations/mermaid_evals/scripts/preprocess_merbench_data.py
@@ -82,10 +82,14 @@ def process_csv_for_static_site(csv_path):
     def extract_provider(model_name):
         if model_name.startswith("gemini-"):
             return "Google"
+        elif "nova" in model_name.lower():
+            return "Amazon"
         elif "claude" in model_name.lower():
             return "Anthropic"
         elif "gpt" in model_name.lower():
             return "OpenAI"
+        elif model_name.startswith("o"):
+            return "OpenAI"
         else:
             return "Other"
     
@@ -176,14 +180,14 @@ def main():
         if not csv_path.is_absolute():
             csv_path = project_root / csv_path
     else:
-        csv_path = project_root / "mermaid_eval_results" / f"{current_month}_gemini_results.csv"
+        csv_path = project_root / "mermaid_eval_results" / f"latest_combined_results.csv"
     
     if args.output_json:
         output_path = Path(args.output_json)
         if not output_path.is_absolute():
             output_path = project_root / output_path
     else:
-        output_path = project_root / "agents_mcp_usage" / "evaluations" / "mermaid_evals" / "results" / f"{current_month}_gemini_results_processed.json"
+        output_path = project_root / "agents_mcp_usage" / "evaluations" / "mermaid_evals" / "results" / f"{current_month}_results_processed.json"
     
     print(f"Processing {csv_path}...")
     data = process_csv_for_static_site(csv_path)