Skip to content

Commit 6891bff

Browse files
authored
Merge pull request #13 from andrewginns/refactor-evals
Tweak scripts that run Merbench
2 parents 4183572 + 308a41e commit 6891bff

File tree

5 files changed

+41
-116
lines changed

5 files changed

+41
-116
lines changed

agents_mcp_usage/evaluations/mermaid_evals/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,8 +102,8 @@ The local dashboard (`merbench_ui.py`) provides:
102102
```bash
103103
# Convert CSV results to JSON format for the public Merbench website
104104
uv run agents_mcp_usage/evaluations/mermaid_evals/scripts/preprocess_merbench_data.py \
105-
mermaid_eval_results/<timestamp>_combined_results.csv \
106-
agents_mcp_usage/evaluations/mermaid_evals/results/<timestamp>_processed.json
105+
-i "mermaid_eval_results/<timestamp>_combined_results.csv" \
106+
-o "agents_mcp_usage/evaluations/mermaid_evals/results/<timestamp>_processed.json"
107107
```
108108

109109
## Evaluation Task & Test Cases

agents_mcp_usage/evaluations/mermaid_evals/dashboard_config.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,11 @@
2121
# --- General Dashboard Settings ---
2222
"title": "🧜‍♀️ Merbench - LLM Evaluation ",
2323
"description": (
24-
"Getting LLMs to consistently nail the mermaid diagram syntax can be... an adventure. "
25-
"\n\nMerbench tests this ability by providing an LLM Agent access to an MCP server that both validates "
26-
"and provides error messages to guide correction of syntax. There are three different difficulty levels (test cases), "
27-
"and the LLM is given a fixed number of attempts to fix the diagram, if this is exceeded, the test case is considered failed. "
28-
"\n\n **Performance is a measure of both tool usage, and Mermaid syntax understanding.**"
29-
"\n\nThe leaderboard shows the average success rate across all selected models and difficulty levels over *n runs*."
24+
"Getting LLMs to consistently nail the Mermaid diagram syntax can be... an adventure. "
25+
"\n\nMerbench evaluates an LLM's ability to autonomously write and debug Mermaid syntax. The agent can access "
26+
"an MCP server that validates its code and provides error feedback, guiding it towards a correct solution."
27+
"\n\nEach model is tested across three difficulty levels, with a limited number of five attempts per test case. "
28+
"Performance is measured by the final success rate, averaged over complete runs, **reflecting both an understanding of Mermaid syntax and effective tool usage.**"
3029
),
3130
"icon": "🧜‍♀️", # Emoji for the browser tab
3231
# --- Primary Metric Configuration ---

agents_mcp_usage/evaluations/mermaid_evals/run_multi_evals.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,10 @@
4949
# "gemini-2.5-pro-preview-03-25",
5050
# "gemini-2.0-flash",
5151
"gemini-2.5-flash",
52+
# "bedrock:us.amazon.nova-pro-v1:0",
53+
# "bedrock:us.amazon.nova-lite-v1:0",
54+
# "bedrock:us.amazon.nova-micro-v1:0",
55+
# "bedrock:us.amazon.nova-premier-v1:0",
5256
# "openai:o4-mini",
5357
# "openai:gpt-4.1",
5458
# "openai:gpt-4.1-mini",

agents_mcp_usage/evaluations/mermaid_evals/scripts/preprocess_merbench_data.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,16 @@
22
import pandas as pd
33
import json
44
import sys
5+
import argparse
6+
from datetime import datetime
57
from pathlib import Path
68

79
# Add parent directory to path to import modules
810
sys.path.append(str(Path(__file__).parent.parent))
911

1012
from agents_mcp_usage.evaluations.mermaid_evals.dashboard_config import DEFAULT_CONFIG
1113
from agents_mcp_usage.evaluations.mermaid_evals.schemas import DashboardConfig
14+
from agents_mcp_usage.utils import get_project_root
1215

1316
def parse_metric_details(metric_details_str):
1417
"""Safely parse JSON string from Metric_details column."""
@@ -79,10 +82,14 @@ def process_csv_for_static_site(csv_path):
7982
def extract_provider(model_name):
8083
if model_name.startswith("gemini-"):
8184
return "Google"
85+
elif "nova" in model_name.lower():
86+
return "Amazon"
8287
elif "claude" in model_name.lower():
8388
return "Anthropic"
8489
elif "gpt" in model_name.lower():
8590
return "OpenAI"
91+
elif model_name.startswith("o"):
92+
return "OpenAI"
8693
else:
8794
return "Other"
8895

@@ -158,8 +165,29 @@ def extract_provider(model_name):
158165
return output_data
159166

160167
def main():
161-
csv_path = "/home/ubuntu/projects/agents-mcp-usage/mermaid_eval_results/Jun_gemini_results.csv"
162-
output_path = "/home/ubuntu/projects/agents-mcp-usage/agents_mcp_usage/evaluations/mermaid_evals/results/Jun_gemini_results_processed.json"
168+
parser = argparse.ArgumentParser(description="Process CSV evaluation results for static site")
169+
parser.add_argument("-i", "--input_csv", nargs="?", help="Path to input CSV file", default=None)
170+
parser.add_argument("-o", "--output_json", nargs="?", help="Path to output JSON file", default=None)
171+
172+
args = parser.parse_args()
173+
174+
project_root = get_project_root()
175+
current_month = datetime.now().strftime("%b").lower()
176+
177+
# Set default paths if not provided
178+
if args.input_csv:
179+
csv_path = Path(args.input_csv)
180+
if not csv_path.is_absolute():
181+
csv_path = project_root / csv_path
182+
else:
183+
csv_path = project_root / "mermaid_eval_results" / "latest_combined_results.csv"
184+
185+
if args.output_json:
186+
output_path = Path(args.output_json)
187+
if not output_path.is_absolute():
188+
output_path = project_root / output_path
189+
else:
190+
output_path = project_root / "agents_mcp_usage" / "evaluations" / "mermaid_evals" / "results" / f"{current_month}_results_processed.json"
163191

164192
print(f"Processing {csv_path}...")
165193
data = process_csv_for_static_site(csv_path)

0 commit comments

Comments
 (0)