Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions agents_mcp_usage/evaluations/mermaid_evals/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,8 @@ The local dashboard (`merbench_ui.py`) provides:
```bash
# Convert CSV results to JSON format for the public Merbench website
uv run agents_mcp_usage/evaluations/mermaid_evals/scripts/preprocess_merbench_data.py \
mermaid_eval_results/<timestamp>_combined_results.csv \
agents_mcp_usage/evaluations/mermaid_evals/results/<timestamp>_processed.json
-i "mermaid_eval_results/<timestamp>_combined_results.csv" \
-o "agents_mcp_usage/evaluations/mermaid_evals/results/<timestamp>_processed.json"
```

## Evaluation Task & Test Cases
Expand Down
11 changes: 5 additions & 6 deletions agents_mcp_usage/evaluations/mermaid_evals/dashboard_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,11 @@
# --- General Dashboard Settings ---
"title": "🧜‍♀️ Merbench - LLM Evaluation ",
"description": (
"Getting LLMs to consistently nail the mermaid diagram syntax can be... an adventure. "
"\n\nMerbench tests this ability by providing an LLM Agent access to an MCP server that both validates "
"and provides error messages to guide correction of syntax. There are three different difficulty levels (test cases), "
"and the LLM is given a fixed number of attempts to fix the diagram, if this is exceeded, the test case is considered failed. "
"\n\n **Performance is a measure of both tool usage, and Mermaid syntax understanding.**"
"\n\nThe leaderboard shows the average success rate across all selected models and difficulty levels over *n runs*."
"Getting LLMs to consistently nail the Mermaid diagram syntax can be... an adventure. "
"\n\nMerbench evaluates an LLM's ability to autonomously write and debug Mermaid syntax. The agent can access "
"an MCP server that validates its code and provides error feedback, guiding it towards a correct solution."
"\n\nEach model is tested across three difficulty levels, with a limited number of five attempts per test case. "
"Performance is measured by the final success rate, averaged over complete runs, **reflecting both an understanding of Mermaid syntax and effective tool usage.**"
),
"icon": "🧜‍♀️", # Emoji for the browser tab
# --- Primary Metric Configuration ---
Expand Down
4 changes: 4 additions & 0 deletions agents_mcp_usage/evaluations/mermaid_evals/run_multi_evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@
# "gemini-2.5-pro-preview-03-25",
# "gemini-2.0-flash",
"gemini-2.5-flash",
# "bedrock:us.amazon.nova-pro-v1:0",
# "bedrock:us.amazon.nova-lite-v1:0",
# "bedrock:us.amazon.nova-micro-v1:0",
# "bedrock:us.amazon.nova-premier-v1:0",
# "openai:o4-mini",
# "openai:gpt-4.1",
# "openai:gpt-4.1-mini",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,16 @@
import pandas as pd
import json
import sys
import argparse
from datetime import datetime
from pathlib import Path

# Add parent directory to path to import modules
sys.path.append(str(Path(__file__).parent.parent))

from agents_mcp_usage.evaluations.mermaid_evals.dashboard_config import DEFAULT_CONFIG
from agents_mcp_usage.evaluations.mermaid_evals.schemas import DashboardConfig
from agents_mcp_usage.utils import get_project_root

def parse_metric_details(metric_details_str):
"""Safely parse JSON string from Metric_details column."""
Expand Down Expand Up @@ -79,10 +82,14 @@ def process_csv_for_static_site(csv_path):
def extract_provider(model_name):
if model_name.startswith("gemini-"):
return "Google"
elif "nova" in model_name.lower():
return "Amazon"
elif "claude" in model_name.lower():
return "Anthropic"
elif "gpt" in model_name.lower():
return "OpenAI"
elif model_name.startswith("o"):
return "OpenAI"
else:
return "Other"

Expand Down Expand Up @@ -158,8 +165,29 @@ def extract_provider(model_name):
return output_data

def main():
csv_path = "/home/ubuntu/projects/agents-mcp-usage/mermaid_eval_results/Jun_gemini_results.csv"
output_path = "/home/ubuntu/projects/agents-mcp-usage/agents_mcp_usage/evaluations/mermaid_evals/results/Jun_gemini_results_processed.json"
parser = argparse.ArgumentParser(description="Process CSV evaluation results for static site")
parser.add_argument("-i", "--input_csv", nargs="?", help="Path to input CSV file", default=None)
parser.add_argument("-o", "--output_json", nargs="?", help="Path to output JSON file", default=None)

args = parser.parse_args()

project_root = get_project_root()
current_month = datetime.now().strftime("%b").lower()

# Set default paths if not provided
if args.input_csv:
csv_path = Path(args.input_csv)
if not csv_path.is_absolute():
csv_path = project_root / csv_path
else:
csv_path = project_root / "mermaid_eval_results" / "latest_combined_results.csv"

if args.output_json:
output_path = Path(args.output_json)
if not output_path.is_absolute():
output_path = project_root / output_path
else:
output_path = project_root / "agents_mcp_usage" / "evaluations" / "mermaid_evals" / "results" / f"{current_month}_results_processed.json"

print(f"Processing {csv_path}...")
data = process_csv_for_static_site(csv_path)
Expand Down
Loading