Skip to content

Commit a40fe34

Browse files
committed
chore: Changes to copy, models, and provider attribution
1 parent 5b83398 commit a40fe34

File tree

3 files changed

+15
-8
lines changed

3 files changed

+15
-8
lines changed

agents_mcp_usage/evaluations/mermaid_evals/dashboard_config.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,11 @@
2121
# --- General Dashboard Settings ---
2222
"title": "🧜‍♀️ Merbench - LLM Evaluation ",
2323
"description": (
24-
"Getting LLMs to consistently nail the mermaid diagram syntax can be... an adventure. "
25-
"\n\nMerbench tests this ability by providing an LLM Agent access to an MCP server that both validates "
26-
"and provides error messages to guide correction of syntax. There are three different difficulty levels (test cases), "
27-
"and the LLM is given a fixed number of attempts to fix the diagram, if this is exceeded, the test case is considered failed. "
28-
"\n\n **Performance is a measure of both tool usage, and Mermaid syntax understanding.**"
29-
"\n\nThe leaderboard shows the average success rate across all selected models and difficulty levels over *n runs*."
24+
"Getting LLMs to consistently nail the Mermaid diagram syntax can be... an adventure. "
25+
"\n\nMerbench evaluates an LLM's ability to autonomously write and debug Mermaid syntax. The agent can access "
26+
"an MCP server that validates its code and provides error feedback, guiding it towards a correct solution."
27+
"\n\nEach model is tested across three difficulty levels, with a limited number of five attempts per test case. "
28+
"Performance is measured by the final success rate, averaged over complete runs, **reflecting both an understanding of Mermaid syntax and effective tool usage.**"
3029
),
3130
"icon": "🧜‍♀️", # Emoji for the browser tab
3231
# --- Primary Metric Configuration ---

agents_mcp_usage/evaluations/mermaid_evals/run_multi_evals.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,10 @@
4949
# "gemini-2.5-pro-preview-03-25",
5050
# "gemini-2.0-flash",
5151
"gemini-2.5-flash",
52+
# "bedrock:us.amazon.nova-pro-v1:0",
53+
# "bedrock:us.amazon.nova-lite-v1:0",
54+
# "bedrock:us.amazon.nova-micro-v1:0",
55+
# "bedrock:us.amazon.nova-premier-v1:0",
5256
# "openai:o4-mini",
5357
# "openai:gpt-4.1",
5458
# "openai:gpt-4.1-mini",

agents_mcp_usage/evaluations/mermaid_evals/scripts/preprocess_merbench_data.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,10 +82,14 @@ def process_csv_for_static_site(csv_path):
8282
def extract_provider(model_name):
8383
if model_name.startswith("gemini-"):
8484
return "Google"
85+
elif "nova" in model_name.lower():
86+
return "Amazon"
8587
elif "claude" in model_name.lower():
8688
return "Anthropic"
8789
elif "gpt" in model_name.lower():
8890
return "OpenAI"
91+
elif model_name.startswith("o"):
92+
return "OpenAI"
8993
else:
9094
return "Other"
9195

@@ -176,14 +180,14 @@ def main():
176180
if not csv_path.is_absolute():
177181
csv_path = project_root / csv_path
178182
else:
179-
csv_path = project_root / "mermaid_eval_results" / f"{current_month}_gemini_results.csv"
183+
csv_path = project_root / "mermaid_eval_results" / f"latest_combined_results.csv"
180184

181185
if args.output_json:
182186
output_path = Path(args.output_json)
183187
if not output_path.is_absolute():
184188
output_path = project_root / output_path
185189
else:
186-
output_path = project_root / "agents_mcp_usage" / "evaluations" / "mermaid_evals" / "results" / f"{current_month}_gemini_results_processed.json"
190+
output_path = project_root / "agents_mcp_usage" / "evaluations" / "mermaid_evals" / "results" / f"{current_month}_results_processed.json"
187191

188192
print(f"Processing {csv_path}...")
189193
data = process_csv_for_static_site(csv_path)

0 commit comments

Comments
 (0)