Skip to content

Commit fc556f5

Browse files
authored
Merge pull request #6 from andrewginns/initial-merbench-release
Initial merbench release
2 parents 77725bb + 3694b37 commit fc556f5

File tree

14 files changed

+508
-207
lines changed

14 files changed

+508
-207
lines changed

.env.example

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
GEMINI_API_KEY=
22
OPENAI_API_KEY=
3-
LOGFIRE_TOKEN=
3+
LOGFIRE_TOKEN=
4+
AWS_REGION=
5+
AWS_PROFILE=

Makefile

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,7 @@ install:
33
npm install -g @mermaid-js/mermaid-cli
44

55
lint:
6-
uv run ruff check .
6+
uv run ruff check .
7+
8+
leaderboard:
9+
uv run -- streamlit run agents_mcp_usage/multi_mcp/eval_multi_mcp/merbench_ui.py

README.md

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,10 +60,6 @@ This project aims to teach:
6060
- `oai-agent_mcp.py` - Example of using MCP with OpenAI Agents
6161
- `pydantic_mcp.py` - Example of using MCP with Pydantic-AI
6262

63-
- **eval_basic_mcp_use/** - Contains evaluation examples for single MCP usage:
64-
- `evals_adk_mcp.py` - Evaluation of MCP with Google's ADK
65-
- `evals_langchain_mcp.py` - Evaluation of MCP with LangGraph
66-
- `evals_pydantic_mcp.py` - Evaluation of MCP with Pydantic-AI
6763

6864
- **[agents_mcp_usage/multi_mcp/](agents_mcp_usage/multi_mcp/)** - Advanced multi-MCP server integration examples
6965
- **multi_mcp_use/** - Contains examples of using multiple MCP servers simultaneously:

agents_mcp_usage/basic_mcp/basic_mcp_use/adk_mcp.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,20 +41,20 @@ async def main(query: str = "Greet Andrew and give him the current time") -> Non
4141
# Create the agent
4242
root_agent = LlmAgent(
4343
model="gemini-2.5-pro-preview-03-25",
44-
name="mcp_pydantic_assistant",
44+
name="mcp_adk_assistant",
4545
tools=tools,
4646
)
4747

4848
# Set up session
4949
session_service = InMemorySessionService()
5050
session = session_service.create_session(
51-
app_name="mcp_pydantic_app",
51+
app_name="mcp_adk_app",
5252
user_id="aginns",
5353
)
5454

5555
# Create the runner
5656
runner = Runner(
57-
app_name="mcp_pydantic_app",
57+
app_name="mcp_adk_app",
5858
agent=root_agent,
5959
session_service=session_service,
6060
)

agents_mcp_usage/multi_mcp/eval_multi_mcp/costs.csv

Lines changed: 0 additions & 103 deletions
This file was deleted.
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
{
2+
"_comment": "Prices are per 1 million tokens",
3+
"_sources": {
4+
"gemini": "https://ai.google.dev/gemini-api/docs/pricing",
5+
"openai": "https://openai.com/api/pricing/"
6+
},
7+
"model_costs": {
8+
"gemini-2.5-pro-preview-03-25": {
9+
"friendly_name": "Gemini 2.5 Pro Preview (Mar)",
10+
"input": [
11+
{"up_to": 200000, "price": 1.25},
12+
{"up_to": "inf", "price": 2.50}
13+
],
14+
"output": {
15+
"default": [
16+
{"up_to": 200000, "price": 10.00},
17+
{"up_to": "inf", "price": 15.00}
18+
]
19+
}
20+
},
21+
"gemini-2.5-pro-preview-05-06": {
22+
"friendly_name": "Gemini 2.5 Pro Preview (May)",
23+
"input": [
24+
{"up_to": 200000, "price": 1.25},
25+
{"up_to": "inf", "price": 2.50}
26+
],
27+
"output": {
28+
"default": [
29+
{"up_to": 200000, "price": 10.00},
30+
{"up_to": "inf", "price": 15.00}
31+
]
32+
}
33+
},
34+
"gemini-2.5-pro-preview-06-05": {
35+
"friendly_name": "Gemini 2.5 Pro Preview (Jun)",
36+
"input": [
37+
{"up_to": 200000, "price": 1.25},
38+
{"up_to": "inf", "price": 2.50}
39+
],
40+
"output": {
41+
"default": [
42+
{"up_to": 200000, "price": 10.00},
43+
{"up_to": "inf", "price": 15.00}
44+
]
45+
}
46+
},
47+
"gemini-2.5-pro-preview": {
48+
"friendly_name": "Gemini 2.5 Pro Preview",
49+
"input": [
50+
{"up_to": 200000, "price": 1.25},
51+
{"up_to": "inf", "price": 2.50}
52+
],
53+
"output": {
54+
"default": [
55+
{"up_to": 200000, "price": 10.00},
56+
{"up_to": "inf", "price": 15.00}
57+
]
58+
}
59+
},
60+
"gemini-1.5-pro": {
61+
"friendly_name": "Gemini 1.5 Pro",
62+
"input": [
63+
{"up_to": 128000, "price": 1.25},
64+
{"up_to": "inf", "price": 2.50}
65+
],
66+
"output": {
67+
"default": [
68+
{"up_to": 128000, "price": 5.00},
69+
{"up_to": "inf", "price": 10.00}
70+
]
71+
}
72+
},
73+
"gemini-1.5-flash": {
74+
"friendly_name": "Gemini 1.5 Flash",
75+
"input": [
76+
{"up_to": 128000, "price": 0.075},
77+
{"up_to": "inf", "price": 0.15}
78+
],
79+
"output": {
80+
"default": [
81+
{"up_to": 128000, "price": 0.30},
82+
{"up_to": "inf", "price": 0.60}
83+
]
84+
}
85+
},
86+
"gemini-2.0-flash": {
87+
"friendly_name": "Gemini 2.0 Flash",
88+
"input": [{"up_to": "inf", "price": 0.10}],
89+
"output": {"default": [{"up_to": "inf", "price": 0.40}]}
90+
},
91+
"gemini-2.5-flash-preview-04-17": {
92+
"friendly_name": "Gemini 2.5 Flash Preview (Apr)",
93+
"input": [{"up_to": "inf", "price": 0.15}],
94+
"output": {
95+
"non_thinking": [{"up_to": "inf", "price": 0.60}],
96+
"thinking": [{"up_to": "inf", "price": 3.50}]
97+
}
98+
},
99+
"gemini-2.5-flash-preview": {
100+
"friendly_name": "Gemini 2.5 Flash Preview",
101+
"input": [{"up_to": "inf", "price": 0.15}],
102+
"output": {
103+
"non_thinking": [{"up_to": "inf", "price": 0.60}],
104+
"thinking": [{"up_to": "inf", "price": 3.50}]
105+
}
106+
},
107+
"openai:o4-mini": {
108+
"friendly_name": "OpenAI o4-mini",
109+
"input": [{"up_to": "inf", "price": 1.10}],
110+
"output": {"default": [{"up_to": "inf", "price": 4.40}]}
111+
},
112+
"openai:o3": {
113+
"friendly_name": "OpenAI o3",
114+
"input": [{"up_to": "inf", "price": 10.00}],
115+
"output": {"default": [{"up_to": "inf", "price": 40.00}]}
116+
},
117+
"openai:gpt-4.1": {
118+
"friendly_name": "GPT-4.1",
119+
"input": [{"up_to": "inf", "price": 2.00}],
120+
"output": {"default": [{"up_to": "inf", "price": 8.00}]}
121+
},
122+
"openai:gpt-4.1-mini": {
123+
"friendly_name": "GPT-4.1 Mini",
124+
"input": [{"up_to": "inf", "price": 0.40}],
125+
"output": {"default": [{"up_to": "inf", "price": 1.60}]}
126+
},
127+
"openai:gpt-4.1-nano": {
128+
"friendly_name": "GPT-4.1 Nano",
129+
"input": [{"up_to": "inf", "price": 0.10}],
130+
"output": {"default": [{"up_to": "inf", "price": 0.40}]}
131+
},
132+
"bedrock:us.anthropic.claude-sonnet-4-20250514-v1:0": {
133+
"friendly_name": "Claude 4 Sonnet",
134+
"input": [{"up_to": "inf", "price": 3.00}],
135+
"output": {"default": [{"up_to": "inf", "price": 15.00}]}
136+
},
137+
"bedrock:us.anthropic.claude-opus-4-20250514-v1:0": {
138+
"friendly_name": "Claude 4 Opus",
139+
"input": [{"up_to": "inf", "price": 15.00}],
140+
"output": {"default": [{"up_to": "inf", "price": 75.00}]}
141+
},
142+
"bedrock:us.anthropic.claude-3-7-sonnet-20250219-v1:0": {
143+
"friendly_name": "Claude 3.7 Sonnet",
144+
"input": [{"up_to": "inf", "price": 3.00}],
145+
"output": {"default": [{"up_to": "inf", "price": 15.00}]}
146+
},
147+
"bedrock:us.anthropic.claude-3-5-sonnet-20240620-v1:0": {
148+
"friendly_name": "Claude 3.5 Sonnet",
149+
"input": [{"up_to": "inf", "price": 3.00}],
150+
"output": {"default": [{"up_to": "inf", "price": 15.00}]}
151+
},
152+
"bedrock:us.anthropic.claude-3-5-haiku-20241022-v1:0": {
153+
"friendly_name": "Claude 3.5 Haiku",
154+
"input": [{"up_to": "inf", "price": 1.00}],
155+
"output": {"default": [{"up_to": "inf", "price": 4.00}]}
156+
}
157+
}
158+
}

agents_mcp_usage/multi_mcp/eval_multi_mcp/dashboard_config.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,16 @@
1919

2020
MERBENCH_CONFIG = {
2121
# --- General Dashboard Settings ---
22-
"title": "Merbench - LLM Evaluation Benchmark",
23-
"icon": "🏆", # Emoji for the browser tab
22+
"title": "🧜‍♀️ Merbench - LLM Evaluation ",
23+
"description": (
24+
"Getting LLMs to consistently nail the mermaid diagram syntax can be... an adventure. "
25+
"\n\nMerbench tests this ability by providing an LLM Agent access to an MCP server that both validates "
26+
"and provides error messages to guide correction of syntax. There are three different difficulty levels (test cases), "
27+
"and the LLM is given a fixed number of attempts to fix the diagram, if this is exceeded, the test case is considered failed. "
28+
"\n\n **Performance is a measure of both tool usage, and Mermaid syntax understanding.**"
29+
"\n\nThe leaderboard shows the average success rate across all selected models and difficulty levels over *n runs*."
30+
),
31+
"icon": "🧜‍♀️", # Emoji for the browser tab
2432
# --- Primary Metric Configuration ---
2533
# The primary metric is the main score used for the leaderboard and
2634
# the y-axis of the Pareto frontier plot.
@@ -50,6 +58,7 @@
5058
"x_axis_options": {
5159
"cost": {"column": "total_cost", "label": "Cost"},
5260
"tokens": {"column": "total_response_tokens", "label": "Tokens"},
61+
"duration": {"column": "Duration", "label": "Duration"},
5362
},
5463
"color_axis": "Duration", # Column to use for the color scale
5564
},

0 commit comments

Comments
 (0)