Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
GEMINI_API_KEY=
OPENAI_API_KEY=
LOGFIRE_TOKEN=
LOGFIRE_TOKEN=
AWS_REGION=
AWS_PROFILE=
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,7 @@ install:
npm install -g @mermaid-js/mermaid-cli

lint:
uv run ruff check .
uv run ruff check .

leaderboard:
uv run -- streamlit run agents_mcp_usage/multi_mcp/eval_multi_mcp/merbench_ui.py
4 changes: 0 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,6 @@ This project aims to teach:
- `oai-agent_mcp.py` - Example of using MCP with OpenAI Agents
- `pydantic_mcp.py` - Example of using MCP with Pydantic-AI

- **eval_basic_mcp_use/** - Contains evaluation examples for single MCP usage:
- `evals_adk_mcp.py` - Evaluation of MCP with Google's ADK
- `evals_langchain_mcp.py` - Evaluation of MCP with LangGraph
- `evals_pydantic_mcp.py` - Evaluation of MCP with Pydantic-AI

- **[agents_mcp_usage/multi_mcp/](agents_mcp_usage/multi_mcp/)** - Advanced multi-MCP server integration examples
- **multi_mcp_use/** - Contains examples of using multiple MCP servers simultaneously:
Expand Down
6 changes: 3 additions & 3 deletions agents_mcp_usage/basic_mcp/basic_mcp_use/adk_mcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,20 +41,20 @@ async def main(query: str = "Greet Andrew and give him the current time") -> Non
# Create the agent
root_agent = LlmAgent(
model="gemini-2.5-pro-preview-03-25",
name="mcp_pydantic_assistant",
name="mcp_adk_assistant",
tools=tools,
)

# Set up session
session_service = InMemorySessionService()
session = session_service.create_session(
app_name="mcp_pydantic_app",
app_name="mcp_adk_app",
user_id="aginns",
)

# Create the runner
runner = Runner(
app_name="mcp_pydantic_app",
app_name="mcp_adk_app",
agent=root_agent,
session_service=session_service,
)
Expand Down
103 changes: 0 additions & 103 deletions agents_mcp_usage/multi_mcp/eval_multi_mcp/costs.csv

This file was deleted.

158 changes: 158 additions & 0 deletions agents_mcp_usage/multi_mcp/eval_multi_mcp/costs.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
{
"_comment": "Prices are per 1 million tokens",
"_sources": {
"gemini": "https://ai.google.dev/gemini-api/docs/pricing",
"openai": "https://openai.com/api/pricing/"
},
"model_costs": {
"gemini-2.5-pro-preview-03-25": {
"friendly_name": "Gemini 2.5 Pro Preview (Mar)",
"input": [
{"up_to": 200000, "price": 1.25},
{"up_to": "inf", "price": 2.50}
],
"output": {
"default": [
{"up_to": 200000, "price": 10.00},
{"up_to": "inf", "price": 15.00}
]
}
},
"gemini-2.5-pro-preview-05-06": {
"friendly_name": "Gemini 2.5 Pro Preview (May)",
"input": [
{"up_to": 200000, "price": 1.25},
{"up_to": "inf", "price": 2.50}
],
"output": {
"default": [
{"up_to": 200000, "price": 10.00},
{"up_to": "inf", "price": 15.00}
]
}
},
"gemini-2.5-pro-preview-06-05": {
"friendly_name": "Gemini 2.5 Pro Preview (Jun)",
"input": [
{"up_to": 200000, "price": 1.25},
{"up_to": "inf", "price": 2.50}
],
"output": {
"default": [
{"up_to": 200000, "price": 10.00},
{"up_to": "inf", "price": 15.00}
]
}
},
"gemini-2.5-pro-preview": {
"friendly_name": "Gemini 2.5 Pro Preview",
"input": [
{"up_to": 200000, "price": 1.25},
{"up_to": "inf", "price": 2.50}
],
"output": {
"default": [
{"up_to": 200000, "price": 10.00},
{"up_to": "inf", "price": 15.00}
]
}
},
"gemini-1.5-pro": {
"friendly_name": "Gemini 1.5 Pro",
"input": [
{"up_to": 128000, "price": 1.25},
{"up_to": "inf", "price": 2.50}
],
"output": {
"default": [
{"up_to": 128000, "price": 5.00},
{"up_to": "inf", "price": 10.00}
]
}
},
"gemini-1.5-flash": {
"friendly_name": "Gemini 1.5 Flash",
"input": [
{"up_to": 128000, "price": 0.075},
{"up_to": "inf", "price": 0.15}
],
"output": {
"default": [
{"up_to": 128000, "price": 0.30},
{"up_to": "inf", "price": 0.60}
]
}
},
"gemini-2.0-flash": {
"friendly_name": "Gemini 2.0 Flash",
"input": [{"up_to": "inf", "price": 0.10}],
"output": {"default": [{"up_to": "inf", "price": 0.40}]}
},
"gemini-2.5-flash-preview-04-17": {
"friendly_name": "Gemini 2.5 Flash Preview (Apr)",
"input": [{"up_to": "inf", "price": 0.15}],
"output": {
"non_thinking": [{"up_to": "inf", "price": 0.60}],
"thinking": [{"up_to": "inf", "price": 3.50}]
}
},
"gemini-2.5-flash-preview": {
"friendly_name": "Gemini 2.5 Flash Preview",
"input": [{"up_to": "inf", "price": 0.15}],
"output": {
"non_thinking": [{"up_to": "inf", "price": 0.60}],
"thinking": [{"up_to": "inf", "price": 3.50}]
}
},
"openai:o4-mini": {
"friendly_name": "OpenAI o4-mini",
"input": [{"up_to": "inf", "price": 1.10}],
"output": {"default": [{"up_to": "inf", "price": 4.40}]}
},
"openai:o3": {
"friendly_name": "OpenAI o3",
"input": [{"up_to": "inf", "price": 10.00}],
"output": {"default": [{"up_to": "inf", "price": 40.00}]}
},
"openai:gpt-4.1": {
"friendly_name": "GPT-4.1",
"input": [{"up_to": "inf", "price": 2.00}],
"output": {"default": [{"up_to": "inf", "price": 8.00}]}
},
"openai:gpt-4.1-mini": {
"friendly_name": "GPT-4.1 Mini",
"input": [{"up_to": "inf", "price": 0.40}],
"output": {"default": [{"up_to": "inf", "price": 1.60}]}
},
"openai:gpt-4.1-nano": {
"friendly_name": "GPT-4.1 Nano",
"input": [{"up_to": "inf", "price": 0.10}],
"output": {"default": [{"up_to": "inf", "price": 0.40}]}
},
"bedrock:us.anthropic.claude-sonnet-4-20250514-v1:0": {
"friendly_name": "Claude 4 Sonnet",
"input": [{"up_to": "inf", "price": 3.00}],
"output": {"default": [{"up_to": "inf", "price": 15.00}]}
},
"bedrock:us.anthropic.claude-opus-4-20250514-v1:0": {
"friendly_name": "Claude 4 Opus",
"input": [{"up_to": "inf", "price": 15.00}],
"output": {"default": [{"up_to": "inf", "price": 75.00}]}
},
"bedrock:us.anthropic.claude-3-7-sonnet-20250219-v1:0": {
"friendly_name": "Claude 3.7 Sonnet",
"input": [{"up_to": "inf", "price": 3.00}],
"output": {"default": [{"up_to": "inf", "price": 15.00}]}
},
"bedrock:us.anthropic.claude-3-5-sonnet-20240620-v1:0": {
"friendly_name": "Claude 3.5 Sonnet",
"input": [{"up_to": "inf", "price": 3.00}],
"output": {"default": [{"up_to": "inf", "price": 15.00}]}
},
"bedrock:us.anthropic.claude-3-5-haiku-20241022-v1:0": {
"friendly_name": "Claude 3.5 Haiku",
"input": [{"up_to": "inf", "price": 1.00}],
"output": {"default": [{"up_to": "inf", "price": 4.00}]}
}
}
}
13 changes: 11 additions & 2 deletions agents_mcp_usage/multi_mcp/eval_multi_mcp/dashboard_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,16 @@

MERBENCH_CONFIG = {
# --- General Dashboard Settings ---
"title": "Merbench - LLM Evaluation Benchmark",
"icon": "🏆", # Emoji for the browser tab
"title": "🧜‍♀️ Merbench - LLM Evaluation ",
"description": (
"Getting LLMs to consistently nail the mermaid diagram syntax can be... an adventure. "
"\n\nMerbench tests this ability by providing an LLM Agent access to an MCP server that both validates "
"and provides error messages to guide correction of syntax. There are three different difficulty levels (test cases), "
"and the LLM is given a fixed number of attempts to fix the diagram, if this is exceeded, the test case is considered failed. "
"\n\n **Performance is a measure of both tool usage, and Mermaid syntax understanding.**"
"\n\nThe leaderboard shows the average success rate across all selected models and difficulty levels over *n runs*."
),
"icon": "🧜‍♀️", # Emoji for the browser tab
# --- Primary Metric Configuration ---
# The primary metric is the main score used for the leaderboard and
# the y-axis of the Pareto frontier plot.
Expand Down Expand Up @@ -50,6 +58,7 @@
"x_axis_options": {
"cost": {"column": "total_cost", "label": "Cost"},
"tokens": {"column": "total_response_tokens", "label": "Tokens"},
"duration": {"column": "Duration", "label": "Duration"},
},
"color_axis": "Duration", # Column to use for the color scale
},
Expand Down
Loading