Skip to content

Commit 4996326

Browse files
committed
chore: Update eval cases, prices, default models, and refactor default models
1 parent b97b853 commit 4996326

File tree

6 files changed

+48
-24
lines changed

6 files changed

+48
-24
lines changed

Makefile

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,7 @@ install:
33
npm install -g @mermaid-js/mermaid-cli
44

55
lint:
6-
uv run ruff check .
6+
uv run ruff check .
7+
8+
leaderboard:
9+
uv run -- streamlit run agents_mcp_usage/multi_mcp/eval_multi_mcp/merbench_ui.py

agents_mcp_usage/multi_mcp/eval_multi_mcp/costs.csv

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,18 @@
22
# Gemini prices from https://ai.google.dev/gemini-api/docs/pricing
33
# OpenAI prices from https://openai.com/api/pricing/
44
MODEL_COSTS = {
5+
"gemini-2.5-pro-preview-03-25": {
6+
"input": [
7+
{"up_to": 200000, "price": 1.25},
8+
{"up_to": float('inf'), "price": 2.50},
9+
],
10+
"output": {
11+
"default": [
12+
{"up_to": 200000, "price": 10.00},
13+
{"up_to": float('inf'), "price": 15.00},
14+
]
15+
},
16+
},
517
"gemini-2.5-pro-preview-05-06": {
618
"input": [
719
{"up_to": 200000, "price": 1.25},

agents_mcp_usage/multi_mcp/eval_multi_mcp/evals_pydantic_mcp.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -49,12 +49,7 @@
4949
logfire.instrument_pydantic_ai()
5050

5151
# Default model configurations
52-
DEFAULT_MODEL = "gemini-2.5-pro-preview-05-06"
53-
DEFAULT_MODELS = [
54-
"gemini-2.5-pro-preview-06-05",
55-
"gemini-2.0-flash",
56-
"gemini-2.5-flash-preview-04-17",
57-
]
52+
DEFAULT_MODEL = "gemini-2.5-pro-preview-06-05"
5853

5954
# Retry configuration
6055
RETRYABLE_HTTP_STATUS_CODES = {429, 500, 502, 503, 504}
@@ -765,9 +760,9 @@ async def fix_with_model(inputs: MermaidInput) -> MermaidOutput:
765760
if __name__ == "__main__":
766761
# You can use different models for the agent and the judge
767762
# agent_model = os.getenv("AGENT_MODEL", DEFAULT_MODEL)
768-
agent_model = "gemini-2.5-pro-preview-06-05"
763+
# agent_model = "gemini-2.5-pro-preview-06-05"
769764
# agent_model = "openai:o4-mini"
770-
# agent_model = "gemini-2.5-flash-preview-04-17"
765+
agent_model = "gemini-2.5-flash-preview-04-17"
771766
judge_model = os.getenv("JUDGE_MODEL", DEFAULT_MODEL)
772767

773768
async def run_all():

agents_mcp_usage/multi_mcp/eval_multi_mcp/merbench_ui.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -781,7 +781,7 @@ def main() -> None:
781781
cols[3].metric("Files Loaded", len(selected_files))
782782

783783
st.info(
784-
f"**Showing results for {grouping_config.label.lower()}:** {', '.join(selected_groups) if selected_groups else 'None'}"
784+
f"**Showing averaged results for {grouping_config.label.lower()}:** {', '.join(selected_groups) if selected_groups else 'None'}"
785785
)
786786

787787
# --- Leaderboard & Pareto ---

agents_mcp_usage/multi_mcp/eval_multi_mcp/run_multi_evals.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@
3434

3535
# Import shared functionality from the improved evals module
3636
from agents_mcp_usage.multi_mcp.eval_multi_mcp.evals_pydantic_mcp import (
37-
DEFAULT_MODELS,
3837
MermaidInput,
3938
MermaidOutput,
4039
fix_mermaid_diagram,
@@ -44,6 +43,12 @@
4443

4544
load_dotenv()
4645

46+
DEFAULT_MODELS = [
47+
# "gemini-2.5-pro-preview-06-05",
48+
"gemini-2.0-flash",
49+
"gemini-2.5-flash-preview-04-17",
50+
]
51+
4752
logfire.configure(
4853
send_to_logfire="if-token-present", service_name="multi-model-mermaid-evals"
4954
)
@@ -496,7 +501,7 @@ async def main() -> None:
496501
parser.add_argument(
497502
"--judge-model",
498503
type=str,
499-
default="gemini-2.5-pro-preview-03-25",
504+
default="gemini-2.5-pro-preview-06-05",
500505
help="Model to use for LLM judging",
501506
)
502507
parser.add_argument(

agents_mcp_usage/multi_mcp/mermaid_diagrams.py

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
66
# Agent Frameworks
77
subgraph "Agent"
8+
direction TD
89
Agent[Agent]
910
ADK["Google ADK<br>(adk_mcp.py)"]
1011
LG["LangGraph<br>(langgraph_mcp.py)"]
@@ -21,7 +22,7 @@
2122
subgraph "MCP"
2223
direction TD
2324
MCP["Model Context Protocol Server<br>(mcp_servers/example_server.py)"]
24-
Tools["Tools<br>- add(a, b)<br>- get_current_time() e.g. {current_time}"]
25+
Tools["Tools<br>- add(a, b)<br>- get_current_time() {current_time}"]
2526
Resources["Resources<br>- greeting://{{name}}"]
2627
MCP --- Tools
2728
MCP --- Resources
@@ -38,7 +39,7 @@
3839
Logfire[("Logfire<br>Tracing")]
3940
4041
ADK --> MCP
41-
LG --> MCP
42+
LG -- > MCP
4243
OAI --> MCP
4344
PYD --> MCP
4445
@@ -47,7 +48,7 @@
4748
MCP --> OTHER
4849
4950
ADK --> Logfire
50-
LG --> Logfire
51+
LG -- > Logfire
5152
OAI --> Logfire
5253
PYD --> Logfire
5354
@@ -63,6 +64,7 @@
6364
```
6465
"""
6566

67+
# 7 syntax errors
6668
invalid_mermaid_diagram_medium = """
6769
```mermaid
6870
graph LR
@@ -87,13 +89,15 @@
8789
subgraph "MCP"
8890
direction TB
8991
MCP["Model Context Protocol Server<br>(mcp_servers/example_server.py)"]
90-
Tools["Tools<br>- add(a, b)<br>- get_current_time() e.g. {current_time}"]
92+
Tools["Tools<br>- add(a, b)<br>- get_current_time() {current_time}"]
9193
Resources["Resources<br>- greeting://{{name}}"]
9294
MCP --- Tools
9395
MCP --- Resources
9496
end
9597
98+
# LLM Providers
9699
subgraph "LLM Providers"
100+
direction TB
97101
OAI_LLM["OpenAI Models"]
98102
GEM["Google Gemini Models"]
99103
OTHER["Other LLM Providers..."]
@@ -102,7 +106,7 @@
102106
Logfire[("Logfire<br>Tracing")]
103107
104108
ADK --> MCP
105-
LG --> MCP
109+
LG -- > MCP
106110
OAI --> MCP
107111
PYD --> MCP
108112
@@ -111,7 +115,7 @@
111115
MCP --> OTHER
112116
113117
ADK --> Logfire
114-
LG --> Logfire
118+
LG -- > Logfire
115119
OAI --> Logfire
116120
PYD --> Logfire
117121
@@ -127,6 +131,7 @@
127131
```
128132
"""
129133

134+
# 2 syntax errors
130135
invalid_mermaid_diagram_easy = """
131136
```mermaid
132137
graph LR
@@ -148,16 +153,18 @@
148153
end
149154
150155
%% MCP Server
151-
subgraph "MCP Server"
156+
subgraph "MCP"
152157
direction TB
153158
MCP["Model Context Protocol Server<br>(mcp_servers/example_server.py)"]
154-
Tools["Tools<br>- add(a, b)<br>- get_current_time() e.g. {current_time}"]
159+
Tools["Tools<br>- add(a, b)<br>- get_current_time() {current_time}"]
155160
Resources["Resources<br>- greeting://{{name}}"]
156-
MCPs --- Tools
157-
MCPs --- Resources
161+
MCP --- Tools
162+
MCP --- Resources
158163
end
159164
165+
%% LLM Providers
160166
subgraph "LLM Providers"
167+
direction TB
161168
OAI_LLM["OpenAI Models"]
162169
GEM["Google Gemini Models"]
163170
OTHER["Other LLM Providers..."]
@@ -171,7 +178,7 @@
171178
PYD --> MCP
172179
173180
MCP --> OAI_LLM
174-
MCP --> GEM
181+
MCP --> GEMINI
175182
MCP --> OTHER
176183
177184
ADK --> Logfire
@@ -215,13 +222,15 @@
215222
subgraph "MCP Server"
216223
direction TB
217224
MCP["Model Context Protocol Server<br>(mcp_servers/example_server.py)"]
218-
Tools["Tools<br>- add(a, b)<br>- get_current_time() e.g. {current_time}"]
225+
Tools["Tools<br>- add(a, b)<br>- get_current_time() {current_time}"]
219226
Resources["Resources<br>- greeting://{{name}}"]
220227
MCP --- Tools
221228
MCP --- Resources
222229
end
223230
231+
%% LLM Providers
224232
subgraph "LLM Providers"
233+
direction TB
225234
OAI_LLM["OpenAI Models"]
226235
GEM["Google Gemini Models"]
227236
OTHER["Other LLM Providers..."]

0 commit comments

Comments
 (0)