chore: Update eval cases, prices, default models, and refactor default models

andrewginns · andrewginns · commit 49963268c73e · 2025-06-11T08:37:33.000Z
diff --git a/Makefile b/Makefile
@@ -3,4 +3,7 @@ install:
 	npm install -g @mermaid-js/mermaid-cli
 
 lint:
-	uv run ruff check .
+	uv run ruff check .
+
+leaderboard:
+	uv run -- streamlit run agents_mcp_usage/multi_mcp/eval_multi_mcp/merbench_ui.py
diff --git a/agents_mcp_usage/multi_mcp/eval_multi_mcp/costs.csv b/agents_mcp_usage/multi_mcp/eval_multi_mcp/costs.csv
@@ -2,6 +2,18 @@
 # Gemini prices from https://ai.google.dev/gemini-api/docs/pricing
 # OpenAI prices from https://openai.com/api/pricing/
 MODEL_COSTS = {
+    "gemini-2.5-pro-preview-03-25": {
+        "input": [
+            {"up_to": 200000, "price": 1.25},
+            {"up_to": float('inf'), "price": 2.50},
+        ],
+        "output": {
+            "default": [
+                {"up_to": 200000, "price": 10.00},
+                {"up_to": float('inf'), "price": 15.00},
+            ]
+        },
+    },
     "gemini-2.5-pro-preview-05-06": {
         "input": [
             {"up_to": 200000, "price": 1.25},
diff --git a/agents_mcp_usage/multi_mcp/eval_multi_mcp/evals_pydantic_mcp.py b/agents_mcp_usage/multi_mcp/eval_multi_mcp/evals_pydantic_mcp.py
@@ -49,12 +49,7 @@
 logfire.instrument_pydantic_ai()
 
 # Default model configurations
-DEFAULT_MODEL = "gemini-2.5-pro-preview-05-06"
-DEFAULT_MODELS = [
-    "gemini-2.5-pro-preview-06-05",
-    "gemini-2.0-flash",
-    "gemini-2.5-flash-preview-04-17",
-]
+DEFAULT_MODEL = "gemini-2.5-pro-preview-06-05"
 
 # Retry configuration
 RETRYABLE_HTTP_STATUS_CODES = {429, 500, 502, 503, 504}
@@ -765,9 +760,9 @@ async def fix_with_model(inputs: MermaidInput) -> MermaidOutput:
 if __name__ == "__main__":
     # You can use different models for the agent and the judge
     # agent_model = os.getenv("AGENT_MODEL", DEFAULT_MODEL)
-    agent_model = "gemini-2.5-pro-preview-06-05"
+    # agent_model = "gemini-2.5-pro-preview-06-05"
     # agent_model = "openai:o4-mini"
-    # agent_model = "gemini-2.5-flash-preview-04-17"
+    agent_model = "gemini-2.5-flash-preview-04-17"
     judge_model = os.getenv("JUDGE_MODEL", DEFAULT_MODEL)
 
     async def run_all():
diff --git a/agents_mcp_usage/multi_mcp/eval_multi_mcp/merbench_ui.py b/agents_mcp_usage/multi_mcp/eval_multi_mcp/merbench_ui.py
@@ -781,7 +781,7 @@ def main() -> None:
     cols[3].metric("Files Loaded", len(selected_files))
 
     st.info(
-        f"**Showing results for {grouping_config.label.lower()}:** {', '.join(selected_groups) if selected_groups else 'None'}"
+        f"**Showing averaged results for {grouping_config.label.lower()}:** {', '.join(selected_groups) if selected_groups else 'None'}"
     )
 
     # --- Leaderboard & Pareto ---
diff --git a/agents_mcp_usage/multi_mcp/eval_multi_mcp/run_multi_evals.py b/agents_mcp_usage/multi_mcp/eval_multi_mcp/run_multi_evals.py
@@ -34,7 +34,6 @@
 
 # Import shared functionality from the improved evals module
 from agents_mcp_usage.multi_mcp.eval_multi_mcp.evals_pydantic_mcp import (
-    DEFAULT_MODELS,
     MermaidInput,
     MermaidOutput,
     fix_mermaid_diagram,
@@ -44,6 +43,12 @@
 
 load_dotenv()
 
+DEFAULT_MODELS = [
+    # "gemini-2.5-pro-preview-06-05",
+    "gemini-2.0-flash",
+    "gemini-2.5-flash-preview-04-17",
+]
+
 logfire.configure(
     send_to_logfire="if-token-present", service_name="multi-model-mermaid-evals"
 )
@@ -496,7 +501,7 @@ async def main() -> None:
     parser.add_argument(
         "--judge-model",
         type=str,
-        default="gemini-2.5-pro-preview-03-25",
+        default="gemini-2.5-pro-preview-06-05",
         help="Model to use for LLM judging",
     )
     parser.add_argument(
diff --git a/agents_mcp_usage/multi_mcp/mermaid_diagrams.py b/agents_mcp_usage/multi_mcp/mermaid_diagrams.py
@@ -5,6 +5,7 @@
 
     # Agent Frameworks
     subgraph "Agent"
+        direction TD
         Agent[Agent]
         ADK["Google ADK<br>(adk_mcp.py)"]
         LG["LangGraph<br>(langgraph_mcp.py)"]
@@ -21,7 +22,7 @@
     subgraph "MCP"
         direction TD
         MCP["Model Context Protocol Server<br>(mcp_servers/example_server.py)"]
-        Tools["Tools<br>- add(a, b)<br>- get_current_time() e.g. {current_time}"]
+        Tools["Tools<br>- add(a, b)<br>- get_current_time() {current_time}"]
         Resources["Resources<br>- greeting://{{name}}"]
         MCP --- Tools
         MCP --- Resources
@@ -38,7 +39,7 @@
     Logfire[("Logfire<br>Tracing")]
 
     ADK --> MCP
-    LG --> MCP
+    LG -- > MCP
     OAI --> MCP
     PYD --> MCP
 
@@ -47,7 +48,7 @@
     MCP --> OTHER
 
     ADK --> Logfire
-    LG --> Logfire
+    LG -- > Logfire
     OAI --> Logfire
     PYD --> Logfire
 
@@ -63,6 +64,7 @@
 ```
 """
 
+# 7 syntax errors
 invalid_mermaid_diagram_medium = """
 ```mermaid
 graph LR
@@ -87,13 +89,15 @@
     subgraph "MCP"
         direction TB
         MCP["Model Context Protocol Server<br>(mcp_servers/example_server.py)"]
-        Tools["Tools<br>- add(a, b)<br>- get_current_time() e.g. {current_time}"]
+        Tools["Tools<br>- add(a, b)<br>- get_current_time() {current_time}"]
         Resources["Resources<br>- greeting://{{name}}"]
         MCP --- Tools
         MCP --- Resources
     end
 
+    # LLM Providers
     subgraph "LLM Providers"
+        direction TB
         OAI_LLM["OpenAI Models"]
         GEM["Google Gemini Models"]
         OTHER["Other LLM Providers..."]
@@ -102,7 +106,7 @@
     Logfire[("Logfire<br>Tracing")]
 
     ADK --> MCP
-    LG --> MCP
+    LG -- > MCP
     OAI --> MCP
     PYD --> MCP
 
@@ -111,7 +115,7 @@
     MCP --> OTHER
 
     ADK --> Logfire
-    LG --> Logfire
+    LG -- > Logfire
     OAI --> Logfire
     PYD --> Logfire
 
@@ -127,6 +131,7 @@
 ```
 """
 
+# 2 syntax errors
 invalid_mermaid_diagram_easy = """
 ```mermaid
 graph LR
@@ -148,16 +153,18 @@
     end
 
     %% MCP Server
-    subgraph "MCP Server"
+    subgraph "MCP"
         direction TB
         MCP["Model Context Protocol Server<br>(mcp_servers/example_server.py)"]
-        Tools["Tools<br>- add(a, b)<br>- get_current_time() e.g. {current_time}"]
+        Tools["Tools<br>- add(a, b)<br>- get_current_time() {current_time}"]
         Resources["Resources<br>- greeting://{{name}}"]
-        MCPs --- Tools
-        MCPs --- Resources
+        MCP --- Tools
+        MCP --- Resources
     end
 
+    %% LLM Providers
     subgraph "LLM Providers"
+        direction TB
         OAI_LLM["OpenAI Models"]
         GEM["Google Gemini Models"]
         OTHER["Other LLM Providers..."]
@@ -171,7 +178,7 @@
     PYD --> MCP
 
     MCP --> OAI_LLM
-    MCP --> GEM
+    MCP --> GEMINI
     MCP --> OTHER
 
     ADK --> Logfire
@@ -215,13 +222,15 @@
     subgraph "MCP Server"
         direction TB
         MCP["Model Context Protocol Server<br>(mcp_servers/example_server.py)"]
-        Tools["Tools<br>- add(a, b)<br>- get_current_time() e.g. {current_time}"]
+        Tools["Tools<br>- add(a, b)<br>- get_current_time() {current_time}"]
         Resources["Resources<br>- greeting://{{name}}"]
         MCP --- Tools
         MCP --- Resources
     end
 
+    %% LLM Providers
     subgraph "LLM Providers"
+        direction TB
         OAI_LLM["OpenAI Models"]
         GEM["Google Gemini Models"]
         OTHER["Other LLM Providers..."]

Original file line number	Diff line number	Diff line change
`@@ -781,7 +781,7 @@ def main() -> None:`
`781`	`781`	`cols[3].metric("Files Loaded", len(selected_files))`
`782`	`782`
`783`	`783`	`st.info(`
`784`		`- f"Showing results for {grouping_config.label.lower()}: {', '.join(selected_groups) if selected_groups else 'None'}"`
	`784`	`+ f"Showing averaged results for {grouping_config.label.lower()}: {', '.join(selected_groups) if selected_groups else 'None'}"`
`785`	`785`	`)`
`786`	`786`
`787`	`787`	`# --- Leaderboard & Pareto ---`