andrewginns · andrewginns · Jun 16, 2025 · Jun 11, 2025 · Jun 11, 2025 · Jun 11, 2025
diff --git a/.env.example b/.env.example
@@ -1,3 +1,5 @@
 GEMINI_API_KEY=
 OPENAI_API_KEY=
-LOGFIRE_TOKEN=
+LOGFIRE_TOKEN=
+AWS_REGION=
+AWS_PROFILE=
diff --git a/Makefile b/Makefile
@@ -3,4 +3,7 @@ install:
 	npm install -g @mermaid-js/mermaid-cli
 
 lint:
-	uv run ruff check .
+	uv run ruff check .
+
+leaderboard:
+	uv run -- streamlit run agents_mcp_usage/multi_mcp/eval_multi_mcp/merbench_ui.py
diff --git a/README.md b/README.md
@@ -60,10 +60,6 @@ This project aims to teach:
     - `oai-agent_mcp.py` - Example of using MCP with OpenAI Agents
     - `pydantic_mcp.py` - Example of using MCP with Pydantic-AI
 
-  - **eval_basic_mcp_use/** - Contains evaluation examples for single MCP usage:
-    - `evals_adk_mcp.py` - Evaluation of MCP with Google's ADK
-    - `evals_langchain_mcp.py` - Evaluation of MCP with LangGraph
-    - `evals_pydantic_mcp.py` - Evaluation of MCP with Pydantic-AI
 
 - **[agents_mcp_usage/multi_mcp/](agents_mcp_usage/multi_mcp/)** - Advanced multi-MCP server integration examples
   - **multi_mcp_use/** - Contains examples of using multiple MCP servers simultaneously:

diff --git a/agents_mcp_usage/basic_mcp/basic_mcp_use/adk_mcp.py b/agents_mcp_usage/basic_mcp/basic_mcp_use/adk_mcp.py
@@ -41,20 +41,20 @@ async def main(query: str = "Greet Andrew and give him the current time") -> Non
     # Create the agent
     root_agent = LlmAgent(
         model="gemini-2.5-pro-preview-03-25",
-        name="mcp_pydantic_assistant",
+        name="mcp_adk_assistant",
         tools=tools,
     )
 
     # Set up session
     session_service = InMemorySessionService()
     session = session_service.create_session(
-        app_name="mcp_pydantic_app",
+        app_name="mcp_adk_app",
         user_id="aginns",
     )
 
     # Create the runner
     runner = Runner(
-        app_name="mcp_pydantic_app",
+        app_name="mcp_adk_app",
         agent=root_agent,
         session_service=session_service,
     )

diff --git a/agents_mcp_usage/multi_mcp/eval_multi_mcp/costs.csv b/agents_mcp_usage/multi_mcp/eval_multi_mcp/costs.csv
diff --git a/agents_mcp_usage/multi_mcp/eval_multi_mcp/costs.json b/agents_mcp_usage/multi_mcp/eval_multi_mcp/costs.json
@@ -0,0 +1,158 @@
+{
+  "_comment": "Prices are per 1 million tokens",
+  "_sources": {
+    "gemini": "https://ai.google.dev/gemini-api/docs/pricing",
+    "openai": "https://openai.com/api/pricing/"
+  },
+  "model_costs": {
+    "gemini-2.5-pro-preview-03-25": {
+      "friendly_name": "Gemini 2.5 Pro Preview (Mar)",
+      "input": [
+        {"up_to": 200000, "price": 1.25},
+        {"up_to": "inf", "price": 2.50}
+      ],
+      "output": {
+        "default": [
+          {"up_to": 200000, "price": 10.00},
+          {"up_to": "inf", "price": 15.00}
+        ]
+      }
+    },
+    "gemini-2.5-pro-preview-05-06": {
+      "friendly_name": "Gemini 2.5 Pro Preview (May)",
+      "input": [
+        {"up_to": 200000, "price": 1.25},
+        {"up_to": "inf", "price": 2.50}
+      ],
+      "output": {
+        "default": [
+          {"up_to": 200000, "price": 10.00},
+          {"up_to": "inf", "price": 15.00}
+        ]
+      }
+    },
+    "gemini-2.5-pro-preview-06-05": {
+      "friendly_name": "Gemini 2.5 Pro Preview (Jun)",
+      "input": [
+        {"up_to": 200000, "price": 1.25},
+        {"up_to": "inf", "price": 2.50}
+      ],
+      "output": {
+        "default": [
+          {"up_to": 200000, "price": 10.00},
+          {"up_to": "inf", "price": 15.00}
+        ]
+      }
+    },
+    "gemini-2.5-pro-preview": {
+      "friendly_name": "Gemini 2.5 Pro Preview",
+      "input": [
+        {"up_to": 200000, "price": 1.25},
+        {"up_to": "inf", "price": 2.50}
+      ],
+      "output": {
+        "default": [
+          {"up_to": 200000, "price": 10.00},
+          {"up_to": "inf", "price": 15.00}
+        ]
+      }
+    },
+    "gemini-1.5-pro": {
+      "friendly_name": "Gemini 1.5 Pro",
+      "input": [
+        {"up_to": 128000, "price": 1.25},
+        {"up_to": "inf", "price": 2.50}
+      ],
+      "output": {
+        "default": [
+          {"up_to": 128000, "price": 5.00},
+          {"up_to": "inf", "price": 10.00}
+        ]
+      }
+    },
+    "gemini-1.5-flash": {
+      "friendly_name": "Gemini 1.5 Flash",
+      "input": [
+        {"up_to": 128000, "price": 0.075},
+        {"up_to": "inf", "price": 0.15}
+      ],
+      "output": {
+        "default": [
+          {"up_to": 128000, "price": 0.30},
+          {"up_to": "inf", "price": 0.60}
+        ]
+      }
+    },
+    "gemini-2.0-flash": {
+      "friendly_name": "Gemini 2.0 Flash",
+      "input": [{"up_to": "inf", "price": 0.10}],
+      "output": {"default": [{"up_to": "inf", "price": 0.40}]}
+    },
+    "gemini-2.5-flash-preview-04-17": {
+      "friendly_name": "Gemini 2.5 Flash Preview (Apr)",
+      "input": [{"up_to": "inf", "price": 0.15}],
+      "output": {
+        "non_thinking": [{"up_to": "inf", "price": 0.60}],
+        "thinking": [{"up_to": "inf", "price": 3.50}]
+      }
+    },
+    "gemini-2.5-flash-preview": {
+      "friendly_name": "Gemini 2.5 Flash Preview",
+      "input": [{"up_to": "inf", "price": 0.15}],
+      "output": {
+        "non_thinking": [{"up_to": "inf", "price": 0.60}],
+        "thinking": [{"up_to": "inf", "price": 3.50}]
+      }
+    },
+    "openai:o4-mini": {
+      "friendly_name": "OpenAI o4-mini",
+      "input": [{"up_to": "inf", "price": 1.10}],
+      "output": {"default": [{"up_to": "inf", "price": 4.40}]}
+    },
+    "openai:o3": {
+      "friendly_name": "OpenAI o3",
+      "input": [{"up_to": "inf", "price": 10.00}],
+      "output": {"default": [{"up_to": "inf", "price": 40.00}]}
+    },
+    "openai:gpt-4.1": {
+      "friendly_name": "GPT-4.1",
+      "input": [{"up_to": "inf", "price": 2.00}],
+      "output": {"default": [{"up_to": "inf", "price": 8.00}]}
+    },
+    "openai:gpt-4.1-mini": {
+      "friendly_name": "GPT-4.1 Mini",
+      "input": [{"up_to": "inf", "price": 0.40}],
+      "output": {"default": [{"up_to": "inf", "price": 1.60}]}
+    },
+    "openai:gpt-4.1-nano": {
+      "friendly_name": "GPT-4.1 Nano",
+      "input": [{"up_to": "inf", "price": 0.10}],
+      "output": {"default": [{"up_to": "inf", "price": 0.40}]}
+    },
+    "bedrock:us.anthropic.claude-sonnet-4-20250514-v1:0": {
+      "friendly_name": "Claude 4 Sonnet",
+      "input": [{"up_to": "inf", "price": 3.00}],
+      "output": {"default": [{"up_to": "inf", "price": 15.00}]}
+    },
+    "bedrock:us.anthropic.claude-opus-4-20250514-v1:0": {
+      "friendly_name": "Claude 4 Opus",
+      "input": [{"up_to": "inf", "price": 15.00}],
+      "output": {"default": [{"up_to": "inf", "price": 75.00}]}
+    },
+    "bedrock:us.anthropic.claude-3-7-sonnet-20250219-v1:0": {
+      "friendly_name": "Claude 3.7 Sonnet",
+      "input": [{"up_to": "inf", "price": 3.00}],
+      "output": {"default": [{"up_to": "inf", "price": 15.00}]}
+    },
+    "bedrock:us.anthropic.claude-3-5-sonnet-20240620-v1:0": {
+      "friendly_name": "Claude 3.5 Sonnet",
+      "input": [{"up_to": "inf", "price": 3.00}],
+      "output": {"default": [{"up_to": "inf", "price": 15.00}]}
+    },
+    "bedrock:us.anthropic.claude-3-5-haiku-20241022-v1:0": {
+      "friendly_name": "Claude 3.5 Haiku",
+      "input": [{"up_to": "inf", "price": 1.00}],
+      "output": {"default": [{"up_to": "inf", "price": 4.00}]}
+    }
+  }
+}
diff --git a/agents_mcp_usage/multi_mcp/eval_multi_mcp/dashboard_config.py b/agents_mcp_usage/multi_mcp/eval_multi_mcp/dashboard_config.py
@@ -19,8 +19,16 @@
 
 MERBENCH_CONFIG = {
     # --- General Dashboard Settings ---
-    "title": "Merbench - LLM Evaluation Benchmark",
-    "icon": "🏆",  # Emoji for the browser tab
+    "title": "🧜‍♀️ Merbench - LLM Evaluation ",
+    "description": (
+        "Getting LLMs to consistently nail the mermaid diagram syntax can be... an adventure. "
+        "\n\nMerbench tests this ability by providing an LLM Agent access to an MCP server that both validates "
+        "and provides error messages to guide correction of syntax. There are three different difficulty levels (test cases), "
+        "and the LLM is given a fixed number of attempts to fix the diagram, if this is exceeded, the test case is considered failed. "
+        "\n\n **Performance is a measure of both tool usage, and Mermaid syntax understanding.**"
+        "\n\nThe leaderboard shows the average success rate across all selected models and difficulty levels over *n runs*."
+    ),
+    "icon": "🧜‍♀️",  # Emoji for the browser tab
     # --- Primary Metric Configuration ---
     # The primary metric is the main score used for the leaderboard and
     # the y-axis of the Pareto frontier plot.
@@ -50,6 +58,7 @@
             "x_axis_options": {
                 "cost": {"column": "total_cost", "label": "Cost"},
                 "tokens": {"column": "total_response_tokens", "label": "Tokens"},
+                "duration": {"column": "Duration", "label": "Duration"},
             },
             "color_axis": "Duration",  # Column to use for the color scale
         },