diff --git a/.env.example b/.env.example
index ecc0442..46f9522 100644
--- a/.env.example
+++ b/.env.example
@@ -1,3 +1,5 @@
GEMINI_API_KEY=
OPENAI_API_KEY=
-LOGFIRE_TOKEN=
\ No newline at end of file
+LOGFIRE_TOKEN=
+AWS_REGION=
+AWS_PROFILE=
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 8319601..d4b393f 100644
--- a/Makefile
+++ b/Makefile
@@ -3,4 +3,7 @@ install:
npm install -g @mermaid-js/mermaid-cli
lint:
- uv run ruff check .
\ No newline at end of file
+ uv run ruff check .
+
+leaderboard:
+ uv run -- streamlit run agents_mcp_usage/multi_mcp/eval_multi_mcp/merbench_ui.py
\ No newline at end of file
diff --git a/README.md b/README.md
index e2181e5..f5427e4 100644
--- a/README.md
+++ b/README.md
@@ -60,10 +60,6 @@ This project aims to teach:
- `oai-agent_mcp.py` - Example of using MCP with OpenAI Agents
- `pydantic_mcp.py` - Example of using MCP with Pydantic-AI
- - **eval_basic_mcp_use/** - Contains evaluation examples for single MCP usage:
- - `evals_adk_mcp.py` - Evaluation of MCP with Google's ADK
- - `evals_langchain_mcp.py` - Evaluation of MCP with LangGraph
- - `evals_pydantic_mcp.py` - Evaluation of MCP with Pydantic-AI
- **[agents_mcp_usage/multi_mcp/](agents_mcp_usage/multi_mcp/)** - Advanced multi-MCP server integration examples
- **multi_mcp_use/** - Contains examples of using multiple MCP servers simultaneously:
diff --git a/agents_mcp_usage/basic_mcp/basic_mcp_use/adk_mcp.py b/agents_mcp_usage/basic_mcp/basic_mcp_use/adk_mcp.py
index 2848f51..d15ec9d 100644
--- a/agents_mcp_usage/basic_mcp/basic_mcp_use/adk_mcp.py
+++ b/agents_mcp_usage/basic_mcp/basic_mcp_use/adk_mcp.py
@@ -41,20 +41,20 @@ async def main(query: str = "Greet Andrew and give him the current time") -> Non
# Create the agent
root_agent = LlmAgent(
model="gemini-2.5-pro-preview-03-25",
- name="mcp_pydantic_assistant",
+ name="mcp_adk_assistant",
tools=tools,
)
# Set up session
session_service = InMemorySessionService()
session = session_service.create_session(
- app_name="mcp_pydantic_app",
+ app_name="mcp_adk_app",
user_id="aginns",
)
# Create the runner
runner = Runner(
- app_name="mcp_pydantic_app",
+ app_name="mcp_adk_app",
agent=root_agent,
session_service=session_service,
)
diff --git a/agents_mcp_usage/multi_mcp/eval_multi_mcp/costs.csv b/agents_mcp_usage/multi_mcp/eval_multi_mcp/costs.csv
deleted file mode 100644
index acafc79..0000000
--- a/agents_mcp_usage/multi_mcp/eval_multi_mcp/costs.csv
+++ /dev/null
@@ -1,103 +0,0 @@
-# Prices are per 1 million tokens
-# Gemini prices from https://ai.google.dev/gemini-api/docs/pricing
-# OpenAI prices from https://openai.com/api/pricing/
-MODEL_COSTS = {
- "gemini-2.5-pro-preview-05-06": {
- "input": [
- {"up_to": 200000, "price": 1.25},
- {"up_to": float('inf'), "price": 2.50},
- ],
- "output": {
- "default": [
- {"up_to": 200000, "price": 10.00},
- {"up_to": float('inf'), "price": 15.00},
- ]
- },
- },
- "gemini-2.5-pro-preview-06-05": {
- "input": [
- {"up_to": 200000, "price": 1.25},
- {"up_to": float('inf'), "price": 2.50},
- ],
- "output": {
- "default": [
- {"up_to": 200000, "price": 10.00},
- {"up_to": float('inf'), "price": 15.00},
- ]
- },
- },
- "gemini-2.5-pro-preview": {
- "input": [
- {"up_to": 200000, "price": 1.25},
- {"up_to": float('inf'), "price": 2.50},
- ],
- "output": {
- "default": [
- {"up_to": 200000, "price": 10.00},
- {"up_to": float('inf'), "price": 15.00},
- ]
- },
- },
- "gemini-1.5-pro": {
- "input": [
- {"up_to": 128000, "price": 1.25},
- {"up_to": float('inf'), "price": 2.50},
- ],
- "output": {
- "default": [
- {"up_to": 128000, "price": 5.00},
- {"up_to": float('inf'), "price": 10.00},
- ]
- },
- },
- "gemini-1.5-flash": {
- "input": [
- {"up_to": 128000, "price": 0.075},
- {"up_to": float('inf'), "price": 0.15},
- ],
- "output": {
- "default": [
- {"up_to": 128000, "price": 0.30},
- {"up_to": float('inf'), "price": 0.60},
- ]
- },
- },
- "gemini-2.0-flash": {
- "input": [{"up_to": float('inf'), "price": 0.10}],
- "output": {"default": [{"up_to": float('inf'), "price": 0.40}]},
- },
- "gemini-2.5-flash-preview-04-17": {
- "input": [{"up_to": float('inf'), "price": 0.15}],
- "output": {
- "non_thinking": [{"up_to": float('inf'), "price": 0.60}],
- "thinking": [{"up_to": float('inf'), "price": 3.50}],
- },
- },
- "gemini-2.5-flash-preview": {
- "input": [{"up_to": float('inf'), "price": 0.15}],
- "output": {
- "non_thinking": [{"up_to": float('inf'), "price": 0.60}],
- "thinking": [{"up_to": float('inf'), "price": 3.50}],
- },
- },
- "openai:o4-mini": {
- "input": [{"up_to": float('inf'), "price": 1.10}],
- "output": {"default": [{"up_to": float('inf'), "price": 4.40}]},
- },
- "openai:o3": {
- "input": [{"up_to": float('inf'), "price": 10.00}],
- "output": {"default": [{"up_to": float('inf'), "price": 40.00}]},
- },
- "openai:gpt-4.1": {
- "input": [{"up_to": float('inf'), "price": 2.00}],
- "output": {"default": [{"up_to": float('inf'), "price": 8.00}]},
- },
- "openai:gpt-4.1-mini": {
- "input": [{"up_to": float('inf'), "price": 0.40}],
- "output": {"default": [{"up_to": float('inf'), "price": 1.60}]},
- },
- "openai:gpt-4.1-nano": {
- "input": [{"up_to": float('inf'), "price": 0.10}],
- "output": {"default": [{"up_to": float('inf'), "price": 0.40}]},
- },
-}
\ No newline at end of file
diff --git a/agents_mcp_usage/multi_mcp/eval_multi_mcp/costs.json b/agents_mcp_usage/multi_mcp/eval_multi_mcp/costs.json
new file mode 100644
index 0000000..df86d38
--- /dev/null
+++ b/agents_mcp_usage/multi_mcp/eval_multi_mcp/costs.json
@@ -0,0 +1,158 @@
+{
+ "_comment": "Prices are per 1 million tokens",
+ "_sources": {
+ "gemini": "https://ai.google.dev/gemini-api/docs/pricing",
+ "openai": "https://openai.com/api/pricing/"
+ },
+ "model_costs": {
+ "gemini-2.5-pro-preview-03-25": {
+ "friendly_name": "Gemini 2.5 Pro Preview (Mar)",
+ "input": [
+ {"up_to": 200000, "price": 1.25},
+ {"up_to": "inf", "price": 2.50}
+ ],
+ "output": {
+ "default": [
+ {"up_to": 200000, "price": 10.00},
+ {"up_to": "inf", "price": 15.00}
+ ]
+ }
+ },
+ "gemini-2.5-pro-preview-05-06": {
+ "friendly_name": "Gemini 2.5 Pro Preview (May)",
+ "input": [
+ {"up_to": 200000, "price": 1.25},
+ {"up_to": "inf", "price": 2.50}
+ ],
+ "output": {
+ "default": [
+ {"up_to": 200000, "price": 10.00},
+ {"up_to": "inf", "price": 15.00}
+ ]
+ }
+ },
+ "gemini-2.5-pro-preview-06-05": {
+ "friendly_name": "Gemini 2.5 Pro Preview (Jun)",
+ "input": [
+ {"up_to": 200000, "price": 1.25},
+ {"up_to": "inf", "price": 2.50}
+ ],
+ "output": {
+ "default": [
+ {"up_to": 200000, "price": 10.00},
+ {"up_to": "inf", "price": 15.00}
+ ]
+ }
+ },
+ "gemini-2.5-pro-preview": {
+ "friendly_name": "Gemini 2.5 Pro Preview",
+ "input": [
+ {"up_to": 200000, "price": 1.25},
+ {"up_to": "inf", "price": 2.50}
+ ],
+ "output": {
+ "default": [
+ {"up_to": 200000, "price": 10.00},
+ {"up_to": "inf", "price": 15.00}
+ ]
+ }
+ },
+ "gemini-1.5-pro": {
+ "friendly_name": "Gemini 1.5 Pro",
+ "input": [
+ {"up_to": 128000, "price": 1.25},
+ {"up_to": "inf", "price": 2.50}
+ ],
+ "output": {
+ "default": [
+ {"up_to": 128000, "price": 5.00},
+ {"up_to": "inf", "price": 10.00}
+ ]
+ }
+ },
+ "gemini-1.5-flash": {
+ "friendly_name": "Gemini 1.5 Flash",
+ "input": [
+ {"up_to": 128000, "price": 0.075},
+ {"up_to": "inf", "price": 0.15}
+ ],
+ "output": {
+ "default": [
+ {"up_to": 128000, "price": 0.30},
+ {"up_to": "inf", "price": 0.60}
+ ]
+ }
+ },
+ "gemini-2.0-flash": {
+ "friendly_name": "Gemini 2.0 Flash",
+ "input": [{"up_to": "inf", "price": 0.10}],
+ "output": {"default": [{"up_to": "inf", "price": 0.40}]}
+ },
+ "gemini-2.5-flash-preview-04-17": {
+ "friendly_name": "Gemini 2.5 Flash Preview (Apr)",
+ "input": [{"up_to": "inf", "price": 0.15}],
+ "output": {
+ "non_thinking": [{"up_to": "inf", "price": 0.60}],
+ "thinking": [{"up_to": "inf", "price": 3.50}]
+ }
+ },
+ "gemini-2.5-flash-preview": {
+ "friendly_name": "Gemini 2.5 Flash Preview",
+ "input": [{"up_to": "inf", "price": 0.15}],
+ "output": {
+ "non_thinking": [{"up_to": "inf", "price": 0.60}],
+ "thinking": [{"up_to": "inf", "price": 3.50}]
+ }
+ },
+ "openai:o4-mini": {
+ "friendly_name": "OpenAI o4-mini",
+ "input": [{"up_to": "inf", "price": 1.10}],
+ "output": {"default": [{"up_to": "inf", "price": 4.40}]}
+ },
+ "openai:o3": {
+ "friendly_name": "OpenAI o3",
+ "input": [{"up_to": "inf", "price": 10.00}],
+ "output": {"default": [{"up_to": "inf", "price": 40.00}]}
+ },
+ "openai:gpt-4.1": {
+ "friendly_name": "GPT-4.1",
+ "input": [{"up_to": "inf", "price": 2.00}],
+ "output": {"default": [{"up_to": "inf", "price": 8.00}]}
+ },
+ "openai:gpt-4.1-mini": {
+ "friendly_name": "GPT-4.1 Mini",
+ "input": [{"up_to": "inf", "price": 0.40}],
+ "output": {"default": [{"up_to": "inf", "price": 1.60}]}
+ },
+ "openai:gpt-4.1-nano": {
+ "friendly_name": "GPT-4.1 Nano",
+ "input": [{"up_to": "inf", "price": 0.10}],
+ "output": {"default": [{"up_to": "inf", "price": 0.40}]}
+ },
+ "bedrock:us.anthropic.claude-sonnet-4-20250514-v1:0": {
+ "friendly_name": "Claude 4 Sonnet",
+ "input": [{"up_to": "inf", "price": 3.00}],
+ "output": {"default": [{"up_to": "inf", "price": 15.00}]}
+ },
+ "bedrock:us.anthropic.claude-opus-4-20250514-v1:0": {
+ "friendly_name": "Claude 4 Opus",
+ "input": [{"up_to": "inf", "price": 15.00}],
+ "output": {"default": [{"up_to": "inf", "price": 75.00}]}
+ },
+ "bedrock:us.anthropic.claude-3-7-sonnet-20250219-v1:0": {
+ "friendly_name": "Claude 3.7 Sonnet",
+ "input": [{"up_to": "inf", "price": 3.00}],
+ "output": {"default": [{"up_to": "inf", "price": 15.00}]}
+ },
+ "bedrock:us.anthropic.claude-3-5-sonnet-20240620-v1:0": {
+ "friendly_name": "Claude 3.5 Sonnet",
+ "input": [{"up_to": "inf", "price": 3.00}],
+ "output": {"default": [{"up_to": "inf", "price": 15.00}]}
+ },
+ "bedrock:us.anthropic.claude-3-5-haiku-20241022-v1:0": {
+ "friendly_name": "Claude 3.5 Haiku",
+ "input": [{"up_to": "inf", "price": 1.00}],
+ "output": {"default": [{"up_to": "inf", "price": 4.00}]}
+ }
+ }
+}
\ No newline at end of file
diff --git a/agents_mcp_usage/multi_mcp/eval_multi_mcp/dashboard_config.py b/agents_mcp_usage/multi_mcp/eval_multi_mcp/dashboard_config.py
index 8167d96..24cf37d 100644
--- a/agents_mcp_usage/multi_mcp/eval_multi_mcp/dashboard_config.py
+++ b/agents_mcp_usage/multi_mcp/eval_multi_mcp/dashboard_config.py
@@ -19,8 +19,16 @@
MERBENCH_CONFIG = {
# --- General Dashboard Settings ---
- "title": "Merbench - LLM Evaluation Benchmark",
- "icon": "🏆", # Emoji for the browser tab
+ "title": "🧜♀️ Merbench - LLM Evaluation ",
+ "description": (
+ "Getting LLMs to consistently nail the mermaid diagram syntax can be... an adventure. "
+ "\n\nMerbench tests this ability by providing an LLM Agent access to an MCP server that both validates "
+ "and provides error messages to guide correction of syntax. There are three different difficulty levels (test cases), "
+ "and the LLM is given a fixed number of attempts to fix the diagram, if this is exceeded, the test case is considered failed. "
+ "\n\n **Performance is a measure of both tool usage, and Mermaid syntax understanding.**"
+ "\n\nThe leaderboard shows the average success rate across all selected models and difficulty levels over *n runs*."
+ ),
+ "icon": "🧜♀️", # Emoji for the browser tab
# --- Primary Metric Configuration ---
# The primary metric is the main score used for the leaderboard and
# the y-axis of the Pareto frontier plot.
@@ -50,6 +58,7 @@
"x_axis_options": {
"cost": {"column": "total_cost", "label": "Cost"},
"tokens": {"column": "total_response_tokens", "label": "Tokens"},
+ "duration": {"column": "Duration", "label": "Duration"},
},
"color_axis": "Duration", # Column to use for the color scale
},
diff --git a/agents_mcp_usage/multi_mcp/eval_multi_mcp/evals_pydantic_mcp.py b/agents_mcp_usage/multi_mcp/eval_multi_mcp/evals_pydantic_mcp.py
index d1a9338..c777290 100644
--- a/agents_mcp_usage/multi_mcp/eval_multi_mcp/evals_pydantic_mcp.py
+++ b/agents_mcp_usage/multi_mcp/eval_multi_mcp/evals_pydantic_mcp.py
@@ -49,12 +49,7 @@
logfire.instrument_pydantic_ai()
# Default model configurations
-DEFAULT_MODEL = "gemini-2.5-pro-preview-05-06"
-DEFAULT_MODELS = [
- "gemini-2.5-pro-preview-06-05",
- "gemini-2.0-flash",
- "gemini-2.5-flash-preview-04-17",
-]
+DEFAULT_MODEL = "gemini-2.5-pro-preview-06-05"
# Retry configuration
RETRYABLE_HTTP_STATUS_CODES = {429, 500, 502, 503, 504}
@@ -214,6 +209,30 @@ def create_agent(
if model_settings is None:
model_settings = {}
+ # Handle Bedrock models specifically
+ if model.startswith("bedrock:"):
+ from pydantic_ai.models.bedrock import BedrockConverseModel
+ from pydantic_ai.providers.bedrock import BedrockProvider
+
+ # Extract the model name (remove "bedrock:" prefix)
+ model_name = model.replace("bedrock:", "")
+
+ # Create BedrockConverseModel with proper region and profile configuration
+ bedrock_model = BedrockConverseModel(
+ model_name,
+ provider=BedrockProvider(
+ region_name=os.getenv("AWS_REGION", "us-east-1"),
+ profile_name=os.getenv("AWS_PROFILE", "my-aws-profile"),
+ ),
+ )
+
+ return Agent(
+ bedrock_model,
+ mcp_servers=get_mcp_servers(),
+ model_settings=model_settings,
+ )
+
+ # For non-Bedrock models, use the original approach
return Agent(
model,
mcp_servers=get_mcp_servers(),
@@ -765,9 +784,9 @@ async def fix_with_model(inputs: MermaidInput) -> MermaidOutput:
if __name__ == "__main__":
# You can use different models for the agent and the judge
# agent_model = os.getenv("AGENT_MODEL", DEFAULT_MODEL)
- agent_model = "gemini-2.5-pro-preview-06-05"
+ # agent_model = "gemini-2.5-pro-preview-06-05"
# agent_model = "openai:o4-mini"
- # agent_model = "gemini-2.5-flash-preview-04-17"
+ agent_model = "gemini-2.5-flash-preview-04-17"
judge_model = os.getenv("JUDGE_MODEL", DEFAULT_MODEL)
async def run_all():
diff --git a/agents_mcp_usage/multi_mcp/eval_multi_mcp/merbench_ui.py b/agents_mcp_usage/multi_mcp/eval_multi_mcp/merbench_ui.py
index 4d2240d..20e2864 100644
--- a/agents_mcp_usage/multi_mcp/eval_multi_mcp/merbench_ui.py
+++ b/agents_mcp_usage/multi_mcp/eval_multi_mcp/merbench_ui.py
@@ -29,40 +29,65 @@
# --- Cost Loading ---
-def load_model_costs(file_path: str) -> Dict:
- """Loads model costs from a CSV file and returns a structured dictionary.
+def load_model_costs(file_path: str) -> tuple[Dict, Dict]:
+ """Loads model costs and friendly names from a JSON file and returns structured dictionaries.
Args:
- file_path: The path to the cost file.
+ file_path: The path to the cost file (JSON or CSV).
Returns:
- A dictionary containing the model costs.
+ A tuple containing (model_costs_dict, friendly_names_dict).
"""
+ import json
+
+ # Try JSON first (new format), then fall back to CSV (old format)
+ json_path = file_path.replace('.csv', '.json')
+
try:
- with open(file_path, "r", encoding="utf-8") as f:
- # Read lines, skipping comments and empty lines
- lines = [
- line for line in f if not line.strip().startswith("#") and line.strip()
- ]
-
- # Find the start of the dictionary-like definition
- dict_str = "".join(lines)
- match = re.search(r"MODEL_COSTS\s*=\s*({.*})", dict_str, re.DOTALL)
- if not match:
- st.error(f"Could not find 'MODEL_COSTS' dictionary in {file_path}")
- return {}
-
- # Safely evaluate the dictionary string
- model_costs_raw = eval(match.group(1), {"float": float})
-
- return model_costs_raw
-
+ # Try to load JSON format first
+ with open(json_path, "r", encoding="utf-8") as f:
+ data = json.load(f)
+ model_costs_raw = data["model_costs"]
+
+ # Extract friendly names and clean cost data
+ friendly_names = {}
+ model_costs_clean = {}
+
+ for model_id, model_data in model_costs_raw.items():
+ # Extract friendly name if it exists
+ if isinstance(model_data, dict) and "friendly_name" in model_data:
+ friendly_names[model_id] = model_data["friendly_name"]
+ # Create a clean copy without the friendly_name for cost calculations
+ model_costs_clean[model_id] = {
+ key: _convert_inf_strings(value) if key in ["input", "output"] else value
+ for key, value in model_data.items()
+ if key != "friendly_name"
+ }
+ else:
+ # No friendly name, use model_id as fallback
+ friendly_names[model_id] = model_id
+ model_costs_clean[model_id] = _convert_inf_strings(model_data)
+
+ return model_costs_clean, friendly_names
+
except FileNotFoundError:
- st.warning(f"Cost file not found at {file_path}. Using empty cost config.")
- return {}
- except (SyntaxError, NameError, Exception) as e:
- st.error(f"Error parsing cost file {file_path}: {e}")
- return {}
+ st.warning(f"Cost file not found at {json_path}. Using empty cost config.")
+ return {}, {}
+ except (json.JSONDecodeError, KeyError, Exception) as e:
+ st.error(f"Error parsing JSON cost file {json_path}: {e}")
+ return {}, {}
+
+
+def _convert_inf_strings(data):
+ """Recursively convert 'inf' strings to float('inf') in nested data structures."""
+ if isinstance(data, dict):
+ return {key: _convert_inf_strings(value) for key, value in data.items()}
+ elif isinstance(data, list):
+ return [_convert_inf_strings(item) for item in data]
+ elif data == "inf":
+ return float('inf')
+ else:
+ return data
# --- Data Loading and Processing ---
@@ -388,8 +413,76 @@ def create_leaderboard(
return leaderboard.sort_values("Correct", ascending=sort_ascending)
+def _calculate_smart_label_positions(
+ x_data, y_data, labels, min_distance_threshold=0.1
+):
+ """Calculate optimal label positions to avoid overlaps.
+
+ Args:
+ x_data: Array of x coordinates (normalized to 0-1 range for distance calc)
+ y_data: Array of y coordinates (normalized to 0-1 range for distance calc)
+ labels: Array of label strings
+ min_distance_threshold: Minimum distance threshold for considering overlap
+
+ Returns:
+ List of textposition strings for each point
+ """
+ import numpy as np
+
+ # Normalize coordinates to 0-1 range for distance calculations
+ x_norm = (
+ (x_data - x_data.min()) / (x_data.max() - x_data.min())
+ if x_data.max() != x_data.min()
+ else x_data * 0
+ )
+ y_norm = (
+ (y_data - y_data.min()) / (y_data.max() - y_data.min())
+ if y_data.max() != y_data.min()
+ else y_data * 0
+ )
+
+ positions = ["top center"] * len(x_data)
+ position_options = [
+ "top center",
+ "bottom center",
+ "middle left",
+ "middle right",
+ "top left",
+ "top right",
+ "bottom left",
+ "bottom right",
+ ]
+
+ # Calculate distances between all pairs of points
+ for i in range(len(x_data)):
+ for j in range(i + 1, len(x_data)):
+ distance = np.sqrt(
+ (x_norm[i] - x_norm[j]) ** 2 + (y_norm[i] - y_norm[j]) ** 2
+ )
+
+ if distance < min_distance_threshold:
+ # Points are close, try different positions
+ for pos_idx, position in enumerate(position_options):
+ if positions[i] == "top center":
+ positions[i] = position_options[pos_idx % len(position_options)]
+ break
+
+ for pos_idx, position in enumerate(position_options):
+ if positions[j] == "top center" or positions[j] == positions[i]:
+ positions[j] = position_options[
+ (pos_idx + 1) % len(position_options)
+ ]
+ break
+
+ return positions
+
+
def create_pareto_frontier_plot(
- df: pd.DataFrame, selected_groups: List[str], x_axis_mode: str, config: Dict
+ df: pd.DataFrame,
+ selected_groups: List[str],
+ x_axis_mode: str,
+ config: Dict,
+ friendly_names: Dict = None,
) -> go.Figure:
"""Visualizes the trade-off between model performance and cost/token usage.
@@ -419,6 +512,7 @@ def create_pareto_frontier_plot(
y_axis=(primary_metric_name, "mean"),
total_cost=("total_cost", "mean"),
total_response_tokens=("total_response_tokens", "mean"),
+ Duration=("Duration", "mean"),
color_axis=(plot_config["color_axis"], "mean"),
)
.reset_index()
@@ -428,31 +522,71 @@ def create_pareto_frontier_plot(
x_data = model_metrics[x_axis_config["column"]]
x_title = x_axis_config["label"]
hover_label = x_axis_config["label"]
- hover_format = ":.4f" if x_axis_mode == "cost" else ":.0f"
-
- fig.add_trace(
- go.Scatter(
- x=x_data,
- y=model_metrics["y_axis"],
- mode="markers+text",
- marker=dict(
- size=18,
- color=model_metrics["color_axis"],
- colorscale="RdYlGn_r",
- showscale=True,
- colorbar=dict(title=f"Avg {plot_config['color_axis']} (s)"),
- ),
- text=model_metrics["Model"],
- textposition="top center",
- hovertemplate=(
- "%{text}
"
- f"{y_axis_label}: %{{y:.1f}}%
"
- f"{hover_label}: %{{x{hover_format}}}
"
- f"Avg {plot_config['color_axis']}: %{{marker.color:.1f}}s"
- ),
- )
+ if x_axis_mode == "cost":
+ hover_format = ":.4f"
+ elif x_axis_mode == "duration":
+ hover_format = ":.2f"
+ else:
+ hover_format = ":.0f"
+
+ # Calculate smart label positions to avoid overlaps
+ label_positions = _calculate_smart_label_positions(
+ x_data.values, model_metrics["y_axis"].values, model_metrics["Model"].values
)
+ # Group data by text position to create separate traces
+ from collections import defaultdict
+
+ position_groups = defaultdict(list)
+
+ for i, position in enumerate(label_positions):
+ position_groups[position].append(i)
+
+ # Create a trace for each text position group
+ first_trace = True
+ for position, indices in position_groups.items():
+ x_vals = [x_data.iloc[i] for i in indices]
+ y_vals = [model_metrics["y_axis"].iloc[i] for i in indices]
+ colors = [model_metrics["color_axis"].iloc[i] for i in indices]
+
+ # Get model names for this position group
+ original_names = [model_metrics["Model"].iloc[i] for i in indices]
+
+ # Use friendly names for display if available, otherwise use original names
+ if friendly_names:
+ display_texts = [friendly_names.get(name, name) for name in original_names]
+ else:
+ display_texts = original_names
+
+ fig.add_trace(
+ go.Scatter(
+ x=x_vals,
+ y=y_vals,
+ mode="markers+text",
+ marker=dict(
+ size=18,
+ color=colors,
+ colorscale="RdYlGn_r",
+ showscale=first_trace, # Show colorbar only on first trace
+ colorbar=dict(title=f"Avg {plot_config['color_axis']} (s)")
+ if first_trace
+ else None,
+ ),
+ text=display_texts, # Use friendly names for display
+ textposition=position,
+ customdata=original_names, # Store original names for hover
+ hovertemplate=(
+ "%{text}
" # Friendly name as title
+ "API Name: %{customdata}
" # Original API name
+ f"{y_axis_label}: %{{y:.1f}}%
"
+ f"{hover_label}: %{{x{hover_format}}}
"
+ f"Avg {plot_config['color_axis']}: %{{marker.color:.1f}}s"
+ ),
+ showlegend=False, # Don't show legend for individual position groups
+ )
+ )
+ first_trace = False
+
fig.update_layout(
title=plot_config["title"].format(x_axis_label=x_title),
xaxis_title=f"Average {x_title}",
@@ -653,7 +787,7 @@ def main() -> None:
eval_config = EVAL_CONFIG # Use the validated config
st.title(eval_config.title)
- st.subheader("LLM Evaluation Benchmark Dashboard")
+ st.markdown(eval_config.description)
# --- Sidebar Setup ---
st.sidebar.header("⚙️ Data Configuration")
@@ -708,7 +842,7 @@ def main() -> None:
# Cost configuration in sidebar
st.sidebar.subheader("💰 Cost Configuration")
cost_file_path = os.path.join(os.path.dirname(__file__), "costs.csv")
- model_costs = load_model_costs(cost_file_path)
+ model_costs, friendly_names = load_model_costs(cost_file_path)
available_models = sorted(df_initial["Model"].unique())
cost_config = {}
@@ -781,7 +915,7 @@ def main() -> None:
cols[3].metric("Files Loaded", len(selected_files))
st.info(
- f"**Showing results for {grouping_config.label.lower()}:** {', '.join(selected_groups) if selected_groups else 'None'}"
+ f"**Showing averaged results for {grouping_config.label.lower()}:** {', '.join(selected_groups) if selected_groups else 'None'}"
)
# --- Leaderboard & Pareto ---
@@ -817,7 +951,7 @@ def main() -> None:
else:
st.warning("No data available for the current filter selection.")
- st.header("📈 Pareto Frontier Analysis")
+ st.header("📈 Pareto Frontier")
pareto_config = eval_config.plots.pareto
x_axis_mode = st.radio(
"Compare performance against:",
@@ -827,7 +961,7 @@ def main() -> None:
)
st.plotly_chart(
create_pareto_frontier_plot(
- df, selected_groups, x_axis_mode, eval_config.model_dump()
+ df, selected_groups, x_axis_mode, eval_config.model_dump(), friendly_names
),
use_container_width=True,
)
diff --git a/agents_mcp_usage/multi_mcp/eval_multi_mcp/run_multi_evals.py b/agents_mcp_usage/multi_mcp/eval_multi_mcp/run_multi_evals.py
index 8f56ba1..29516b7 100644
--- a/agents_mcp_usage/multi_mcp/eval_multi_mcp/run_multi_evals.py
+++ b/agents_mcp_usage/multi_mcp/eval_multi_mcp/run_multi_evals.py
@@ -34,7 +34,6 @@
# Import shared functionality from the improved evals module
from agents_mcp_usage.multi_mcp.eval_multi_mcp.evals_pydantic_mcp import (
- DEFAULT_MODELS,
MermaidInput,
MermaidOutput,
fix_mermaid_diagram,
@@ -44,6 +43,23 @@
load_dotenv()
+DEFAULT_MODELS = [
+ # "gemini-2.5-pro-preview-06-05",
+ # "gemini-2.5-pro-preview-05-06",
+ # "gemini-2.5-pro-preview-03-25",
+ "gemini-2.0-flash",
+ "gemini-2.5-flash-preview-04-17",
+ # "openai:o4-mini",
+ # "openai:gpt-4.1",
+ # "openai:gpt-4.1-mini",
+ # "openai:gpt-4.1-nano",
+ # "bedrock:us.anthropic.claude-sonnet-4-20250514-v1:0",
+ # "bedrock:us.anthropic.claude-opus-4-20250514-v1:0",
+ # "bedrock:us.anthropic.claude-3-7-sonnet-20250219-v1:0",
+ # "bedrock:us.anthropic.claude-3-5-sonnet-20240620-v1:0",
+ # "bedrock:us.anthropic.claude-3-5-haiku-20241022-v1:0",
+]
+
logfire.configure(
send_to_logfire="if-token-present", service_name="multi-model-mermaid-evals"
)
@@ -496,13 +512,13 @@ async def main() -> None:
parser.add_argument(
"--judge-model",
type=str,
- default="gemini-2.5-pro-preview-03-25",
+ default="gemini-2.5-pro-preview-06-05",
help="Model to use for LLM judging",
)
parser.add_argument(
"--parallel",
action="store_true",
- default=True,
+ default=False,
help="Run evaluations in parallel",
)
parser.add_argument(
diff --git a/agents_mcp_usage/multi_mcp/eval_multi_mcp/schemas.py b/agents_mcp_usage/multi_mcp/eval_multi_mcp/schemas.py
index dc8059b..cbc2f1c 100644
--- a/agents_mcp_usage/multi_mcp/eval_multi_mcp/schemas.py
+++ b/agents_mcp_usage/multi_mcp/eval_multi_mcp/schemas.py
@@ -2,8 +2,8 @@
Pydantic Schemas for Dashboard Configuration Validation
"""
-from pydantic import BaseModel, Field, validator
-from typing import List, Dict, Optional, Any
+from pydantic import BaseModel, Field, field_validator, model_validator
+from typing import List, Dict, Optional
class PrimaryMetricConfig(BaseModel):
@@ -15,7 +15,8 @@ class PrimaryMetricConfig(BaseModel):
description="Optional source column to calculate the primary metric from if it doesn't exist.",
)
- @validator("goal")
+ @field_validator("goal")
+ @classmethod
def goal_must_be_max_or_min(cls, v: str) -> str:
"""Validates that the goal is either 'maximize' or 'minimize'."""
if v not in ["maximize", "minimize"]:
@@ -65,17 +66,15 @@ class BarPlotConfig(BaseModel):
y_columns: Optional[List[str]] = None
series: Optional[List[StackedBarPlotSeries]] = None
- @validator("y_columns", always=True)
- def check_prefix_or_columns(
- cls, v: Optional[List[str]], values: Dict[str, Any]
- ) -> Optional[List[str]]:
+ @model_validator(mode="after")
+ def check_prefix_or_columns(self) -> "BarPlotConfig":
"""Validates that either 'y_prefix' or 'y_columns' is provided for grouped_bar plots."""
- if not values.get("y_prefix") and not v:
- if values.get("type") == "grouped_bar":
+ if not self.y_prefix and not self.y_columns:
+ if self.type == "grouped_bar":
raise ValueError(
"Either 'y_prefix' or 'y_columns' must be provided for grouped_bar plots."
)
- return v
+ return self
class PlotConfig(BaseModel):
@@ -93,6 +92,7 @@ class CostCalculationConfig(BaseModel):
class DashboardConfig(BaseModel):
title: str
+ description: str
icon: str
primary_metric: PrimaryMetricConfig
grouping: GroupingConfig
diff --git a/agents_mcp_usage/multi_mcp/mermaid_diagrams.py b/agents_mcp_usage/multi_mcp/mermaid_diagrams.py
index 485c993..7d14012 100644
--- a/agents_mcp_usage/multi_mcp/mermaid_diagrams.py
+++ b/agents_mcp_usage/multi_mcp/mermaid_diagrams.py
@@ -5,6 +5,7 @@
# Agent Frameworks
subgraph "Agent"
+ direction TD
Agent[Agent]
ADK["Google ADK
(adk_mcp.py)"]
LG["LangGraph
(langgraph_mcp.py)"]
@@ -21,7 +22,7 @@
subgraph "MCP"
direction TD
MCP["Model Context Protocol Server
(mcp_servers/example_server.py)"]
- Tools["Tools
- add(a, b)
- get_current_time() e.g. {current_time}"]
+ Tools["Tools
- add(a, b)
- get_current_time() {current_time}"]
Resources["Resources
- greeting://{{name}}"]
MCP --- Tools
MCP --- Resources
@@ -38,7 +39,7 @@
Logfire[("Logfire
Tracing")]
ADK --> MCP
- LG --> MCP
+ LG -- > MCP
OAI --> MCP
PYD --> MCP
@@ -47,7 +48,7 @@
MCP --> OTHER
ADK --> Logfire
- LG --> Logfire
+ LG -- > Logfire
OAI --> Logfire
PYD --> Logfire
@@ -63,6 +64,7 @@
```
"""
+# 7 syntax errors
invalid_mermaid_diagram_medium = """
```mermaid
graph LR
@@ -87,13 +89,15 @@
subgraph "MCP"
direction TB
MCP["Model Context Protocol Server
(mcp_servers/example_server.py)"]
- Tools["Tools
- add(a, b)
- get_current_time() e.g. {current_time}"]
+ Tools["Tools
- add(a, b)
- get_current_time() {current_time}"]
Resources["Resources
- greeting://{{name}}"]
MCP --- Tools
MCP --- Resources
end
+ # LLM Providers
subgraph "LLM Providers"
+ direction TB
OAI_LLM["OpenAI Models"]
GEM["Google Gemini Models"]
OTHER["Other LLM Providers..."]
@@ -102,7 +106,7 @@
Logfire[("Logfire
Tracing")]
ADK --> MCP
- LG --> MCP
+ LG -- > MCP
OAI --> MCP
PYD --> MCP
@@ -111,7 +115,7 @@
MCP --> OTHER
ADK --> Logfire
- LG --> Logfire
+ LG -- > Logfire
OAI --> Logfire
PYD --> Logfire
@@ -127,6 +131,7 @@
```
"""
+# 2 syntax errors
invalid_mermaid_diagram_easy = """
```mermaid
graph LR
@@ -148,16 +153,18 @@
end
%% MCP Server
- subgraph "MCP Server"
+ subgraph "MCP"
direction TB
MCP["Model Context Protocol Server
(mcp_servers/example_server.py)"]
- Tools["Tools
- add(a, b)
- get_current_time() e.g. {current_time}"]
+ Tools["Tools
- add(a, b)
- get_current_time() {current_time}"]
Resources["Resources
- greeting://{{name}}"]
- MCPs --- Tools
- MCPs --- Resources
+ MCP --- Tools
+ MCP --- Resources
end
+ %% LLM Providers
subgraph "LLM Providers"
+ direction TB
OAI_LLM["OpenAI Models"]
GEM["Google Gemini Models"]
OTHER["Other LLM Providers..."]
@@ -171,7 +178,7 @@
PYD --> MCP
MCP --> OAI_LLM
- MCP --> GEM
+ MCP --> GEMINI
MCP --> OTHER
ADK --> Logfire
@@ -191,7 +198,7 @@
```
"""
-valid_mermaid_diagram = """`
+valid_mermaid_diagram = """
```mermaid
graph LR
User((User)) --> |"Run script
(e.g., pydantic_mcp.py)"| Agent
@@ -215,13 +222,15 @@
subgraph "MCP Server"
direction TB
MCP["Model Context Protocol Server
(mcp_servers/example_server.py)"]
- Tools["Tools
- add(a, b)
- get_current_time() e.g. {current_time}"]
+ Tools["Tools
- add(a, b)
- get_current_time() {current_time}"]
Resources["Resources
- greeting://{{name}}"]
MCP --- Tools
MCP --- Resources
end
+ %% LLM Providers
subgraph "LLM Providers"
+ direction TB
OAI_LLM["OpenAI Models"]
GEM["Google Gemini Models"]
OTHER["Other LLM Providers..."]
diff --git a/pyproject.toml b/pyproject.toml
index 23be2b7..600a8df 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,7 +18,7 @@ dependencies = [
"openai-agents>=0.0.12",
"pandas>=2.3.0",
"plotly>=6.1.2",
- "pydantic-ai-slim[mcp]>=0.2.15",
+ "pydantic-ai-slim[bedrock,mcp]>=0.2.15",
"pydantic-evals[logfire]>=0.2.15",
"python-dotenv>=1.1.0",
"ruff>=0.11.10",
diff --git a/uv.lock b/uv.lock
index 5c3c391..100ec3c 100644
--- a/uv.lock
+++ b/uv.lock
@@ -125,6 +125,34 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/10/cb/f2ad4230dc2eb1a74edf38f1a38b9b52277f75bef262d8908e60d957e13c/blinker-1.9.0-py3-none-any.whl", hash = "sha256:ba0efaa9080b619ff2f3459d1d500c57bddea4a6b424b60a91141db6fd2f08bc", size = 8458, upload-time = "2024-11-08T17:25:46.184Z" },
]
+[[package]]
+name = "boto3"
+version = "1.38.34"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "botocore" },
+ { name = "jmespath" },
+ { name = "s3transfer" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/cb/2f/4547f4b3cc7b63ec1266edaf923a675f3eae2057299aba8ecfe46f013c1a/boto3-1.38.34.tar.gz", hash = "sha256:25e76b9fec8db8e21adaf84df0de5c58fa779be121bc327e07e920c7c0870394", size = 111793, upload-time = "2025-06-10T19:26:47.176Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/84/9b/5fe964a2e11f5b884d40fd5d90421faef04521f7866f6e473e3e28df583b/boto3-1.38.34-py3-none-any.whl", hash = "sha256:7d9409be63a11c1684427a9b06d6820ec72785cb275b56affe437f3709a80eb3", size = 139938, upload-time = "2025-06-10T19:26:43.781Z" },
+]
+
+[[package]]
+name = "botocore"
+version = "1.38.34"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "jmespath" },
+ { name = "python-dateutil" },
+ { name = "urllib3" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/08/27/34dfe90dd51d16473985198e056b82d74afd355dc47daa10af2a4f117072/botocore-1.38.34.tar.gz", hash = "sha256:a105f4d941f329aa72c43ddf42371ec4bee50ab3619fc1ef35d0005520219612", size = 13953563, upload-time = "2025-06-10T19:26:35.379Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/c9/17/d9611cf624ec274ed83139cf4446f631dbbb51cbfe51336db7b27a724dd7/botocore-1.38.34-py3-none-any.whl", hash = "sha256:95ff2c4819498e94b321c9b5ac65d02267df93ff7ce7617323b19f19ea7cb545", size = 13614470, upload-time = "2025-06-10T19:26:31.484Z" },
+]
+
[[package]]
name = "cachetools"
version = "5.5.2"
@@ -472,6 +500,9 @@ dependencies = [
{ name = "uritemplate" },
]
sdist = { url = "https://files.pythonhosted.org/packages/35/99/237cd2510aecca9fabb54007e58553274cc43cb3c18512ee1ea574d11b87/google_api_python_client-2.171.0.tar.gz", hash = "sha256:057a5c08d28463c6b9eb89746355de5f14b7ed27a65c11fdbf1d06c66bb66b23", size = 13028937, upload-time = "2025-06-03T18:57:38.732Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/79/db/c397e3eb3ea18f423855479d0a5852bdc9c3f644e3d4194931fa664a70b4/google_api_python_client-2.171.0-py3-none-any.whl", hash = "sha256:c9c9b76f561e9d9ac14e54a9e2c0842876201d5b96e69e48f967373f0784cbe9", size = 13547393, upload-time = "2025-06-10T02:14:38.225Z" },
+]
[[package]]
name = "google-auth"
@@ -643,6 +674,9 @@ dependencies = [
{ name = "protobuf" },
]
sdist = { url = "https://files.pythonhosted.org/packages/58/7a/2fa6735ec693d822fe08a76709c4d95d9b5b4c02e83e720497355039d2ee/google_cloud_secret_manager-2.24.0.tar.gz", hash = "sha256:ce573d40ffc2fb7d01719243a94ee17aa243ea642a6ae6c337501e58fbf642b5", size = 269516, upload-time = "2025-06-05T22:22:22.965Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/be/af/db1217cae1809e69a4527ee6293b82a9af2a1fb2313ad110c775e8f3c820/google_cloud_secret_manager-2.24.0-py3-none-any.whl", hash = "sha256:9bea1254827ecc14874bc86c63b899489f8f50bfe1442bfb2517530b30b3a89b", size = 218050, upload-time = "2025-06-10T02:02:19.88Z" },
+]
[[package]]
name = "google-cloud-speech"
@@ -971,6 +1005,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/b3/4a/4175a563579e884192ba6e81725fc0448b042024419be8d83aa8a80a3f44/jiter-0.10.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3aa96f2abba33dc77f79b4cf791840230375f9534e5fac927ccceb58c5e604a5", size = 354213, upload-time = "2025-05-18T19:04:41.894Z" },
]
+[[package]]
+name = "jmespath"
+version = "1.0.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/00/2a/e867e8531cf3e36b41201936b7fa7ba7b5702dbef42922193f05c8976cd6/jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe", size = 25843, upload-time = "2022-06-17T18:00:12.224Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/31/b4/b9b800c45527aadd64d5b442f9b932b00648617eb5d63d2c7a6587b7cafc/jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980", size = 20256, upload-time = "2022-06-17T18:00:10.251Z" },
+]
+
[[package]]
name = "jsonpatch"
version = "1.33"
@@ -1842,7 +1885,7 @@ dependencies = [
{ name = "openai-agents" },
{ name = "pandas" },
{ name = "plotly" },
- { name = "pydantic-ai-slim", extra = ["mcp"] },
+ { name = "pydantic-ai-slim", extra = ["bedrock", "mcp"] },
{ name = "pydantic-evals", extra = ["logfire"] },
{ name = "python-dotenv" },
{ name = "ruff" },
@@ -1864,7 +1907,7 @@ requires-dist = [
{ name = "openai-agents", specifier = ">=0.0.12" },
{ name = "pandas", specifier = ">=2.3.0" },
{ name = "plotly", specifier = ">=6.1.2" },
- { name = "pydantic-ai-slim", extras = ["mcp"], specifier = ">=0.2.15" },
+ { name = "pydantic-ai-slim", extras = ["bedrock", "mcp"], specifier = ">=0.2.15" },
{ name = "pydantic-evals", extras = ["logfire"], specifier = ">=0.2.15" },
{ name = "python-dotenv", specifier = ">=1.1.0" },
{ name = "ruff", specifier = ">=0.11.10" },
@@ -1890,6 +1933,9 @@ wheels = [
]
[package.optional-dependencies]
+bedrock = [
+ { name = "boto3" },
+]
mcp = [
{ name = "mcp" },
]
@@ -2185,6 +2231,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/ec/bf/b273dd11673fed8a6bd46032c0ea2a04b2ac9bfa9c628756a5856ba113b0/ruff-0.11.13-py3-none-win_arm64.whl", hash = "sha256:b4385285e9179d608ff1d2fb9922062663c658605819a6876d8beef0c30b7f3b", size = 10683928, upload-time = "2025-06-05T21:00:13.758Z" },
]
+[[package]]
+name = "s3transfer"
+version = "0.13.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "botocore" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ed/5d/9dcc100abc6711e8247af5aa561fc07c4a046f72f659c3adea9a449e191a/s3transfer-0.13.0.tar.gz", hash = "sha256:f5e6db74eb7776a37208001113ea7aa97695368242b364d73e91c981ac522177", size = 150232, upload-time = "2025-05-22T19:24:50.245Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/18/17/22bf8155aa0ea2305eefa3a6402e040df7ebe512d1310165eda1e233c3f8/s3transfer-0.13.0-py3-none-any.whl", hash = "sha256:0148ef34d6dd964d0d8cf4311b2b21c474693e57c2e069ec708ce043d2b527be", size = 85152, upload-time = "2025-05-22T19:24:48.703Z" },
+]
+
[[package]]
name = "shapely"
version = "2.1.1"