diff --git a/.env.example b/.env.example index ecc0442..46f9522 100644 --- a/.env.example +++ b/.env.example @@ -1,3 +1,5 @@ GEMINI_API_KEY= OPENAI_API_KEY= -LOGFIRE_TOKEN= \ No newline at end of file +LOGFIRE_TOKEN= +AWS_REGION= +AWS_PROFILE= \ No newline at end of file diff --git a/Makefile b/Makefile index 8319601..d4b393f 100644 --- a/Makefile +++ b/Makefile @@ -3,4 +3,7 @@ install: npm install -g @mermaid-js/mermaid-cli lint: - uv run ruff check . \ No newline at end of file + uv run ruff check . + +leaderboard: + uv run -- streamlit run agents_mcp_usage/multi_mcp/eval_multi_mcp/merbench_ui.py \ No newline at end of file diff --git a/README.md b/README.md index e2181e5..f5427e4 100644 --- a/README.md +++ b/README.md @@ -60,10 +60,6 @@ This project aims to teach: - `oai-agent_mcp.py` - Example of using MCP with OpenAI Agents - `pydantic_mcp.py` - Example of using MCP with Pydantic-AI - - **eval_basic_mcp_use/** - Contains evaluation examples for single MCP usage: - - `evals_adk_mcp.py` - Evaluation of MCP with Google's ADK - - `evals_langchain_mcp.py` - Evaluation of MCP with LangGraph - - `evals_pydantic_mcp.py` - Evaluation of MCP with Pydantic-AI - **[agents_mcp_usage/multi_mcp/](agents_mcp_usage/multi_mcp/)** - Advanced multi-MCP server integration examples - **multi_mcp_use/** - Contains examples of using multiple MCP servers simultaneously: diff --git a/agents_mcp_usage/basic_mcp/basic_mcp_use/adk_mcp.py b/agents_mcp_usage/basic_mcp/basic_mcp_use/adk_mcp.py index 2848f51..d15ec9d 100644 --- a/agents_mcp_usage/basic_mcp/basic_mcp_use/adk_mcp.py +++ b/agents_mcp_usage/basic_mcp/basic_mcp_use/adk_mcp.py @@ -41,20 +41,20 @@ async def main(query: str = "Greet Andrew and give him the current time") -> Non # Create the agent root_agent = LlmAgent( model="gemini-2.5-pro-preview-03-25", - name="mcp_pydantic_assistant", + name="mcp_adk_assistant", tools=tools, ) # Set up session session_service = InMemorySessionService() session = session_service.create_session( - app_name="mcp_pydantic_app", + app_name="mcp_adk_app", user_id="aginns", ) # Create the runner runner = Runner( - app_name="mcp_pydantic_app", + app_name="mcp_adk_app", agent=root_agent, session_service=session_service, ) diff --git a/agents_mcp_usage/multi_mcp/eval_multi_mcp/costs.csv b/agents_mcp_usage/multi_mcp/eval_multi_mcp/costs.csv deleted file mode 100644 index acafc79..0000000 --- a/agents_mcp_usage/multi_mcp/eval_multi_mcp/costs.csv +++ /dev/null @@ -1,103 +0,0 @@ -# Prices are per 1 million tokens -# Gemini prices from https://ai.google.dev/gemini-api/docs/pricing -# OpenAI prices from https://openai.com/api/pricing/ -MODEL_COSTS = { - "gemini-2.5-pro-preview-05-06": { - "input": [ - {"up_to": 200000, "price": 1.25}, - {"up_to": float('inf'), "price": 2.50}, - ], - "output": { - "default": [ - {"up_to": 200000, "price": 10.00}, - {"up_to": float('inf'), "price": 15.00}, - ] - }, - }, - "gemini-2.5-pro-preview-06-05": { - "input": [ - {"up_to": 200000, "price": 1.25}, - {"up_to": float('inf'), "price": 2.50}, - ], - "output": { - "default": [ - {"up_to": 200000, "price": 10.00}, - {"up_to": float('inf'), "price": 15.00}, - ] - }, - }, - "gemini-2.5-pro-preview": { - "input": [ - {"up_to": 200000, "price": 1.25}, - {"up_to": float('inf'), "price": 2.50}, - ], - "output": { - "default": [ - {"up_to": 200000, "price": 10.00}, - {"up_to": float('inf'), "price": 15.00}, - ] - }, - }, - "gemini-1.5-pro": { - "input": [ - {"up_to": 128000, "price": 1.25}, - {"up_to": float('inf'), "price": 2.50}, - ], - "output": { - "default": [ - {"up_to": 128000, "price": 5.00}, - {"up_to": float('inf'), "price": 10.00}, - ] - }, - }, - "gemini-1.5-flash": { - "input": [ - {"up_to": 128000, "price": 0.075}, - {"up_to": float('inf'), "price": 0.15}, - ], - "output": { - "default": [ - {"up_to": 128000, "price": 0.30}, - {"up_to": float('inf'), "price": 0.60}, - ] - }, - }, - "gemini-2.0-flash": { - "input": [{"up_to": float('inf'), "price": 0.10}], - "output": {"default": [{"up_to": float('inf'), "price": 0.40}]}, - }, - "gemini-2.5-flash-preview-04-17": { - "input": [{"up_to": float('inf'), "price": 0.15}], - "output": { - "non_thinking": [{"up_to": float('inf'), "price": 0.60}], - "thinking": [{"up_to": float('inf'), "price": 3.50}], - }, - }, - "gemini-2.5-flash-preview": { - "input": [{"up_to": float('inf'), "price": 0.15}], - "output": { - "non_thinking": [{"up_to": float('inf'), "price": 0.60}], - "thinking": [{"up_to": float('inf'), "price": 3.50}], - }, - }, - "openai:o4-mini": { - "input": [{"up_to": float('inf'), "price": 1.10}], - "output": {"default": [{"up_to": float('inf'), "price": 4.40}]}, - }, - "openai:o3": { - "input": [{"up_to": float('inf'), "price": 10.00}], - "output": {"default": [{"up_to": float('inf'), "price": 40.00}]}, - }, - "openai:gpt-4.1": { - "input": [{"up_to": float('inf'), "price": 2.00}], - "output": {"default": [{"up_to": float('inf'), "price": 8.00}]}, - }, - "openai:gpt-4.1-mini": { - "input": [{"up_to": float('inf'), "price": 0.40}], - "output": {"default": [{"up_to": float('inf'), "price": 1.60}]}, - }, - "openai:gpt-4.1-nano": { - "input": [{"up_to": float('inf'), "price": 0.10}], - "output": {"default": [{"up_to": float('inf'), "price": 0.40}]}, - }, -} \ No newline at end of file diff --git a/agents_mcp_usage/multi_mcp/eval_multi_mcp/costs.json b/agents_mcp_usage/multi_mcp/eval_multi_mcp/costs.json new file mode 100644 index 0000000..df86d38 --- /dev/null +++ b/agents_mcp_usage/multi_mcp/eval_multi_mcp/costs.json @@ -0,0 +1,158 @@ +{ + "_comment": "Prices are per 1 million tokens", + "_sources": { + "gemini": "https://ai.google.dev/gemini-api/docs/pricing", + "openai": "https://openai.com/api/pricing/" + }, + "model_costs": { + "gemini-2.5-pro-preview-03-25": { + "friendly_name": "Gemini 2.5 Pro Preview (Mar)", + "input": [ + {"up_to": 200000, "price": 1.25}, + {"up_to": "inf", "price": 2.50} + ], + "output": { + "default": [ + {"up_to": 200000, "price": 10.00}, + {"up_to": "inf", "price": 15.00} + ] + } + }, + "gemini-2.5-pro-preview-05-06": { + "friendly_name": "Gemini 2.5 Pro Preview (May)", + "input": [ + {"up_to": 200000, "price": 1.25}, + {"up_to": "inf", "price": 2.50} + ], + "output": { + "default": [ + {"up_to": 200000, "price": 10.00}, + {"up_to": "inf", "price": 15.00} + ] + } + }, + "gemini-2.5-pro-preview-06-05": { + "friendly_name": "Gemini 2.5 Pro Preview (Jun)", + "input": [ + {"up_to": 200000, "price": 1.25}, + {"up_to": "inf", "price": 2.50} + ], + "output": { + "default": [ + {"up_to": 200000, "price": 10.00}, + {"up_to": "inf", "price": 15.00} + ] + } + }, + "gemini-2.5-pro-preview": { + "friendly_name": "Gemini 2.5 Pro Preview", + "input": [ + {"up_to": 200000, "price": 1.25}, + {"up_to": "inf", "price": 2.50} + ], + "output": { + "default": [ + {"up_to": 200000, "price": 10.00}, + {"up_to": "inf", "price": 15.00} + ] + } + }, + "gemini-1.5-pro": { + "friendly_name": "Gemini 1.5 Pro", + "input": [ + {"up_to": 128000, "price": 1.25}, + {"up_to": "inf", "price": 2.50} + ], + "output": { + "default": [ + {"up_to": 128000, "price": 5.00}, + {"up_to": "inf", "price": 10.00} + ] + } + }, + "gemini-1.5-flash": { + "friendly_name": "Gemini 1.5 Flash", + "input": [ + {"up_to": 128000, "price": 0.075}, + {"up_to": "inf", "price": 0.15} + ], + "output": { + "default": [ + {"up_to": 128000, "price": 0.30}, + {"up_to": "inf", "price": 0.60} + ] + } + }, + "gemini-2.0-flash": { + "friendly_name": "Gemini 2.0 Flash", + "input": [{"up_to": "inf", "price": 0.10}], + "output": {"default": [{"up_to": "inf", "price": 0.40}]} + }, + "gemini-2.5-flash-preview-04-17": { + "friendly_name": "Gemini 2.5 Flash Preview (Apr)", + "input": [{"up_to": "inf", "price": 0.15}], + "output": { + "non_thinking": [{"up_to": "inf", "price": 0.60}], + "thinking": [{"up_to": "inf", "price": 3.50}] + } + }, + "gemini-2.5-flash-preview": { + "friendly_name": "Gemini 2.5 Flash Preview", + "input": [{"up_to": "inf", "price": 0.15}], + "output": { + "non_thinking": [{"up_to": "inf", "price": 0.60}], + "thinking": [{"up_to": "inf", "price": 3.50}] + } + }, + "openai:o4-mini": { + "friendly_name": "OpenAI o4-mini", + "input": [{"up_to": "inf", "price": 1.10}], + "output": {"default": [{"up_to": "inf", "price": 4.40}]} + }, + "openai:o3": { + "friendly_name": "OpenAI o3", + "input": [{"up_to": "inf", "price": 10.00}], + "output": {"default": [{"up_to": "inf", "price": 40.00}]} + }, + "openai:gpt-4.1": { + "friendly_name": "GPT-4.1", + "input": [{"up_to": "inf", "price": 2.00}], + "output": {"default": [{"up_to": "inf", "price": 8.00}]} + }, + "openai:gpt-4.1-mini": { + "friendly_name": "GPT-4.1 Mini", + "input": [{"up_to": "inf", "price": 0.40}], + "output": {"default": [{"up_to": "inf", "price": 1.60}]} + }, + "openai:gpt-4.1-nano": { + "friendly_name": "GPT-4.1 Nano", + "input": [{"up_to": "inf", "price": 0.10}], + "output": {"default": [{"up_to": "inf", "price": 0.40}]} + }, + "bedrock:us.anthropic.claude-sonnet-4-20250514-v1:0": { + "friendly_name": "Claude 4 Sonnet", + "input": [{"up_to": "inf", "price": 3.00}], + "output": {"default": [{"up_to": "inf", "price": 15.00}]} + }, + "bedrock:us.anthropic.claude-opus-4-20250514-v1:0": { + "friendly_name": "Claude 4 Opus", + "input": [{"up_to": "inf", "price": 15.00}], + "output": {"default": [{"up_to": "inf", "price": 75.00}]} + }, + "bedrock:us.anthropic.claude-3-7-sonnet-20250219-v1:0": { + "friendly_name": "Claude 3.7 Sonnet", + "input": [{"up_to": "inf", "price": 3.00}], + "output": {"default": [{"up_to": "inf", "price": 15.00}]} + }, + "bedrock:us.anthropic.claude-3-5-sonnet-20240620-v1:0": { + "friendly_name": "Claude 3.5 Sonnet", + "input": [{"up_to": "inf", "price": 3.00}], + "output": {"default": [{"up_to": "inf", "price": 15.00}]} + }, + "bedrock:us.anthropic.claude-3-5-haiku-20241022-v1:0": { + "friendly_name": "Claude 3.5 Haiku", + "input": [{"up_to": "inf", "price": 1.00}], + "output": {"default": [{"up_to": "inf", "price": 4.00}]} + } + } +} \ No newline at end of file diff --git a/agents_mcp_usage/multi_mcp/eval_multi_mcp/dashboard_config.py b/agents_mcp_usage/multi_mcp/eval_multi_mcp/dashboard_config.py index 8167d96..24cf37d 100644 --- a/agents_mcp_usage/multi_mcp/eval_multi_mcp/dashboard_config.py +++ b/agents_mcp_usage/multi_mcp/eval_multi_mcp/dashboard_config.py @@ -19,8 +19,16 @@ MERBENCH_CONFIG = { # --- General Dashboard Settings --- - "title": "Merbench - LLM Evaluation Benchmark", - "icon": "🏆", # Emoji for the browser tab + "title": "🧜‍♀️ Merbench - LLM Evaluation ", + "description": ( + "Getting LLMs to consistently nail the mermaid diagram syntax can be... an adventure. " + "\n\nMerbench tests this ability by providing an LLM Agent access to an MCP server that both validates " + "and provides error messages to guide correction of syntax. There are three different difficulty levels (test cases), " + "and the LLM is given a fixed number of attempts to fix the diagram, if this is exceeded, the test case is considered failed. " + "\n\n **Performance is a measure of both tool usage, and Mermaid syntax understanding.**" + "\n\nThe leaderboard shows the average success rate across all selected models and difficulty levels over *n runs*." + ), + "icon": "🧜‍♀️", # Emoji for the browser tab # --- Primary Metric Configuration --- # The primary metric is the main score used for the leaderboard and # the y-axis of the Pareto frontier plot. @@ -50,6 +58,7 @@ "x_axis_options": { "cost": {"column": "total_cost", "label": "Cost"}, "tokens": {"column": "total_response_tokens", "label": "Tokens"}, + "duration": {"column": "Duration", "label": "Duration"}, }, "color_axis": "Duration", # Column to use for the color scale }, diff --git a/agents_mcp_usage/multi_mcp/eval_multi_mcp/evals_pydantic_mcp.py b/agents_mcp_usage/multi_mcp/eval_multi_mcp/evals_pydantic_mcp.py index d1a9338..c777290 100644 --- a/agents_mcp_usage/multi_mcp/eval_multi_mcp/evals_pydantic_mcp.py +++ b/agents_mcp_usage/multi_mcp/eval_multi_mcp/evals_pydantic_mcp.py @@ -49,12 +49,7 @@ logfire.instrument_pydantic_ai() # Default model configurations -DEFAULT_MODEL = "gemini-2.5-pro-preview-05-06" -DEFAULT_MODELS = [ - "gemini-2.5-pro-preview-06-05", - "gemini-2.0-flash", - "gemini-2.5-flash-preview-04-17", -] +DEFAULT_MODEL = "gemini-2.5-pro-preview-06-05" # Retry configuration RETRYABLE_HTTP_STATUS_CODES = {429, 500, 502, 503, 504} @@ -214,6 +209,30 @@ def create_agent( if model_settings is None: model_settings = {} + # Handle Bedrock models specifically + if model.startswith("bedrock:"): + from pydantic_ai.models.bedrock import BedrockConverseModel + from pydantic_ai.providers.bedrock import BedrockProvider + + # Extract the model name (remove "bedrock:" prefix) + model_name = model.replace("bedrock:", "") + + # Create BedrockConverseModel with proper region and profile configuration + bedrock_model = BedrockConverseModel( + model_name, + provider=BedrockProvider( + region_name=os.getenv("AWS_REGION", "us-east-1"), + profile_name=os.getenv("AWS_PROFILE", "my-aws-profile"), + ), + ) + + return Agent( + bedrock_model, + mcp_servers=get_mcp_servers(), + model_settings=model_settings, + ) + + # For non-Bedrock models, use the original approach return Agent( model, mcp_servers=get_mcp_servers(), @@ -765,9 +784,9 @@ async def fix_with_model(inputs: MermaidInput) -> MermaidOutput: if __name__ == "__main__": # You can use different models for the agent and the judge # agent_model = os.getenv("AGENT_MODEL", DEFAULT_MODEL) - agent_model = "gemini-2.5-pro-preview-06-05" + # agent_model = "gemini-2.5-pro-preview-06-05" # agent_model = "openai:o4-mini" - # agent_model = "gemini-2.5-flash-preview-04-17" + agent_model = "gemini-2.5-flash-preview-04-17" judge_model = os.getenv("JUDGE_MODEL", DEFAULT_MODEL) async def run_all(): diff --git a/agents_mcp_usage/multi_mcp/eval_multi_mcp/merbench_ui.py b/agents_mcp_usage/multi_mcp/eval_multi_mcp/merbench_ui.py index 4d2240d..20e2864 100644 --- a/agents_mcp_usage/multi_mcp/eval_multi_mcp/merbench_ui.py +++ b/agents_mcp_usage/multi_mcp/eval_multi_mcp/merbench_ui.py @@ -29,40 +29,65 @@ # --- Cost Loading --- -def load_model_costs(file_path: str) -> Dict: - """Loads model costs from a CSV file and returns a structured dictionary. +def load_model_costs(file_path: str) -> tuple[Dict, Dict]: + """Loads model costs and friendly names from a JSON file and returns structured dictionaries. Args: - file_path: The path to the cost file. + file_path: The path to the cost file (JSON or CSV). Returns: - A dictionary containing the model costs. + A tuple containing (model_costs_dict, friendly_names_dict). """ + import json + + # Try JSON first (new format), then fall back to CSV (old format) + json_path = file_path.replace('.csv', '.json') + try: - with open(file_path, "r", encoding="utf-8") as f: - # Read lines, skipping comments and empty lines - lines = [ - line for line in f if not line.strip().startswith("#") and line.strip() - ] - - # Find the start of the dictionary-like definition - dict_str = "".join(lines) - match = re.search(r"MODEL_COSTS\s*=\s*({.*})", dict_str, re.DOTALL) - if not match: - st.error(f"Could not find 'MODEL_COSTS' dictionary in {file_path}") - return {} - - # Safely evaluate the dictionary string - model_costs_raw = eval(match.group(1), {"float": float}) - - return model_costs_raw - + # Try to load JSON format first + with open(json_path, "r", encoding="utf-8") as f: + data = json.load(f) + model_costs_raw = data["model_costs"] + + # Extract friendly names and clean cost data + friendly_names = {} + model_costs_clean = {} + + for model_id, model_data in model_costs_raw.items(): + # Extract friendly name if it exists + if isinstance(model_data, dict) and "friendly_name" in model_data: + friendly_names[model_id] = model_data["friendly_name"] + # Create a clean copy without the friendly_name for cost calculations + model_costs_clean[model_id] = { + key: _convert_inf_strings(value) if key in ["input", "output"] else value + for key, value in model_data.items() + if key != "friendly_name" + } + else: + # No friendly name, use model_id as fallback + friendly_names[model_id] = model_id + model_costs_clean[model_id] = _convert_inf_strings(model_data) + + return model_costs_clean, friendly_names + except FileNotFoundError: - st.warning(f"Cost file not found at {file_path}. Using empty cost config.") - return {} - except (SyntaxError, NameError, Exception) as e: - st.error(f"Error parsing cost file {file_path}: {e}") - return {} + st.warning(f"Cost file not found at {json_path}. Using empty cost config.") + return {}, {} + except (json.JSONDecodeError, KeyError, Exception) as e: + st.error(f"Error parsing JSON cost file {json_path}: {e}") + return {}, {} + + +def _convert_inf_strings(data): + """Recursively convert 'inf' strings to float('inf') in nested data structures.""" + if isinstance(data, dict): + return {key: _convert_inf_strings(value) for key, value in data.items()} + elif isinstance(data, list): + return [_convert_inf_strings(item) for item in data] + elif data == "inf": + return float('inf') + else: + return data # --- Data Loading and Processing --- @@ -388,8 +413,76 @@ def create_leaderboard( return leaderboard.sort_values("Correct", ascending=sort_ascending) +def _calculate_smart_label_positions( + x_data, y_data, labels, min_distance_threshold=0.1 +): + """Calculate optimal label positions to avoid overlaps. + + Args: + x_data: Array of x coordinates (normalized to 0-1 range for distance calc) + y_data: Array of y coordinates (normalized to 0-1 range for distance calc) + labels: Array of label strings + min_distance_threshold: Minimum distance threshold for considering overlap + + Returns: + List of textposition strings for each point + """ + import numpy as np + + # Normalize coordinates to 0-1 range for distance calculations + x_norm = ( + (x_data - x_data.min()) / (x_data.max() - x_data.min()) + if x_data.max() != x_data.min() + else x_data * 0 + ) + y_norm = ( + (y_data - y_data.min()) / (y_data.max() - y_data.min()) + if y_data.max() != y_data.min() + else y_data * 0 + ) + + positions = ["top center"] * len(x_data) + position_options = [ + "top center", + "bottom center", + "middle left", + "middle right", + "top left", + "top right", + "bottom left", + "bottom right", + ] + + # Calculate distances between all pairs of points + for i in range(len(x_data)): + for j in range(i + 1, len(x_data)): + distance = np.sqrt( + (x_norm[i] - x_norm[j]) ** 2 + (y_norm[i] - y_norm[j]) ** 2 + ) + + if distance < min_distance_threshold: + # Points are close, try different positions + for pos_idx, position in enumerate(position_options): + if positions[i] == "top center": + positions[i] = position_options[pos_idx % len(position_options)] + break + + for pos_idx, position in enumerate(position_options): + if positions[j] == "top center" or positions[j] == positions[i]: + positions[j] = position_options[ + (pos_idx + 1) % len(position_options) + ] + break + + return positions + + def create_pareto_frontier_plot( - df: pd.DataFrame, selected_groups: List[str], x_axis_mode: str, config: Dict + df: pd.DataFrame, + selected_groups: List[str], + x_axis_mode: str, + config: Dict, + friendly_names: Dict = None, ) -> go.Figure: """Visualizes the trade-off between model performance and cost/token usage. @@ -419,6 +512,7 @@ def create_pareto_frontier_plot( y_axis=(primary_metric_name, "mean"), total_cost=("total_cost", "mean"), total_response_tokens=("total_response_tokens", "mean"), + Duration=("Duration", "mean"), color_axis=(plot_config["color_axis"], "mean"), ) .reset_index() @@ -428,31 +522,71 @@ def create_pareto_frontier_plot( x_data = model_metrics[x_axis_config["column"]] x_title = x_axis_config["label"] hover_label = x_axis_config["label"] - hover_format = ":.4f" if x_axis_mode == "cost" else ":.0f" - - fig.add_trace( - go.Scatter( - x=x_data, - y=model_metrics["y_axis"], - mode="markers+text", - marker=dict( - size=18, - color=model_metrics["color_axis"], - colorscale="RdYlGn_r", - showscale=True, - colorbar=dict(title=f"Avg {plot_config['color_axis']} (s)"), - ), - text=model_metrics["Model"], - textposition="top center", - hovertemplate=( - "%{text}
" - f"{y_axis_label}: %{{y:.1f}}%
" - f"{hover_label}: %{{x{hover_format}}}
" - f"Avg {plot_config['color_axis']}: %{{marker.color:.1f}}s" - ), - ) + if x_axis_mode == "cost": + hover_format = ":.4f" + elif x_axis_mode == "duration": + hover_format = ":.2f" + else: + hover_format = ":.0f" + + # Calculate smart label positions to avoid overlaps + label_positions = _calculate_smart_label_positions( + x_data.values, model_metrics["y_axis"].values, model_metrics["Model"].values ) + # Group data by text position to create separate traces + from collections import defaultdict + + position_groups = defaultdict(list) + + for i, position in enumerate(label_positions): + position_groups[position].append(i) + + # Create a trace for each text position group + first_trace = True + for position, indices in position_groups.items(): + x_vals = [x_data.iloc[i] for i in indices] + y_vals = [model_metrics["y_axis"].iloc[i] for i in indices] + colors = [model_metrics["color_axis"].iloc[i] for i in indices] + + # Get model names for this position group + original_names = [model_metrics["Model"].iloc[i] for i in indices] + + # Use friendly names for display if available, otherwise use original names + if friendly_names: + display_texts = [friendly_names.get(name, name) for name in original_names] + else: + display_texts = original_names + + fig.add_trace( + go.Scatter( + x=x_vals, + y=y_vals, + mode="markers+text", + marker=dict( + size=18, + color=colors, + colorscale="RdYlGn_r", + showscale=first_trace, # Show colorbar only on first trace + colorbar=dict(title=f"Avg {plot_config['color_axis']} (s)") + if first_trace + else None, + ), + text=display_texts, # Use friendly names for display + textposition=position, + customdata=original_names, # Store original names for hover + hovertemplate=( + "%{text}
" # Friendly name as title + "API Name: %{customdata}
" # Original API name + f"{y_axis_label}: %{{y:.1f}}%
" + f"{hover_label}: %{{x{hover_format}}}
" + f"Avg {plot_config['color_axis']}: %{{marker.color:.1f}}s" + ), + showlegend=False, # Don't show legend for individual position groups + ) + ) + first_trace = False + fig.update_layout( title=plot_config["title"].format(x_axis_label=x_title), xaxis_title=f"Average {x_title}", @@ -653,7 +787,7 @@ def main() -> None: eval_config = EVAL_CONFIG # Use the validated config st.title(eval_config.title) - st.subheader("LLM Evaluation Benchmark Dashboard") + st.markdown(eval_config.description) # --- Sidebar Setup --- st.sidebar.header("⚙️ Data Configuration") @@ -708,7 +842,7 @@ def main() -> None: # Cost configuration in sidebar st.sidebar.subheader("💰 Cost Configuration") cost_file_path = os.path.join(os.path.dirname(__file__), "costs.csv") - model_costs = load_model_costs(cost_file_path) + model_costs, friendly_names = load_model_costs(cost_file_path) available_models = sorted(df_initial["Model"].unique()) cost_config = {} @@ -781,7 +915,7 @@ def main() -> None: cols[3].metric("Files Loaded", len(selected_files)) st.info( - f"**Showing results for {grouping_config.label.lower()}:** {', '.join(selected_groups) if selected_groups else 'None'}" + f"**Showing averaged results for {grouping_config.label.lower()}:** {', '.join(selected_groups) if selected_groups else 'None'}" ) # --- Leaderboard & Pareto --- @@ -817,7 +951,7 @@ def main() -> None: else: st.warning("No data available for the current filter selection.") - st.header("📈 Pareto Frontier Analysis") + st.header("📈 Pareto Frontier") pareto_config = eval_config.plots.pareto x_axis_mode = st.radio( "Compare performance against:", @@ -827,7 +961,7 @@ def main() -> None: ) st.plotly_chart( create_pareto_frontier_plot( - df, selected_groups, x_axis_mode, eval_config.model_dump() + df, selected_groups, x_axis_mode, eval_config.model_dump(), friendly_names ), use_container_width=True, ) diff --git a/agents_mcp_usage/multi_mcp/eval_multi_mcp/run_multi_evals.py b/agents_mcp_usage/multi_mcp/eval_multi_mcp/run_multi_evals.py index 8f56ba1..29516b7 100644 --- a/agents_mcp_usage/multi_mcp/eval_multi_mcp/run_multi_evals.py +++ b/agents_mcp_usage/multi_mcp/eval_multi_mcp/run_multi_evals.py @@ -34,7 +34,6 @@ # Import shared functionality from the improved evals module from agents_mcp_usage.multi_mcp.eval_multi_mcp.evals_pydantic_mcp import ( - DEFAULT_MODELS, MermaidInput, MermaidOutput, fix_mermaid_diagram, @@ -44,6 +43,23 @@ load_dotenv() +DEFAULT_MODELS = [ + # "gemini-2.5-pro-preview-06-05", + # "gemini-2.5-pro-preview-05-06", + # "gemini-2.5-pro-preview-03-25", + "gemini-2.0-flash", + "gemini-2.5-flash-preview-04-17", + # "openai:o4-mini", + # "openai:gpt-4.1", + # "openai:gpt-4.1-mini", + # "openai:gpt-4.1-nano", + # "bedrock:us.anthropic.claude-sonnet-4-20250514-v1:0", + # "bedrock:us.anthropic.claude-opus-4-20250514-v1:0", + # "bedrock:us.anthropic.claude-3-7-sonnet-20250219-v1:0", + # "bedrock:us.anthropic.claude-3-5-sonnet-20240620-v1:0", + # "bedrock:us.anthropic.claude-3-5-haiku-20241022-v1:0", +] + logfire.configure( send_to_logfire="if-token-present", service_name="multi-model-mermaid-evals" ) @@ -496,13 +512,13 @@ async def main() -> None: parser.add_argument( "--judge-model", type=str, - default="gemini-2.5-pro-preview-03-25", + default="gemini-2.5-pro-preview-06-05", help="Model to use for LLM judging", ) parser.add_argument( "--parallel", action="store_true", - default=True, + default=False, help="Run evaluations in parallel", ) parser.add_argument( diff --git a/agents_mcp_usage/multi_mcp/eval_multi_mcp/schemas.py b/agents_mcp_usage/multi_mcp/eval_multi_mcp/schemas.py index dc8059b..cbc2f1c 100644 --- a/agents_mcp_usage/multi_mcp/eval_multi_mcp/schemas.py +++ b/agents_mcp_usage/multi_mcp/eval_multi_mcp/schemas.py @@ -2,8 +2,8 @@ Pydantic Schemas for Dashboard Configuration Validation """ -from pydantic import BaseModel, Field, validator -from typing import List, Dict, Optional, Any +from pydantic import BaseModel, Field, field_validator, model_validator +from typing import List, Dict, Optional class PrimaryMetricConfig(BaseModel): @@ -15,7 +15,8 @@ class PrimaryMetricConfig(BaseModel): description="Optional source column to calculate the primary metric from if it doesn't exist.", ) - @validator("goal") + @field_validator("goal") + @classmethod def goal_must_be_max_or_min(cls, v: str) -> str: """Validates that the goal is either 'maximize' or 'minimize'.""" if v not in ["maximize", "minimize"]: @@ -65,17 +66,15 @@ class BarPlotConfig(BaseModel): y_columns: Optional[List[str]] = None series: Optional[List[StackedBarPlotSeries]] = None - @validator("y_columns", always=True) - def check_prefix_or_columns( - cls, v: Optional[List[str]], values: Dict[str, Any] - ) -> Optional[List[str]]: + @model_validator(mode="after") + def check_prefix_or_columns(self) -> "BarPlotConfig": """Validates that either 'y_prefix' or 'y_columns' is provided for grouped_bar plots.""" - if not values.get("y_prefix") and not v: - if values.get("type") == "grouped_bar": + if not self.y_prefix and not self.y_columns: + if self.type == "grouped_bar": raise ValueError( "Either 'y_prefix' or 'y_columns' must be provided for grouped_bar plots." ) - return v + return self class PlotConfig(BaseModel): @@ -93,6 +92,7 @@ class CostCalculationConfig(BaseModel): class DashboardConfig(BaseModel): title: str + description: str icon: str primary_metric: PrimaryMetricConfig grouping: GroupingConfig diff --git a/agents_mcp_usage/multi_mcp/mermaid_diagrams.py b/agents_mcp_usage/multi_mcp/mermaid_diagrams.py index 485c993..7d14012 100644 --- a/agents_mcp_usage/multi_mcp/mermaid_diagrams.py +++ b/agents_mcp_usage/multi_mcp/mermaid_diagrams.py @@ -5,6 +5,7 @@ # Agent Frameworks subgraph "Agent" + direction TD Agent[Agent] ADK["Google ADK
(adk_mcp.py)"] LG["LangGraph
(langgraph_mcp.py)"] @@ -21,7 +22,7 @@ subgraph "MCP" direction TD MCP["Model Context Protocol Server
(mcp_servers/example_server.py)"] - Tools["Tools
- add(a, b)
- get_current_time() e.g. {current_time}"] + Tools["Tools
- add(a, b)
- get_current_time() {current_time}"] Resources["Resources
- greeting://{{name}}"] MCP --- Tools MCP --- Resources @@ -38,7 +39,7 @@ Logfire[("Logfire
Tracing")] ADK --> MCP - LG --> MCP + LG -- > MCP OAI --> MCP PYD --> MCP @@ -47,7 +48,7 @@ MCP --> OTHER ADK --> Logfire - LG --> Logfire + LG -- > Logfire OAI --> Logfire PYD --> Logfire @@ -63,6 +64,7 @@ ``` """ +# 7 syntax errors invalid_mermaid_diagram_medium = """ ```mermaid graph LR @@ -87,13 +89,15 @@ subgraph "MCP" direction TB MCP["Model Context Protocol Server
(mcp_servers/example_server.py)"] - Tools["Tools
- add(a, b)
- get_current_time() e.g. {current_time}"] + Tools["Tools
- add(a, b)
- get_current_time() {current_time}"] Resources["Resources
- greeting://{{name}}"] MCP --- Tools MCP --- Resources end + # LLM Providers subgraph "LLM Providers" + direction TB OAI_LLM["OpenAI Models"] GEM["Google Gemini Models"] OTHER["Other LLM Providers..."] @@ -102,7 +106,7 @@ Logfire[("Logfire
Tracing")] ADK --> MCP - LG --> MCP + LG -- > MCP OAI --> MCP PYD --> MCP @@ -111,7 +115,7 @@ MCP --> OTHER ADK --> Logfire - LG --> Logfire + LG -- > Logfire OAI --> Logfire PYD --> Logfire @@ -127,6 +131,7 @@ ``` """ +# 2 syntax errors invalid_mermaid_diagram_easy = """ ```mermaid graph LR @@ -148,16 +153,18 @@ end %% MCP Server - subgraph "MCP Server" + subgraph "MCP" direction TB MCP["Model Context Protocol Server
(mcp_servers/example_server.py)"] - Tools["Tools
- add(a, b)
- get_current_time() e.g. {current_time}"] + Tools["Tools
- add(a, b)
- get_current_time() {current_time}"] Resources["Resources
- greeting://{{name}}"] - MCPs --- Tools - MCPs --- Resources + MCP --- Tools + MCP --- Resources end + %% LLM Providers subgraph "LLM Providers" + direction TB OAI_LLM["OpenAI Models"] GEM["Google Gemini Models"] OTHER["Other LLM Providers..."] @@ -171,7 +178,7 @@ PYD --> MCP MCP --> OAI_LLM - MCP --> GEM + MCP --> GEMINI MCP --> OTHER ADK --> Logfire @@ -191,7 +198,7 @@ ``` """ -valid_mermaid_diagram = """` +valid_mermaid_diagram = """ ```mermaid graph LR User((User)) --> |"Run script
(e.g., pydantic_mcp.py)"| Agent @@ -215,13 +222,15 @@ subgraph "MCP Server" direction TB MCP["Model Context Protocol Server
(mcp_servers/example_server.py)"] - Tools["Tools
- add(a, b)
- get_current_time() e.g. {current_time}"] + Tools["Tools
- add(a, b)
- get_current_time() {current_time}"] Resources["Resources
- greeting://{{name}}"] MCP --- Tools MCP --- Resources end + %% LLM Providers subgraph "LLM Providers" + direction TB OAI_LLM["OpenAI Models"] GEM["Google Gemini Models"] OTHER["Other LLM Providers..."] diff --git a/pyproject.toml b/pyproject.toml index 23be2b7..600a8df 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ dependencies = [ "openai-agents>=0.0.12", "pandas>=2.3.0", "plotly>=6.1.2", - "pydantic-ai-slim[mcp]>=0.2.15", + "pydantic-ai-slim[bedrock,mcp]>=0.2.15", "pydantic-evals[logfire]>=0.2.15", "python-dotenv>=1.1.0", "ruff>=0.11.10", diff --git a/uv.lock b/uv.lock index 5c3c391..100ec3c 100644 --- a/uv.lock +++ b/uv.lock @@ -125,6 +125,34 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/10/cb/f2ad4230dc2eb1a74edf38f1a38b9b52277f75bef262d8908e60d957e13c/blinker-1.9.0-py3-none-any.whl", hash = "sha256:ba0efaa9080b619ff2f3459d1d500c57bddea4a6b424b60a91141db6fd2f08bc", size = 8458, upload-time = "2024-11-08T17:25:46.184Z" }, ] +[[package]] +name = "boto3" +version = "1.38.34" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "botocore" }, + { name = "jmespath" }, + { name = "s3transfer" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/cb/2f/4547f4b3cc7b63ec1266edaf923a675f3eae2057299aba8ecfe46f013c1a/boto3-1.38.34.tar.gz", hash = "sha256:25e76b9fec8db8e21adaf84df0de5c58fa779be121bc327e07e920c7c0870394", size = 111793, upload-time = "2025-06-10T19:26:47.176Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/84/9b/5fe964a2e11f5b884d40fd5d90421faef04521f7866f6e473e3e28df583b/boto3-1.38.34-py3-none-any.whl", hash = "sha256:7d9409be63a11c1684427a9b06d6820ec72785cb275b56affe437f3709a80eb3", size = 139938, upload-time = "2025-06-10T19:26:43.781Z" }, +] + +[[package]] +name = "botocore" +version = "1.38.34" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "jmespath" }, + { name = "python-dateutil" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/08/27/34dfe90dd51d16473985198e056b82d74afd355dc47daa10af2a4f117072/botocore-1.38.34.tar.gz", hash = "sha256:a105f4d941f329aa72c43ddf42371ec4bee50ab3619fc1ef35d0005520219612", size = 13953563, upload-time = "2025-06-10T19:26:35.379Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c9/17/d9611cf624ec274ed83139cf4446f631dbbb51cbfe51336db7b27a724dd7/botocore-1.38.34-py3-none-any.whl", hash = "sha256:95ff2c4819498e94b321c9b5ac65d02267df93ff7ce7617323b19f19ea7cb545", size = 13614470, upload-time = "2025-06-10T19:26:31.484Z" }, +] + [[package]] name = "cachetools" version = "5.5.2" @@ -472,6 +500,9 @@ dependencies = [ { name = "uritemplate" }, ] sdist = { url = "https://files.pythonhosted.org/packages/35/99/237cd2510aecca9fabb54007e58553274cc43cb3c18512ee1ea574d11b87/google_api_python_client-2.171.0.tar.gz", hash = "sha256:057a5c08d28463c6b9eb89746355de5f14b7ed27a65c11fdbf1d06c66bb66b23", size = 13028937, upload-time = "2025-06-03T18:57:38.732Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/79/db/c397e3eb3ea18f423855479d0a5852bdc9c3f644e3d4194931fa664a70b4/google_api_python_client-2.171.0-py3-none-any.whl", hash = "sha256:c9c9b76f561e9d9ac14e54a9e2c0842876201d5b96e69e48f967373f0784cbe9", size = 13547393, upload-time = "2025-06-10T02:14:38.225Z" }, +] [[package]] name = "google-auth" @@ -643,6 +674,9 @@ dependencies = [ { name = "protobuf" }, ] sdist = { url = "https://files.pythonhosted.org/packages/58/7a/2fa6735ec693d822fe08a76709c4d95d9b5b4c02e83e720497355039d2ee/google_cloud_secret_manager-2.24.0.tar.gz", hash = "sha256:ce573d40ffc2fb7d01719243a94ee17aa243ea642a6ae6c337501e58fbf642b5", size = 269516, upload-time = "2025-06-05T22:22:22.965Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/be/af/db1217cae1809e69a4527ee6293b82a9af2a1fb2313ad110c775e8f3c820/google_cloud_secret_manager-2.24.0-py3-none-any.whl", hash = "sha256:9bea1254827ecc14874bc86c63b899489f8f50bfe1442bfb2517530b30b3a89b", size = 218050, upload-time = "2025-06-10T02:02:19.88Z" }, +] [[package]] name = "google-cloud-speech" @@ -971,6 +1005,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b3/4a/4175a563579e884192ba6e81725fc0448b042024419be8d83aa8a80a3f44/jiter-0.10.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3aa96f2abba33dc77f79b4cf791840230375f9534e5fac927ccceb58c5e604a5", size = 354213, upload-time = "2025-05-18T19:04:41.894Z" }, ] +[[package]] +name = "jmespath" +version = "1.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/00/2a/e867e8531cf3e36b41201936b7fa7ba7b5702dbef42922193f05c8976cd6/jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe", size = 25843, upload-time = "2022-06-17T18:00:12.224Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/31/b4/b9b800c45527aadd64d5b442f9b932b00648617eb5d63d2c7a6587b7cafc/jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980", size = 20256, upload-time = "2022-06-17T18:00:10.251Z" }, +] + [[package]] name = "jsonpatch" version = "1.33" @@ -1842,7 +1885,7 @@ dependencies = [ { name = "openai-agents" }, { name = "pandas" }, { name = "plotly" }, - { name = "pydantic-ai-slim", extra = ["mcp"] }, + { name = "pydantic-ai-slim", extra = ["bedrock", "mcp"] }, { name = "pydantic-evals", extra = ["logfire"] }, { name = "python-dotenv" }, { name = "ruff" }, @@ -1864,7 +1907,7 @@ requires-dist = [ { name = "openai-agents", specifier = ">=0.0.12" }, { name = "pandas", specifier = ">=2.3.0" }, { name = "plotly", specifier = ">=6.1.2" }, - { name = "pydantic-ai-slim", extras = ["mcp"], specifier = ">=0.2.15" }, + { name = "pydantic-ai-slim", extras = ["bedrock", "mcp"], specifier = ">=0.2.15" }, { name = "pydantic-evals", extras = ["logfire"], specifier = ">=0.2.15" }, { name = "python-dotenv", specifier = ">=1.1.0" }, { name = "ruff", specifier = ">=0.11.10" }, @@ -1890,6 +1933,9 @@ wheels = [ ] [package.optional-dependencies] +bedrock = [ + { name = "boto3" }, +] mcp = [ { name = "mcp" }, ] @@ -2185,6 +2231,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ec/bf/b273dd11673fed8a6bd46032c0ea2a04b2ac9bfa9c628756a5856ba113b0/ruff-0.11.13-py3-none-win_arm64.whl", hash = "sha256:b4385285e9179d608ff1d2fb9922062663c658605819a6876d8beef0c30b7f3b", size = 10683928, upload-time = "2025-06-05T21:00:13.758Z" }, ] +[[package]] +name = "s3transfer" +version = "0.13.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "botocore" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ed/5d/9dcc100abc6711e8247af5aa561fc07c4a046f72f659c3adea9a449e191a/s3transfer-0.13.0.tar.gz", hash = "sha256:f5e6db74eb7776a37208001113ea7aa97695368242b364d73e91c981ac522177", size = 150232, upload-time = "2025-05-22T19:24:50.245Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/18/17/22bf8155aa0ea2305eefa3a6402e040df7ebe512d1310165eda1e233c3f8/s3transfer-0.13.0-py3-none-any.whl", hash = "sha256:0148ef34d6dd964d0d8cf4311b2b21c474693e57c2e069ec708ce043d2b527be", size = 85152, upload-time = "2025-05-22T19:24:48.703Z" }, +] + [[package]] name = "shapely" version = "2.1.1"