andrewginns · andrewginns · Jun 29, 2025 · Jun 29, 2025
diff --git a/Makefile b/Makefile
@@ -6,7 +6,7 @@ lint:
 	uv run ruff check .
 
 leaderboard:
-	uv run -- streamlit run agents_mcp_usage/multi_mcp/eval_multi_mcp/merbench_ui.py
+	uv run -- streamlit run agents_mcp_usage/evaluations/mermaid_evals/merbench_ui.py
 
 adk_basic_ui:
 	uv run adk web agents_mcp_usage/basic_mcp

diff --git a/README.md b/README.md
@@ -9,7 +9,8 @@ This repository demonstrates LLM Agents using tools from Model Context Protocol
 ## Repository Structure
 
 - [Agent with a single MCP server](agents_mcp_usage/basic_mcp/README.md) - Learning examples and basic patterns
-- [Agent with multiple MCP servers](agents_mcp_usage/multi_mcp/README.md) - Advanced usage with comprehensive evaluation suite
+- [Agent with multiple MCP servers](agents_mcp_usage/multi_mcp/README.md) - Advanced usage with MCP server coordination
+- [Evaluation suite](agents_mcp_usage/evaluations/mermaid_evals/README.md) - Comprehensive benchmarking tools
   - **Evaluation Dashboard**: Interactive Streamlit UI for model comparison
   - **Multi-Model Benchmarking**: Parallel/sequential evaluation across multiple LLMs
   - **Rich Metrics**: Usage analysis, cost comparison, and performance leaderboards
@@ -67,8 +68,12 @@ This project aims to teach:
 - **[agents_mcp_usage/multi_mcp/](agents_mcp_usage/multi_mcp/)** - Advanced multi-MCP server integration examples
   - **multi_mcp_use/** - Contains examples of using multiple MCP servers simultaneously:
     - `pydantic_mcp.py` - Example of using multiple MCP servers with Pydantic-AI Agent
-  - **eval_multi_mcp/** - Contains evaluation examples for multi-MCP usage:
-    - `evals_pydantic_mcp.py` - Example of evaluating the use of multiple MCP servers with Pydantic-AI
+
+- **[agents_mcp_usage/evaluations/](agents_mcp_usage/evaluations/)** - Evaluation modules for benchmarking
+  - **mermaid_evals/** - Comprehensive evaluation suite for mermaid diagram fixing tasks
+    - `evals_pydantic_mcp.py` - Core evaluation module for single-model testing
+    - `run_multi_evals.py` - Multi-model benchmarking with parallel execution
+    - `merbench_ui.py` - Interactive dashboard for result visualization
 
 - **Demo Python MCP Servers**
   - `mcp_servers/example_server.py` - Simple MCP server that runs locally, implemented in Python
@@ -221,13 +226,13 @@ graph LR
 uv run agents_mcp_usage/multi_mcp/multi_mcp_use/pydantic_mcp.py
 
 # Run the multi-MCP evaluation
-uv run agents_mcp_usage/multi_mcp/eval_multi_mcp/evals_pydantic_mcp.py
+uv run agents_mcp_usage/evaluations/mermaid_evals/evals_pydantic_mcp.py
 
 # Run multi-model benchmarking
-uv run agents_mcp_usage/multi_mcp/eval_multi_mcp/run_multi_evals.py --models "gemini-2.5-pro-preview-06-05,gemini-2.0-flash" --runs 5 --parallel
+uv run agents_mcp_usage/evaluations/mermaid_evals/run_multi_evals.py --models "gemini-2.5-pro-preview-06-05,gemini-2.0-flash" --runs 5 --parallel
 
 # Launch the evaluation dashboard
-uv run streamlit run agents_mcp_usage/multi_mcp/eval_multi_mcp/merbench_ui.py
+uv run streamlit run agents_mcp_usage/evaluations/mermaid_evals/merbench_ui.py
 ```
 
 More details on multi-MCP implementation can be found in the [multi_mcp README](agents_mcp_usage/multi_mcp/README.md).
@@ -260,17 +265,17 @@ The included Streamlit dashboard (`merbench_ui.py`) provides:
 
 ```bash
 # Single model evaluation
-uv run agents_mcp_usage/multi_mcp/eval_multi_mcp/evals_pydantic_mcp.py
+uv run agents_mcp_usage/evaluations/mermaid_evals/evals_pydantic_mcp.py
 
 # Multi-model parallel benchmarking
-uv run agents_mcp_usage/multi_mcp/eval_multi_mcp/run_multi_evals.py \
+uv run agents_mcp_usage/evaluations/mermaid_evals/run_multi_evals.py \
   --models "gemini-2.5-pro-preview-06-05,gemini-2.0-flash,gemini-2.5-flash" \
   --runs 5 \
   --parallel \
   --output-dir ./results
 
 # Launch interactive dashboard
-uv run streamlit run agents_mcp_usage/multi_mcp/eval_multi_mcp/merbench_ui.py
+uv run streamlit run agents_mcp_usage/evaluations/mermaid_evals/merbench_ui.py
 ```
 
 The evaluation system enables robust, repeatable benchmarking across LLM models and agent frameworks, supporting both research and production model selection decisions.

diff --git a/agents_mcp_usage/evaluations/__init__.py b/agents_mcp_usage/evaluations/__init__.py
diff --git a/..._usage/multi_mcp/eval_multi_mcp/README.md → ...usage/evaluations/mermaid_evals/README.md b/..._usage/multi_mcp/eval_multi_mcp/README.md → ...usage/evaluations/mermaid_evals/README.md
@@ -1,4 +1,4 @@
-# Multi-MCP Mermaid Diagram Evaluation System
+# Mermaid Diagram Evaluation System
 
 This directory contains evaluation modules for testing LLM agents on mermaid diagram fixing tasks using multiple MCP (Model Context Protocol) servers. The system evaluates how well language models can fix invalid mermaid diagrams while utilizing multiple external tools.
 
@@ -21,7 +21,7 @@ The system tests LLM agents on their ability to:
 
 The evaluation includes three test cases of increasing difficulty:
 1. **Easy** - Simple syntax errors in mermaid diagrams
-2. **Medium** - More complex structural issues
+2. **Medium** - More complex structural issues 
 3. **Hard** - Advanced mermaid syntax problems
 
 ## Output Schema
@@ -164,26 +164,26 @@ Results are exported to CSV files with the following columns:
 
 ```bash
 # Run evaluation with default model
-uv run agents_mcp_usage/multi_mcp/eval_multi_mcp/evals_pydantic_mcp.py
+uv run agents_mcp_usage/evaluations/mermaid_evals/evals_pydantic_mcp.py
 
 # Customize model and judge
 AGENT_MODEL="gemini-2.5-pro-preview-06-05" JUDGE_MODEL="gemini-2.0-flash" \
-uv run agents_mcp_usage/multi_mcp/eval_multi_mcp/evals_pydantic_mcp.py
+uv run agents_mcp_usage/evaluations/mermaid_evals/evals_pydantic_mcp.py
 ```
 
 ### Multi-Model Evaluation
 
 ```bash
 # Run evaluation across multiple models
-uv run agents_mcp_usage/multi_mcp/eval_multi_mcp/run_multi_evals.py \
+uv run agents_mcp_usage/evaluations/mermaid_evals/run_multi_evals.py \
   --models "gemini-2.5-pro-preview-06-05,gemini-2.0-flash" \
   --runs 5 \
   --parallel \
   --timeout 600 \
   --output-dir ./results
 
 # Sequential execution with custom judge
-uv run agents_mcp_usage/multi_mcp/eval_multi_mcp/run_multi_evals.py \
+uv run agents_mcp_usage/evaluations/mermaid_evals/run_multi_evals.py \
   --models "gemini-2.5-pro-preview-06-05,claude-3-opus" \
   --runs 3 \
   --sequential \
@@ -247,4 +247,4 @@ The system implements robust error handling:
 - **pydantic-evals** - Evaluation framework and metrics
 - **logfire** - Logging and monitoring
 - **rich** - Console output and progress bars
-- **asyncio** - Asynchronous evaluation execution 
+- **asyncio** - Asynchronous evaluation execution
diff --git a/agents_mcp_usage/evaluations/mermaid_evals/__init__.py b/agents_mcp_usage/evaluations/mermaid_evals/__init__.py
diff --git a/...usage/multi_mcp/eval_multi_mcp/costs.json → ...sage/evaluations/mermaid_evals/costs.json b/...usage/multi_mcp/eval_multi_mcp/costs.json → ...sage/evaluations/mermaid_evals/costs.json
diff --git a/...ti_mcp/eval_multi_mcp/dashboard_config.py → ...uations/mermaid_evals/dashboard_config.py b/...ti_mcp/eval_multi_mcp/dashboard_config.py → ...uations/mermaid_evals/dashboard_config.py
@@ -126,4 +126,4 @@
 
 # The default configuration to use when the dashboard starts.
 # You can change this to point to a different configuration.
-DEFAULT_CONFIG = MERBENCH_CONFIG
+DEFAULT_CONFIG = MERBENCH_CONFIG
diff --git a/..._mcp/eval_multi_mcp/evals_pydantic_mcp.py → ...tions/mermaid_evals/evals_pydantic_mcp.py b/..._mcp/eval_multi_mcp/evals_pydantic_mcp.py → ...tions/mermaid_evals/evals_pydantic_mcp.py
@@ -31,7 +31,7 @@
 from pydantic_evals.evaluators import Evaluator, EvaluatorContext, LLMJudge
 from pydantic_evals.reporting import EvaluationReport
 
-from agents_mcp_usage.multi_mcp.mermaid_diagrams import (
+from agents_mcp_usage.evaluations.mermaid_evals.mermaid_diagrams import (
     invalid_mermaid_diagram_easy,
     invalid_mermaid_diagram_medium,
     invalid_mermaid_diagram_hard,
@@ -646,7 +646,7 @@ def get_timestamp_prefix() -> str:
 
 
 def write_mermaid_results_to_csv(
-    report: EvaluationReport, model: str, output_dir: str = "./mermaid_results"
+    report: EvaluationReport, model: str, output_dir: str = "./mermaid_eval_results"
 ) -> str:
     """Writes mermaid evaluation results with metrics to a CSV file.
 
@@ -750,7 +750,7 @@ async def run_evaluations(
     model: str = DEFAULT_MODEL,
     judge_model: str = DEFAULT_MODEL,
     export_csv: bool = True,
-    output_dir: str = "./mermaid_results",
+    output_dir: str = "./mermaid_eval_results",
 ) -> EvaluationReport:
     """Runs the evaluations on the mermaid diagram fixing task.
 
@@ -804,4 +804,4 @@ async def run_all():
             model=agent_model, judge_model=judge_model, export_csv=True
         )
 
-    asyncio.run(run_all())
+    asyncio.run(run_all())
diff --git a/...e/multi_mcp/eval_multi_mcp/merbench_ui.py → .../evaluations/mermaid_evals/merbench_ui.py b/...e/multi_mcp/eval_multi_mcp/merbench_ui.py → .../evaluations/mermaid_evals/merbench_ui.py
@@ -8,10 +8,10 @@
 import re
 from pydantic import ValidationError
 
-from agents_mcp_usage.multi_mcp.eval_multi_mcp.dashboard_config import (
+from agents_mcp_usage.evaluations.mermaid_evals.dashboard_config import (
     DEFAULT_CONFIG,
 )
-from agents_mcp_usage.multi_mcp.eval_multi_mcp.schemas import DashboardConfig
+from agents_mcp_usage.evaluations.mermaid_evals.schemas import DashboardConfig
 
 # Load and validate the configuration
 try:
@@ -841,7 +841,7 @@ def main() -> None:
 
     # Cost configuration in sidebar
     st.sidebar.subheader("💰 Cost Configuration")
-    cost_file_path = os.path.join(os.path.dirname(__file__), "costs.csv")
+    cost_file_path = os.path.join(os.path.dirname(__file__), "costs.json")
     model_costs, friendly_names = load_model_costs(cost_file_path)
     available_models = sorted(df_initial["Model"].unique())
 
@@ -1033,4 +1033,4 @@ def main() -> None:
 
 
 if __name__ == "__main__":
-    main()
+    main()