andrewginns
diff --git a/‎agents_mcp_usage/__init.py__‎ b/‎agents_mcp_usage/__init.py__‎
diff --git a/‎basic_mcp_use/__init__.py‎ renamed to ‎agents_mcp_usage/basic_mcp_use/__init__.py‎ b/‎basic_mcp_use/__init__.py‎ renamed to ‎agents_mcp_usage/basic_mcp_use/__init__.py‎
diff --git a/‎basic_mcp_use/adk_mcp.py‎ renamed to ‎agents_mcp_usage/basic_mcp_use/adk_mcp.py‎ b/‎basic_mcp_use/adk_mcp.py‎ renamed to ‎agents_mcp_usage/basic_mcp_use/adk_mcp.py‎
diff --git a/‎basic_mcp_use/langgraph_mcp.py‎ renamed to ‎agents_mcp_usage/basic_mcp_use/langgraph_mcp.py‎ b/‎basic_mcp_use/langgraph_mcp.py‎ renamed to ‎agents_mcp_usage/basic_mcp_use/langgraph_mcp.py‎
diff --git a/‎basic_mcp_use/oai-agent_mcp.py‎ renamed to ‎agents_mcp_usage/basic_mcp_use/oai-agent_mcp.py‎ b/‎basic_mcp_use/oai-agent_mcp.py‎ renamed to ‎agents_mcp_usage/basic_mcp_use/oai-agent_mcp.py‎
diff --git a/‎basic_mcp_use/pydantic_mcp.py‎ renamed to ‎agents_mcp_usage/basic_mcp_use/pydantic_mcp.py‎ b/‎basic_mcp_use/pydantic_mcp.py‎ renamed to ‎agents_mcp_usage/basic_mcp_use/pydantic_mcp.py‎
diff --git a/‎agents_mcp_usage/eval_basic_mcp_use/__init__.py‎
Lines changed: 10 additions & 0 deletions b/‎agents_mcp_usage/eval_basic_mcp_use/__init__.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎agents_mcp_usage/eval_basic_mcp_use/evals_adk_mcp.py‎
Lines changed: 149 additions & 0 deletions b/‎agents_mcp_usage/eval_basic_mcp_use/evals_adk_mcp.py‎
Lines changed: 149 additions & 0 deletions
diff --git a/‎agents_mcp_usage/eval_basic_mcp_use/evals_langchain_mcp.py‎
Lines changed: 120 additions & 0 deletions b/‎agents_mcp_usage/eval_basic_mcp_use/evals_langchain_mcp.py‎
Lines changed: 120 additions & 0 deletions
diff --git a/‎agents_mcp_usage/eval_basic_mcp_use/evals_pydantic_mcp.py‎
Lines changed: 111 additions & 0 deletions b/‎agents_mcp_usage/eval_basic_mcp_use/evals_pydantic_mcp.py‎
Lines changed: 111 additions & 0 deletions
@@ -0,0 +1,10 @@
+# MCP evaluation examples package
+
+"""
+Evaluation modules for MCP (Model Context Protocol) based agents.
+
+This package contains various evaluation modules for different frameworks:
+- evals_pydantic_mcp: Evaluations using Pydantic AI
+- evals_langchain_mcp: Evaluations using Langchain
+- evals_adk_mcp: Evaluations using Google ADK
+"""
@@ -0,0 +1,149 @@
+import os
+from typing import Any
+
+import logfire
+from dotenv import load_dotenv
+from google.adk.agents.llm_agent import LlmAgent
+from google.adk.tools.mcp_tool.mcp_toolset import MCPToolset, StdioServerParameters
+from google.adk.runners import Runner
+from google.adk.sessions import InMemorySessionService
+from google.genai import types
+from pydantic import BaseModel
+from pydantic_evals import Case, Dataset
+from pydantic_evals.evaluators import Evaluator, EvaluatorContext, IsInstance, LLMJudge
+
+load_dotenv()
+
+# Configure logging to logfire if LOGFIRE_TOKEN is set in environment
+logfire.configure(
+    send_to_logfire="if-token-present",
+    service_name="evals",
+)
+logfire.instrument_mcp()
+
+# Set API key for Google AI API from environment variable
+os.environ["GOOGLE_API_KEY"] = os.getenv("GEMINI_API_KEY")
+
+# Grant the MCP server access to only a specific directory
+current_dir = os.path.dirname(os.path.abspath(__file__))
+parent_dir = os.path.dirname(current_dir)
+test_dir = os.path.join(parent_dir, "mcp_allowed_dir")
+
+# Model to use for the agent
+model_name = "gemini-2.5-pro-preview-03-25"
+
+
+# Define the input prompt schema
+class InputPrompt(BaseModel):
+    question: str
+
+
+# Define the output response schema
+class OutputResponse(BaseModel):
+    output: str
+
+
+async def find_target_quote(inputs: InputPrompt) -> OutputResponse:
+    """Find information in files using the agent with a fresh MCP server for each evaluation.
+
+    Args:
+        inputs: The input prompt containing the question
+
+    Returns:
+        An OutputResponse with the agent's answer
+    """
+    # Create a new server instance for each evaluation
+    server_params = StdioServerParameters(
+        command="npx",
+        args=["-y", "@modelcontextprotocol/server-filesystem", test_dir],
+    )
+
+    tools, exit_stack = await MCPToolset.from_server(connection_params=server_params)
+
+    try:
+        # Create the agent
+        root_agent = LlmAgent(
+            model=model_name,
+            name="mcp_adk_assistant",
+            tools=tools,
+        )
+
+        # Set up session
+        session_service = InMemorySessionService()
+        session = session_service.create_session(
+            app_name="mcp_adk_app",
+            user_id="user",
+        )
+
+        # Create the runner
+        runner = Runner(
+            app_name="mcp_adk_app",
+            agent=root_agent,
+            session_service=session_service,
+        )
+
+        # Run the agent with the query
+        content = types.Content(role="user", parts=[types.Part(text=inputs.question)])
+
+        events_async = runner.run_async(
+            session_id=session.id, user_id=session.user_id, new_message=content
+        )
+
+        # Extract all text responses from the agent
+        async for event in events_async:
+            if hasattr(event, "content") and event.content:
+                # Extract text content from the response
+                for part in event.content.parts:
+                    if hasattr(part, "text") and part.text:
+                        # Store only the last text response
+                        result = part.text
+
+        return OutputResponse(output=result)
+    finally:
+        # Ensure MCP server connection is properly closed
+        await exit_stack.aclose()
+
+
+# Define the dataset of cases to evaluate
+quote_dataset = Dataset[InputPrompt, OutputResponse, Any](
+    cases=[
+        Case(
+            name="2024_quote",
+            inputs=InputPrompt(
+                question=f"Read the contents of files in {test_dir} to find the line that the famous personality says in Motorway's 2024 advertising sponsorship feature. Give me just the line used."
+            ),
+            expected_output=OutputResponse(output="Oh yeah he's winning!"),
+            metadata={"difficulty": "medium"},
+        ),
+        Case(
+            name="find_reference",
+            inputs=InputPrompt(
+                question=f"Read the contents of files in {test_dir} to find the sub-heading number for 'Cycle lanes and cycle tracks'. Give me just the number."
+            ),
+            expected_output=OutputResponse(output="140"),
+            metadata={"difficulty": "easy"},
+        ),
+    ],
+    evaluators=[
+        IsInstance(type_name="OutputResponse"),
+        LLMJudge(
+            rubric="Output should match expected",
+            include_input=True,
+            # LLM to use as the judge
+            model="gemini-2.5-pro-preview-03-25",
+        ),
+    ],
+)
+
+
+def main():
+    """Main function to run evaluations when module is imported or run directly."""
+    # Run evaluations in parallel since each has its own server
+    report = quote_dataset.evaluate_sync(
+        find_target_quote, name=f"{model_name}-adk-find_target_quote-evals"
+    )
+    report.print(include_input=False, include_expected_output=True, include_output=True)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,120 @@
+import os
+from typing import Any
+
+import logfire
+from dotenv import load_dotenv
+from pydantic import BaseModel
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_mcp_adapters.tools import load_mcp_tools
+from langgraph.prebuilt import create_react_agent
+from mcp import ClientSession, StdioServerParameters
+from mcp.client.stdio import stdio_client
+from pydantic_evals import Case, Dataset
+from pydantic_evals.evaluators import Evaluator, EvaluatorContext, IsInstance, LLMJudge
+
+load_dotenv()
+
+# Configure logging to logfire if LOGFIRE_TOKEN is set in environment
+logfire.configure(
+    send_to_logfire="if-token-present",
+    service_name="evals",
+)
+logfire.instrument_mcp()
+
+# Grant the MCP server access to only a specific directory
+current_dir = os.path.dirname(os.path.abspath(__file__))
+parent_dir = os.path.dirname(current_dir)
+test_dir = os.path.join(parent_dir, "mcp_allowed_dir")
+
+# Model to use for the agent
+model_name = "gemini-2.5-pro-preview-03-25"
+
+
+# Define the input prompt schema
+class InputPrompt(BaseModel):
+    question: str
+
+
+# Define the output response schema
+class OutputResponse(BaseModel):
+    output: str
+
+
+async def find_target_quote(inputs: InputPrompt) -> OutputResponse:
+    """Find information in files using the agent with a fresh MCP server for each evaluation.
+
+    Args:
+        inputs: The input prompt containing the question
+
+    Returns:
+        An OutputResponse with the agent's answer
+    """
+    # Create a new server instance for each evaluation
+    server = StdioServerParameters(
+        command="npx",
+        args=["-y", "@modelcontextprotocol/server-filesystem", test_dir],
+    )
+
+    model = ChatGoogleGenerativeAI(
+        model="gemini-2.5-pro-preview-03-25", google_api_key=os.getenv("GEMINI_API_KEY")
+    )
+    async with stdio_client(server) as (read, write):
+        async with ClientSession(read, write) as session:
+            # Initialise the connection
+            await session.initialize()
+
+            # Get tools
+            tools = await load_mcp_tools(session)
+
+            # Create agent
+            agent = create_react_agent(model, tools)
+            agent_response = await agent.ainvoke({"messages": inputs.question})
+
+    return OutputResponse(output=agent_response["messages"][-1].content)
+
+
+# Define the dataset of cases to evaluate
+quote_dataset = Dataset[InputPrompt, OutputResponse, Any](
+    cases=[
+        Case(
+            name="2024_quote",
+            inputs=InputPrompt(
+                question=f"Read the contents of files in {test_dir} to find the line that the famous personality says in Motorway's 2024 advertising sponsorship feature. Give me just the line used."
+            ),
+            expected_output=OutputResponse(output="Oh yeah he's winning!"),
+            metadata={"difficulty": "medium"},
+        ),
+        Case(
+            name="find_reference",
+            inputs=InputPrompt(
+                question=f"Read the contents of files in {test_dir} to find the sub-heading number for 'Cycle lanes and cycle tracks'. Give me just the number."
+            ),
+            expected_output=OutputResponse(output="140"),
+            metadata={"difficulty": "easy"},
+        ),
+    ],
+    evaluators=[
+        IsInstance(type_name="OutputResponse"),
+        LLMJudge(
+            rubric="Output should match expected",
+            include_input=True,
+            # LLM to use as the judge
+            model="gemini-2.5-pro-preview-03-25",
+        ),
+    ],
+)
+
+
+def main():
+    """Main function to run evaluations when module is imported or run directly."""
+    # Run evaluations in parallel since each has its own server
+    report = quote_dataset.evaluate_sync(
+        find_target_quote, name=f"{model_name}-langchain-find_target_quote-evals"
+    )
+    report.print(
+        include_input=False, include_expected_output=False, include_output=False
+    )
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,111 @@
+import os
+from typing import Any
+
+import logfire
+from dotenv import load_dotenv
+from pydantic import BaseModel
+from pydantic_ai import Agent
+from pydantic_ai.mcp import MCPServerStdio
+from pydantic_evals import Case, Dataset
+from pydantic_evals.evaluators import Evaluator, EvaluatorContext, IsInstance, LLMJudge
+
+load_dotenv()
+
+# Configure logging to logfire if LOGFIRE_TOKEN is set in environment
+logfire.configure(
+    send_to_logfire="if-token-present",
+    service_name="evals",
+)
+logfire.instrument_mcp()
+logfire.instrument_pydantic_ai()
+
+# Grant the MCP server access to only a specific directory
+current_dir = os.path.dirname(os.path.abspath(__file__))
+parent_dir = os.path.dirname(current_dir)
+test_dir = os.path.join(parent_dir, "mcp_allowed_dir")
+
+# Model to use for the agent
+model_name = "gemini-2.5-pro-preview-03-25"
+
+
+# Define the input prompt schema
+class InputPrompt(BaseModel):
+    question: str
+
+
+# Define the output response schema
+class OutputResponse(BaseModel):
+    output: str
+
+
+async def find_target_quote(inputs: InputPrompt) -> OutputResponse:
+    """Find information in files using the agent with a fresh MCP server for each evaluation.
+
+    Args:
+        inputs: The input prompt containing the question
+
+    Returns:
+        An OutputResponse with the agent's answer
+    """
+    # Create a new server instance for each evaluation
+    server = MCPServerStdio(
+        command="npx",
+        args=["-y", "@modelcontextprotocol/server-filesystem", test_dir],
+    )
+
+    # Create a new agent with its own MCP server instances
+    agent = Agent(model_name, mcp_servers=[server])
+
+    # Use the agent to accomplish the task
+    async with agent.run_mcp_servers():
+        result = await agent.run(inputs.question)
+
+    return OutputResponse(output=result.output)
+
+
+# Define the dataset of cases to evaluate
+quote_dataset = Dataset[InputPrompt, OutputResponse, Any](
+    cases=[
+        Case(
+            name="2024_quote",
+            inputs=InputPrompt(
+                question=f"Read the contents of files in {test_dir} to find the line that the famous personality says in Motorway's 2024 advertising sponsorship feature. Give me just the line used."
+            ),
+            expected_output=OutputResponse(output="Oh yeah he's winning!"),
+            metadata={"difficulty": "medium"},
+        ),
+        Case(
+            name="find_reference",
+            inputs=InputPrompt(
+                question=f"Read the contents of files in {test_dir} to find the sub-heading number for 'Cycle lanes and cycle tracks'. Give me just the number."
+            ),
+            expected_output=OutputResponse(output="140"),
+            metadata={"difficulty": "easy"},
+        ),
+    ],
+    evaluators=[
+        IsInstance(type_name="OutputResponse"),
+        LLMJudge(
+            rubric="Output should match expected",
+            include_input=True,
+            # LLM to use as the judge
+            model="gemini-2.5-pro-preview-03-25",
+        ),
+    ],
+)
+
+
+def main():
+    """Main function to run evaluations when module is imported or run directly."""
+    # Run evaluations in parallel since each has its own server
+    report = quote_dataset.evaluate_sync(
+        find_target_quote, name=f"{model_name}-pydantic_ai-find_target_quote-evals"
+    )
+    report.print(
+        include_input=False, include_expected_output=False, include_output=False
+    )
+
+
+# This block still allows the script to be run directly
+if __name__ == "__main__":
+    main()