Skip to content

Commit 5637414

Browse files
committed
refact: Create top level module
1 parent 01b12b5 commit 5637414

File tree

11 files changed

+390
-128
lines changed

11 files changed

+390
-128
lines changed

agents_mcp_usage/__init.py__

Whitespace-only changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# MCP evaluation examples package
2+
3+
"""
4+
Evaluation modules for MCP (Model Context Protocol) based agents.
5+
6+
This package contains various evaluation modules for different frameworks:
7+
- evals_pydantic_mcp: Evaluations using Pydantic AI
8+
- evals_langchain_mcp: Evaluations using Langchain
9+
- evals_adk_mcp: Evaluations using Google ADK
10+
"""
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
import os
2+
from typing import Any
3+
4+
import logfire
5+
from dotenv import load_dotenv
6+
from google.adk.agents.llm_agent import LlmAgent
7+
from google.adk.tools.mcp_tool.mcp_toolset import MCPToolset, StdioServerParameters
8+
from google.adk.runners import Runner
9+
from google.adk.sessions import InMemorySessionService
10+
from google.genai import types
11+
from pydantic import BaseModel
12+
from pydantic_evals import Case, Dataset
13+
from pydantic_evals.evaluators import Evaluator, EvaluatorContext, IsInstance, LLMJudge
14+
15+
load_dotenv()
16+
17+
# Configure logging to logfire if LOGFIRE_TOKEN is set in environment
18+
logfire.configure(
19+
send_to_logfire="if-token-present",
20+
service_name="evals",
21+
)
22+
logfire.instrument_mcp()
23+
24+
# Set API key for Google AI API from environment variable
25+
os.environ["GOOGLE_API_KEY"] = os.getenv("GEMINI_API_KEY")
26+
27+
# Grant the MCP server access to only a specific directory
28+
current_dir = os.path.dirname(os.path.abspath(__file__))
29+
parent_dir = os.path.dirname(current_dir)
30+
test_dir = os.path.join(parent_dir, "mcp_allowed_dir")
31+
32+
# Model to use for the agent
33+
model_name = "gemini-2.5-pro-preview-03-25"
34+
35+
36+
# Define the input prompt schema
37+
class InputPrompt(BaseModel):
38+
question: str
39+
40+
41+
# Define the output response schema
42+
class OutputResponse(BaseModel):
43+
output: str
44+
45+
46+
async def find_target_quote(inputs: InputPrompt) -> OutputResponse:
47+
"""Find information in files using the agent with a fresh MCP server for each evaluation.
48+
49+
Args:
50+
inputs: The input prompt containing the question
51+
52+
Returns:
53+
An OutputResponse with the agent's answer
54+
"""
55+
# Create a new server instance for each evaluation
56+
server_params = StdioServerParameters(
57+
command="npx",
58+
args=["-y", "@modelcontextprotocol/server-filesystem", test_dir],
59+
)
60+
61+
tools, exit_stack = await MCPToolset.from_server(connection_params=server_params)
62+
63+
try:
64+
# Create the agent
65+
root_agent = LlmAgent(
66+
model=model_name,
67+
name="mcp_adk_assistant",
68+
tools=tools,
69+
)
70+
71+
# Set up session
72+
session_service = InMemorySessionService()
73+
session = session_service.create_session(
74+
app_name="mcp_adk_app",
75+
user_id="user",
76+
)
77+
78+
# Create the runner
79+
runner = Runner(
80+
app_name="mcp_adk_app",
81+
agent=root_agent,
82+
session_service=session_service,
83+
)
84+
85+
# Run the agent with the query
86+
content = types.Content(role="user", parts=[types.Part(text=inputs.question)])
87+
88+
events_async = runner.run_async(
89+
session_id=session.id, user_id=session.user_id, new_message=content
90+
)
91+
92+
# Extract all text responses from the agent
93+
async for event in events_async:
94+
if hasattr(event, "content") and event.content:
95+
# Extract text content from the response
96+
for part in event.content.parts:
97+
if hasattr(part, "text") and part.text:
98+
# Store only the last text response
99+
result = part.text
100+
101+
return OutputResponse(output=result)
102+
finally:
103+
# Ensure MCP server connection is properly closed
104+
await exit_stack.aclose()
105+
106+
107+
# Define the dataset of cases to evaluate
108+
quote_dataset = Dataset[InputPrompt, OutputResponse, Any](
109+
cases=[
110+
Case(
111+
name="2024_quote",
112+
inputs=InputPrompt(
113+
question=f"Read the contents of files in {test_dir} to find the line that the famous personality says in Motorway's 2024 advertising sponsorship feature. Give me just the line used."
114+
),
115+
expected_output=OutputResponse(output="Oh yeah he's winning!"),
116+
metadata={"difficulty": "medium"},
117+
),
118+
Case(
119+
name="find_reference",
120+
inputs=InputPrompt(
121+
question=f"Read the contents of files in {test_dir} to find the sub-heading number for 'Cycle lanes and cycle tracks'. Give me just the number."
122+
),
123+
expected_output=OutputResponse(output="140"),
124+
metadata={"difficulty": "easy"},
125+
),
126+
],
127+
evaluators=[
128+
IsInstance(type_name="OutputResponse"),
129+
LLMJudge(
130+
rubric="Output should match expected",
131+
include_input=True,
132+
# LLM to use as the judge
133+
model="gemini-2.5-pro-preview-03-25",
134+
),
135+
],
136+
)
137+
138+
139+
def main():
140+
"""Main function to run evaluations when module is imported or run directly."""
141+
# Run evaluations in parallel since each has its own server
142+
report = quote_dataset.evaluate_sync(
143+
find_target_quote, name=f"{model_name}-adk-find_target_quote-evals"
144+
)
145+
report.print(include_input=False, include_expected_output=True, include_output=True)
146+
147+
148+
if __name__ == "__main__":
149+
main()
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
import os
2+
from typing import Any
3+
4+
import logfire
5+
from dotenv import load_dotenv
6+
from pydantic import BaseModel
7+
from langchain_google_genai import ChatGoogleGenerativeAI
8+
from langchain_mcp_adapters.tools import load_mcp_tools
9+
from langgraph.prebuilt import create_react_agent
10+
from mcp import ClientSession, StdioServerParameters
11+
from mcp.client.stdio import stdio_client
12+
from pydantic_evals import Case, Dataset
13+
from pydantic_evals.evaluators import Evaluator, EvaluatorContext, IsInstance, LLMJudge
14+
15+
load_dotenv()
16+
17+
# Configure logging to logfire if LOGFIRE_TOKEN is set in environment
18+
logfire.configure(
19+
send_to_logfire="if-token-present",
20+
service_name="evals",
21+
)
22+
logfire.instrument_mcp()
23+
24+
# Grant the MCP server access to only a specific directory
25+
current_dir = os.path.dirname(os.path.abspath(__file__))
26+
parent_dir = os.path.dirname(current_dir)
27+
test_dir = os.path.join(parent_dir, "mcp_allowed_dir")
28+
29+
# Model to use for the agent
30+
model_name = "gemini-2.5-pro-preview-03-25"
31+
32+
33+
# Define the input prompt schema
34+
class InputPrompt(BaseModel):
35+
question: str
36+
37+
38+
# Define the output response schema
39+
class OutputResponse(BaseModel):
40+
output: str
41+
42+
43+
async def find_target_quote(inputs: InputPrompt) -> OutputResponse:
44+
"""Find information in files using the agent with a fresh MCP server for each evaluation.
45+
46+
Args:
47+
inputs: The input prompt containing the question
48+
49+
Returns:
50+
An OutputResponse with the agent's answer
51+
"""
52+
# Create a new server instance for each evaluation
53+
server = StdioServerParameters(
54+
command="npx",
55+
args=["-y", "@modelcontextprotocol/server-filesystem", test_dir],
56+
)
57+
58+
model = ChatGoogleGenerativeAI(
59+
model="gemini-2.5-pro-preview-03-25", google_api_key=os.getenv("GEMINI_API_KEY")
60+
)
61+
async with stdio_client(server) as (read, write):
62+
async with ClientSession(read, write) as session:
63+
# Initialise the connection
64+
await session.initialize()
65+
66+
# Get tools
67+
tools = await load_mcp_tools(session)
68+
69+
# Create agent
70+
agent = create_react_agent(model, tools)
71+
agent_response = await agent.ainvoke({"messages": inputs.question})
72+
73+
return OutputResponse(output=agent_response["messages"][-1].content)
74+
75+
76+
# Define the dataset of cases to evaluate
77+
quote_dataset = Dataset[InputPrompt, OutputResponse, Any](
78+
cases=[
79+
Case(
80+
name="2024_quote",
81+
inputs=InputPrompt(
82+
question=f"Read the contents of files in {test_dir} to find the line that the famous personality says in Motorway's 2024 advertising sponsorship feature. Give me just the line used."
83+
),
84+
expected_output=OutputResponse(output="Oh yeah he's winning!"),
85+
metadata={"difficulty": "medium"},
86+
),
87+
Case(
88+
name="find_reference",
89+
inputs=InputPrompt(
90+
question=f"Read the contents of files in {test_dir} to find the sub-heading number for 'Cycle lanes and cycle tracks'. Give me just the number."
91+
),
92+
expected_output=OutputResponse(output="140"),
93+
metadata={"difficulty": "easy"},
94+
),
95+
],
96+
evaluators=[
97+
IsInstance(type_name="OutputResponse"),
98+
LLMJudge(
99+
rubric="Output should match expected",
100+
include_input=True,
101+
# LLM to use as the judge
102+
model="gemini-2.5-pro-preview-03-25",
103+
),
104+
],
105+
)
106+
107+
108+
def main():
109+
"""Main function to run evaluations when module is imported or run directly."""
110+
# Run evaluations in parallel since each has its own server
111+
report = quote_dataset.evaluate_sync(
112+
find_target_quote, name=f"{model_name}-langchain-find_target_quote-evals"
113+
)
114+
report.print(
115+
include_input=False, include_expected_output=False, include_output=False
116+
)
117+
118+
119+
if __name__ == "__main__":
120+
main()
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
import os
2+
from typing import Any
3+
4+
import logfire
5+
from dotenv import load_dotenv
6+
from pydantic import BaseModel
7+
from pydantic_ai import Agent
8+
from pydantic_ai.mcp import MCPServerStdio
9+
from pydantic_evals import Case, Dataset
10+
from pydantic_evals.evaluators import Evaluator, EvaluatorContext, IsInstance, LLMJudge
11+
12+
load_dotenv()
13+
14+
# Configure logging to logfire if LOGFIRE_TOKEN is set in environment
15+
logfire.configure(
16+
send_to_logfire="if-token-present",
17+
service_name="evals",
18+
)
19+
logfire.instrument_mcp()
20+
logfire.instrument_pydantic_ai()
21+
22+
# Grant the MCP server access to only a specific directory
23+
current_dir = os.path.dirname(os.path.abspath(__file__))
24+
parent_dir = os.path.dirname(current_dir)
25+
test_dir = os.path.join(parent_dir, "mcp_allowed_dir")
26+
27+
# Model to use for the agent
28+
model_name = "gemini-2.5-pro-preview-03-25"
29+
30+
31+
# Define the input prompt schema
32+
class InputPrompt(BaseModel):
33+
question: str
34+
35+
36+
# Define the output response schema
37+
class OutputResponse(BaseModel):
38+
output: str
39+
40+
41+
async def find_target_quote(inputs: InputPrompt) -> OutputResponse:
42+
"""Find information in files using the agent with a fresh MCP server for each evaluation.
43+
44+
Args:
45+
inputs: The input prompt containing the question
46+
47+
Returns:
48+
An OutputResponse with the agent's answer
49+
"""
50+
# Create a new server instance for each evaluation
51+
server = MCPServerStdio(
52+
command="npx",
53+
args=["-y", "@modelcontextprotocol/server-filesystem", test_dir],
54+
)
55+
56+
# Create a new agent with its own MCP server instances
57+
agent = Agent(model_name, mcp_servers=[server])
58+
59+
# Use the agent to accomplish the task
60+
async with agent.run_mcp_servers():
61+
result = await agent.run(inputs.question)
62+
63+
return OutputResponse(output=result.output)
64+
65+
66+
# Define the dataset of cases to evaluate
67+
quote_dataset = Dataset[InputPrompt, OutputResponse, Any](
68+
cases=[
69+
Case(
70+
name="2024_quote",
71+
inputs=InputPrompt(
72+
question=f"Read the contents of files in {test_dir} to find the line that the famous personality says in Motorway's 2024 advertising sponsorship feature. Give me just the line used."
73+
),
74+
expected_output=OutputResponse(output="Oh yeah he's winning!"),
75+
metadata={"difficulty": "medium"},
76+
),
77+
Case(
78+
name="find_reference",
79+
inputs=InputPrompt(
80+
question=f"Read the contents of files in {test_dir} to find the sub-heading number for 'Cycle lanes and cycle tracks'. Give me just the number."
81+
),
82+
expected_output=OutputResponse(output="140"),
83+
metadata={"difficulty": "easy"},
84+
),
85+
],
86+
evaluators=[
87+
IsInstance(type_name="OutputResponse"),
88+
LLMJudge(
89+
rubric="Output should match expected",
90+
include_input=True,
91+
# LLM to use as the judge
92+
model="gemini-2.5-pro-preview-03-25",
93+
),
94+
],
95+
)
96+
97+
98+
def main():
99+
"""Main function to run evaluations when module is imported or run directly."""
100+
# Run evaluations in parallel since each has its own server
101+
report = quote_dataset.evaluate_sync(
102+
find_target_quote, name=f"{model_name}-pydantic_ai-find_target_quote-evals"
103+
)
104+
report.print(
105+
include_input=False, include_expected_output=False, include_output=False
106+
)
107+
108+
109+
# This block still allows the script to be run directly
110+
if __name__ == "__main__":
111+
main()

0 commit comments

Comments
 (0)