Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
922 changes: 922 additions & 0 deletions packages/sample-app/sample_app/agents/travel_agent_example.py

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
"""
Agent Evaluators Experiment

This example demonstrates Traceloop's agent evaluators:
- Agent Goal Accuracy: Validates agent goal achievement
- Agent Tool Error Detector: Detects errors or failures during tool execution
- Agent Flow Quality: Validates agent trajectories against user-defined natural language tests
- Agent Efficiency: Evaluates agent efficiency by checking for redundant calls and optimal paths
- Agent Goal Completeness: Measures whether the agent successfully accomplished all user goals

These evaluators help ensure your AI agents perform optimally and achieve their objectives.
"""

import asyncio
import os
from openai import AsyncOpenAI
from traceloop.sdk import Traceloop
from traceloop.sdk.evaluator import EvaluatorMadeByTraceloop

# Initialize Traceloop
client = Traceloop.init()


async def generate_agent_trace(task_description: str) -> dict:
"""
Simulate an agent execution and generate trace data.
In a real scenario, this would come from your actual agent framework.
"""
openai_client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Simulate agent executing a task
response = await openai_client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "You are a helpful agent that completes tasks step by step."},
{"role": "user", "content": task_description}
],
temperature=0.7,
max_tokens=300,
)

completion = response.choices[0].message.content

# Return trace data (simplified for demo)
# In production, this would be actual trace/span data from your agent
return {
"task": task_description,
"completion": completion,
}


async def agent_evaluators_task(row):
"""
Unified task function for all 5 agent evaluators.

IMPORTANT: Thanks to field synonym mapping, you can use flexible field names!
For example:
- "answer", "response", "text" → all map to "completion"
- "prompt", "instructions" → map to "question"
- "context", "ground_truth" → map to "reference"
- "prompts" → maps to "trajectory_prompts"
- "completions" → maps to "trajectory_completions"

This makes it easier to write tasks without worrying about exact field names.

Required fields for the 5 agent evaluators:
- question (or prompt): The input question or goal (for agent_goal_accuracy)
- completion (or answer, response, text): The agent's completion (for agent_goal_accuracy)
- reference (or ground_truth, context): The reference answer (for agent_goal_accuracy)
- tool_input: The input to tools (for agent_tool_error_detector)
- tool_output: The output from tools (for agent_tool_error_detector)
- trajectory_prompts (or prompts): The agent's prompt trajectory
(for agent_flow_quality, agent_efficiency, agent_goal_completeness)
- trajectory_completions (or completions): The agent's completion trajectory
(for agent_flow_quality, agent_efficiency, agent_goal_completeness)
"""
# Get data from row or use defaults
question = row.get("question", "Book a flight from New York to Paris")
reference = row.get(
"reference",
"Successfully booked flight NYC to Paris, departure 2024-12-15, return 2024-12-22"
)
tool_input = row.get("tool_input", "New York to Paris")
tool_output = row.get(
"tool_output",
"Successfully booked flight NYC to Paris, departure 2024-12-15, return 2024-12-22"
)
trajectory_prompts = row.get("trajectory_prompts", "New York to Paris")
trajectory_completions = row.get(
"trajectory_completions",
"Successfully booked flight NYC to Paris, departure 2024-12-15, return 2024-12-22"
)

# Generate agent trace
trace_data = await generate_agent_trace(question)

# You can use synonyms! These will automatically map to the required fields:
# - Using "answer" instead of "completion" ✓
# - Using "prompt" instead of "question" ✓
# - Using "context" instead of "reference" ✓
return {
"prompt": question, # Maps to "question"
"answer": trace_data["completion"], # Maps to "completion"
"context": reference, # Maps to "reference"
"tool_input": tool_input,
"tool_output": tool_output,
"prompts": trajectory_prompts, # Maps to "trajectory_prompts"
"completions": trajectory_completions, # Maps to "trajectory_completions"
}


async def run_agents_experiment():
"""
Run experiment with all 5 agent evaluators.

This experiment will evaluate agent performance across:
1. Agent Goal Accuracy - Did the agent achieve the stated goal?
2. Agent Tool Error Detector - Were there any tool execution errors?
3. Agent Flow Quality - Did the agent follow the expected trajectory?
4. Agent Efficiency - Was the agent efficient (no redundant calls)?
5. Agent Goal Completeness - Did the agent fully accomplish all goals?
"""

print("\n" + "="*80)
print("AGENT EVALUATORS EXPERIMENT")
print("="*80 + "\n")

print("This experiment will test five agent-specific evaluators:\n")
print("1. Agent Goal Accuracy - Validates goal achievement")
print("2. Agent Tool Error Detector - Detects tool execution errors")
print("3. Agent Flow Quality - Validates expected trajectories")
print("4. Agent Efficiency - Checks for optimal execution paths")
print("5. Agent Goal Completeness - Measures full goal accomplishment")
print("\n" + "-"*80 + "\n")

# Configure agent evaluators
evaluators = [
EvaluatorMadeByTraceloop.agent_goal_accuracy(),
EvaluatorMadeByTraceloop.agent_tool_error_detector(),
EvaluatorMadeByTraceloop.agent_flow_quality(),
EvaluatorMadeByTraceloop.agent_efficiency(),
EvaluatorMadeByTraceloop.agent_goal_completeness(),
]

print("Running experiment with evaluators:")
for evaluator in evaluators:
print(f" - {evaluator.slug}")

print("\n" + "-"*80 + "\n")

# Run the experiment
# Note: You'll need to create a dataset with appropriate test cases for agents
results, errors = await client.experiment.run(
dataset_slug="agents", # Set a dataset slug that exists in the traceloop platform
dataset_version="v1",
task=agent_evaluators_task,
evaluators=evaluators,
experiment_slug="agents-evaluators-exp",
stop_on_error=False,
wait_for_results=True,
)

print("\n" + "="*80)
print("Agent evaluators experiment completed!")
print("="*80 + "\n")

print("Results summary:")
print(f" - Total rows processed: {len(results) if results else 0}")
print(f" - Errors encountered: {len(errors) if errors else 0}")

if errors:
print("\nErrors:")
for error in errors:
print(f" - {error}")

if __name__ == "__main__":
print("\nAgent Evaluators Experiment\n")

asyncio.run(run_agents_experiment())
37 changes: 20 additions & 17 deletions packages/traceloop-sdk/tests/evaluator/test_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import pytest
from traceloop.sdk.evaluator.evaluator import validate_task_output
from traceloop.sdk.evaluator.evaluator import validate_and_normalize_task_output
from traceloop.sdk.evaluator.config import EvaluatorDetails


Expand All @@ -12,7 +12,7 @@ def test_validate_task_output_with_no_evaluators(self):
evaluators = []

# Should not raise any exception
validate_task_output(task_output, evaluators)
validate_and_normalize_task_output(task_output, evaluators)

def test_validate_task_output_with_evaluators_no_required_fields(self):
"""Test that validation passes when evaluators have no required fields"""
Expand All @@ -23,7 +23,7 @@ def test_validate_task_output_with_evaluators_no_required_fields(self):
]

# Should not raise any exception
validate_task_output(task_output, evaluators)
validate_and_normalize_task_output(task_output, evaluators)

def test_validate_task_output_with_valid_output(self):
"""Test that validation passes when all required fields are present"""
Expand All @@ -37,7 +37,7 @@ def test_validate_task_output_with_valid_output(self):
]

# Should not raise any exception
validate_task_output(task_output, evaluators)
validate_and_normalize_task_output(task_output, evaluators)

def test_validate_task_output_missing_single_field(self):
"""Test that validation fails when a single required field is missing"""
Expand All @@ -47,11 +47,12 @@ def test_validate_task_output_missing_single_field(self):
]

with pytest.raises(ValueError) as exc_info:
validate_task_output(task_output, evaluators)
validate_and_normalize_task_output(task_output, evaluators)

error_message = str(exc_info.value)
assert "Task output missing required fields for evaluators:" in error_message
assert "pii-detector requires: ['text']" in error_message
assert "pii-detector requires:" in error_message
assert "'text'" in error_message
assert "Task output contains: ['prompt']" in error_message
assert (
"Hint: Update your task function to return a dictionary "
Expand All @@ -69,7 +70,7 @@ def test_validate_task_output_missing_multiple_fields_single_evaluator(self):
]

with pytest.raises(ValueError) as exc_info:
validate_task_output(task_output, evaluators)
validate_and_normalize_task_output(task_output, evaluators)

error_message = str(exc_info.value)
assert "relevance-checker requires:" in error_message
Expand All @@ -90,7 +91,7 @@ def test_validate_task_output_missing_fields_multiple_evaluators(self):
]

with pytest.raises(ValueError) as exc_info:
validate_task_output(task_output, evaluators)
validate_and_normalize_task_output(task_output, evaluators)

error_message = str(exc_info.value)
assert "pii-detector requires:" in error_message
Expand All @@ -112,7 +113,7 @@ def test_validate_task_output_partial_match(self):
]

with pytest.raises(ValueError) as exc_info:
validate_task_output(task_output, evaluators)
validate_and_normalize_task_output(task_output, evaluators)

error_message = str(exc_info.value)
# Should only mention the failing evaluator
Expand All @@ -127,7 +128,7 @@ def test_validate_task_output_empty_task_output(self):
]

with pytest.raises(ValueError) as exc_info:
validate_task_output(task_output, evaluators)
validate_and_normalize_task_output(task_output, evaluators)

error_message = str(exc_info.value)
assert "Task output contains: []" in error_message
Expand All @@ -146,7 +147,7 @@ def test_validate_task_output_with_extra_fields(self):
]

# Should not raise any exception - extra fields are allowed
validate_task_output(task_output, evaluators)
validate_and_normalize_task_output(task_output, evaluators)

def test_validate_task_output_case_sensitive_field_names(self):
"""Test that field name matching is case-sensitive"""
Expand All @@ -156,10 +157,11 @@ def test_validate_task_output_case_sensitive_field_names(self):
]

with pytest.raises(ValueError) as exc_info:
validate_task_output(task_output, evaluators)
validate_and_normalize_task_output(task_output, evaluators)

error_message = str(exc_info.value)
assert "pii-detector requires: ['text']" in error_message
assert "pii-detector requires:" in error_message
assert "'text'" in error_message
assert "Task output contains: ['Text']" in error_message

def test_validate_task_output_with_evaluator_config(self):
Expand All @@ -175,7 +177,7 @@ def test_validate_task_output_with_evaluator_config(self):
]

# Should not raise any exception - config shouldn't affect validation
validate_task_output(task_output, evaluators)
validate_and_normalize_task_output(task_output, evaluators)

def test_validate_task_output_mixed_evaluators(self):
"""Test validation with a mix of evaluators with and without required fields"""
Expand All @@ -191,11 +193,12 @@ def test_validate_task_output_mixed_evaluators(self):
]

with pytest.raises(ValueError) as exc_info:
validate_task_output(task_output, evaluators)
validate_and_normalize_task_output(task_output, evaluators)

error_message = str(exc_info.value)
# Should only mention failing evaluator
assert "relevance-checker requires: ['prompt']" in error_message
assert "relevance-checker requires:" in error_message
assert "'prompt'" in error_message
assert "evaluator-no-requirements" not in error_message
assert "pii-detector" not in error_message or "pii-detector requires:" not in error_message

Expand All @@ -212,7 +215,7 @@ def test_validate_task_output_duplicate_required_fields(self):
]

with pytest.raises(ValueError) as exc_info:
validate_task_output(task_output, evaluators)
validate_and_normalize_task_output(task_output, evaluators)

error_message = str(exc_info.value)
assert "pii-detector requires:" in error_message
Expand Down
Loading