Skip to content

Commit 8e52c97

Browse files
authored
fix(evals): evals API supports input + config, generate mbt functions (#3534)
1 parent a528699 commit 8e52c97

File tree

10 files changed

+1262
-225
lines changed

10 files changed

+1262
-225
lines changed
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
"""
2+
Agent Tool Trajectory Experiment
3+
4+
This example demonstrates Traceloop's agent tool trajectory evaluator:
5+
- Agent Tool Trajectory: Validates the agent tool trajectory
6+
7+
This evaluator helps ensure your AI agents perform optimally and follow the expected tool trajectory.
8+
"""
9+
10+
import asyncio
11+
from traceloop.sdk import Traceloop
12+
from traceloop.sdk.evaluator import EvaluatorMadeByTraceloop
13+
14+
# Initialize Traceloop
15+
client = Traceloop.init()
16+
17+
18+
def agent_evaluators_task(row):
19+
executed_tool_calls = row.get("actual", "")
20+
default_expected = (
21+
"[{'name': 'search', 'input': {'query': 'weather'}}, "
22+
"{'name': 'book_flight', 'input': {'flight': 'NYC to Paris'}}, "
23+
"{'name': 'get_confirmation', 'input': {'confirmation': 'flight booked'}}]"
24+
)
25+
expected_tool_calls = row.get("expected", default_expected)
26+
27+
return {
28+
"executed_tool_calls": executed_tool_calls,
29+
"expected_tool_calls": expected_tool_calls,
30+
}
31+
32+
33+
async def run_agent_tool_trajectory_experiment():
34+
print("\n" + "="*80)
35+
print("AGENT TOOL TRAJECTORY EXPERIMENT")
36+
print("="*80 + "\n")
37+
print("This experiment will test the agent tool trajectory with the agent tool trajectory evaluator:\n")
38+
print("1. Agent Tool Trajectory - Validates the agent tool trajectory")
39+
print("\n" + "-"*80 + "\n")
40+
41+
# Configure agent evaluators
42+
evaluators = [
43+
EvaluatorMadeByTraceloop.agent_tool_trajectory(
44+
input_params_sensitive=True,
45+
mismatch_sensitive=False,
46+
order_sensitive=False,
47+
threshold=0.7,
48+
),
49+
]
50+
51+
print("Running experiment with evaluators:")
52+
for evaluator in evaluators:
53+
print(f" - {evaluator.slug}")
54+
55+
print("\n" + "-"*80 + "\n")
56+
57+
# Run the experiment
58+
# Note: You'll need to create a dataset with appropriate test cases for agents
59+
results, errors = await client.experiment.run(
60+
dataset_slug="agent-tool-trajectory", # Set a dataset slug that exists in the traceloop platform
61+
dataset_version="v1",
62+
task=agent_evaluators_task,
63+
evaluators=evaluators,
64+
experiment_slug="agent-tool-trajectory-exp",
65+
stop_on_error=False,
66+
wait_for_results=True,
67+
)
68+
69+
print("\n" + "="*80)
70+
print("Agent tool trajectory experiment completed!")
71+
print("="*80 + "\n")
72+
73+
print("Results summary:")
74+
print(f" - Total rows processed: {len(results) if results else 0}")
75+
print(f" - Errors encountered: {len(errors) if errors else 0}")
76+
77+
if errors:
78+
print("\nErrors:")
79+
for error in errors:
80+
print(f" - {error}")
81+
82+
if __name__ == "__main__":
83+
print("\nAgent Tool Trajectory Experiment\n")
84+
85+
asyncio.run(run_agent_tool_trajectory_experiment())
Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
11
from .evaluator import Evaluator
22
from .config import EvaluatorDetails
3-
from .evaluators_made_by_traceloop import EvaluatorMadeByTraceloop, create_evaluator
3+
from ..generated.evaluators.definitions import EvaluatorMadeByTraceloop
44

55
__all__ = [
66
"Evaluator",
77
"EvaluatorDetails",
88
"EvaluatorMadeByTraceloop",
9-
"create_evaluator",
109
]

packages/traceloop-sdk/traceloop/sdk/evaluator/evaluator.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ def _validate_evaluator_input(slug: str, input: Dict[str, str]) -> None:
2828
request_model = get_request_model(slug)
2929
if request_model:
3030
try:
31-
request_model(**input)
31+
# Request models expect data nested under 'input' field
32+
request_model(input=input)
3233
except ValidationError as e:
3334
raise ValueError(f"Invalid input for '{slug}': {e}") from e
3435

packages/traceloop-sdk/traceloop/sdk/evaluator/evaluators_made_by_traceloop.py

Lines changed: 0 additions & 120 deletions
This file was deleted.

0 commit comments

Comments
 (0)