|
| 1 | +""" |
| 2 | +Agent Tool Trajectory Experiment |
| 3 | +
|
| 4 | +This example demonstrates Traceloop's agent tool trajectory evaluator: |
| 5 | +- Agent Tool Trajectory: Validates the agent tool trajectory |
| 6 | +
|
| 7 | +This evaluator helps ensure your AI agents perform optimally and follow the expected tool trajectory. |
| 8 | +""" |
| 9 | + |
| 10 | +import asyncio |
| 11 | +from traceloop.sdk import Traceloop |
| 12 | +from traceloop.sdk.evaluator import EvaluatorMadeByTraceloop |
| 13 | + |
| 14 | +# Initialize Traceloop |
| 15 | +client = Traceloop.init() |
| 16 | + |
| 17 | + |
| 18 | +def agent_evaluators_task(row): |
| 19 | + executed_tool_calls = row.get("actual", "") |
| 20 | + default_expected = ( |
| 21 | + "[{'name': 'search', 'input': {'query': 'weather'}}, " |
| 22 | + "{'name': 'book_flight', 'input': {'flight': 'NYC to Paris'}}, " |
| 23 | + "{'name': 'get_confirmation', 'input': {'confirmation': 'flight booked'}}]" |
| 24 | + ) |
| 25 | + expected_tool_calls = row.get("expected", default_expected) |
| 26 | + |
| 27 | + return { |
| 28 | + "executed_tool_calls": executed_tool_calls, |
| 29 | + "expected_tool_calls": expected_tool_calls, |
| 30 | + } |
| 31 | + |
| 32 | + |
| 33 | +async def run_agent_tool_trajectory_experiment(): |
| 34 | + print("\n" + "="*80) |
| 35 | + print("AGENT TOOL TRAJECTORY EXPERIMENT") |
| 36 | + print("="*80 + "\n") |
| 37 | + print("This experiment will test the agent tool trajectory with the agent tool trajectory evaluator:\n") |
| 38 | + print("1. Agent Tool Trajectory - Validates the agent tool trajectory") |
| 39 | + print("\n" + "-"*80 + "\n") |
| 40 | + |
| 41 | + # Configure agent evaluators |
| 42 | + evaluators = [ |
| 43 | + EvaluatorMadeByTraceloop.agent_tool_trajectory( |
| 44 | + input_params_sensitive=True, |
| 45 | + mismatch_sensitive=False, |
| 46 | + order_sensitive=False, |
| 47 | + threshold=0.7, |
| 48 | + ), |
| 49 | + ] |
| 50 | + |
| 51 | + print("Running experiment with evaluators:") |
| 52 | + for evaluator in evaluators: |
| 53 | + print(f" - {evaluator.slug}") |
| 54 | + |
| 55 | + print("\n" + "-"*80 + "\n") |
| 56 | + |
| 57 | + # Run the experiment |
| 58 | + # Note: You'll need to create a dataset with appropriate test cases for agents |
| 59 | + results, errors = await client.experiment.run( |
| 60 | + dataset_slug="agent-tool-trajectory", # Set a dataset slug that exists in the traceloop platform |
| 61 | + dataset_version="v1", |
| 62 | + task=agent_evaluators_task, |
| 63 | + evaluators=evaluators, |
| 64 | + experiment_slug="agent-tool-trajectory-exp", |
| 65 | + stop_on_error=False, |
| 66 | + wait_for_results=True, |
| 67 | + ) |
| 68 | + |
| 69 | + print("\n" + "="*80) |
| 70 | + print("Agent tool trajectory experiment completed!") |
| 71 | + print("="*80 + "\n") |
| 72 | + |
| 73 | + print("Results summary:") |
| 74 | + print(f" - Total rows processed: {len(results) if results else 0}") |
| 75 | + print(f" - Errors encountered: {len(errors) if errors else 0}") |
| 76 | + |
| 77 | + if errors: |
| 78 | + print("\nErrors:") |
| 79 | + for error in errors: |
| 80 | + print(f" - {error}") |
| 81 | + |
| 82 | +if __name__ == "__main__": |
| 83 | + print("\nAgent Tool Trajectory Experiment\n") |
| 84 | + |
| 85 | + asyncio.run(run_agent_tool_trajectory_experiment()) |
0 commit comments