|
| 1 | +""" |
| 2 | +Demonstration of per-step queries in chained evaluation. |
| 3 | +This shows how initial_query can be used for steps after the first one. |
| 4 | +""" |
| 5 | + |
| 6 | +from toolvaluator import ChainedEvaluator, get_tool_schemas_sync |
| 7 | +from toolvaluator.test_server import mcp |
| 8 | + |
| 9 | + |
| 10 | +def test_step_queries(): |
| 11 | + """Test that per-step queries are properly sent to the model.""" |
| 12 | + tool_schemas = get_tool_schemas_sync(mcp) |
| 13 | + |
| 14 | + # Create a chain with specific queries for each step |
| 15 | + chain = ChainedEvaluator( |
| 16 | + tool_schemas=tool_schemas, |
| 17 | + mcp_server=mcp, |
| 18 | + model_name="openai/gpt-4o-mini", |
| 19 | + api_key="test-key", |
| 20 | + mocks={ |
| 21 | + "get_location": "San Francisco", |
| 22 | + "get_weather": "Sunny, 72F", |
| 23 | + }, |
| 24 | + verbose=True, |
| 25 | + ) |
| 26 | + |
| 27 | + # Step 1: Get location |
| 28 | + chain.add_step( |
| 29 | + initial_query="What's my location?", |
| 30 | + expected_tool="get_location", |
| 31 | + expected_arguments={}, |
| 32 | + ) |
| 33 | + |
| 34 | + # Step 2: Get weather with a SPECIFIC query for this step |
| 35 | + # This query will now be included along with the context |
| 36 | + chain.add_step( |
| 37 | + initial_query="Now get the weather forecast for the next 5 days", |
| 38 | + expected_tool="get_weather", |
| 39 | + expected_arguments={"location": None}, # Wildcard - any location works |
| 40 | + ) |
| 41 | + |
| 42 | + print("=" * 70) |
| 43 | + print("Testing per-step queries in chained evaluation") |
| 44 | + print("=" * 70) |
| 45 | + print() |
| 46 | + print("Expected behavior:") |
| 47 | + print("- Step 1: Uses 'What's my location?' as query") |
| 48 | + print("- Step 2: Should receive:") |
| 49 | + print(" 1. Previous context (location result)") |
| 50 | + print(" 2. The step-specific query: 'Now get the weather forecast...'") |
| 51 | + print() |
| 52 | + print("This ensures the model gets both context AND step-specific instructions.") |
| 53 | + print() |
| 54 | + |
| 55 | + # Run evaluation (this will print queries sent to model due to verbose=True) |
| 56 | + result = chain.evaluate() |
| 57 | + |
| 58 | + print() |
| 59 | + print("=" * 70) |
| 60 | + print("Result:") |
| 61 | + print(f"Overall score: {result['score']:.2f}") |
| 62 | + print("=" * 70) |
| 63 | + |
| 64 | + |
| 65 | +if __name__ == "__main__": |
| 66 | + test_step_queries() |
0 commit comments