Skip to content

Commit 6945ce6

Browse files
committed
Fix issue in chained steps after number 1
Step 1 was the only place you could add an initial_query. We need that to come across with the previous context in order for the model to get a hint on the next thing it needs to do. Like calling another tool.
1 parent 27e261b commit 6945ce6

File tree

3 files changed

+89
-6
lines changed

3 files changed

+89
-6
lines changed

src/toolvaluator/builders.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,10 @@ def add_step(
232232
Args:
233233
expected_tool: Name of tool that should be called in this step
234234
expected_arguments: Expected arguments (None for wildcards)
235-
initial_query: For first step only - the user's initial query
235+
initial_query: Optional query for this step. Required for first step.
236+
For subsequent steps, if provided, will be used as the
237+
query along with previous context. If not provided for
238+
subsequent steps, uses generic continuation message.
236239
mock_result: Optional mock result to use instead of executing tool
237240
system_prompt: Optional system prompt for this step (overrides eval-level)
238241

src/toolvaluator/chained.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,10 @@ def add_step(
136136
Args:
137137
expected_tool: Name of tool that should be called in this step
138138
expected_arguments: Expected arguments (None for wildcards)
139-
initial_query: For first step only - the user's initial query
139+
initial_query: Optional query for this step. Required for first step.
140+
For subsequent steps, if provided, will be used as the
141+
query along with previous context. If not provided for
142+
subsequent steps, uses generic continuation message.
140143
mock_result: Optional mock result to use instead of executing tool
141144
Overrides any global mock for this specific step
142145
@@ -347,16 +350,27 @@ def evaluate(self) -> dict[str, Any]:
347350

348351
# Build query for next step using conversation history
349352
if i + 1 < len(self.steps):
353+
next_step = self.steps[i + 1]
350354
context_str = "\n".join(
351355
[
352356
f"Called {item['tool']}({item.get('args', '')}) → {item['result']}"
353357
for item in conversation_context
354358
]
355359
)
356-
current_query = (
357-
f"Previous context:\n{context_str}\n\n"
358-
f"Continue the task to accomplish the original goal."
359-
)
360+
361+
# Check if next step has its own initial_query
362+
if next_step.get("initial_query"):
363+
# Use the step's query with context
364+
current_query = (
365+
f"Previous context:\n{context_str}\n\n"
366+
f"{next_step['initial_query']}"
367+
)
368+
else:
369+
# Use generic continuation message
370+
current_query = (
371+
f"Previous context:\n{context_str}\n\n"
372+
f"Continue the task to accomplish the original goal."
373+
)
360374

361375
# Calculate overall score
362376
tool_scores = [1.0 if r["tool_correct"] else 0.0 for r in step_results]

test_step_query.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
"""
2+
Demonstration of per-step queries in chained evaluation.
3+
This shows how initial_query can be used for steps after the first one.
4+
"""
5+
6+
from toolvaluator import ChainedEvaluator, get_tool_schemas_sync
7+
from toolvaluator.test_server import mcp
8+
9+
10+
def test_step_queries():
11+
"""Test that per-step queries are properly sent to the model."""
12+
tool_schemas = get_tool_schemas_sync(mcp)
13+
14+
# Create a chain with specific queries for each step
15+
chain = ChainedEvaluator(
16+
tool_schemas=tool_schemas,
17+
mcp_server=mcp,
18+
model_name="openai/gpt-4o-mini",
19+
api_key="test-key",
20+
mocks={
21+
"get_location": "San Francisco",
22+
"get_weather": "Sunny, 72F",
23+
},
24+
verbose=True,
25+
)
26+
27+
# Step 1: Get location
28+
chain.add_step(
29+
initial_query="What's my location?",
30+
expected_tool="get_location",
31+
expected_arguments={},
32+
)
33+
34+
# Step 2: Get weather with a SPECIFIC query for this step
35+
# This query will now be included along with the context
36+
chain.add_step(
37+
initial_query="Now get the weather forecast for the next 5 days",
38+
expected_tool="get_weather",
39+
expected_arguments={"location": None}, # Wildcard - any location works
40+
)
41+
42+
print("=" * 70)
43+
print("Testing per-step queries in chained evaluation")
44+
print("=" * 70)
45+
print()
46+
print("Expected behavior:")
47+
print("- Step 1: Uses 'What's my location?' as query")
48+
print("- Step 2: Should receive:")
49+
print(" 1. Previous context (location result)")
50+
print(" 2. The step-specific query: 'Now get the weather forecast...'")
51+
print()
52+
print("This ensures the model gets both context AND step-specific instructions.")
53+
print()
54+
55+
# Run evaluation (this will print queries sent to model due to verbose=True)
56+
result = chain.evaluate()
57+
58+
print()
59+
print("=" * 70)
60+
print("Result:")
61+
print(f"Overall score: {result['score']:.2f}")
62+
print("=" * 70)
63+
64+
65+
if __name__ == "__main__":
66+
test_step_queries()

0 commit comments

Comments
 (0)