Fix issue in chained steps after number 1

cwbooth5 · cwbooth5 · commit 6945ce6177ab · 2025-11-28T20:03:39.000-08:00
Step 1 was the only place you could add an initial_query. We need that to come across with the previous context in order for the model to get a hint on the next thing it needs to do. Like calling another tool.
diff --git a/src/toolvaluator/builders.py b/src/toolvaluator/builders.py
@@ -232,7 +232,10 @@ def add_step(
         Args:
             expected_tool: Name of tool that should be called in this step
             expected_arguments: Expected arguments (None for wildcards)
-            initial_query: For first step only - the user's initial query
+            initial_query: Optional query for this step. Required for first step.
+                          For subsequent steps, if provided, will be used as the
+                          query along with previous context. If not provided for
+                          subsequent steps, uses generic continuation message.
             mock_result: Optional mock result to use instead of executing tool
             system_prompt: Optional system prompt for this step (overrides eval-level)
 
diff --git a/src/toolvaluator/chained.py b/src/toolvaluator/chained.py
@@ -136,7 +136,10 @@ def add_step(
         Args:
             expected_tool: Name of tool that should be called in this step
             expected_arguments: Expected arguments (None for wildcards)
-            initial_query: For first step only - the user's initial query
+            initial_query: Optional query for this step. Required for first step.
+                          For subsequent steps, if provided, will be used as the
+                          query along with previous context. If not provided for
+                          subsequent steps, uses generic continuation message.
             mock_result: Optional mock result to use instead of executing tool
                         Overrides any global mock for this specific step
 
@@ -347,16 +350,27 @@ def evaluate(self) -> dict[str, Any]:
 
                 # Build query for next step using conversation history
                 if i + 1 < len(self.steps):
+                    next_step = self.steps[i + 1]
                     context_str = "\n".join(
                         [
                             f"Called {item['tool']}({item.get('args', '')}) → {item['result']}"
                             for item in conversation_context
                         ]
                     )
-                    current_query = (
-                        f"Previous context:\n{context_str}\n\n"
-                        f"Continue the task to accomplish the original goal."
-                    )
+
+                    # Check if next step has its own initial_query
+                    if next_step.get("initial_query"):
+                        # Use the step's query with context
+                        current_query = (
+                            f"Previous context:\n{context_str}\n\n"
+                            f"{next_step['initial_query']}"
+                        )
+                    else:
+                        # Use generic continuation message
+                        current_query = (
+                            f"Previous context:\n{context_str}\n\n"
+                            f"Continue the task to accomplish the original goal."
+                        )
 
         # Calculate overall score
         tool_scores = [1.0 if r["tool_correct"] else 0.0 for r in step_results]
diff --git a/test_step_query.py b/test_step_query.py
@@ -0,0 +1,66 @@
+"""
+Demonstration of per-step queries in chained evaluation.
+This shows how initial_query can be used for steps after the first one.
+"""
+
+from toolvaluator import ChainedEvaluator, get_tool_schemas_sync
+from toolvaluator.test_server import mcp
+
+
+def test_step_queries():
+    """Test that per-step queries are properly sent to the model."""
+    tool_schemas = get_tool_schemas_sync(mcp)
+
+    # Create a chain with specific queries for each step
+    chain = ChainedEvaluator(
+        tool_schemas=tool_schemas,
+        mcp_server=mcp,
+        model_name="openai/gpt-4o-mini",
+        api_key="test-key",
+        mocks={
+            "get_location": "San Francisco",
+            "get_weather": "Sunny, 72F",
+        },
+        verbose=True,
+    )
+
+    # Step 1: Get location
+    chain.add_step(
+        initial_query="What's my location?",
+        expected_tool="get_location",
+        expected_arguments={},
+    )
+
+    # Step 2: Get weather with a SPECIFIC query for this step
+    # This query will now be included along with the context
+    chain.add_step(
+        initial_query="Now get the weather forecast for the next 5 days",
+        expected_tool="get_weather",
+        expected_arguments={"location": None},  # Wildcard - any location works
+    )
+
+    print("=" * 70)
+    print("Testing per-step queries in chained evaluation")
+    print("=" * 70)
+    print()
+    print("Expected behavior:")
+    print("- Step 1: Uses 'What's my location?' as query")
+    print("- Step 2: Should receive:")
+    print("  1. Previous context (location result)")
+    print("  2. The step-specific query: 'Now get the weather forecast...'")
+    print()
+    print("This ensures the model gets both context AND step-specific instructions.")
+    print()
+
+    # Run evaluation (this will print queries sent to model due to verbose=True)
+    result = chain.evaluate()
+
+    print()
+    print("=" * 70)
+    print("Result:")
+    print(f"Overall score: {result['score']:.2f}")
+    print("=" * 70)
+
+
+if __name__ == "__main__":
+    test_step_queries()