chore: summarize_when_long with no tool call response

GangGreenTemperTatum · GangGreenTemperTatum · commit 13b662ccd674 · 2026-01-21T11:40:35.000-05:00
diff --git a/dreadnode/agent/hooks/summarize.py b/dreadnode/agent/hooks/summarize.py
@@ -44,12 +44,56 @@ def _get_last_input_tokens(event: AgentEvent) -> int:
     return last_generation_event.usage.input_tokens if last_generation_event.usage else 0
 
 
+def _find_tool_aware_boundary(
+    messages: list[rg.Message],
+    min_messages_to_keep: int,
+) -> int:
+    """
+    Find the best summarization boundary while preserving tool call/response pairs.
+
+    This prevents breaking tool messages that would cause API errors with strict models
+    (OpenAI, Anthropic) that require every tool_call_id to have a matching response.
+
+    Args:
+        messages: List of messages to analyze (excluding system message)
+        min_messages_to_keep: Minimum messages that must be kept after boundary
+
+    Returns:
+        Index where to split (messages[:idx] summarized, messages[idx:] kept)
+        Returns 0 if no valid boundary found
+    """
+    # Build tool_call_id -> assistant message index mapping
+    tool_call_map: dict[str, int] = {}
+    for i, msg in enumerate(messages):
+        if msg.role == "assistant" and hasattr(msg, "tool_calls"):
+            for tc in getattr(msg, "tool_calls", None) or []:
+                if hasattr(tc, "id"):
+                    tool_call_map[tc.id] = i
+
+    # Walk backward from desired split point to find first valid boundary
+    for boundary in range(len(messages) - min_messages_to_keep, -1, -1):
+        # Check if this boundary would orphan any tool responses
+        has_orphan = False
+        for msg in messages[boundary:]:
+            if msg.role == "tool" and hasattr(msg, "tool_call_id"):
+                call_idx = tool_call_map.get(msg.tool_call_id)
+                if call_idx is not None and call_idx < boundary:
+                    has_orphan = True
+                    break
+
+        if not has_orphan:
+            return boundary
+
+    return 0  # No valid boundary found
+
+
 @component
 def summarize_when_long(
     model: str | rg.Generator | None = None,
     max_tokens: int = 100_000,
     min_messages_to_keep: int = 5,
     guidance: str = "",
+    preserve_tool_pairs: bool = True,
 ) -> "Hook":
     """
     Creates a hook to manage the agent's context window by summarizing the conversation history.
@@ -66,6 +110,9 @@ def summarize_when_long(
             (default is None, meaning no proactive summarization).
         min_messages_to_keep: The minimum number of messages to retain after summarization (default is 5).
         guidance: Additional guidance for the summarization process (default is "").
+        preserve_tool_pairs: If True, ensures tool call/response pairs stay together to avoid breaking
+            strict API requirements (OpenAI, Anthropic). Defaults to True. Set to False to use legacy
+            behavior that may break tool pairs but allows more aggressive summarization.
     """
 
     if min_messages_to_keep < 2:
@@ -91,6 +138,10 @@ async def summarize_when_long(  # noqa: PLR0912
             guidance,
             help="Additional guidance for the summarization process",
         ),
+        preserve_tool_pairs: bool = Config(
+            preserve_tool_pairs,
+            help="Preserve tool call/response pairs to avoid breaking strict API requirements",
+        ),
     ) -> Reaction | None:
         should_summarize = False
 
@@ -123,26 +174,30 @@ async def summarize_when_long(  # noqa: PLR0912
             messages.pop(0) if messages and messages[0].role == "system" else None
         )
 
-        # Find the best point to summarize by walking the message list once.
-        # A boundary is valid after a simple assistant message or a finished tool block.
-        best_summarize_boundary = 0
-        for i, message in enumerate(messages):
-            # If the remaining messages are less than or equal to our minimum, we can't slice any further.
-            if len(messages) - i <= min_messages_to_keep:
-                break
-
-            # Condition 1: The message is an assistant response without tool calls.
-            is_simple_assistant = message.role == "assistant" and not getattr(
-                message, "tool_calls", None
-            )
-
-            # Condition 2: The message is the last in a block of tool responses.
-            is_last_tool_in_block = message.role == "tool" and (
-                i + 1 == len(messages) or messages[i + 1].role != "tool"
-            )
-
-            if is_simple_assistant or is_last_tool_in_block:
-                best_summarize_boundary = i + 1
+        # Find the best point to summarize
+        if preserve_tool_pairs:
+            # Use tool-aware boundary finding to prevent breaking tool call/response pairs
+            best_summarize_boundary = _find_tool_aware_boundary(messages, min_messages_to_keep)
+        else:
+            # Legacy behavior: walk the message list once looking for simple boundaries
+            best_summarize_boundary = 0
+            for i, message in enumerate(messages):
+                # If the remaining messages are less than or equal to our minimum, we can't slice any further.
+                if len(messages) - i <= min_messages_to_keep:
+                    break
+
+                # Condition 1: The message is an assistant response without tool calls.
+                is_simple_assistant = message.role == "assistant" and not getattr(
+                    message, "tool_calls", None
+                )
+
+                # Condition 2: The message is the last in a block of tool responses.
+                is_last_tool_in_block = message.role == "tool" and (
+                    i + 1 == len(messages) or messages[i + 1].role != "tool"
+                )
+
+                if is_simple_assistant or is_last_tool_in_block:
+                    best_summarize_boundary = i + 1
 
         if best_summarize_boundary == 0:
             return None  # No valid slice point was found.
diff --git a/tests/test_preserve_tool_pairs.py b/tests/test_preserve_tool_pairs.py
@@ -0,0 +1,85 @@
+"""Tests for preserve_tool_pairs functionality in summarize_when_long hook."""
+
+import rigging as rg
+from dreadnode.agent.hooks.summarize import _find_tool_aware_boundary
+
+
+class ToolCall:
+    """Minimal tool call representation for testing."""
+    def __init__(self, call_id: str):
+        self.id = call_id
+
+
+class ToolMessage(rg.Message):
+    """Tool response message for testing."""
+    def __init__(self, call_id: str, content: str):
+        super().__init__("tool", content)
+        self.tool_call_id = call_id
+
+
+def test_preserves_tool_pairs():
+    """Tool call and response stay together when split."""
+    messages = [
+        rg.Message("user", "Hello"),
+        rg.Message("assistant", "Let me check", tool_calls=[ToolCall("call_1")]),
+        ToolMessage("call_1", "Result"),
+        rg.Message("assistant", "Done"),
+        rg.Message("user", "Thanks"),
+    ]
+
+    boundary = _find_tool_aware_boundary(messages, min_messages_to_keep=2)
+
+    # Should keep tool pair together by moving boundary earlier
+    assert boundary <= 1, "Boundary should preserve tool call/response pair"
+
+
+def test_no_tools():
+    """Works correctly without any tool messages."""
+    messages = [
+        rg.Message("user", "Hello"),
+        rg.Message("assistant", "Hi"),
+        rg.Message("user", "How are you"),
+        rg.Message("assistant", "Good"),
+    ]
+
+    boundary = _find_tool_aware_boundary(messages, min_messages_to_keep=2)
+    assert boundary == 2, "Should split at natural boundary"
+
+
+def test_multiple_tool_pairs():
+    """Handles multiple tool call/response pairs correctly."""
+    messages = [
+        rg.Message("user", "Do A and B"),
+        rg.Message("assistant", "Running A", tool_calls=[ToolCall("a")]),
+        ToolMessage("a", "A done"),
+        rg.Message("assistant", "Running B", tool_calls=[ToolCall("b")]),
+        ToolMessage("b", "B done"),
+        rg.Message("user", "Thanks"),
+    ]
+
+    boundary = _find_tool_aware_boundary(messages, min_messages_to_keep=2)
+
+    # Should not split between any tool pairs
+    kept = messages[boundary:]
+    assert len(kept) >= 2, "Should keep minimum messages"
+
+
+def test_no_valid_boundary():
+    """Returns 0 when entire conversation is tool chain."""
+    messages = [
+        rg.Message("assistant", "Start", tool_calls=[ToolCall("1")]),
+        ToolMessage("1", "Result 1"),
+        rg.Message("assistant", "Continue", tool_calls=[ToolCall("2")]),
+        ToolMessage("2", "Result 2"),
+    ]
+
+    boundary = _find_tool_aware_boundary(messages, min_messages_to_keep=2)
+    assert boundary == 0, "Should keep everything when no valid split exists"
+
+
+if __name__ == "__main__":
+    test_preserves_tool_pairs()
+    test_no_tools()
+    test_multiple_tool_pairs()
+    test_no_valid_boundary()
+    print("All tests passed")