Skip to content

Commit 33b92b3

Browse files
feat: only include latest summarization and acknowledgement (#31)
* only include latest summarization and acknowledgement * add test * add extra test and update takewhile to include assistant msg * remove test_summarize_messages_strips_old_summarie as it overlaps with new test
1 parent 09aadfe commit 33b92b3

File tree

4 files changed

+105
-4
lines changed

4 files changed

+105
-4
lines changed

src/stirrup/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
ImageContentBlock,
4040
LLMClient,
4141
SubAgentMetadata,
42+
SummaryMessage,
4243
SystemMessage,
4344
TokenUsage,
4445
Tool,
@@ -63,6 +64,7 @@
6364
"ImageContentBlock",
6465
"LLMClient",
6566
"SubAgentMetadata",
67+
"SummaryMessage",
6668
"SystemMessage",
6769
"TokenUsage",
6870
"Tool",

src/stirrup/core/agent.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
ImageContentBlock,
3030
LLMClient,
3131
SubAgentMetadata,
32+
SummaryMessage,
3233
SystemMessage,
3334
TokenUsage,
3435
Tool,
@@ -1036,15 +1037,18 @@ async def step(
10361037

10371038
async def summarize_messages(self, messages: list[ChatMessage]) -> list[ChatMessage]:
10381039
"""Condense message history using LLM to stay within context window."""
1039-
task_context: list[ChatMessage] = list(takewhile(lambda m: not isinstance(m, AssistantMessage), messages))
1040+
task_context: list[ChatMessage] = list(
1041+
takewhile(lambda m: not isinstance(m, (AssistantMessage, SummaryMessage)), messages)
1042+
)
10401043

10411044
summary_prompt = [*messages, UserMessage(content=MESSAGE_SUMMARIZER)]
10421045

10431046
# We need to pass the tools to the client so that it has context of tools used in the conversation
10441047
summary = await self._client.generate(summary_prompt, self._active_tools)
10451048

10461049
summary_bridge_prompt = MESSAGE_SUMMARIZER_BRIDGE_TEMPLATE.format(summary=summary.content)
1047-
summary_bridge = UserMessage(content=summary_bridge_prompt)
1050+
summary_bridge = SummaryMessage(content=summary_bridge_prompt)
1051+
# UserMessage (not AssistantMessage) to avoid consecutive assistant messages which some providers reject
10481052
acknowledgement_msg = UserMessage(content="Got it, thanks!")
10491053

10501054
# Log the completed summary

src/stirrup/core/models.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -600,6 +600,12 @@ class UserMessage(BaseModel):
600600
content: Content
601601

602602

603+
class SummaryMessage(UserMessage):
604+
"""Summary message to the LLM."""
605+
606+
pass
607+
608+
603609
class Reasoning(BaseModel):
604610
"""Extended thinking/reasoning content from models that support chain-of-thought reasoning."""
605611

tests/test_agent.py

Lines changed: 91 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
AssistantMessage,
99
ChatMessage,
1010
LLMClient,
11+
SummaryMessage,
1112
SystemMessage,
1213
TokenUsage,
1314
Tool,
@@ -22,17 +23,18 @@
2223
class MockLLMClient(LLMClient):
2324
"""Mock LLM client for testing."""
2425

25-
def __init__(self, responses: list[AssistantMessage]) -> None:
26+
def __init__(self, responses: list[AssistantMessage], max_tokens: int = 100_000) -> None:
2627
self.responses = responses
2728
self.call_count = 0
29+
self._max_tokens = max_tokens
2830

2931
@property
3032
def model_slug(self) -> str:
3133
return "mock-model"
3234

3335
@property
3436
def max_tokens(self) -> int:
35-
return 100_000
37+
return self._max_tokens
3638

3739
async def generate(self, messages: list[ChatMessage], tools: dict[str, Tool]) -> AssistantMessage: # noqa: ARG002
3840
response = self.responses[self.call_count]
@@ -493,3 +495,90 @@ async def test_allow_successive_assistant_messages() -> None:
493495
messages = message_history[0]
494496
continue_messages = [m for m in messages if isinstance(m, UserMessage) and m.content == "Please continue the task"]
495497
assert len(continue_messages) == 0
498+
499+
500+
async def test_summarize_history_has_one_summary_per_trajectory() -> None:
501+
"""Test that each sub-trajectory in history contains at most one SummaryMessage.
502+
503+
Simulates an agent run where summarization triggers twice. Verifies:
504+
- history[0] (pre-first-summary) has 0 SummaryMessages
505+
- history[1] (post-first-summary) has exactly 1 SummaryMessage
506+
- history[2] (post-second-summary, final) has exactly 1 SummaryMessage
507+
"""
508+
# max_tokens=1000 and cutoff=0.3 means summarization triggers when
509+
# token_usage.total >= 300. Turns without tool calls also trigger
510+
# "Please continue" messages from block_successive_assistant_messages.
511+
512+
responses = [
513+
# Turn 1: high token usage triggers first summarization
514+
AssistantMessage(
515+
content="Working on it",
516+
tool_calls=[],
517+
token_usage=TokenUsage(input=250, answer=100), # total=350 >= 300
518+
),
519+
# First summarization generate call
520+
AssistantMessage(
521+
content="First summary of progress.",
522+
tool_calls=[],
523+
token_usage=TokenUsage(input=200, answer=50),
524+
),
525+
# Turn 2: high token usage triggers second summarization
526+
AssistantMessage(
527+
content="Continuing work",
528+
tool_calls=[],
529+
token_usage=TokenUsage(input=250, answer=100), # total=350 >= 300
530+
),
531+
# Second summarization generate call
532+
AssistantMessage(
533+
content="Second summary of progress.",
534+
tool_calls=[],
535+
token_usage=TokenUsage(input=200, answer=50),
536+
),
537+
# Turn 3: finish
538+
AssistantMessage(
539+
content="Done",
540+
tool_calls=[
541+
ToolCall(
542+
name=FINISH_TOOL_NAME,
543+
arguments='{"reason": "Completed", "paths": []}',
544+
tool_call_id="call_finish",
545+
)
546+
],
547+
token_usage=TokenUsage(input=100, answer=50),
548+
),
549+
]
550+
551+
client = MockLLMClient(responses, max_tokens=1000)
552+
553+
agent = Agent(
554+
client=client,
555+
name="test-agent",
556+
max_turns=10,
557+
turns_remaining_warning_threshold=2,
558+
tools=[],
559+
finish_tool=SIMPLE_FINISH_TOOL,
560+
context_summarization_cutoff=0.3,
561+
)
562+
563+
async with agent.session() as session:
564+
_finish_params, history, _ = await session.run(
565+
[SystemMessage(content="System prompt"), UserMessage(content="Do the task")]
566+
)
567+
568+
# Should have 3 sub-trajectories: pre-summary, post-1st-summary, post-2nd-summary (final)
569+
assert len(history) == 3
570+
571+
# history[0]: original conversation before first summarization — no summaries
572+
summaries_0 = [m for m in history[0] if isinstance(m, SummaryMessage)]
573+
assert len(summaries_0) == 0
574+
575+
# history[1]: after first summarization — exactly 1 SummaryMessage
576+
summaries_1 = [m for m in history[1] if isinstance(m, SummaryMessage)]
577+
assert len(summaries_1) == 1
578+
579+
# history[2]: after second summarization — exactly 1 SummaryMessage (not 2)
580+
summaries_2 = [m for m in history[2] if isinstance(m, SummaryMessage)]
581+
assert len(summaries_2) == 1
582+
583+
# The summary content should be different between history[1] and history[2]
584+
assert summaries_1[0].content != summaries_2[0].content

0 commit comments

Comments
 (0)