Skip to content

Commit 3a88685

Browse files
Fix duplicate tool_result error from Anthropic API
Add ToolResultUniquenessProperty to ensure each tool_call_id has exactly one tool result in the LLM message context. The bug occurs when: 1. An agent invokes a tool (creating ActionEvent with tool_call_id) 2. Runtime restarts while tool is running 3. On restart, AgentErrorEvent is created for 'unmatched' action 4. Tool completes and creates ObservationEvent with same tool_call_id 5. Two tool_results for same tool_call_id violates Anthropic API The fix adds a new view property that deduplicates tool results: - Groups observations by tool_call_id - Keeps ObservationEvent over AgentErrorEvent when both exist - If multiple of same type, keeps the later one Co-authored-by: openhands <openhands@all-hands.dev>
1 parent 2b54375 commit 3a88685

File tree

3 files changed

+321
-0
lines changed

3 files changed

+321
-0
lines changed

openhands-sdk/openhands/sdk/context/view/properties/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,16 @@
66
from openhands.sdk.context.view.properties.tool_loop_atomicity import (
77
ToolLoopAtomicityProperty,
88
)
9+
from openhands.sdk.context.view.properties.tool_result_uniqueness import (
10+
ToolResultUniquenessProperty,
11+
)
912

1013

1114
ALL_PROPERTIES: list[ViewPropertyBase] = [
1215
BatchAtomicityProperty(),
1316
ToolCallMatchingProperty(),
1417
ToolLoopAtomicityProperty(),
18+
ToolResultUniquenessProperty(),
1519
]
1620
"""A list of all existing properties."""
1721

@@ -20,5 +24,6 @@
2024
"BatchAtomicityProperty",
2125
"ToolCallMatchingProperty",
2226
"ToolLoopAtomicityProperty",
27+
"ToolResultUniquenessProperty",
2328
"ALL_PROPERTIES",
2429
]
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
"""Property to ensure each tool_call_id has exactly one tool result.
2+
3+
LLM APIs (especially Anthropic) require that each tool_use has exactly one
4+
corresponding tool_result. This property handles the edge case where multiple
5+
observation events (e.g., AgentErrorEvent and ObservationEvent) are created
6+
for the same tool_call_id, typically due to restarts or race conditions.
7+
"""
8+
9+
from collections import defaultdict
10+
from collections.abc import Sequence
11+
12+
from openhands.sdk.context.view.manipulation_indices import ManipulationIndices
13+
from openhands.sdk.context.view.properties.base import ViewPropertyBase
14+
from openhands.sdk.event import (
15+
AgentErrorEvent,
16+
Event,
17+
EventID,
18+
LLMConvertibleEvent,
19+
ObservationBaseEvent,
20+
ObservationEvent,
21+
ToolCallID,
22+
)
23+
24+
25+
class ToolResultUniquenessProperty(ViewPropertyBase):
26+
"""Each tool_call_id must have exactly one tool result.
27+
28+
When multiple observations exist for the same tool_call_id, this property
29+
keeps the most informative one:
30+
1. ObservationEvent (actual tool result) is preferred over AgentErrorEvent
31+
2. If multiple of the same type exist, the later one is kept
32+
"""
33+
34+
def enforce(
35+
self,
36+
current_view_events: list[LLMConvertibleEvent],
37+
all_events: Sequence[Event], # noqa: ARG002
38+
) -> set[EventID]:
39+
"""Remove duplicate tool results for the same tool_call_id.
40+
41+
When multiple ObservationBaseEvents share the same tool_call_id,
42+
keep only one - preferring ObservationEvent over AgentErrorEvent.
43+
"""
44+
# Group observations by tool_call_id
45+
observations_by_tool_call: dict[ToolCallID, list[ObservationBaseEvent]] = (
46+
defaultdict(list)
47+
)
48+
49+
for event in current_view_events:
50+
if isinstance(event, ObservationBaseEvent):
51+
observations_by_tool_call[event.tool_call_id].append(event)
52+
53+
events_to_remove: set[EventID] = set()
54+
55+
for tool_call_id, observations in observations_by_tool_call.items():
56+
if len(observations) <= 1:
57+
continue
58+
59+
# Multiple observations for same tool_call_id - need to pick one
60+
# Priority: ObservationEvent > AgentErrorEvent > others
61+
# If same priority, keep the later one (more recent)
62+
obs_events = [o for o in observations if isinstance(o, ObservationEvent)]
63+
error_events = [o for o in observations if isinstance(o, AgentErrorEvent)]
64+
other_events = [
65+
o
66+
for o in observations
67+
if not isinstance(o, (ObservationEvent, AgentErrorEvent))
68+
]
69+
70+
# Determine which one to keep
71+
if obs_events:
72+
# Keep the last ObservationEvent, remove all others
73+
to_keep = obs_events[-1]
74+
elif other_events:
75+
# Keep the last "other" event (UserRejectObservation, etc.)
76+
to_keep = other_events[-1]
77+
else:
78+
# Only AgentErrorEvents, keep the last one
79+
to_keep = error_events[-1]
80+
81+
# Mark all others for removal
82+
for obs in observations:
83+
if obs.id != to_keep.id:
84+
events_to_remove.add(obs.id)
85+
86+
return events_to_remove
87+
88+
def manipulation_indices(
89+
self,
90+
current_view_events: list[LLMConvertibleEvent],
91+
) -> ManipulationIndices:
92+
"""Calculate manipulation indices for tool result uniqueness.
93+
94+
This property doesn't restrict manipulation - it only enforces
95+
uniqueness when violations are detected.
96+
"""
97+
return ManipulationIndices.complete(current_view_events)
Lines changed: 219 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,219 @@
1+
"""Tests for ToolResultUniquenessProperty.
2+
3+
This module tests that duplicate tool results for the same tool_call_id
4+
are properly deduplicated, preferring ObservationEvent over AgentErrorEvent.
5+
"""
6+
7+
from unittest.mock import create_autospec
8+
9+
from openhands.sdk.context.view.properties.tool_result_uniqueness import (
10+
ToolResultUniquenessProperty,
11+
)
12+
from openhands.sdk.event.base import LLMConvertibleEvent
13+
from openhands.sdk.event.llm_convertible import (
14+
ActionEvent,
15+
AgentErrorEvent,
16+
ObservationEvent,
17+
UserRejectObservation,
18+
)
19+
from tests.sdk.context.view.properties.conftest import message_event
20+
21+
22+
class TestToolResultUniquenessPropertyEnforcement:
23+
"""Tests for the enforce method of ToolResultUniquenessProperty."""
24+
25+
def setup_method(self) -> None:
26+
"""Set up test fixtures."""
27+
self.property = ToolResultUniquenessProperty()
28+
29+
def test_empty_list(self) -> None:
30+
"""Test enforce with empty event list."""
31+
result = self.property.enforce([], [])
32+
assert result == set()
33+
34+
def test_no_duplicates(self) -> None:
35+
"""Test enforce when there are no duplicate tool_call_ids."""
36+
obs1 = create_autospec(ObservationEvent, instance=True)
37+
obs1.tool_call_id = "call_1"
38+
obs1.id = "obs_1"
39+
40+
obs2 = create_autospec(ObservationEvent, instance=True)
41+
obs2.tool_call_id = "call_2"
42+
obs2.id = "obs_2"
43+
44+
events: list[LLMConvertibleEvent] = [
45+
message_event("Start"),
46+
obs1,
47+
obs2,
48+
message_event("End"),
49+
]
50+
51+
result = self.property.enforce(events, events)
52+
assert result == set()
53+
54+
def test_duplicate_observation_events(self) -> None:
55+
"""Test that duplicate ObservationEvents keep the later one."""
56+
obs1 = create_autospec(ObservationEvent, instance=True)
57+
obs1.tool_call_id = "call_1"
58+
obs1.id = "obs_1"
59+
60+
obs2 = create_autospec(ObservationEvent, instance=True)
61+
obs2.tool_call_id = "call_1" # Same tool_call_id!
62+
obs2.id = "obs_2"
63+
64+
events: list[LLMConvertibleEvent] = [obs1, obs2]
65+
66+
result = self.property.enforce(events, events)
67+
# obs1 should be removed, obs2 (later) should be kept
68+
assert result == {"obs_1"}
69+
70+
def test_observation_event_preferred_over_agent_error(self) -> None:
71+
"""Test that ObservationEvent is preferred over AgentErrorEvent."""
72+
# This is the main bug scenario: AgentErrorEvent created on restart,
73+
# then ObservationEvent arrives later with actual result
74+
agent_error = AgentErrorEvent(
75+
tool_name="terminal",
76+
tool_call_id="call_1",
77+
error="Restart occurred while tool was running",
78+
)
79+
80+
obs = create_autospec(ObservationEvent, instance=True)
81+
obs.tool_call_id = "call_1" # Same tool_call_id!
82+
obs.id = "obs_1"
83+
84+
events: list[LLMConvertibleEvent] = [agent_error, obs]
85+
86+
result = self.property.enforce(events, events)
87+
# AgentErrorEvent should be removed, ObservationEvent kept
88+
assert result == {agent_error.id}
89+
90+
def test_agent_error_before_observation_event(self) -> None:
91+
"""Test AgentErrorEvent followed by ObservationEvent (restart scenario)."""
92+
# Simulates: restart creates AgentErrorEvent, then actual result arrives
93+
action = create_autospec(ActionEvent, instance=True)
94+
action.tool_call_id = "call_1"
95+
action.id = "action_1"
96+
action.llm_response_id = "response_1"
97+
98+
agent_error = AgentErrorEvent(
99+
tool_name="terminal",
100+
tool_call_id="call_1",
101+
error="A restart occurred while this tool was in progress.",
102+
)
103+
104+
obs = create_autospec(ObservationEvent, instance=True)
105+
obs.tool_call_id = "call_1"
106+
obs.id = "obs_1"
107+
108+
events: list[LLMConvertibleEvent] = [
109+
message_event("User message"),
110+
action,
111+
agent_error,
112+
obs, # Actual result arrives later
113+
]
114+
115+
result = self.property.enforce(events, events)
116+
# AgentErrorEvent should be removed since we have actual ObservationEvent
117+
assert result == {agent_error.id}
118+
119+
def test_multiple_agent_errors_keep_last(self) -> None:
120+
"""Test that when only AgentErrorEvents exist, the last one is kept."""
121+
error1 = AgentErrorEvent(
122+
tool_name="terminal",
123+
tool_call_id="call_1",
124+
error="First error",
125+
)
126+
127+
error2 = AgentErrorEvent(
128+
tool_name="terminal",
129+
tool_call_id="call_1",
130+
error="Second error",
131+
)
132+
133+
events: list[LLMConvertibleEvent] = [error1, error2]
134+
135+
result = self.property.enforce(events, events)
136+
# First error should be removed, second (later) should be kept
137+
assert result == {error1.id}
138+
139+
def test_user_reject_observation_handling(self) -> None:
140+
"""Test that UserRejectObservation is handled correctly."""
141+
reject = UserRejectObservation(
142+
tool_name="terminal",
143+
tool_call_id="call_1",
144+
action_id="action_1",
145+
rejection_reason="User rejected",
146+
)
147+
148+
obs = create_autospec(ObservationEvent, instance=True)
149+
obs.tool_call_id = "call_1"
150+
obs.id = "obs_1"
151+
152+
events: list[LLMConvertibleEvent] = [reject, obs]
153+
154+
result = self.property.enforce(events, events)
155+
# ObservationEvent is preferred over UserRejectObservation
156+
assert result == {reject.id}
157+
158+
def test_mixed_scenario_multiple_tool_calls(self) -> None:
159+
"""Test with multiple tool calls, some with duplicates."""
160+
# Tool call 1: has duplicate (error + observation)
161+
error1 = AgentErrorEvent(
162+
tool_name="terminal",
163+
tool_call_id="call_1",
164+
error="Restart error",
165+
)
166+
obs1 = create_autospec(ObservationEvent, instance=True)
167+
obs1.tool_call_id = "call_1"
168+
obs1.id = "obs_1"
169+
170+
# Tool call 2: single observation (no duplicate)
171+
obs2 = create_autospec(ObservationEvent, instance=True)
172+
obs2.tool_call_id = "call_2"
173+
obs2.id = "obs_2"
174+
175+
# Tool call 3: single error (no duplicate)
176+
error3 = AgentErrorEvent(
177+
tool_name="file_editor",
178+
tool_call_id="call_3",
179+
error="Tool not found",
180+
)
181+
182+
events: list[LLMConvertibleEvent] = [
183+
message_event("Start"),
184+
error1,
185+
obs1,
186+
obs2,
187+
error3,
188+
]
189+
190+
result = self.property.enforce(events, events)
191+
# Only error1 should be removed (duplicate with obs1)
192+
assert result == {error1.id}
193+
194+
195+
class TestToolResultUniquenessPropertyManipulationIndices:
196+
"""Tests for the manipulation_indices method."""
197+
198+
def setup_method(self) -> None:
199+
"""Set up test fixtures."""
200+
self.property = ToolResultUniquenessProperty()
201+
202+
def test_complete_indices_returned(self) -> None:
203+
"""Test that manipulation indices are complete (no restrictions)."""
204+
obs = create_autospec(ObservationEvent, instance=True)
205+
obs.tool_call_id = "call_1"
206+
obs.id = "obs_1"
207+
208+
events: list[LLMConvertibleEvent] = [
209+
message_event("Start"),
210+
obs,
211+
message_event("End"),
212+
]
213+
214+
result = self.property.manipulation_indices(events)
215+
# Should have indices 0, 1, 2, 3 (all positions)
216+
assert 0 in result
217+
assert 1 in result
218+
assert 2 in result
219+
assert 3 in result

0 commit comments

Comments
 (0)