Fix duplicate tool_result error from Anthropic API

openhands-agent · openhands-agent · commit 3a886857a9a0 · 2026-03-02T07:30:45.000Z
Add ToolResultUniquenessProperty to ensure each tool_call_id has exactly
one tool result in the LLM message context.

The bug occurs when:
1. An agent invokes a tool (creating ActionEvent with tool_call_id)
2. Runtime restarts while tool is running
3. On restart, AgentErrorEvent is created for 'unmatched' action
4. Tool completes and creates ObservationEvent with same tool_call_id
5. Two tool_results for same tool_call_id violates Anthropic API

The fix adds a new view property that deduplicates tool results:
- Groups observations by tool_call_id
- Keeps ObservationEvent over AgentErrorEvent when both exist
- If multiple of same type, keeps the later one

Co-authored-by: openhands &lt;openhands@all-hands.dev&gt;
diff --git a/openhands-sdk/openhands/sdk/context/view/properties/__init__.py b/openhands-sdk/openhands/sdk/context/view/properties/__init__.py
@@ -6,12 +6,16 @@
 from openhands.sdk.context.view.properties.tool_loop_atomicity import (
     ToolLoopAtomicityProperty,
 )
+from openhands.sdk.context.view.properties.tool_result_uniqueness import (
+    ToolResultUniquenessProperty,
+)
 
 
 ALL_PROPERTIES: list[ViewPropertyBase] = [
     BatchAtomicityProperty(),
     ToolCallMatchingProperty(),
     ToolLoopAtomicityProperty(),
+    ToolResultUniquenessProperty(),
 ]
 """A list of all existing properties."""
 
@@ -20,5 +24,6 @@
     "BatchAtomicityProperty",
     "ToolCallMatchingProperty",
     "ToolLoopAtomicityProperty",
+    "ToolResultUniquenessProperty",
     "ALL_PROPERTIES",
 ]
diff --git a/openhands-sdk/openhands/sdk/context/view/properties/tool_result_uniqueness.py b/openhands-sdk/openhands/sdk/context/view/properties/tool_result_uniqueness.py
@@ -0,0 +1,97 @@
+"""Property to ensure each tool_call_id has exactly one tool result.
+
+LLM APIs (especially Anthropic) require that each tool_use has exactly one
+corresponding tool_result. This property handles the edge case where multiple
+observation events (e.g., AgentErrorEvent and ObservationEvent) are created
+for the same tool_call_id, typically due to restarts or race conditions.
+"""
+
+from collections import defaultdict
+from collections.abc import Sequence
+
+from openhands.sdk.context.view.manipulation_indices import ManipulationIndices
+from openhands.sdk.context.view.properties.base import ViewPropertyBase
+from openhands.sdk.event import (
+    AgentErrorEvent,
+    Event,
+    EventID,
+    LLMConvertibleEvent,
+    ObservationBaseEvent,
+    ObservationEvent,
+    ToolCallID,
+)
+
+
+class ToolResultUniquenessProperty(ViewPropertyBase):
+    """Each tool_call_id must have exactly one tool result.
+
+    When multiple observations exist for the same tool_call_id, this property
+    keeps the most informative one:
+    1. ObservationEvent (actual tool result) is preferred over AgentErrorEvent
+    2. If multiple of the same type exist, the later one is kept
+    """
+
+    def enforce(
+        self,
+        current_view_events: list[LLMConvertibleEvent],
+        all_events: Sequence[Event],  # noqa: ARG002
+    ) -> set[EventID]:
+        """Remove duplicate tool results for the same tool_call_id.
+
+        When multiple ObservationBaseEvents share the same tool_call_id,
+        keep only one - preferring ObservationEvent over AgentErrorEvent.
+        """
+        # Group observations by tool_call_id
+        observations_by_tool_call: dict[ToolCallID, list[ObservationBaseEvent]] = (
+            defaultdict(list)
+        )
+
+        for event in current_view_events:
+            if isinstance(event, ObservationBaseEvent):
+                observations_by_tool_call[event.tool_call_id].append(event)
+
+        events_to_remove: set[EventID] = set()
+
+        for tool_call_id, observations in observations_by_tool_call.items():
+            if len(observations) <= 1:
+                continue
+
+            # Multiple observations for same tool_call_id - need to pick one
+            # Priority: ObservationEvent > AgentErrorEvent > others
+            # If same priority, keep the later one (more recent)
+            obs_events = [o for o in observations if isinstance(o, ObservationEvent)]
+            error_events = [o for o in observations if isinstance(o, AgentErrorEvent)]
+            other_events = [
+                o
+                for o in observations
+                if not isinstance(o, (ObservationEvent, AgentErrorEvent))
+            ]
+
+            # Determine which one to keep
+            if obs_events:
+                # Keep the last ObservationEvent, remove all others
+                to_keep = obs_events[-1]
+            elif other_events:
+                # Keep the last "other" event (UserRejectObservation, etc.)
+                to_keep = other_events[-1]
+            else:
+                # Only AgentErrorEvents, keep the last one
+                to_keep = error_events[-1]
+
+            # Mark all others for removal
+            for obs in observations:
+                if obs.id != to_keep.id:
+                    events_to_remove.add(obs.id)
+
+        return events_to_remove
+
+    def manipulation_indices(
+        self,
+        current_view_events: list[LLMConvertibleEvent],
+    ) -> ManipulationIndices:
+        """Calculate manipulation indices for tool result uniqueness.
+
+        This property doesn't restrict manipulation - it only enforces
+        uniqueness when violations are detected.
+        """
+        return ManipulationIndices.complete(current_view_events)
diff --git a/tests/sdk/context/view/properties/test_tool_result_uniqueness.py b/tests/sdk/context/view/properties/test_tool_result_uniqueness.py
@@ -0,0 +1,219 @@
+"""Tests for ToolResultUniquenessProperty.
+
+This module tests that duplicate tool results for the same tool_call_id
+are properly deduplicated, preferring ObservationEvent over AgentErrorEvent.
+"""
+
+from unittest.mock import create_autospec
+
+from openhands.sdk.context.view.properties.tool_result_uniqueness import (
+    ToolResultUniquenessProperty,
+)
+from openhands.sdk.event.base import LLMConvertibleEvent
+from openhands.sdk.event.llm_convertible import (
+    ActionEvent,
+    AgentErrorEvent,
+    ObservationEvent,
+    UserRejectObservation,
+)
+from tests.sdk.context.view.properties.conftest import message_event
+
+
+class TestToolResultUniquenessPropertyEnforcement:
+    """Tests for the enforce method of ToolResultUniquenessProperty."""
+
+    def setup_method(self) -> None:
+        """Set up test fixtures."""
+        self.property = ToolResultUniquenessProperty()
+
+    def test_empty_list(self) -> None:
+        """Test enforce with empty event list."""
+        result = self.property.enforce([], [])
+        assert result == set()
+
+    def test_no_duplicates(self) -> None:
+        """Test enforce when there are no duplicate tool_call_ids."""
+        obs1 = create_autospec(ObservationEvent, instance=True)
+        obs1.tool_call_id = "call_1"
+        obs1.id = "obs_1"
+
+        obs2 = create_autospec(ObservationEvent, instance=True)
+        obs2.tool_call_id = "call_2"
+        obs2.id = "obs_2"
+
+        events: list[LLMConvertibleEvent] = [
+            message_event("Start"),
+            obs1,
+            obs2,
+            message_event("End"),
+        ]
+
+        result = self.property.enforce(events, events)
+        assert result == set()
+
+    def test_duplicate_observation_events(self) -> None:
+        """Test that duplicate ObservationEvents keep the later one."""
+        obs1 = create_autospec(ObservationEvent, instance=True)
+        obs1.tool_call_id = "call_1"
+        obs1.id = "obs_1"
+
+        obs2 = create_autospec(ObservationEvent, instance=True)
+        obs2.tool_call_id = "call_1"  # Same tool_call_id!
+        obs2.id = "obs_2"
+
+        events: list[LLMConvertibleEvent] = [obs1, obs2]
+
+        result = self.property.enforce(events, events)
+        # obs1 should be removed, obs2 (later) should be kept
+        assert result == {"obs_1"}
+
+    def test_observation_event_preferred_over_agent_error(self) -> None:
+        """Test that ObservationEvent is preferred over AgentErrorEvent."""
+        # This is the main bug scenario: AgentErrorEvent created on restart,
+        # then ObservationEvent arrives later with actual result
+        agent_error = AgentErrorEvent(
+            tool_name="terminal",
+            tool_call_id="call_1",
+            error="Restart occurred while tool was running",
+        )
+
+        obs = create_autospec(ObservationEvent, instance=True)
+        obs.tool_call_id = "call_1"  # Same tool_call_id!
+        obs.id = "obs_1"
+
+        events: list[LLMConvertibleEvent] = [agent_error, obs]
+
+        result = self.property.enforce(events, events)
+        # AgentErrorEvent should be removed, ObservationEvent kept
+        assert result == {agent_error.id}
+
+    def test_agent_error_before_observation_event(self) -> None:
+        """Test AgentErrorEvent followed by ObservationEvent (restart scenario)."""
+        # Simulates: restart creates AgentErrorEvent, then actual result arrives
+        action = create_autospec(ActionEvent, instance=True)
+        action.tool_call_id = "call_1"
+        action.id = "action_1"
+        action.llm_response_id = "response_1"
+
+        agent_error = AgentErrorEvent(
+            tool_name="terminal",
+            tool_call_id="call_1",
+            error="A restart occurred while this tool was in progress.",
+        )
+
+        obs = create_autospec(ObservationEvent, instance=True)
+        obs.tool_call_id = "call_1"
+        obs.id = "obs_1"
+
+        events: list[LLMConvertibleEvent] = [
+            message_event("User message"),
+            action,
+            agent_error,
+            obs,  # Actual result arrives later
+        ]
+
+        result = self.property.enforce(events, events)
+        # AgentErrorEvent should be removed since we have actual ObservationEvent
+        assert result == {agent_error.id}
+
+    def test_multiple_agent_errors_keep_last(self) -> None:
+        """Test that when only AgentErrorEvents exist, the last one is kept."""
+        error1 = AgentErrorEvent(
+            tool_name="terminal",
+            tool_call_id="call_1",
+            error="First error",
+        )
+
+        error2 = AgentErrorEvent(
+            tool_name="terminal",
+            tool_call_id="call_1",
+            error="Second error",
+        )
+
+        events: list[LLMConvertibleEvent] = [error1, error2]
+
+        result = self.property.enforce(events, events)
+        # First error should be removed, second (later) should be kept
+        assert result == {error1.id}
+
+    def test_user_reject_observation_handling(self) -> None:
+        """Test that UserRejectObservation is handled correctly."""
+        reject = UserRejectObservation(
+            tool_name="terminal",
+            tool_call_id="call_1",
+            action_id="action_1",
+            rejection_reason="User rejected",
+        )
+
+        obs = create_autospec(ObservationEvent, instance=True)
+        obs.tool_call_id = "call_1"
+        obs.id = "obs_1"
+
+        events: list[LLMConvertibleEvent] = [reject, obs]
+
+        result = self.property.enforce(events, events)
+        # ObservationEvent is preferred over UserRejectObservation
+        assert result == {reject.id}
+
+    def test_mixed_scenario_multiple_tool_calls(self) -> None:
+        """Test with multiple tool calls, some with duplicates."""
+        # Tool call 1: has duplicate (error + observation)
+        error1 = AgentErrorEvent(
+            tool_name="terminal",
+            tool_call_id="call_1",
+            error="Restart error",
+        )
+        obs1 = create_autospec(ObservationEvent, instance=True)
+        obs1.tool_call_id = "call_1"
+        obs1.id = "obs_1"
+
+        # Tool call 2: single observation (no duplicate)
+        obs2 = create_autospec(ObservationEvent, instance=True)
+        obs2.tool_call_id = "call_2"
+        obs2.id = "obs_2"
+
+        # Tool call 3: single error (no duplicate)
+        error3 = AgentErrorEvent(
+            tool_name="file_editor",
+            tool_call_id="call_3",
+            error="Tool not found",
+        )
+
+        events: list[LLMConvertibleEvent] = [
+            message_event("Start"),
+            error1,
+            obs1,
+            obs2,
+            error3,
+        ]
+
+        result = self.property.enforce(events, events)
+        # Only error1 should be removed (duplicate with obs1)
+        assert result == {error1.id}
+
+
+class TestToolResultUniquenessPropertyManipulationIndices:
+    """Tests for the manipulation_indices method."""
+
+    def setup_method(self) -> None:
+        """Set up test fixtures."""
+        self.property = ToolResultUniquenessProperty()
+
+    def test_complete_indices_returned(self) -> None:
+        """Test that manipulation indices are complete (no restrictions)."""
+        obs = create_autospec(ObservationEvent, instance=True)
+        obs.tool_call_id = "call_1"
+        obs.id = "obs_1"
+
+        events: list[LLMConvertibleEvent] = [
+            message_event("Start"),
+            obs,
+            message_event("End"),
+        ]
+
+        result = self.property.manipulation_indices(events)
+        # Should have indices 0, 1, 2, 3 (all positions)
+        assert 0 in result
+        assert 1 in result
+        assert 2 in result
+        assert 3 in result