-
Notifications
You must be signed in to change notification settings - Fork 218
Fix duplicate tool_result error from Anthropic API #2256
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
Closed
Changes from 1 commit
Commits
Show all changes
8 commits
Select commit
Hold shift + click to select a range
3a88685
Fix duplicate tool_result error from Anthropic API
openhands-agent a48436a
Merge AgentErrorEvent content into ObservationEvent instead of select…
openhands-agent a50cbf6
Address PR review comments for tool result uniqueness
openhands-agent 029ebc4
Limit tool result uniqueness handling to consecutive duplicates only
openhands-agent d66d74d
Merge branch 'main' into fix-duplicate-tool-result-error
xingyaoww b7b1e6b
Simplify: move duplicate observation deduplication to events_to_messages
openhands-agent b75c6e3
Remove unnecessary logger from base.py and document SDK logger prefer…
openhands-agent b99b0bd
Merge duplicate observation content instead of selecting best
openhands-agent File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
97 changes: 97 additions & 0 deletions
97
openhands-sdk/openhands/sdk/context/view/properties/tool_result_uniqueness.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,97 @@ | ||
| """Property to ensure each tool_call_id has exactly one tool result. | ||
|
|
||
| LLM APIs (especially Anthropic) require that each tool_use has exactly one | ||
| corresponding tool_result. This property handles the edge case where multiple | ||
| observation events (e.g., AgentErrorEvent and ObservationEvent) are created | ||
| for the same tool_call_id, typically due to restarts or race conditions. | ||
| """ | ||
|
|
||
| from collections import defaultdict | ||
| from collections.abc import Sequence | ||
|
|
||
| from openhands.sdk.context.view.manipulation_indices import ManipulationIndices | ||
| from openhands.sdk.context.view.properties.base import ViewPropertyBase | ||
| from openhands.sdk.event import ( | ||
| AgentErrorEvent, | ||
| Event, | ||
| EventID, | ||
| LLMConvertibleEvent, | ||
| ObservationBaseEvent, | ||
| ObservationEvent, | ||
| ToolCallID, | ||
| ) | ||
|
|
||
|
|
||
| class ToolResultUniquenessProperty(ViewPropertyBase): | ||
| """Each tool_call_id must have exactly one tool result. | ||
|
|
||
| When multiple observations exist for the same tool_call_id, this property | ||
| keeps the most informative one: | ||
| 1. ObservationEvent (actual tool result) is preferred over AgentErrorEvent | ||
| 2. If multiple of the same type exist, the later one is kept | ||
| """ | ||
|
|
||
| def enforce( | ||
| self, | ||
| current_view_events: list[LLMConvertibleEvent], | ||
| all_events: Sequence[Event], # noqa: ARG002 | ||
| ) -> set[EventID]: | ||
| """Remove duplicate tool results for the same tool_call_id. | ||
|
|
||
| When multiple ObservationBaseEvents share the same tool_call_id, | ||
| keep only one - preferring ObservationEvent over AgentErrorEvent. | ||
| """ | ||
| # Group observations by tool_call_id | ||
| observations_by_tool_call: dict[ToolCallID, list[ObservationBaseEvent]] = ( | ||
| defaultdict(list) | ||
| ) | ||
|
|
||
| for event in current_view_events: | ||
| if isinstance(event, ObservationBaseEvent): | ||
| observations_by_tool_call[event.tool_call_id].append(event) | ||
|
|
||
| events_to_remove: set[EventID] = set() | ||
|
|
||
| for tool_call_id, observations in observations_by_tool_call.items(): | ||
| if len(observations) <= 1: | ||
| continue | ||
|
|
||
| # Multiple observations for same tool_call_id - need to pick one | ||
| # Priority: ObservationEvent > AgentErrorEvent > others | ||
| # If same priority, keep the later one (more recent) | ||
| obs_events = [o for o in observations if isinstance(o, ObservationEvent)] | ||
| error_events = [o for o in observations if isinstance(o, AgentErrorEvent)] | ||
| other_events = [ | ||
| o | ||
| for o in observations | ||
| if not isinstance(o, (ObservationEvent, AgentErrorEvent)) | ||
| ] | ||
|
|
||
| # Determine which one to keep | ||
| if obs_events: | ||
| # Keep the last ObservationEvent, remove all others | ||
| to_keep = obs_events[-1] | ||
| elif other_events: | ||
| # Keep the last "other" event (UserRejectObservation, etc.) | ||
| to_keep = other_events[-1] | ||
| else: | ||
| # Only AgentErrorEvents, keep the last one | ||
| to_keep = error_events[-1] | ||
|
|
||
| # Mark all others for removal | ||
| for obs in observations: | ||
| if obs.id != to_keep.id: | ||
| events_to_remove.add(obs.id) | ||
|
|
||
| return events_to_remove | ||
|
|
||
| def manipulation_indices( | ||
| self, | ||
| current_view_events: list[LLMConvertibleEvent], | ||
| ) -> ManipulationIndices: | ||
| """Calculate manipulation indices for tool result uniqueness. | ||
|
|
||
| This property doesn't restrict manipulation - it only enforces | ||
| uniqueness when violations are detected. | ||
| """ | ||
| return ManipulationIndices.complete(current_view_events) | ||
219 changes: 219 additions & 0 deletions
219
tests/sdk/context/view/properties/test_tool_result_uniqueness.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,219 @@ | ||
| """Tests for ToolResultUniquenessProperty. | ||
|
|
||
| This module tests that duplicate tool results for the same tool_call_id | ||
| are properly deduplicated, preferring ObservationEvent over AgentErrorEvent. | ||
| """ | ||
|
|
||
| from unittest.mock import create_autospec | ||
|
|
||
| from openhands.sdk.context.view.properties.tool_result_uniqueness import ( | ||
| ToolResultUniquenessProperty, | ||
| ) | ||
| from openhands.sdk.event.base import LLMConvertibleEvent | ||
| from openhands.sdk.event.llm_convertible import ( | ||
| ActionEvent, | ||
| AgentErrorEvent, | ||
| ObservationEvent, | ||
| UserRejectObservation, | ||
| ) | ||
| from tests.sdk.context.view.properties.conftest import message_event | ||
|
|
||
|
|
||
| class TestToolResultUniquenessPropertyEnforcement: | ||
| """Tests for the enforce method of ToolResultUniquenessProperty.""" | ||
|
|
||
| def setup_method(self) -> None: | ||
| """Set up test fixtures.""" | ||
| self.property = ToolResultUniquenessProperty() | ||
|
|
||
| def test_empty_list(self) -> None: | ||
| """Test enforce with empty event list.""" | ||
| result = self.property.enforce([], []) | ||
| assert result == set() | ||
|
|
||
| def test_no_duplicates(self) -> None: | ||
| """Test enforce when there are no duplicate tool_call_ids.""" | ||
| obs1 = create_autospec(ObservationEvent, instance=True) | ||
| obs1.tool_call_id = "call_1" | ||
| obs1.id = "obs_1" | ||
|
|
||
| obs2 = create_autospec(ObservationEvent, instance=True) | ||
| obs2.tool_call_id = "call_2" | ||
| obs2.id = "obs_2" | ||
|
|
||
| events: list[LLMConvertibleEvent] = [ | ||
| message_event("Start"), | ||
| obs1, | ||
| obs2, | ||
| message_event("End"), | ||
| ] | ||
|
|
||
| result = self.property.enforce(events, events) | ||
| assert result == set() | ||
xingyaoww marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| def test_duplicate_observation_events(self) -> None: | ||
| """Test that duplicate ObservationEvents keep the later one.""" | ||
| obs1 = create_autospec(ObservationEvent, instance=True) | ||
| obs1.tool_call_id = "call_1" | ||
| obs1.id = "obs_1" | ||
|
|
||
| obs2 = create_autospec(ObservationEvent, instance=True) | ||
| obs2.tool_call_id = "call_1" # Same tool_call_id! | ||
| obs2.id = "obs_2" | ||
|
|
||
| events: list[LLMConvertibleEvent] = [obs1, obs2] | ||
|
|
||
| result = self.property.enforce(events, events) | ||
| # obs1 should be removed, obs2 (later) should be kept | ||
| assert result == {"obs_1"} | ||
|
|
||
| def test_observation_event_preferred_over_agent_error(self) -> None: | ||
| """Test that ObservationEvent is preferred over AgentErrorEvent.""" | ||
| # This is the main bug scenario: AgentErrorEvent created on restart, | ||
| # then ObservationEvent arrives later with actual result | ||
| agent_error = AgentErrorEvent( | ||
| tool_name="terminal", | ||
| tool_call_id="call_1", | ||
| error="Restart occurred while tool was running", | ||
| ) | ||
|
|
||
| obs = create_autospec(ObservationEvent, instance=True) | ||
| obs.tool_call_id = "call_1" # Same tool_call_id! | ||
| obs.id = "obs_1" | ||
|
|
||
| events: list[LLMConvertibleEvent] = [agent_error, obs] | ||
|
|
||
| result = self.property.enforce(events, events) | ||
| # AgentErrorEvent should be removed, ObservationEvent kept | ||
| assert result == {agent_error.id} | ||
|
|
||
| def test_agent_error_before_observation_event(self) -> None: | ||
| """Test AgentErrorEvent followed by ObservationEvent (restart scenario).""" | ||
| # Simulates: restart creates AgentErrorEvent, then actual result arrives | ||
| action = create_autospec(ActionEvent, instance=True) | ||
| action.tool_call_id = "call_1" | ||
| action.id = "action_1" | ||
| action.llm_response_id = "response_1" | ||
|
|
||
| agent_error = AgentErrorEvent( | ||
| tool_name="terminal", | ||
| tool_call_id="call_1", | ||
| error="A restart occurred while this tool was in progress.", | ||
| ) | ||
|
|
||
| obs = create_autospec(ObservationEvent, instance=True) | ||
| obs.tool_call_id = "call_1" | ||
| obs.id = "obs_1" | ||
|
|
||
| events: list[LLMConvertibleEvent] = [ | ||
| message_event("User message"), | ||
| action, | ||
| agent_error, | ||
| obs, # Actual result arrives later | ||
| ] | ||
|
|
||
| result = self.property.enforce(events, events) | ||
| # AgentErrorEvent should be removed since we have actual ObservationEvent | ||
| assert result == {agent_error.id} | ||
|
|
||
| def test_multiple_agent_errors_keep_last(self) -> None: | ||
| """Test that when only AgentErrorEvents exist, the last one is kept.""" | ||
| error1 = AgentErrorEvent( | ||
| tool_name="terminal", | ||
| tool_call_id="call_1", | ||
| error="First error", | ||
| ) | ||
|
|
||
| error2 = AgentErrorEvent( | ||
| tool_name="terminal", | ||
| tool_call_id="call_1", | ||
| error="Second error", | ||
| ) | ||
|
|
||
| events: list[LLMConvertibleEvent] = [error1, error2] | ||
|
|
||
| result = self.property.enforce(events, events) | ||
| # First error should be removed, second (later) should be kept | ||
| assert result == {error1.id} | ||
|
|
||
| def test_user_reject_observation_handling(self) -> None: | ||
| """Test that UserRejectObservation is handled correctly.""" | ||
| reject = UserRejectObservation( | ||
| tool_name="terminal", | ||
| tool_call_id="call_1", | ||
| action_id="action_1", | ||
| rejection_reason="User rejected", | ||
| ) | ||
|
|
||
| obs = create_autospec(ObservationEvent, instance=True) | ||
| obs.tool_call_id = "call_1" | ||
| obs.id = "obs_1" | ||
|
|
||
| events: list[LLMConvertibleEvent] = [reject, obs] | ||
|
|
||
| result = self.property.enforce(events, events) | ||
| # ObservationEvent is preferred over UserRejectObservation | ||
| assert result == {reject.id} | ||
|
|
||
| def test_mixed_scenario_multiple_tool_calls(self) -> None: | ||
| """Test with multiple tool calls, some with duplicates.""" | ||
| # Tool call 1: has duplicate (error + observation) | ||
| error1 = AgentErrorEvent( | ||
| tool_name="terminal", | ||
| tool_call_id="call_1", | ||
| error="Restart error", | ||
| ) | ||
| obs1 = create_autospec(ObservationEvent, instance=True) | ||
| obs1.tool_call_id = "call_1" | ||
| obs1.id = "obs_1" | ||
|
|
||
| # Tool call 2: single observation (no duplicate) | ||
| obs2 = create_autospec(ObservationEvent, instance=True) | ||
| obs2.tool_call_id = "call_2" | ||
| obs2.id = "obs_2" | ||
|
|
||
| # Tool call 3: single error (no duplicate) | ||
| error3 = AgentErrorEvent( | ||
| tool_name="file_editor", | ||
| tool_call_id="call_3", | ||
| error="Tool not found", | ||
| ) | ||
|
|
||
| events: list[LLMConvertibleEvent] = [ | ||
| message_event("Start"), | ||
| error1, | ||
| obs1, | ||
| obs2, | ||
| error3, | ||
| ] | ||
|
|
||
| result = self.property.enforce(events, events) | ||
| # Only error1 should be removed (duplicate with obs1) | ||
| assert result == {error1.id} | ||
|
|
||
|
|
||
| class TestToolResultUniquenessPropertyManipulationIndices: | ||
| """Tests for the manipulation_indices method.""" | ||
|
|
||
| def setup_method(self) -> None: | ||
| """Set up test fixtures.""" | ||
| self.property = ToolResultUniquenessProperty() | ||
|
|
||
| def test_complete_indices_returned(self) -> None: | ||
| """Test that manipulation indices are complete (no restrictions).""" | ||
| obs = create_autospec(ObservationEvent, instance=True) | ||
| obs.tool_call_id = "call_1" | ||
| obs.id = "obs_1" | ||
|
|
||
| events: list[LLMConvertibleEvent] = [ | ||
| message_event("Start"), | ||
| obs, | ||
| message_event("End"), | ||
| ] | ||
|
|
||
| result = self.property.manipulation_indices(events) | ||
| # Should have indices 0, 1, 2, 3 (all positions) | ||
| assert 0 in result | ||
| assert 1 in result | ||
| assert 2 in result | ||
| assert 3 in result | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.