feat(acp): add retry logic for transient connection errors

simonrosenberg · claude · simonrosenberg · commit d0127c4c5e35 · 2026-03-11T16:05:51.000-03:00
Add automatic retry for ACP prompt failures caused by transient
connection errors (network blips, server restarts, connection resets).

Changes:
- Add _is_retriable_connection_error() to classify errors as retriable
  vs non-retriable (e.g., policy violations won't be retried)
- Wrap prompt() call in retry loop with exponential backoff (5s, 15s, 30s)
- Default to 3 retries (configurable via ACP_PROMPT_MAX_RETRIES env var)
- Reset client accumulators between retries to avoid stale state
- Timeout errors are NOT retried (handled separately)
- Non-retriable errors (usage policy, content policy) fail immediately

This preserves session state when connection errors occur, avoiding
the need to restart instances from scratch in the evaluation framework.

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/openhands-sdk/openhands/sdk/agent/acp_agent.py b/openhands-sdk/openhands/sdk/agent/acp_agent.py
@@ -78,6 +78,51 @@
     os.environ.get("ACP_NOTIFICATION_DRAIN_DELAY", "0.1")
 )
 
+# Retry configuration for transient ACP connection errors.
+# These errors can occur when the connection drops mid-conversation but the
+# session state is still valid on the server side.
+_ACP_PROMPT_MAX_RETRIES: int = int(os.environ.get("ACP_PROMPT_MAX_RETRIES", "3"))
+_ACP_PROMPT_RETRY_DELAYS: tuple[float, ...] = (5.0, 15.0, 30.0)  # seconds
+
+
+def _is_retriable_connection_error(error: Exception) -> bool:
+    """Check if an error is a transient connection error that should be retried.
+
+    Returns True for connection-related errors that may succeed on retry
+    (network blips, server restarts, etc.). Returns False for errors that
+    indicate a fundamental problem (policy violations, invalid requests).
+    """
+    # Check exception type first
+    if isinstance(error, (OSError, ConnectionError, BrokenPipeError, EOFError)):
+        return True
+
+    # Check error message for connection-related patterns
+    error_str = str(error).lower()
+    retriable_patterns = (
+        "connection closed",
+        "connection reset",
+        "connection refused",
+        "broken pipe",
+        "server disconnected",
+        "eof",
+        "network",
+        "transport",
+    )
+    non_retriable_patterns = (
+        "usage policy",
+        "content policy",
+        "permission denied",
+        "unauthorized",
+        "invalid",
+    )
+
+    # If it matches a non-retriable pattern, don't retry
+    if any(pattern in error_str for pattern in non_retriable_patterns):
+        return False
+
+    # If it matches a retriable pattern, retry
+    return any(pattern in error_str for pattern in retriable_patterns)
+
 # Limit for asyncio.StreamReader buffers used by the ACP subprocess pipes.
 # The default (64 KiB) is too small for session_update notifications that
 # carry large tool-call outputs (e.g. file contents, test results).  When
@@ -704,15 +749,48 @@ async def _prompt() -> PromptResponse:
                 await _drain_notifications()
                 return response
 
-            # Send prompt to ACP server (with timeout to prevent indefinite hangs)
+            # Send prompt to ACP server with retry logic for connection errors.
+            # Transient connection failures (network blips, server restarts) are
+            # retried to preserve session state and avoid losing progress.
             logger.info(
                 "Sending ACP prompt (timeout=%.0fs, msg=%d chars)",
                 self.acp_prompt_timeout,
                 len(user_message),
             )
-            response = self._executor.run_async(
-                _prompt, timeout=self.acp_prompt_timeout
-            )
+
+            response: PromptResponse | None = None
+            max_retries = _ACP_PROMPT_MAX_RETRIES
+
+            for attempt in range(max_retries + 1):
+                try:
+                    response = self._executor.run_async(
+                        _prompt, timeout=self.acp_prompt_timeout
+                    )
+                    break  # Success, exit retry loop
+                except TimeoutError:
+                    # Timeout is handled separately below, don't retry
+                    raise
+                except Exception as e:
+                    if attempt < max_retries and _is_retriable_connection_error(e):
+                        delay = _ACP_PROMPT_RETRY_DELAYS[
+                            min(attempt, len(_ACP_PROMPT_RETRY_DELAYS) - 1)
+                        ]
+                        logger.warning(
+                            "ACP prompt failed with retriable error (attempt %d/%d), "
+                            "retrying in %.0fs: %s",
+                            attempt + 1,
+                            max_retries + 1,
+                            delay,
+                            e,
+                        )
+                        time.sleep(delay)
+                        # Reset accumulators for retry (partial state may be stale)
+                        self._client.reset()
+                        self._client.on_token = on_token
+                    else:
+                        # Non-retriable error or max retries exceeded
+                        raise
+
             elapsed = time.monotonic() - t0
             logger.info("ACP prompt returned in %.1fs", elapsed)
 
diff --git a/tests/sdk/agent/test_acp_agent.py b/tests/sdk/agent/test_acp_agent.py
@@ -12,6 +12,7 @@
 from openhands.sdk.agent.acp_agent import (
     ACPAgent,
     _OpenHandsACPBridge,
+    _is_retriable_connection_error,
     _resolve_bypass_mode,
     _select_auth_method,
 )
@@ -1387,3 +1388,194 @@ def test_serialization_roundtrip(self):
         restored = AgentBase.model_validate_json(dumped)
         assert isinstance(restored, ACPAgent)
         assert restored.acp_session_mode == "full-access"
+
+
+# ---------------------------------------------------------------------------
+# Connection retry logic
+# ---------------------------------------------------------------------------
+
+
+class TestIsRetriableConnectionError:
+    """Test _is_retriable_connection_error classification."""
+
+    def test_oserror_is_retriable(self):
+        assert _is_retriable_connection_error(OSError("Connection reset"))
+
+    def test_connection_error_is_retriable(self):
+        assert _is_retriable_connection_error(ConnectionError("Connection refused"))
+
+    def test_broken_pipe_is_retriable(self):
+        assert _is_retriable_connection_error(BrokenPipeError("Broken pipe"))
+
+    def test_eof_error_is_retriable(self):
+        assert _is_retriable_connection_error(EOFError("Unexpected EOF"))
+
+    def test_connection_closed_message_is_retriable(self):
+        assert _is_retriable_connection_error(RuntimeError("connection closed by peer"))
+
+    def test_server_disconnected_is_retriable(self):
+        assert _is_retriable_connection_error(Exception("server disconnected"))
+
+    def test_usage_policy_not_retriable(self):
+        assert not _is_retriable_connection_error(
+            RuntimeError("Usage policy violation")
+        )
+
+    def test_content_policy_not_retriable(self):
+        assert not _is_retriable_connection_error(
+            RuntimeError("Content policy blocked")
+        )
+
+    def test_permission_denied_not_retriable(self):
+        assert not _is_retriable_connection_error(
+            RuntimeError("Permission denied for operation")
+        )
+
+    def test_generic_error_not_retriable(self):
+        # Generic errors without connection patterns should not be retried
+        assert not _is_retriable_connection_error(RuntimeError("Something went wrong"))
+
+
+class TestACPPromptRetry:
+    """Test retry logic for ACP prompt failures."""
+
+    def _make_conversation_with_message(self, tmp_path, text="Hello"):
+        """Create a mock conversation with a user message."""
+        state = _make_state(tmp_path)
+        state.events.append(
+            SystemPromptEvent(
+                source="agent",
+                system_prompt=TextContent(text="ACP-managed agent"),
+                tools=[],
+            )
+        )
+        state.events.append(
+            MessageEvent(
+                source="user",
+                llm_message=Message(role="user", content=[TextContent(text=text)]),
+            )
+        )
+
+        conversation = MagicMock()
+        conversation.state = state
+        return conversation
+
+    def test_retry_on_connection_error_then_success(self, tmp_path):
+        """Retry succeeds after transient connection error."""
+        agent = _make_agent()
+        conversation = self._make_conversation_with_message(tmp_path)
+        events: list = []
+
+        mock_client = _OpenHandsACPBridge()
+        agent._client = mock_client
+        agent._conn = MagicMock()
+        agent._session_id = "test-session"
+
+        call_count = 0
+
+        def _fake_run_async(_coro, **_kwargs):
+            nonlocal call_count
+            call_count += 1
+            if call_count == 1:
+                raise ConnectionError("Connection reset by peer")
+            # Second call succeeds - must populate text and return a response
+            mock_client.accumulated_text.append("Success after retry")
+            # Return a mock PromptResponse (can be MagicMock since we only check usage)
+            return MagicMock(usage=None)
+
+        mock_executor = MagicMock()
+        mock_executor.run_async = _fake_run_async
+        agent._executor = mock_executor
+
+        # Patch sleep to avoid actual delays in tests
+        with patch("openhands.sdk.agent.acp_agent.time.sleep"):
+            agent.step(conversation, on_event=events.append)
+
+        assert call_count == 2  # First failed, second succeeded
+        assert conversation.state.execution_status == ConversationExecutionStatus.FINISHED
+        assert len(events) == 3  # MessageEvent, ActionEvent, ObservationEvent
+        assert "Success after retry" in events[0].llm_message.content[0].text
+
+    def test_no_retry_on_non_retriable_error(self, tmp_path):
+        """Non-retriable errors fail immediately without retry."""
+        agent = _make_agent()
+        conversation = self._make_conversation_with_message(tmp_path)
+        events: list = []
+
+        mock_client = _OpenHandsACPBridge()
+        agent._client = mock_client
+        agent._conn = MagicMock()
+        agent._session_id = "test-session"
+
+        call_count = 0
+
+        def _fake_run_async(_coro, **_kwargs):
+            nonlocal call_count
+            call_count += 1
+            raise RuntimeError("Usage policy violation")
+
+        mock_executor = MagicMock()
+        mock_executor.run_async = _fake_run_async
+        agent._executor = mock_executor
+
+        with pytest.raises(RuntimeError, match="Usage policy violation"):
+            agent.step(conversation, on_event=events.append)
+
+        assert call_count == 1  # No retry attempted
+        assert conversation.state.execution_status == ConversationExecutionStatus.ERROR
+
+    def test_no_retry_on_timeout(self, tmp_path):
+        """Timeout errors are not retried (handled separately)."""
+        agent = _make_agent()
+        conversation = self._make_conversation_with_message(tmp_path)
+
+        mock_client = _OpenHandsACPBridge()
+        agent._client = mock_client
+        agent._conn = MagicMock()
+        agent._session_id = "test-session"
+
+        call_count = 0
+
+        def _fake_run_async(_coro, **_kwargs):
+            nonlocal call_count
+            call_count += 1
+            raise TimeoutError("ACP prompt timed out")
+
+        mock_executor = MagicMock()
+        mock_executor.run_async = _fake_run_async
+        agent._executor = mock_executor
+
+        agent.step(conversation, on_event=lambda _: None)
+
+        assert call_count == 1  # No retry for timeout
+        assert conversation.state.execution_status == ConversationExecutionStatus.ERROR
+
+    def test_max_retries_exceeded(self, tmp_path):
+        """Error raised after max retries exhausted."""
+        agent = _make_agent()
+        conversation = self._make_conversation_with_message(tmp_path)
+        events: list = []
+
+        mock_client = _OpenHandsACPBridge()
+        agent._client = mock_client
+        agent._conn = MagicMock()
+        agent._session_id = "test-session"
+
+        call_count = 0
+
+        def _fake_run_async(_coro, **_kwargs):
+            nonlocal call_count
+            call_count += 1
+            raise ConnectionError("Persistent connection failure")
+
+        mock_executor = MagicMock()
+        mock_executor.run_async = _fake_run_async
+        agent._executor = mock_executor
+
+        with patch("openhands.sdk.agent.acp_agent.time.sleep"):
+            with pytest.raises(ConnectionError, match="Persistent connection failure"):
+                agent.step(conversation, on_event=events.append)
+
+        # Default max retries is 3, so 4 total attempts (1 initial + 3 retries)
+        assert call_count == 4
+        assert conversation.state.execution_status == ConversationExecutionStatus.ERROR