feat(acp): add retry logic for transient connection errors

simonrosenberg · claude · simonrosenberg · commit bda77a61551c · 2026-03-11T16:13:17.000-03:00
Add automatic retry for ACP prompt failures caused by transient
connection errors (OSError, ConnectionError, BrokenPipeError, EOFError).

Changes:
- Wrap prompt() call in retry loop for connection exception types
- Retry up to 3 times with exponential backoff (5s, 15s, 30s)
- Configurable via ACP_PROMPT_MAX_RETRIES env var
- Reset client accumulators between retries
- Timeout errors are NOT retried (handled separately)

This preserves session state when connection errors occur, avoiding
the need to restart instances from scratch in the evaluation framework.

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/openhands-sdk/openhands/sdk/agent/acp_agent.py b/openhands-sdk/openhands/sdk/agent/acp_agent.py
@@ -78,6 +78,15 @@
     os.environ.get("ACP_NOTIFICATION_DRAIN_DELAY", "0.1")
 )
 
+# Retry configuration for transient ACP connection errors.
+# These errors can occur when the connection drops mid-conversation but the
+# session state is still valid on the server side.
+_ACP_PROMPT_MAX_RETRIES: int = int(os.environ.get("ACP_PROMPT_MAX_RETRIES", "3"))
+_ACP_PROMPT_RETRY_DELAYS: tuple[float, ...] = (5.0, 15.0, 30.0)  # seconds
+
+# Exception types that indicate transient connection issues worth retrying
+_RETRIABLE_CONNECTION_ERRORS = (OSError, ConnectionError, BrokenPipeError, EOFError)
+
 # Limit for asyncio.StreamReader buffers used by the ACP subprocess pipes.
 # The default (64 KiB) is too small for session_update notifications that
 # carry large tool-call outputs (e.g. file contents, test results).  When
@@ -704,15 +713,48 @@ async def _prompt() -> PromptResponse:
                 await _drain_notifications()
                 return response
 
-            # Send prompt to ACP server (with timeout to prevent indefinite hangs)
+            # Send prompt to ACP server with retry logic for connection errors.
+            # Transient connection failures (network blips, server restarts) are
+            # retried to preserve session state and avoid losing progress.
             logger.info(
                 "Sending ACP prompt (timeout=%.0fs, msg=%d chars)",
                 self.acp_prompt_timeout,
                 len(user_message),
             )
-            response = self._executor.run_async(
-                _prompt, timeout=self.acp_prompt_timeout
-            )
+
+            response: PromptResponse | None = None
+            max_retries = _ACP_PROMPT_MAX_RETRIES
+
+            for attempt in range(max_retries + 1):
+                try:
+                    response = self._executor.run_async(
+                        _prompt, timeout=self.acp_prompt_timeout
+                    )
+                    break  # Success, exit retry loop
+                except TimeoutError:
+                    # Timeout is handled separately below, don't retry
+                    raise
+                except _RETRIABLE_CONNECTION_ERRORS as e:
+                    if attempt < max_retries:
+                        delay = _ACP_PROMPT_RETRY_DELAYS[
+                            min(attempt, len(_ACP_PROMPT_RETRY_DELAYS) - 1)
+                        ]
+                        logger.warning(
+                            "ACP prompt failed with retriable error (attempt %d/%d), "
+                            "retrying in %.0fs: %s",
+                            attempt + 1,
+                            max_retries + 1,
+                            delay,
+                            e,
+                        )
+                        time.sleep(delay)
+                        # Reset accumulators for retry (partial state may be stale)
+                        self._client.reset()
+                        self._client.on_token = on_token
+                    else:
+                        # Max retries exceeded
+                        raise
+
             elapsed = time.monotonic() - t0
             logger.info("ACP prompt returned in %.1fs", elapsed)
 
diff --git a/tests/sdk/agent/test_acp_agent.py b/tests/sdk/agent/test_acp_agent.py
@@ -1387,3 +1387,153 @@ def test_serialization_roundtrip(self):
         restored = AgentBase.model_validate_json(dumped)
         assert isinstance(restored, ACPAgent)
         assert restored.acp_session_mode == "full-access"
+
+
+# ---------------------------------------------------------------------------
+# Connection retry logic
+# ---------------------------------------------------------------------------
+
+
+class TestACPPromptRetry:
+    """Test retry logic for ACP prompt failures."""
+
+    def _make_conversation_with_message(self, tmp_path, text="Hello"):
+        """Create a mock conversation with a user message."""
+        state = _make_state(tmp_path)
+        state.events.append(
+            SystemPromptEvent(
+                source="agent",
+                system_prompt=TextContent(text="ACP-managed agent"),
+                tools=[],
+            )
+        )
+        state.events.append(
+            MessageEvent(
+                source="user",
+                llm_message=Message(role="user", content=[TextContent(text=text)]),
+            )
+        )
+
+        conversation = MagicMock()
+        conversation.state = state
+        return conversation
+
+    def test_retry_on_connection_error_then_success(self, tmp_path):
+        """Retry succeeds after transient connection error."""
+        agent = _make_agent()
+        conversation = self._make_conversation_with_message(tmp_path)
+        events: list = []
+
+        mock_client = _OpenHandsACPBridge()
+        agent._client = mock_client
+        agent._conn = MagicMock()
+        agent._session_id = "test-session"
+
+        call_count = 0
+
+        def _fake_run_async(_coro, **_kwargs):
+            nonlocal call_count
+            call_count += 1
+            if call_count == 1:
+                raise ConnectionError("Connection reset by peer")
+            # Second call succeeds - must populate text and return a response
+            mock_client.accumulated_text.append("Success after retry")
+            # Return a mock PromptResponse (can be MagicMock since we only check usage)
+            return MagicMock(usage=None)
+
+        mock_executor = MagicMock()
+        mock_executor.run_async = _fake_run_async
+        agent._executor = mock_executor
+
+        # Patch sleep to avoid actual delays in tests
+        with patch("openhands.sdk.agent.acp_agent.time.sleep"):
+            agent.step(conversation, on_event=events.append)
+
+        assert call_count == 2  # First failed, second succeeded
+        assert conversation.state.execution_status == ConversationExecutionStatus.FINISHED
+        assert len(events) == 3  # MessageEvent, ActionEvent, ObservationEvent
+        assert "Success after retry" in events[0].llm_message.content[0].text
+
+    def test_no_retry_on_non_connection_error(self, tmp_path):
+        """Non-connection errors (e.g., RuntimeError) fail immediately without retry."""
+        agent = _make_agent()
+        conversation = self._make_conversation_with_message(tmp_path)
+        events: list = []
+
+        mock_client = _OpenHandsACPBridge()
+        agent._client = mock_client
+        agent._conn = MagicMock()
+        agent._session_id = "test-session"
+
+        call_count = 0
+
+        def _fake_run_async(_coro, **_kwargs):
+            nonlocal call_count
+            call_count += 1
+            raise RuntimeError("Some application error")
+
+        mock_executor = MagicMock()
+        mock_executor.run_async = _fake_run_async
+        agent._executor = mock_executor
+
+        with pytest.raises(RuntimeError, match="Some application error"):
+            agent.step(conversation, on_event=events.append)
+
+        assert call_count == 1  # No retry attempted
+        assert conversation.state.execution_status == ConversationExecutionStatus.ERROR
+
+    def test_no_retry_on_timeout(self, tmp_path):
+        """Timeout errors are not retried (handled separately)."""
+        agent = _make_agent()
+        conversation = self._make_conversation_with_message(tmp_path)
+
+        mock_client = _OpenHandsACPBridge()
+        agent._client = mock_client
+        agent._conn = MagicMock()
+        agent._session_id = "test-session"
+
+        call_count = 0
+
+        def _fake_run_async(_coro, **_kwargs):
+            nonlocal call_count
+            call_count += 1
+            raise TimeoutError("ACP prompt timed out")
+
+        mock_executor = MagicMock()
+        mock_executor.run_async = _fake_run_async
+        agent._executor = mock_executor
+
+        agent.step(conversation, on_event=lambda _: None)
+
+        assert call_count == 1  # No retry for timeout
+        assert conversation.state.execution_status == ConversationExecutionStatus.ERROR
+
+    def test_max_retries_exceeded(self, tmp_path):
+        """Error raised after max retries exhausted."""
+        agent = _make_agent()
+        conversation = self._make_conversation_with_message(tmp_path)
+        events: list = []
+
+        mock_client = _OpenHandsACPBridge()
+        agent._client = mock_client
+        agent._conn = MagicMock()
+        agent._session_id = "test-session"
+
+        call_count = 0
+
+        def _fake_run_async(_coro, **_kwargs):
+            nonlocal call_count
+            call_count += 1
+            raise ConnectionError("Persistent connection failure")
+
+        mock_executor = MagicMock()
+        mock_executor.run_async = _fake_run_async
+        agent._executor = mock_executor
+
+        with patch("openhands.sdk.agent.acp_agent.time.sleep"):
+            with pytest.raises(ConnectionError, match="Persistent connection failure"):
+                agent.step(conversation, on_event=events.append)
+
+        # Default max retries is 3, so 4 total attempts (1 initial + 3 retries)
+        assert call_count == 4
+        assert conversation.state.execution_status == ConversationExecutionStatus.ERROR