fix: improve conversation resilience for long-running and resumed sessions (#2384)

xingyaoww · openhands-agent · web-flow · commit 46f3d78e2b24 · 2026-03-12T02:28:00.000+08:00
Co-authored-by: openhands &lt;openhands@all-hands.dev&gt;
diff --git a/openhands-sdk/openhands/sdk/agent/agent.py b/openhands-sdk/openhands/sdk/agent/agent.py
@@ -10,6 +10,7 @@
     fix_malformed_tool_arguments,
     make_llm_completion,
     prepare_llm_messages,
+    sanitize_json_control_chars,
 )
 from openhands.sdk.conversation import (
     ConversationCallbackType,
@@ -574,7 +575,10 @@ def _get_action_event(
         # Validate arguments
         security_risk: risk.SecurityRisk = risk.SecurityRisk.UNKNOWN
         try:
-            arguments = json.loads(tool_call.arguments)
+            # Sanitize raw control characters (U+0000–U+001F) that some
+            # models emit as literal bytes instead of JSON escape sequences.
+            sanitized_args = sanitize_json_control_chars(tool_call.arguments)
+            arguments = json.loads(sanitized_args)
 
             # Fix malformed arguments (e.g., JSON strings for list/dict fields)
             arguments = fix_malformed_tool_arguments(arguments, tool.action_type)
diff --git a/openhands-sdk/openhands/sdk/agent/utils.py b/openhands-sdk/openhands/sdk/agent/utils.py
@@ -1,4 +1,5 @@
 import json
+import re
 import types
 from collections.abc import Sequence
 from typing import (
@@ -19,6 +20,41 @@
 from openhands.sdk.tool import Action, ToolDefinition
 
 
+# Regex matching raw ASCII control characters (U+0000–U+001F) that are
+# illegal inside JSON strings per RFC 8259 §7.
+_CONTROL_CHAR_RE = re.compile(r"[\x00-\x1f]")
+
+# Mapping from raw control-char ordinals to their JSON-legal two-character
+# escape sequences.  Characters without a short alias fall back to \uXXXX.
+_CTRL_ESCAPE_TABLE: dict[int, str] = {
+    0x08: "\\b",
+    0x09: "\\t",
+    0x0A: "\\n",
+    0x0C: "\\f",
+    0x0D: "\\r",
+}
+
+
+def _escape_control_char(m: re.Match[str]) -> str:
+    """Replace a single raw control character with its JSON escape."""
+    ch = m.group(0)
+    return _CTRL_ESCAPE_TABLE.get(ord(ch), f"\\u{ord(ch):04x}")
+
+
+def sanitize_json_control_chars(raw: str) -> str:
+    """Escape raw control characters in a JSON string produced by an LLM.
+
+    Some models (e.g. kimi-k2.5, minimax-m2.5) emit literal control
+    characters (newline, tab, …) inside ``tool_call.arguments`` instead of
+    their proper two-character JSON escape sequences (``\\n``, ``\\t``, …).
+    ``json.loads`` rejects these per RFC 8259.
+
+    This function replaces every raw U+0000–U+001F byte with the correct
+    escape sequence so the string becomes valid JSON.
+    """
+    return _CONTROL_CHAR_RE.sub(_escape_control_char, raw)
+
+
 def fix_malformed_tool_arguments(
     arguments: dict[str, Any], action_type: type[Action]
 ) -> dict[str, Any]:
diff --git a/openhands-sdk/openhands/sdk/conversation/event_store.py b/openhands-sdk/openhands/sdk/conversation/event_store.py
@@ -79,9 +79,19 @@ def _get_single_item(self, idx: SupportsIndex) -> Event:
             i += self._length
         if i < 0 or i >= self._length:
             raise IndexError("Event index out of range")
-        txt = self._fs.read(self._path(i))
+        try:
+            path = self._path(i)
+        except KeyError:
+            # In-memory index is stale (e.g., external file modifications
+            # or concurrent writes).  Rebuild from disk and retry once.
+            logger.warning("Stale EventLog index at %d; rebuilding from disk.", i)
+            self._length = self._scan_and_build_index()
+            if i >= self._length:
+                raise IndexError("Event index out of range")
+            path = self._path(i)
+        txt = self._fs.read(path)
         if not txt:
-            raise FileNotFoundError(f"Missing event file: {self._path(i)}")
+            raise FileNotFoundError(f"Missing event file: {path}")
         return Event.model_validate_json(txt)
 
     def __iter__(self) -> Iterator[Event]:
diff --git a/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py b/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py
@@ -526,10 +526,13 @@ def send_message(self, message: str | Message, sender: str | None = None) -> Non
             "Only user messages are allowed to be sent to the agent."
         )
         with self._state:
-            if self._state.execution_status == ConversationExecutionStatus.FINISHED:
+            if self._state.execution_status in (
+                ConversationExecutionStatus.FINISHED,
+                ConversationExecutionStatus.STUCK,
+            ):
                 self._state.execution_status = (
                     ConversationExecutionStatus.IDLE
-                )  # now we have a new message
+                )  # new message resets terminal states
 
             # TODO: We should add test cases for all these scenarios
             activated_skill_names: list[str] = []
@@ -584,6 +587,7 @@ def run(self) -> None:
                 ConversationExecutionStatus.IDLE,
                 ConversationExecutionStatus.PAUSED,
                 ConversationExecutionStatus.ERROR,
+                ConversationExecutionStatus.STUCK,
             ]:
                 self._state.execution_status = ConversationExecutionStatus.RUNNING
 
diff --git a/tests/sdk/agent/test_sanitize_json_control_chars.py b/tests/sdk/agent/test_sanitize_json_control_chars.py
@@ -0,0 +1,86 @@
+"""Tests for sanitize_json_control_chars helper function.
+
+This module tests the sanitize_json_control_chars helper that escapes raw
+control characters (U+0000–U+001F) in JSON strings produced by LLMs.  Some
+models (e.g. kimi-k2.5, minimax-m2.5) emit literal control bytes instead of
+legal two-character JSON escape sequences, which causes json.loads() to fail.
+"""
+
+import json
+
+from openhands.sdk.agent.utils import sanitize_json_control_chars
+
+
+def test_valid_json_unchanged():
+    """Already-valid JSON is returned unmodified."""
+    raw = '{"command": "echo hello", "path": "/tmp"}'
+    assert sanitize_json_control_chars(raw) == raw
+
+
+def test_literal_newline_escaped():
+    """A raw 0x0A byte inside a JSON string is replaced with \\n."""
+    raw = '{"command": "line1\nline2"}'
+    sanitized = sanitize_json_control_chars(raw)
+    assert "\n" not in sanitized
+    parsed = json.loads(sanitized)
+    assert parsed["command"] == "line1\nline2"
+
+
+def test_literal_tab_escaped():
+    """A raw 0x09 byte inside a JSON string is replaced with \\t."""
+    raw = '{"indent": "col1\tcol2"}'
+    sanitized = sanitize_json_control_chars(raw)
+    assert "\t" not in sanitized
+    parsed = json.loads(sanitized)
+    assert parsed["indent"] == "col1\tcol2"
+
+
+def test_multiple_control_chars():
+    """Multiple different control characters are all escaped."""
+    raw = '{"text": "a\tb\nc\rd"}'
+    sanitized = sanitize_json_control_chars(raw)
+    parsed = json.loads(sanitized)
+    assert parsed["text"] == "a\tb\nc\rd"
+
+
+def test_null_byte_escaped():
+    """A raw NUL (0x00) byte is escaped to \\u0000."""
+    raw = '{"data": "before\x00after"}'
+    sanitized = sanitize_json_control_chars(raw)
+    assert "\\u0000" in sanitized
+    parsed = json.loads(sanitized)
+    assert parsed["data"] == "before\x00after"
+
+
+def test_form_feed_and_backspace():
+    """Form-feed and backspace get their short escape aliases."""
+    raw = '{"x": "a\x08b\x0cc"}'
+    sanitized = sanitize_json_control_chars(raw)
+    assert "\\b" in sanitized
+    assert "\\f" in sanitized
+    parsed = json.loads(sanitized)
+    assert parsed["x"] == "a\x08b\x0cc"
+
+
+def test_already_escaped_sequences_preserved():
+    """Properly escaped sequences (\\n, \\t) are NOT double-escaped."""
+    raw = r'{"command": "echo \"hello\\nworld\""}'
+    sanitized = sanitize_json_control_chars(raw)
+    # Already-valid escape sequences should parse correctly
+    parsed = json.loads(sanitized)
+    assert "hello\\nworld" in parsed["command"]
+
+
+def test_empty_string():
+    """Empty input returns empty output."""
+    assert sanitize_json_control_chars("") == ""
+
+
+def test_realistic_tool_call_arguments():
+    """Simulates a realistic malformed tool_call.arguments from an LLM."""
+    # The LLM emitted a literal newline inside the "command" value
+    raw = '{"command": "cd /workspace && \\\npython test.py", "path": "/workspace"}'
+    sanitized = sanitize_json_control_chars(raw)
+    parsed = json.loads(sanitized)
+    assert "python test.py" in parsed["command"]
+    assert parsed["path"] == "/workspace"
diff --git a/tests/sdk/conversation/local/test_agent_status_transition.py b/tests/sdk/conversation/local/test_agent_status_transition.py
@@ -12,7 +12,9 @@
 - PAUSED -> RUNNING (when run() is called after pause)
 - WAITING_FOR_CONFIRMATION -> RUNNING (when run() is called to confirm)
 - FINISHED -> IDLE -> RUNNING (when new message sent after completion)
-- FINISHED/STUCK -> remain unchanged (run() exits immediately)
+- STUCK -> IDLE (when new message sent) -> RUNNING (when run() is called)
+- STUCK -> RUNNING (when run() is called directly)
+- FINISHED -> remain unchanged (run() exits immediately without new message)
 """
 
 import threading
@@ -357,23 +359,60 @@ def test_run_exits_immediately_when_already_finished():
     assert llm.call_count == initial_call_count
 
 
-def test_run_exits_immediately_when_stuck():
-    """Test that run() exits immediately when status is STUCK."""
-    # Use TestLLM with no scripted responses (should not be called)
-    llm = TestLLM.from_messages([])
+def test_run_recovers_from_stuck():
+    """Test that run() resets STUCK status and lets the agent continue.
+
+    When a conversation is STUCK (e.g. stuck detector triggered or
+    persisted STUCK state from a previous session), calling run() should
+    reset the status to RUNNING so the agent can retry.  Without this
+    reset, a persisted STUCK state would permanently kill the session.
+    """
+    # Provide a finish response so the agent can complete after unsticking.
+    llm = TestLLM.from_messages(
+        [Message(role="assistant", content=[TextContent(text="Recovered")])]
+    )
     agent = Agent(llm=llm, tools=[])
     conversation = Conversation(agent=agent)
 
-    # Manually set status to STUCK (simulating stuck detection)
+    # Seed a user message so the agent has context to work with
+    conversation.send_message(
+        Message(role="user", content=[TextContent(text="Please continue")])
+    )
+
+    # Simulate stuck detection persisted from previous session
     conversation._state.execution_status = ConversationExecutionStatus.STUCK
 
-    # Call run - should exit immediately
     conversation.run()
 
-    # Status should still be STUCK
-    assert conversation.state.execution_status == ConversationExecutionStatus.STUCK
-    # LLM should not be called
-    assert llm.call_count == 0
+    # Agent should have recovered and finished normally
+    assert conversation.state.execution_status == ConversationExecutionStatus.FINISHED
+    assert llm.call_count == 1
+
+
+def test_send_message_resets_stuck_to_idle():
+    """Test STUCK → IDLE transition when a new user message arrives.
+
+    A new user message is an implicit signal to unstick the conversation,
+    analogous to how FINISHED → IDLE works.
+    """
+    llm = TestLLM.from_messages(
+        [Message(role="assistant", content=[TextContent(text="Done")])]
+    )
+    agent = Agent(llm=llm, tools=[])
+    conversation = Conversation(agent=agent)
+
+    # Simulate stuck state
+    conversation._state.execution_status = ConversationExecutionStatus.STUCK
+
+    # Sending a new message should reset STUCK → IDLE
+    conversation.send_message(
+        Message(role="user", content=[TextContent(text="Try again")])
+    )
+    assert conversation.state.execution_status == ConversationExecutionStatus.IDLE
+
+    # Running should proceed normally: IDLE → RUNNING → FINISHED
+    conversation.run()
+    assert conversation.state.execution_status == ConversationExecutionStatus.FINISHED
 
 
 def test_execution_status_error_on_max_iterations():
diff --git a/tests/sdk/conversation/test_event_store.py b/tests/sdk/conversation/test_event_store.py
@@ -402,3 +402,42 @@ def test_event_log_concurrent_writes_serialized():
 
         files = [f for f in fs.list("events") if not f.endswith(".lock")]
         assert len(files) == 2
+
+
+def test_get_single_item_recovers_from_stale_index():
+    """_get_single_item rebuilds the index when _idx_to_id is stale."""
+    fs = InMemoryFileStore()
+    log = EventLog(fs)
+
+    # Use UUID-like IDs to match EVENT_NAME_RE pattern
+    evt_id = "00000000-0000-0000-0000-000000000001"
+    event = create_test_event(evt_id, "Should recover")
+    log.append(event)
+    assert log[0].id == evt_id
+
+    # Simulate a stale in-memory index (e.g., external file modification)
+    log._idx_to_id.clear()
+    log._id_to_idx.clear()
+
+    # Access should rebuild the index transparently and succeed
+    recovered = log[0]
+    assert recovered.id == evt_id
+
+
+def test_get_single_item_stale_index_out_of_range():
+    """After index rebuild, raise IndexError if the index no longer exists."""
+    fs = InMemoryFileStore()
+    log = EventLog(fs)
+
+    evt_id = "00000000-0000-0000-0000-000000000002"
+    event = create_test_event(evt_id, "Only one")
+    log.append(event)
+
+    # Clear index AND artificially inflate length to simulate stale state
+    log._idx_to_id.clear()
+    log._id_to_idx.clear()
+    log._length = 5  # pretend there are 5 events
+
+    # Index 3 doesn't exist on disk; should raise IndexError after rebuild
+    with pytest.raises(IndexError, match="Event index out of range"):
+        log[3]