test(sdk): reproduce delegate resume compatibility regression (#2382)

neubig · openhands-agent · VascoSch92 · web-flow · commit 472328d94653 · 2026-03-16T15:57:54.000+01:00
Co-authored-by: openhands &lt;openhands@all-hands.dev&gt;
Co-authored-by: Vasco Schiavo &lt;115561717+VascoSch92@users.noreply.github.com&gt;
diff --git a/openhands-sdk/openhands/sdk/agent/base.py b/openhands-sdk/openhands/sdk/agent/base.py
@@ -415,11 +415,11 @@ def verify(
 
         Compatibility requirements:
         - Agent class/type must match.
-        - Tools must match exactly (same tool names).
+        - Tools may only be added, never removed.
 
-        Tools are part of the system prompt and cannot be changed mid-conversation.
-        To use different tools, start a new conversation or use conversation forking
-        (see https://github.com/OpenHands/OpenHands/issues/8560).
+        Removing tools breaks backward compatibility because the LLM may have
+        already been told about them.  Adding new tools is safe — the LLM
+        simply gains new capabilities on the next turn.
 
         All other configuration (LLM, agent_context, condenser, etc.) can be
         freely changed between sessions.
@@ -457,24 +457,18 @@ def verify(
             if tool_class is not None:
                 persisted_names.add(tool_class.name)
 
-        if runtime_names == persisted_names:
-            return self
-
-        # Tools don't match - this is not allowed
+        # Removing tools breaks backward compatibility because the LLM may
+        # have already been told about them.  Adding new tools is safe — the
+        # LLM simply gains new capabilities on the next turn.
         missing_in_runtime = persisted_names - runtime_names
-        added_in_runtime = runtime_names - persisted_names
-
-        details: list[str] = []
         if missing_in_runtime:
-            details.append(f"removed: {sorted(missing_in_runtime)}")
-        if added_in_runtime:
-            details.append(f"added: {sorted(added_in_runtime)}")
-
-        raise ValueError(
-            f"Cannot resume conversation: tools cannot be changed mid-conversation "
-            f"({'; '.join(details)}). "
-            f"To use different tools, start a new conversation."
-        )
+            raise ValueError(
+                f"Cannot resume conversation: tools were removed mid-conversation "
+                f"(removed: {sorted(missing_in_runtime)}). "
+                f"To use different tools, start a new conversation."
+            )
+
+        return self
 
     def model_dump_succint(self, **kwargs):
         """Like model_dump, but excludes None fields by default."""
diff --git a/tests/cross/test_agent_loading.py b/tests/cross/test_agent_loading.py
@@ -160,17 +160,15 @@ def test_conversation_fails_when_removing_tools():
                 visualizer=None,
             )
 
-        assert "tools cannot be changed mid-conversation" in str(exc_info.value)
+        assert "tools were removed mid-conversation" in str(exc_info.value)
         assert "removed:" in str(exc_info.value)
         assert "FileEditorTool" in str(exc_info.value)
 
 
-def test_conversation_fails_when_adding_tools():
-    """Test that adding new tools fails.
+def test_conversation_succeeds_when_adding_tools():
+    """Test that adding new tools succeeds on resume.
 
-    Tools are part of the system prompt and cannot be changed mid-conversation.
-    To use different tools, start a new conversation or use conversation forking.
-    See: https://github.com/OpenHands/OpenHands/issues/8560
+    Adding tools is allowed — only removing tools is rejected.
     """
     with tempfile.TemporaryDirectory() as temp_dir:
         # Create conversation with only one tool
@@ -194,7 +192,7 @@ def test_conversation_fails_when_adding_tools():
         conversation_id = conversation.state.id
         del conversation
 
-        # Resume with additional tools - should FAIL (tools must match exactly)
+        # Resume with additional tools - should SUCCEED (adding tools is allowed)
         expanded_tools = [
             Tool(name="TerminalTool"),
             Tool(name="FileEditorTool"),  # New tool added
@@ -204,18 +202,14 @@ def test_conversation_fails_when_adding_tools():
         )
         expanded_agent = Agent(llm=llm2, tools=expanded_tools)
 
-        with pytest.raises(ValueError) as exc_info:
-            LocalConversation(
-                agent=expanded_agent,
-                workspace=temp_dir,
-                persistence_dir=temp_dir,
-                conversation_id=conversation_id,
-                visualizer=None,
-            )
-
-        assert "tools cannot be changed mid-conversation" in str(exc_info.value)
-        assert "added:" in str(exc_info.value)
-        assert "FileEditorTool" in str(exc_info.value)
+        conversation = LocalConversation(
+            agent=expanded_agent,
+            workspace=temp_dir,
+            persistence_dir=temp_dir,
+            conversation_id=conversation_id,
+            visualizer=None,
+        )
+        assert conversation is not None
 
 
 def test_conversation_fails_when_used_tool_is_missing():
@@ -274,10 +268,8 @@ def test_conversation_fails_when_used_tool_is_missing():
         )
         reduced_agent = Agent(llm=llm2, tools=reduced_tools)
 
-        # This should raise - tools cannot be changed mid-conversation
-        with pytest.raises(
-            ValueError, match="tools cannot be changed mid-conversation"
-        ):
+        # This should raise - tools were removed mid-conversation
+        with pytest.raises(ValueError, match="tools were removed mid-conversation"):
             LocalConversation(
                 agent=reduced_agent,
                 workspace=temp_dir,
diff --git a/tests/cross/test_conversation_restore_behavior.py b/tests/cross/test_conversation_restore_behavior.py
@@ -231,7 +231,7 @@ def test_conversation_restore_fails_when_removing_tools(mock_completion):
         )
 
         with pytest.raises(
-            ValueError, match="tools cannot be changed mid-conversation"
+            ValueError, match="tools were removed mid-conversation"
         ) as exc:
             lifecycle.restore(runtime_agent)
 
@@ -240,8 +240,11 @@ def test_conversation_restore_fails_when_removing_tools(mock_completion):
 
 
 @patch("openhands.sdk.llm.llm.litellm_completion")
-def test_conversation_restore_fails_when_adding_tools(mock_completion):
-    """Restore must fail when runtime tools add a new tool."""
+def test_conversation_restore_succeeds_when_adding_tools(mock_completion):
+    """Restore must succeed when runtime tools add a new tool.
+
+    Adding tools is allowed — only removing tools is rejected.
+    """
 
     mock_completion.return_value = create_mock_litellm_response(
         content="I'll help you with that.", finish_reason="stop"
@@ -274,13 +277,8 @@ def test_conversation_restore_fails_when_adding_tools(mock_completion):
             skill_keyword="alpha",
         )
 
-        with pytest.raises(
-            ValueError, match="tools cannot be changed mid-conversation"
-        ) as exc:
-            lifecycle.restore(runtime_agent)
-
-        assert "added:" in str(exc.value)
-        assert "FileEditorTool" in str(exc.value)
+        conversation = lifecycle.restore(runtime_agent)
+        assert conversation is not None
 
 
 @patch("openhands.sdk.llm.llm.litellm_completion")
@@ -364,7 +362,7 @@ def test_conversation_restore_fails_when_default_tools_removed(mock_completion):
         )
 
         with pytest.raises(
-            ValueError, match="tools cannot be changed mid-conversation"
+            ValueError, match="tools were removed mid-conversation"
         ) as exc:
             lifecycle.restore(runtime_agent)
 
@@ -373,8 +371,11 @@ def test_conversation_restore_fails_when_default_tools_removed(mock_completion):
 
 
 @patch("openhands.sdk.llm.llm.litellm_completion")
-def test_conversation_restore_fails_when_default_tools_added(mock_completion):
-    """Restore must fail if include_default_tools adds a built-in tool."""
+def test_conversation_restore_succeeds_when_default_tools_added(mock_completion):
+    """Restore must succeed if include_default_tools adds a built-in tool.
+
+    Adding tools is allowed — only removing tools is rejected.
+    """
 
     mock_completion.return_value = create_mock_litellm_response(
         content="I'll help you with that.", finish_reason="stop"
@@ -409,13 +410,8 @@ def test_conversation_restore_fails_when_default_tools_added(mock_completion):
             include_default_tools=["FinishTool", "ThinkTool"],
         )
 
-        with pytest.raises(
-            ValueError, match="tools cannot be changed mid-conversation"
-        ) as exc:
-            lifecycle.restore(runtime_agent)
-
-        assert "added:" in str(exc.value)
-        assert "think" in str(exc.value)
+        conversation = lifecycle.restore(runtime_agent)
+        assert conversation is not None
 
 
 @patch("openhands.sdk.llm.llm.litellm_completion")
diff --git a/tests/fixtures/conversations/v1_11_5_cli_default/base_state.json b/tests/fixtures/conversations/v1_11_5_cli_default/base_state.json
@@ -0,0 +1,120 @@
+{
+  "id": "11111111-2222-3333-4444-555555555555",
+  "agent": {
+    "llm": {
+      "model": "gpt-4o-mini",
+      "api_key": "**********",
+      "openrouter_site_url": "https://docs.all-hands.dev/",
+      "openrouter_app_name": "OpenHands",
+      "num_retries": 5,
+      "retry_multiplier": 8.0,
+      "retry_min_wait": 8,
+      "retry_max_wait": 64,
+      "timeout": 300,
+      "max_message_chars": 30000,
+      "temperature": 0.0,
+      "top_p": 1.0,
+      "max_input_tokens": 128000,
+      "max_output_tokens": 16384,
+      "stream": false,
+      "drop_params": true,
+      "modify_params": true,
+      "disable_stop_word": false,
+      "caching_prompt": true,
+      "log_completions": false,
+      "log_completions_folder": "logs/completions",
+      "native_tool_calling": true,
+      "reasoning_effort": "high",
+      "enable_encrypted_reasoning": true,
+      "prompt_cache_retention": "24h",
+      "extended_thinking_budget": 200000,
+      "usage_id": "test-llm",
+      "litellm_extra_body": {}
+    },
+    "tools": [
+      {
+        "name": "terminal",
+        "params": {}
+      },
+      {
+        "name": "file_editor",
+        "params": {}
+      },
+      {
+        "name": "task_tracker",
+        "params": {}
+      }
+    ],
+    "mcp_config": {},
+    "include_default_tools": [
+      "FinishTool",
+      "ThinkTool"
+    ],
+    "system_prompt_filename": "system_prompt.j2",
+    "security_policy_filename": "security_policy.j2",
+    "system_prompt_kwargs": {
+      "cli_mode": true,
+      "llm_security_analyzer": true
+    },
+    "condenser": {
+      "llm": {
+        "model": "gpt-4o-mini",
+        "api_key": "**********",
+        "openrouter_site_url": "https://docs.all-hands.dev/",
+        "openrouter_app_name": "OpenHands",
+        "num_retries": 5,
+        "retry_multiplier": 8.0,
+        "retry_min_wait": 8,
+        "retry_max_wait": 64,
+        "timeout": 300,
+        "max_message_chars": 30000,
+        "temperature": 0.0,
+        "top_p": 1.0,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 16384,
+        "stream": false,
+        "drop_params": true,
+        "modify_params": true,
+        "disable_stop_word": false,
+        "caching_prompt": true,
+        "log_completions": false,
+        "log_completions_folder": "logs/completions",
+        "native_tool_calling": true,
+        "reasoning_effort": "high",
+        "enable_encrypted_reasoning": true,
+        "prompt_cache_retention": "24h",
+        "extended_thinking_budget": 200000,
+        "usage_id": "condenser",
+        "litellm_extra_body": {}
+      },
+      "max_size": 80,
+      "keep_first": 4,
+      "minimum_progress": 0.1,
+      "hard_context_reset_max_retries": 5,
+      "hard_context_reset_context_scaling": 0.8,
+      "kind": "LLMSummarizingCondenser"
+    },
+    "kind": "Agent"
+  },
+  "workspace": {
+    "working_dir": "/workspace/project/software-agent-sdk/.agent_tmp/repro/persistence",
+    "kind": "LocalWorkspace"
+  },
+  "persistence_dir": "/workspace/project/software-agent-sdk/.agent_tmp/repro/persistence/11111111222233334444555555555555",
+  "max_iterations": 500,
+  "stuck_detection": true,
+  "execution_status": "idle",
+  "confirmation_policy": {
+    "kind": "NeverConfirm"
+  },
+  "activated_knowledge_skills": [],
+  "blocked_actions": {},
+  "blocked_messages": {},
+  "stats": {
+    "usage_to_metrics": {}
+  },
+  "secret_registry": {
+    "secret_sources": {}
+  },
+  "agent_state": {}
+}
diff --git a/tests/sdk/conversation/local/test_state_serialization.py b/tests/sdk/conversation/local/test_state_serialization.py