Add gpt-5.4 to resolve_model_config.py (#2374)

juanmichelini · openhands-agent · web-flow · commit c3bb8e70ed5e · 2026-03-12T02:02:13.000+08:00
Co-authored-by: openhands &lt;openhands@all-hands.dev&gt;
diff --git a/.github/run-eval/resolve_model_config.py b/.github/run-eval/resolve_model_config.py
@@ -140,6 +140,14 @@
             "reasoning_effort": "high",
         },
     },
+    "gpt-5.4": {
+        "id": "gpt-5.4",
+        "display_name": "GPT-5.4",
+        "llm_config": {
+            "model": "litellm_proxy/openai/gpt-5.4",
+            "reasoning_effort": "high",
+        },
+    },
     "minimax-m2": {
         "id": "minimax-m2",
         "display_name": "MiniMax M2",
diff --git a/openhands-sdk/openhands/sdk/llm/utils/model_features.py b/openhands-sdk/openhands/sdk/llm/utils/model_features.py
@@ -69,6 +69,7 @@ class ModelFeatures:
     "gemini-3.1-pro-preview",
     # OpenAI GPT-5 family (includes mini variants)
     "gpt-5",
+    "gpt-5.4",
     # Anthropic Opus 4.5 and 4.6
     "claude-opus-4-5",
     "claude-opus-4-6",
diff --git a/openhands-sdk/openhands/sdk/llm/utils/model_prompt_spec.py b/openhands-sdk/openhands/sdk/llm/utils/model_prompt_spec.py
@@ -40,7 +40,7 @@ class ModelPromptSpec(BaseModel):
             "gpt-5-codex",
             ("gpt-5-codex", "gpt-5.1-codex", "gpt-5.2-codex", "gpt-5.3-codex"),
         ),
-        ("gpt-5", ("gpt-5", "gpt-5.1", "gpt-5.2")),
+        ("gpt-5", ("gpt-5", "gpt-5.1", "gpt-5.2", "gpt-5.4")),
     ),
 }
 
diff --git a/tests/github_workflows/test_resolve_model_config.py b/tests/github_workflows/test_resolve_model_config.py
@@ -491,3 +491,13 @@ def test_models_importable_without_litellm():
         f"stderr: {result.stderr}"
     )
     assert "SUCCESS" in result.stdout
+
+
+def test_gpt_5_4_config():
+    """Test that gpt-5.4 has correct configuration."""
+    model = MODELS["gpt-5.4"]
+
+    assert model["id"] == "gpt-5.4"
+    assert model["display_name"] == "GPT-5.4"
+    assert model["llm_config"]["model"] == "litellm_proxy/openai/gpt-5.4"
+    assert model["llm_config"]["reasoning_effort"] == "high"
diff --git a/tests/integration/tests/c01_thinking_block_condenser.py b/tests/integration/tests/c01_thinking_block_condenser.py
@@ -1,16 +1,21 @@
 """
 Integration test for thinking block handling during condensation.
 
-This test validates that Claude Opus's thinking blocks are properly handled
+This test validates that Anthropic Claude's thinking blocks are properly handled
 during conversation condensation, preventing malformed signature errors that
 can occur when thinking blocks are included in conversation history.
+
+Note: This test only applies to models that support extended_thinking (Anthropic
+Claude models). Models with reasoning_effort (like OpenAI o-series and GPT-5.x)
+produce reasoning items instead of thinking blocks, and are skipped.
 """
 
 from openhands.sdk import LLM, Message, TextContent, Tool
 from openhands.sdk.context.condenser.base import CondenserBase
 from openhands.sdk.context.view import View
 from openhands.sdk.conversation.impl.local_conversation import LocalConversation
 from openhands.sdk.event import ActionEvent, Condensation
+from openhands.sdk.llm.utils.model_features import get_features
 from openhands.sdk.tool import register_tool
 from openhands.tools.terminal import TerminalTool
 from tests.integration.base import BaseIntegrationTest, SkipTest, TestResult
@@ -135,16 +140,16 @@ def setup(self) -> None:
         """
         Validate that the model supports extended thinking.
 
-        Thinking blocks are primarily supported by:
-        - Anthropic Claude models (extended_thinking)
-        - Some Gemini models (extended_thinking)
-        - Some other models (reasoning_effort)
+        Thinking blocks are specifically supported by Anthropic Claude models
+        with extended_thinking enabled. Models that only support reasoning_effort
+        (like OpenAI o-series and GPT-5.x) produce reasoning items instead of
+        thinking blocks, so they should be skipped.
         """
         model = self.llm_config.get("model", "")
+        features = get_features(model)
 
-        # Check if model has extended thinking or reasoning effort configured
+        # Check if model has extended thinking configured
         has_extended_thinking = self.llm_config.get("extended_thinking", False)
-        has_reasoning_effort = "reasoning_effort" in self.llm_config
 
         # For Claude Opus, automatically enable extended thinking if not set
         if "opus" in model.lower() and not has_extended_thinking:
@@ -154,11 +159,15 @@ def setup(self) -> None:
                 **{**self.llm.model_dump(), **self.llm_config}
             )
             self.agent.llm = self.llm
+            has_extended_thinking = True
 
-        # Skip test if model doesn't support thinking blocks
-        if not has_extended_thinking and not has_reasoning_effort:
+        # Skip test if model doesn't support extended thinking (which produces
+        # thinking_blocks). Models that only support reasoning_effort produce
+        # responses_reasoning_item instead, which is a different mechanism.
+        if not has_extended_thinking and not features.supports_extended_thinking:
             raise SkipTest(
-                f"Model {model} does not support extended thinking or reasoning effort"
+                f"Model {model} does not support extended thinking "
+                "(produces reasoning items instead of thinking blocks)"
             )
 
     def conversation_callback(self, event):
diff --git a/tests/sdk/llm/test_model_features.py b/tests/sdk/llm/test_model_features.py
@@ -39,6 +39,7 @@ def test_model_matches(name, pattern, expected):
         # GPT-5 family
         ("gpt-5.2", True),
         ("gpt-5.2-codex", True),
+        ("gpt-5.4", True),
         ("gpt-4o", False),
         ("claude-3-5-sonnet", False),
         ("gemini-1.5-pro", False),

Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,7 @@ class ModelPromptSpec(BaseModel):`
`40`	`40`	`"gpt-5-codex",`
`41`	`41`	`("gpt-5-codex", "gpt-5.1-codex", "gpt-5.2-codex", "gpt-5.3-codex"),`
`42`	`42`	`),`
`43`		`- ("gpt-5", ("gpt-5", "gpt-5.1", "gpt-5.2")),`
	`43`	`+ ("gpt-5", ("gpt-5", "gpt-5.1", "gpt-5.2", "gpt-5.4")),`
`44`	`44`	`),`
`45`	`45`	`}`
`46`	`46`