fix(llm): cap auto-detected max_output_tokens when it fills the entire context window

csmith49 · openhands-agent · csmith49 · commit f34dd4c1c37e · 2026-04-07T10:30:30.000-06:00
When litellm's model registry reports max_output_tokens &gt;= max_input_tokens
(e.g. Nemotron: both 262144), the SDK would request the entire context window
for output, leaving zero tokens for input. Every provider call was rejected,
the condenser misinterpreted this as context overflow, and crashed on the
near-empty history with NoCondensationAvailableException.

Cap auto-detected max_output_tokens to half the context window when it would
otherwise consume the full window. Explicitly user-set values are not affected.

Co-authored-by: openhands &lt;openhands@all-hands.dev&gt;
diff --git a/openhands-sdk/openhands/sdk/llm/llm.py b/openhands-sdk/openhands/sdk/llm/llm.py
@@ -1210,25 +1210,30 @@ def _init_model_info_and_caps(self) -> None:
             elif self._model_info is not None:
                 if isinstance(self._model_info.get("max_output_tokens"), int):
                     self.max_output_tokens = self._model_info.get("max_output_tokens")
-                    # Guard: if the registry reports max_output_tokens >= the
-                    # context window, requesting that many output tokens would
-                    # leave zero room for input and the provider will reject
-                    # every call. Cap to half the context window so input has
-                    # headroom (e.g. Nemotron 262 144 / 262 144).
+                    # Guard: if max_output_tokens >= the context window,
+                    # requesting that many output tokens would leave zero
+                    # room for input and strict providers (e.g. AWS Bedrock,
+                    # Nemotron 262 144 / 262 144) will reject every call.
+                    # Halve it so input has headroom. We check both
+                    # max_input_tokens and max_tokens since either may
+                    # represent the context window depending on the provider.
+                    context_window = self.max_input_tokens or self._model_info.get(
+                        "max_tokens"
+                    )
                     if (
-                        self.max_input_tokens is not None
+                        context_window is not None
                         and self.max_output_tokens is not None
-                        and self.max_output_tokens >= self.max_input_tokens
+                        and self.max_output_tokens >= context_window
                     ):
-                        capped = self.max_input_tokens // 2
+                        capped = self.max_output_tokens // 2
                         logger.debug(
                             "Capping max_output_tokens from %s to %s "
                             "for %s (max_output_tokens >= context "
                             "window %s)",
                             self.max_output_tokens,
                             capped,
                             self.model,
-                            self.max_input_tokens,
+                            context_window,
                         )
                         self.max_output_tokens = capped
                 elif isinstance(self._model_info.get("max_tokens"), int):
diff --git a/tests/sdk/llm/test_llm.py b/tests/sdk/llm/test_llm.py
@@ -1168,8 +1168,8 @@ def test_max_output_tokens_capped_when_equal_to_context_window(
     """max_output_tokens == context window leaves zero input headroom.
 
     Nemotron reports max_output_tokens = max_input_tokens = 262144.
-    Without capping, every LLM call is rejected because the entire context
-    window is reserved for output.
+    Strict providers (e.g. AWS Bedrock) reject every call when
+    max_output_tokens fills the entire context window.
     """
     mock_get_model_info.return_value = {
         "max_output_tokens": 262144,
@@ -1186,6 +1186,30 @@ def test_max_output_tokens_capped_when_equal_to_context_window(
     assert llm.max_input_tokens == 262144
 
 
+@patch("openhands.sdk.llm.llm.get_litellm_model_info")
+def test_max_output_tokens_capped_when_equal_to_max_tokens(
+    mock_get_model_info,
+):
+    """max_output_tokens == max_tokens should also be halved.
+
+    Some registries only provide max_tokens (context window) without
+    max_input_tokens. The guard should still fire.
+    """
+    mock_get_model_info.return_value = {
+        "max_output_tokens": 131072,
+        "max_tokens": 131072,
+        "max_input_tokens": None,
+    }
+
+    llm = LLM(
+        model="litellm_proxy/test-model-max-tokens-only",
+        api_key=SecretStr("test-key"),
+        usage_id="test-llm",
+    )
+
+    assert llm.max_output_tokens == 131072 // 2
+
+
 @patch("openhands.sdk.llm.llm.get_litellm_model_info")
 def test_max_output_tokens_not_capped_when_below_context_window(
     mock_get_model_info,