Skip to content

Commit f34dd4c

Browse files
fix(llm): cap auto-detected max_output_tokens when it fills the entire context window
When litellm's model registry reports max_output_tokens >= max_input_tokens (e.g. Nemotron: both 262144), the SDK would request the entire context window for output, leaving zero tokens for input. Every provider call was rejected, the condenser misinterpreted this as context overflow, and crashed on the near-empty history with NoCondensationAvailableException. Cap auto-detected max_output_tokens to half the context window when it would otherwise consume the full window. Explicitly user-set values are not affected. Co-authored-by: openhands <openhands@all-hands.dev>
1 parent 0b4be6f commit f34dd4c

File tree

2 files changed

+40
-11
lines changed

2 files changed

+40
-11
lines changed

openhands-sdk/openhands/sdk/llm/llm.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1210,25 +1210,30 @@ def _init_model_info_and_caps(self) -> None:
12101210
elif self._model_info is not None:
12111211
if isinstance(self._model_info.get("max_output_tokens"), int):
12121212
self.max_output_tokens = self._model_info.get("max_output_tokens")
1213-
# Guard: if the registry reports max_output_tokens >= the
1214-
# context window, requesting that many output tokens would
1215-
# leave zero room for input and the provider will reject
1216-
# every call. Cap to half the context window so input has
1217-
# headroom (e.g. Nemotron 262 144 / 262 144).
1213+
# Guard: if max_output_tokens >= the context window,
1214+
# requesting that many output tokens would leave zero
1215+
# room for input and strict providers (e.g. AWS Bedrock,
1216+
# Nemotron 262 144 / 262 144) will reject every call.
1217+
# Halve it so input has headroom. We check both
1218+
# max_input_tokens and max_tokens since either may
1219+
# represent the context window depending on the provider.
1220+
context_window = self.max_input_tokens or self._model_info.get(
1221+
"max_tokens"
1222+
)
12181223
if (
1219-
self.max_input_tokens is not None
1224+
context_window is not None
12201225
and self.max_output_tokens is not None
1221-
and self.max_output_tokens >= self.max_input_tokens
1226+
and self.max_output_tokens >= context_window
12221227
):
1223-
capped = self.max_input_tokens // 2
1228+
capped = self.max_output_tokens // 2
12241229
logger.debug(
12251230
"Capping max_output_tokens from %s to %s "
12261231
"for %s (max_output_tokens >= context "
12271232
"window %s)",
12281233
self.max_output_tokens,
12291234
capped,
12301235
self.model,
1231-
self.max_input_tokens,
1236+
context_window,
12321237
)
12331238
self.max_output_tokens = capped
12341239
elif isinstance(self._model_info.get("max_tokens"), int):

tests/sdk/llm/test_llm.py

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1168,8 +1168,8 @@ def test_max_output_tokens_capped_when_equal_to_context_window(
11681168
"""max_output_tokens == context window leaves zero input headroom.
11691169
11701170
Nemotron reports max_output_tokens = max_input_tokens = 262144.
1171-
Without capping, every LLM call is rejected because the entire context
1172-
window is reserved for output.
1171+
Strict providers (e.g. AWS Bedrock) reject every call when
1172+
max_output_tokens fills the entire context window.
11731173
"""
11741174
mock_get_model_info.return_value = {
11751175
"max_output_tokens": 262144,
@@ -1186,6 +1186,30 @@ def test_max_output_tokens_capped_when_equal_to_context_window(
11861186
assert llm.max_input_tokens == 262144
11871187

11881188

1189+
@patch("openhands.sdk.llm.llm.get_litellm_model_info")
1190+
def test_max_output_tokens_capped_when_equal_to_max_tokens(
1191+
mock_get_model_info,
1192+
):
1193+
"""max_output_tokens == max_tokens should also be halved.
1194+
1195+
Some registries only provide max_tokens (context window) without
1196+
max_input_tokens. The guard should still fire.
1197+
"""
1198+
mock_get_model_info.return_value = {
1199+
"max_output_tokens": 131072,
1200+
"max_tokens": 131072,
1201+
"max_input_tokens": None,
1202+
}
1203+
1204+
llm = LLM(
1205+
model="litellm_proxy/test-model-max-tokens-only",
1206+
api_key=SecretStr("test-key"),
1207+
usage_id="test-llm",
1208+
)
1209+
1210+
assert llm.max_output_tokens == 131072 // 2
1211+
1212+
11891213
@patch("openhands.sdk.llm.llm.get_litellm_model_info")
11901214
def test_max_output_tokens_not_capped_when_below_context_window(
11911215
mock_get_model_info,

0 commit comments

Comments
 (0)