Skip to content

Commit f5fcef8

Browse files
csmith49openhands-agentjuanmichelini
authored
fix(llm): cap auto-detected max_output_tokens when it fills the entire context window (#2747)
Co-authored-by: openhands <openhands@all-hands.dev> Co-authored-by: Juan Michelini <juan@juan.com.uy>
1 parent 07bd007 commit f5fcef8

File tree

2 files changed

+93
-0
lines changed

2 files changed

+93
-0
lines changed

openhands-sdk/openhands/sdk/llm/llm.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1210,6 +1210,32 @@ def _init_model_info_and_caps(self) -> None:
12101210
elif self._model_info is not None:
12111211
if isinstance(self._model_info.get("max_output_tokens"), int):
12121212
self.max_output_tokens = self._model_info.get("max_output_tokens")
1213+
# Guard: if max_output_tokens >= the context window,
1214+
# requesting that many output tokens would leave zero
1215+
# room for input and strict providers (e.g. AWS Bedrock)
1216+
# will reject every call. Halve it so input has
1217+
# headroom. We check both max_input_tokens and
1218+
# max_tokens since either may represent the context
1219+
# window depending on the provider.
1220+
context_window = self.max_input_tokens or self._model_info.get(
1221+
"max_tokens"
1222+
)
1223+
if (
1224+
context_window is not None
1225+
and self.max_output_tokens is not None
1226+
and self.max_output_tokens >= context_window
1227+
):
1228+
capped = self.max_output_tokens // 2
1229+
logger.debug(
1230+
"Capping max_output_tokens from %s to %s "
1231+
"for %s (max_output_tokens >= context "
1232+
"window %s)",
1233+
self.max_output_tokens,
1234+
capped,
1235+
self.model,
1236+
context_window,
1237+
)
1238+
self.max_output_tokens = capped
12131239
elif isinstance(self._model_info.get("max_tokens"), int):
12141240
# 'max_tokens' is ambiguous: some providers use it for total
12151241
# context window, not output limit. Cap it to avoid requesting

tests/sdk/llm/test_llm.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1161,4 +1161,71 @@ def test_explicit_max_output_tokens_not_overridden():
11611161
assert llm.max_output_tokens == 32768
11621162

11631163

1164+
@patch("openhands.sdk.llm.llm.get_litellm_model_info")
1165+
def test_max_output_tokens_capped_when_equal_to_context_window(
1166+
mock_get_model_info,
1167+
):
1168+
"""max_output_tokens == context window leaves zero input headroom.
1169+
1170+
Strict providers (e.g. AWS Bedrock) reject every call when
1171+
max_output_tokens fills the entire context window.
1172+
"""
1173+
mock_get_model_info.return_value = {
1174+
"max_output_tokens": 262144,
1175+
"max_input_tokens": 262144,
1176+
}
1177+
1178+
llm = LLM(
1179+
model="litellm_proxy/test-model-equal-windows",
1180+
api_key=SecretStr("test-key"),
1181+
usage_id="test-llm",
1182+
)
1183+
1184+
assert llm.max_output_tokens == 262144 // 2
1185+
assert llm.max_input_tokens == 262144
1186+
1187+
1188+
@patch("openhands.sdk.llm.llm.get_litellm_model_info")
1189+
def test_max_output_tokens_capped_when_equal_to_max_tokens(
1190+
mock_get_model_info,
1191+
):
1192+
"""max_output_tokens == max_tokens should also be halved.
1193+
1194+
Some registries only provide max_tokens (context window) without
1195+
max_input_tokens. The guard should still fire.
1196+
"""
1197+
mock_get_model_info.return_value = {
1198+
"max_output_tokens": 131072,
1199+
"max_tokens": 131072,
1200+
"max_input_tokens": None,
1201+
}
1202+
1203+
llm = LLM(
1204+
model="litellm_proxy/test-model-max-tokens-only",
1205+
api_key=SecretStr("test-key"),
1206+
usage_id="test-llm",
1207+
)
1208+
1209+
assert llm.max_output_tokens == 131072 // 2
1210+
1211+
1212+
@patch("openhands.sdk.llm.llm.get_litellm_model_info")
1213+
def test_max_output_tokens_not_capped_when_below_context_window(
1214+
mock_get_model_info,
1215+
):
1216+
"""max_output_tokens < context window should be used as-is."""
1217+
mock_get_model_info.return_value = {
1218+
"max_output_tokens": 8192,
1219+
"max_input_tokens": 200000,
1220+
}
1221+
1222+
llm = LLM(
1223+
model="anthropic/claude-3-5-sonnet-latest",
1224+
api_key=SecretStr("test-key"),
1225+
usage_id="test-llm",
1226+
)
1227+
1228+
assert llm.max_output_tokens == 8192
1229+
1230+
11641231
# LLM Registry Tests

0 commit comments

Comments
 (0)