Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions openhands-sdk/openhands/sdk/llm/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1210,6 +1210,27 @@ def _init_model_info_and_caps(self) -> None:
elif self._model_info is not None:
if isinstance(self._model_info.get("max_output_tokens"), int):
self.max_output_tokens = self._model_info.get("max_output_tokens")
# Guard: if the registry reports max_output_tokens >= the
# context window, requesting that many output tokens would
# leave zero room for input and the provider will reject
# every call. Cap to half the context window so input has
# headroom (e.g. Nemotron 262 144 / 262 144).
if (
self.max_input_tokens is not None
and self.max_output_tokens is not None
and self.max_output_tokens >= self.max_input_tokens
):
capped = self.max_input_tokens // 2
logger.debug(
"Capping max_output_tokens from %s to %s "
"for %s (max_output_tokens >= context "
"window %s)",
self.max_output_tokens,
capped,
self.model,
self.max_input_tokens,
)
self.max_output_tokens = capped
elif isinstance(self._model_info.get("max_tokens"), int):
# 'max_tokens' is ambiguous: some providers use it for total
# context window, not output limit. Cap it to avoid requesting
Expand Down
44 changes: 44 additions & 0 deletions tests/sdk/llm/test_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1161,4 +1161,48 @@ def test_explicit_max_output_tokens_not_overridden():
assert llm.max_output_tokens == 32768


@patch("openhands.sdk.llm.llm.get_litellm_model_info")
def test_max_output_tokens_capped_when_equal_to_context_window(
mock_get_model_info,
):
"""max_output_tokens == context window leaves zero input headroom.

Nemotron reports max_output_tokens = max_input_tokens = 262144.
Without capping, every LLM call is rejected because the entire context
window is reserved for output.
"""
mock_get_model_info.return_value = {
"max_output_tokens": 262144,
"max_input_tokens": 262144,
}

llm = LLM(
model="litellm_proxy/converse-nemotron-super-3-120b",
api_key=SecretStr("test-key"),
usage_id="test-llm",
)

assert llm.max_output_tokens == 262144 // 2
assert llm.max_input_tokens == 262144


@patch("openhands.sdk.llm.llm.get_litellm_model_info")
def test_max_output_tokens_not_capped_when_below_context_window(
mock_get_model_info,
):
"""max_output_tokens < context window should be used as-is."""
mock_get_model_info.return_value = {
"max_output_tokens": 8192,
"max_input_tokens": 200000,
}

llm = LLM(
model="anthropic/claude-3-5-sonnet-latest",
api_key=SecretStr("test-key"),
usage_id="test-llm",
)

assert llm.max_output_tokens == 8192


# LLM Registry Tests
Loading