Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions openhands-sdk/openhands/sdk/llm/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1210,6 +1210,32 @@ def _init_model_info_and_caps(self) -> None:
elif self._model_info is not None:
if isinstance(self._model_info.get("max_output_tokens"), int):
self.max_output_tokens = self._model_info.get("max_output_tokens")
# Guard: if max_output_tokens >= the context window,
# requesting that many output tokens would leave zero
# room for input and strict providers (e.g. AWS Bedrock)
# will reject every call. Halve it so input has
# headroom. We check both max_input_tokens and
# max_tokens since either may represent the context
# window depending on the provider.
context_window = self.max_input_tokens or self._model_info.get(
"max_tokens"
)
if (
context_window is not None
and self.max_output_tokens is not None
and self.max_output_tokens >= context_window
):
capped = self.max_output_tokens // 2
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that's why we sometimes had 4096 or something like that, output tokens are not typically all that much in a single call. This works though! 🤔

It just means the history is smaller when it reaches context error, than if we put some value like 4096, because half is more

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does setting the max like that encourage models to generate more? Honestly I'm not sure. I'd expect we'll end up with very similarly-sized events as if we had set it at 4096.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🤷 I don't know, I'm thinking about the reverse: setting half means that the LLM API provider will error sooner, because it adds that value to the prompt I think? So to the input tokens at the time of the request.

At least, I'm pretty sure Anthropic and OpenAI do that, and I thought the error message suggested it... I could be wrong though

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The relevant error message here is:

You passed 2468 input characters and requested 262144 output tokens.
However, the model's context length is only 262144 tokens, resulting in
a maximum input length of 0 tokens (at most 0 characters). Please reduce
the length of the input prompt.

Maybe the "requested" suggests that behavior? In which case this is probably worth escalating to LiteLLM, considering it's their registry that sets the output tokens the way it is.

logger.debug(
"Capping max_output_tokens from %s to %s "
"for %s (max_output_tokens >= context "
"window %s)",
self.max_output_tokens,
capped,
self.model,
context_window,
)
self.max_output_tokens = capped
elif isinstance(self._model_info.get("max_tokens"), int):
# 'max_tokens' is ambiguous: some providers use it for total
# context window, not output limit. Cap it to avoid requesting
Expand Down
67 changes: 67 additions & 0 deletions tests/sdk/llm/test_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1161,4 +1161,71 @@ def test_explicit_max_output_tokens_not_overridden():
assert llm.max_output_tokens == 32768


@patch("openhands.sdk.llm.llm.get_litellm_model_info")
def test_max_output_tokens_capped_when_equal_to_context_window(
mock_get_model_info,
):
"""max_output_tokens == context window leaves zero input headroom.

Strict providers (e.g. AWS Bedrock) reject every call when
max_output_tokens fills the entire context window.
"""
mock_get_model_info.return_value = {
"max_output_tokens": 262144,
"max_input_tokens": 262144,
}

llm = LLM(
model="litellm_proxy/test-model-equal-windows",
api_key=SecretStr("test-key"),
usage_id="test-llm",
)

assert llm.max_output_tokens == 262144 // 2
assert llm.max_input_tokens == 262144


@patch("openhands.sdk.llm.llm.get_litellm_model_info")
def test_max_output_tokens_capped_when_equal_to_max_tokens(
mock_get_model_info,
):
"""max_output_tokens == max_tokens should also be halved.

Some registries only provide max_tokens (context window) without
max_input_tokens. The guard should still fire.
"""
mock_get_model_info.return_value = {
"max_output_tokens": 131072,
"max_tokens": 131072,
"max_input_tokens": None,
}

llm = LLM(
model="litellm_proxy/test-model-max-tokens-only",
api_key=SecretStr("test-key"),
usage_id="test-llm",
)

assert llm.max_output_tokens == 131072 // 2


@patch("openhands.sdk.llm.llm.get_litellm_model_info")
def test_max_output_tokens_not_capped_when_below_context_window(
mock_get_model_info,
):
"""max_output_tokens < context window should be used as-is."""
mock_get_model_info.return_value = {
"max_output_tokens": 8192,
"max_input_tokens": 200000,
}

llm = LLM(
model="anthropic/claude-3-5-sonnet-latest",
api_key=SecretStr("test-key"),
usage_id="test-llm",
)

assert llm.max_output_tokens == 8192


# LLM Registry Tests
Loading