Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 22 additions & 1 deletion openhands-sdk/openhands/sdk/llm/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,12 @@
# Environment variable to override the minimum context window check
ENV_ALLOW_SHORT_CONTEXT_WINDOWS: Final[str] = "ALLOW_SHORT_CONTEXT_WINDOWS"

# Default max output tokens when model info only provides 'max_tokens' (ambiguous).
# Some providers use 'max_tokens' for the total context window, not output limit.
# This cap prevents requesting output that exceeds the context window.
# 16384 is a safe default that works for most models (GPT-4o: 16k, Claude: 8k).
DEFAULT_MAX_OUTPUT_TOKENS_CAP: Final[int] = 16384
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟢 Pragmatic: 16384 is a sensible default that works for common models (GPT-4o: 16k, Claude: 8k). This solves the real problem where uncapped values cause OpenRouter errors.



class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin):
"""Language model interface for OpenHands agents.
Expand Down Expand Up @@ -187,7 +193,7 @@
"or higher values (0.7-1.0) for more creative responses."
),
)
top_p: float | None = Field(

Check warning on line 196 in openhands-sdk/openhands/sdk/llm/llm.py

View workflow job for this annotation

GitHub Actions / SDK programmatic API (Griffe)

LLM.top_p

Attribute value was changed: `Field(default=1.0, ge=0, le=1)` -> `Field(default=None, ge=0, le=1, description='Nucleus sampling parameter. Defaults to None (uses provider default). Set to a value between 0 and 1 to control diversity of outputs.')`
default=None,
ge=0,
le=1,
Expand Down Expand Up @@ -1112,7 +1118,22 @@
if isinstance(self._model_info.get("max_output_tokens"), int):
self.max_output_tokens = self._model_info.get("max_output_tokens")
elif isinstance(self._model_info.get("max_tokens"), int):
self.max_output_tokens = self._model_info.get("max_tokens")
# 'max_tokens' is ambiguous: some providers use it for total
# context window, not output limit. Cap it to avoid requesting
# output that exceeds the context window.
max_tokens_value = self._model_info.get("max_tokens")
assert isinstance(max_tokens_value, int) # for type checker
self.max_output_tokens = min(
max_tokens_value, DEFAULT_MAX_OUTPUT_TOKENS_CAP
)
if max_tokens_value > DEFAULT_MAX_OUTPUT_TOKENS_CAP:
logger.debug(
"Capping max_output_tokens from %s to %s for %s "
"(max_tokens may be context window, not output)",
max_tokens_value,
self.max_output_tokens,
self.model,
)
Comment on lines +1121 to +1136
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟢 Good Design: The capping logic is clean and preserves backward compatibility. Explicitly set max_output_tokens values are still respected (tested in test_explicit_max_output_tokens_not_overridden), so users can override if needed. Debug logging helps troubleshoot edge cases.


if "o3" in self.model:
o3_limit = 100000
Expand Down
91 changes: 91 additions & 0 deletions tests/sdk/llm/test_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1070,4 +1070,95 @@ def test_llm_reset_metrics():
assert llm.metrics.accumulated_cost == 0.0


# max_output_tokens Capping Tests


@patch("openhands.sdk.llm.llm.get_litellm_model_info")
def test_max_output_tokens_capped_when_using_max_tokens_fallback(mock_get_model_info):
"""Test that max_output_tokens is capped when falling back to max_tokens.

Some providers (e.g., OpenRouter) set max_tokens to the context window size
rather than the output limit. Without capping, this could request output
that exceeds the context window.

See: https://github.com/OpenHands/software-agent-sdk/issues/XXX
"""
from openhands.sdk.llm.llm import DEFAULT_MAX_OUTPUT_TOKENS_CAP

# Simulate a model where max_tokens = context window (200k) but
# max_output_tokens is not set
mock_get_model_info.return_value = {
"max_tokens": 200000, # This is the context window, not output limit
"max_output_tokens": None,
"max_input_tokens": 200000,
}

llm = LLM(
model="openrouter/anthropic/claude-3-haiku",
api_key=SecretStr("test-key"),
usage_id="test-llm",
)

# max_output_tokens should be capped, not set to 200000
assert llm.max_output_tokens is not None
assert llm.max_output_tokens == DEFAULT_MAX_OUTPUT_TOKENS_CAP
assert llm.max_output_tokens < 200000


@patch("openhands.sdk.llm.llm.get_litellm_model_info")
def test_max_output_tokens_uses_actual_value_when_available(mock_get_model_info):
"""Test that actual max_output_tokens is used when available."""
# Simulate a model with proper max_output_tokens
mock_get_model_info.return_value = {
"max_tokens": 8192,
"max_output_tokens": 8192,
"max_input_tokens": 200000,
}

llm = LLM(
model="anthropic/claude-3-5-sonnet-latest",
api_key=SecretStr("test-key"),
usage_id="test-llm",
)

# Should use the actual max_output_tokens, not capped
assert llm.max_output_tokens == 8192


@patch("openhands.sdk.llm.llm.get_litellm_model_info")
def test_max_output_tokens_small_max_tokens_not_capped(mock_get_model_info):
"""Test that small max_tokens fallback is not unnecessarily capped."""
from openhands.sdk.llm.llm import DEFAULT_MAX_OUTPUT_TOKENS_CAP

# Simulate a model where max_tokens is small (actual output limit)
mock_get_model_info.return_value = {
"max_tokens": 4096, # This is the actual output limit
"max_output_tokens": None,
"max_input_tokens": None,
}

llm = LLM(
model="openrouter/test/small-model",
api_key=SecretStr("test-key"),
usage_id="test-llm",
)

# Should use the actual value since it's below the cap
assert llm.max_output_tokens == 4096
assert llm.max_output_tokens < DEFAULT_MAX_OUTPUT_TOKENS_CAP


def test_explicit_max_output_tokens_not_overridden():
"""Test that explicitly set max_output_tokens is respected."""
llm = LLM(
model="gpt-4o",
api_key=SecretStr("test-key"),
usage_id="test-llm",
max_output_tokens=32768, # Explicitly set higher than cap
)

# Should respect the explicit value
assert llm.max_output_tokens == 32768


Comment on lines +1073 to +1163
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟢 Solid Tests: Comprehensive coverage of the capping logic with real behavior tests (not mocks). Tests verify:

  1. ✅ Capping when fallback > cap
  2. ✅ Using actual max_output_tokens when available
  3. ✅ Not capping when fallback < cap
  4. ✅ Explicit values override the cap

These tests would catch regressions.

# LLM Registry Tests
Loading