-
Notifications
You must be signed in to change notification settings - Fork 219
fix: cap max_output_tokens when using max_tokens fallback #2264
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -122,6 +122,12 @@ | |
| # Environment variable to override the minimum context window check | ||
| ENV_ALLOW_SHORT_CONTEXT_WINDOWS: Final[str] = "ALLOW_SHORT_CONTEXT_WINDOWS" | ||
|
|
||
| # Default max output tokens when model info only provides 'max_tokens' (ambiguous). | ||
| # Some providers use 'max_tokens' for the total context window, not output limit. | ||
| # This cap prevents requesting output that exceeds the context window. | ||
| # 16384 is a safe default that works for most models (GPT-4o: 16k, Claude: 8k). | ||
| DEFAULT_MAX_OUTPUT_TOKENS_CAP: Final[int] = 16384 | ||
|
|
||
|
|
||
| class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin): | ||
| """Language model interface for OpenHands agents. | ||
|
|
@@ -187,7 +193,7 @@ | |
| "or higher values (0.7-1.0) for more creative responses." | ||
| ), | ||
| ) | ||
| top_p: float | None = Field( | ||
|
Check warning on line 196 in openhands-sdk/openhands/sdk/llm/llm.py
|
||
| default=None, | ||
| ge=0, | ||
| le=1, | ||
|
|
@@ -1112,7 +1118,22 @@ | |
| if isinstance(self._model_info.get("max_output_tokens"), int): | ||
| self.max_output_tokens = self._model_info.get("max_output_tokens") | ||
| elif isinstance(self._model_info.get("max_tokens"), int): | ||
| self.max_output_tokens = self._model_info.get("max_tokens") | ||
| # 'max_tokens' is ambiguous: some providers use it for total | ||
| # context window, not output limit. Cap it to avoid requesting | ||
| # output that exceeds the context window. | ||
| max_tokens_value = self._model_info.get("max_tokens") | ||
| assert isinstance(max_tokens_value, int) # for type checker | ||
| self.max_output_tokens = min( | ||
| max_tokens_value, DEFAULT_MAX_OUTPUT_TOKENS_CAP | ||
| ) | ||
| if max_tokens_value > DEFAULT_MAX_OUTPUT_TOKENS_CAP: | ||
| logger.debug( | ||
| "Capping max_output_tokens from %s to %s for %s " | ||
| "(max_tokens may be context window, not output)", | ||
| max_tokens_value, | ||
| self.max_output_tokens, | ||
| self.model, | ||
| ) | ||
|
Comment on lines
+1121
to
+1136
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟢 Good Design: The capping logic is clean and preserves backward compatibility. Explicitly set |
||
|
|
||
| if "o3" in self.model: | ||
| o3_limit = 100000 | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1070,4 +1070,95 @@ def test_llm_reset_metrics(): | |
| assert llm.metrics.accumulated_cost == 0.0 | ||
|
|
||
|
|
||
| # max_output_tokens Capping Tests | ||
|
|
||
|
|
||
| @patch("openhands.sdk.llm.llm.get_litellm_model_info") | ||
| def test_max_output_tokens_capped_when_using_max_tokens_fallback(mock_get_model_info): | ||
| """Test that max_output_tokens is capped when falling back to max_tokens. | ||
|
|
||
| Some providers (e.g., OpenRouter) set max_tokens to the context window size | ||
| rather than the output limit. Without capping, this could request output | ||
| that exceeds the context window. | ||
|
|
||
| See: https://github.com/OpenHands/software-agent-sdk/issues/XXX | ||
| """ | ||
| from openhands.sdk.llm.llm import DEFAULT_MAX_OUTPUT_TOKENS_CAP | ||
|
|
||
| # Simulate a model where max_tokens = context window (200k) but | ||
| # max_output_tokens is not set | ||
| mock_get_model_info.return_value = { | ||
| "max_tokens": 200000, # This is the context window, not output limit | ||
| "max_output_tokens": None, | ||
| "max_input_tokens": 200000, | ||
| } | ||
|
|
||
| llm = LLM( | ||
| model="openrouter/anthropic/claude-3-haiku", | ||
| api_key=SecretStr("test-key"), | ||
| usage_id="test-llm", | ||
| ) | ||
|
|
||
| # max_output_tokens should be capped, not set to 200000 | ||
| assert llm.max_output_tokens is not None | ||
| assert llm.max_output_tokens == DEFAULT_MAX_OUTPUT_TOKENS_CAP | ||
| assert llm.max_output_tokens < 200000 | ||
|
|
||
|
|
||
| @patch("openhands.sdk.llm.llm.get_litellm_model_info") | ||
| def test_max_output_tokens_uses_actual_value_when_available(mock_get_model_info): | ||
| """Test that actual max_output_tokens is used when available.""" | ||
| # Simulate a model with proper max_output_tokens | ||
| mock_get_model_info.return_value = { | ||
| "max_tokens": 8192, | ||
| "max_output_tokens": 8192, | ||
| "max_input_tokens": 200000, | ||
| } | ||
|
|
||
| llm = LLM( | ||
| model="anthropic/claude-3-5-sonnet-latest", | ||
| api_key=SecretStr("test-key"), | ||
| usage_id="test-llm", | ||
| ) | ||
|
|
||
| # Should use the actual max_output_tokens, not capped | ||
| assert llm.max_output_tokens == 8192 | ||
|
|
||
|
|
||
| @patch("openhands.sdk.llm.llm.get_litellm_model_info") | ||
| def test_max_output_tokens_small_max_tokens_not_capped(mock_get_model_info): | ||
| """Test that small max_tokens fallback is not unnecessarily capped.""" | ||
| from openhands.sdk.llm.llm import DEFAULT_MAX_OUTPUT_TOKENS_CAP | ||
|
|
||
| # Simulate a model where max_tokens is small (actual output limit) | ||
| mock_get_model_info.return_value = { | ||
| "max_tokens": 4096, # This is the actual output limit | ||
| "max_output_tokens": None, | ||
| "max_input_tokens": None, | ||
| } | ||
|
|
||
| llm = LLM( | ||
| model="openrouter/test/small-model", | ||
| api_key=SecretStr("test-key"), | ||
| usage_id="test-llm", | ||
| ) | ||
|
|
||
| # Should use the actual value since it's below the cap | ||
| assert llm.max_output_tokens == 4096 | ||
| assert llm.max_output_tokens < DEFAULT_MAX_OUTPUT_TOKENS_CAP | ||
|
|
||
|
|
||
| def test_explicit_max_output_tokens_not_overridden(): | ||
| """Test that explicitly set max_output_tokens is respected.""" | ||
| llm = LLM( | ||
| model="gpt-4o", | ||
| api_key=SecretStr("test-key"), | ||
| usage_id="test-llm", | ||
| max_output_tokens=32768, # Explicitly set higher than cap | ||
| ) | ||
|
|
||
| # Should respect the explicit value | ||
| assert llm.max_output_tokens == 32768 | ||
|
|
||
|
|
||
|
Comment on lines
+1073
to
+1163
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟢 Solid Tests: Comprehensive coverage of the capping logic with real behavior tests (not mocks). Tests verify:
These tests would catch regressions. |
||
| # LLM Registry Tests | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🟢 Pragmatic: 16384 is a sensible default that works for common models (GPT-4o: 16k, Claude: 8k). This solves the real problem where uncapped values cause OpenRouter errors.