-
Notifications
You must be signed in to change notification settings - Fork 2.6k
feat: simplify browser use detection to use image support #6867
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,39 +15,6 @@ export const litellmDefaultModelInfo: ModelInfo = { | |
| cacheReadsPrice: 0.3, | ||
| } | ||
|
|
||
| export const LITELLM_COMPUTER_USE_MODELS = new Set([ | ||
| "claude-3-5-sonnet-latest", | ||
| "claude-opus-4-1-20250805", | ||
| "claude-opus-4-20250514", | ||
| "claude-sonnet-4-20250514", | ||
| "claude-3-7-sonnet-latest", | ||
| "claude-3-7-sonnet-20250219", | ||
| "claude-3-5-sonnet-20241022", | ||
| "vertex_ai/claude-3-5-sonnet", | ||
| "vertex_ai/claude-3-5-sonnet-v2", | ||
| "vertex_ai/claude-3-5-sonnet-v2@20241022", | ||
| "vertex_ai/claude-3-7-sonnet@20250219", | ||
| "vertex_ai/claude-opus-4-1@20250805", | ||
| "vertex_ai/claude-opus-4@20250514", | ||
| "vertex_ai/claude-sonnet-4@20250514", | ||
| "openrouter/anthropic/claude-3.5-sonnet", | ||
| "openrouter/anthropic/claude-3.5-sonnet:beta", | ||
| "openrouter/anthropic/claude-3.7-sonnet", | ||
| "openrouter/anthropic/claude-3.7-sonnet:beta", | ||
| "anthropic.claude-opus-4-1-20250805-v1:0", | ||
| "anthropic.claude-opus-4-20250514-v1:0", | ||
| "anthropic.claude-sonnet-4-20250514-v1:0", | ||
| "anthropic.claude-3-7-sonnet-20250219-v1:0", | ||
| "anthropic.claude-3-5-sonnet-20241022-v2:0", | ||
| "us.anthropic.claude-3-5-sonnet-20241022-v2:0", | ||
| "us.anthropic.claude-3-7-sonnet-20250219-v1:0", | ||
| "us.anthropic.claude-opus-4-1-20250805-v1:0", | ||
| "us.anthropic.claude-opus-4-20250514-v1:0", | ||
| "us.anthropic.claude-sonnet-4-20250514-v1:0", | ||
| "eu.anthropic.claude-3-5-sonnet-20241022-v2:0", | ||
| "eu.anthropic.claude-3-7-sonnet-20250219-v1:0", | ||
| "eu.anthropic.claude-opus-4-1-20250805-v1:0", | ||
| "eu.anthropic.claude-opus-4-20250514-v1:0", | ||
| "eu.anthropic.claude-sonnet-4-20250514-v1:0", | ||
| "snowflake/claude-3-5-sonnet", | ||
| ]) | ||
| // Computer use capability is now determined by image support | ||
| // Any model that supports images can theoretically use browser tools | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this approach perhaps too permissive? We're enabling browser use for ALL models with image support, including models that may not have been designed or tested for browser automation (e.g., image generation models, basic vision models). Could we consider adding a denylist for known incompatible models or requiring models to opt-in rather than being automatically enabled? |
||
| // This approach is simpler and more inclusive than maintaining hardcoded lists | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,7 +1,5 @@ | ||
| import axios from "axios" | ||
|
|
||
| import { LITELLM_COMPUTER_USE_MODELS } from "@roo-code/types" | ||
|
|
||
| import type { ModelRecord } from "../../../shared/api" | ||
|
|
||
| import { DEFAULT_HEADERS } from "../constants" | ||
|
|
@@ -33,33 +31,28 @@ export async function getLiteLLMModels(apiKey: string, baseUrl: string): Promise | |
| const response = await axios.get(url, { headers, timeout: 5000 }) | ||
| const models: ModelRecord = {} | ||
|
|
||
| const computerModels = Array.from(LITELLM_COMPUTER_USE_MODELS) | ||
|
|
||
| // Process the model info from the response | ||
| if (response.data && response.data.data && Array.isArray(response.data.data)) { | ||
| for (const model of response.data.data) { | ||
| const modelName = model.model_name | ||
| const modelInfo = model.model_info | ||
| const litellmModelName = model?.litellm_params?.model as string | undefined | ||
|
|
||
| if (!modelName || !modelInfo || !litellmModelName) continue | ||
| if (!modelName || !modelInfo) continue | ||
|
|
||
| // Use explicit supports_computer_use if available, otherwise fall back to hardcoded list | ||
| // Use explicit supports_computer_use if available, otherwise use image support | ||
| let supportsComputerUse: boolean | ||
| if (modelInfo.supports_computer_use !== undefined) { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good implementation of the fallback logic. The explicit |
||
| supportsComputerUse = Boolean(modelInfo.supports_computer_use) | ||
| } else { | ||
| // Fallback for older LiteLLM versions that don't have supports_computer_use field | ||
| supportsComputerUse = computerModels.some((computer_model) => | ||
| litellmModelName.endsWith(computer_model), | ||
| ) | ||
| // Browser automation requires screenshot analysis, which requires image/vision capabilities | ||
| // Any model that can process images can theoretically use the browser tool | ||
| supportsComputerUse = Boolean(modelInfo.supports_vision) | ||
| } | ||
|
|
||
| models[modelName] = { | ||
| maxTokens: modelInfo.max_tokens || 8192, | ||
| contextWindow: modelInfo.max_input_tokens || 200000, | ||
| supportsImages: Boolean(modelInfo.supports_vision), | ||
| // litellm_params.model may have a prefix like openrouter/ | ||
| supportsComputerUse, | ||
| supportsPromptCache: Boolean(modelInfo.supports_prompt_caching), | ||
| inputPrice: modelInfo.input_cost_per_token ? modelInfo.input_cost_per_token * 1000000 : undefined, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These comments are identical in both files. Could we make them slightly more specific to each context? For example, here we could mention that LiteLLM can override this with an explicit
supports_computer_usefield.