feat: simplify browser use detection to use image support

roomote · roomote · commit 7ba48a11d36a · 2025-08-09T02:56:33.000Z
- Remove hardcoded OPEN_ROUTER_COMPUTER_USE_MODELS and LITELLM_COMPUTER_USE_MODELS lists
- Update logic to enable browser/computer use for any model that supports images
- This approach is simpler and more inclusive, as browser automation requires screenshot analysis which needs image/vision capabilities
- Update tests to reflect the new image-based detection logic

This change aligns with Cline's approach where any model with image support can theoretically use browser tools, making the system more maintainable and avoiding the need to constantly update hardcoded model lists.
diff --git a/packages/types/src/providers/lite-llm.ts b/packages/types/src/providers/lite-llm.ts
@@ -15,39 +15,6 @@ export const litellmDefaultModelInfo: ModelInfo = {
 	cacheReadsPrice: 0.3,
 }
 
-export const LITELLM_COMPUTER_USE_MODELS = new Set([
-	"claude-3-5-sonnet-latest",
-	"claude-opus-4-1-20250805",
-	"claude-opus-4-20250514",
-	"claude-sonnet-4-20250514",
-	"claude-3-7-sonnet-latest",
-	"claude-3-7-sonnet-20250219",
-	"claude-3-5-sonnet-20241022",
-	"vertex_ai/claude-3-5-sonnet",
-	"vertex_ai/claude-3-5-sonnet-v2",
-	"vertex_ai/claude-3-5-sonnet-v2@20241022",
-	"vertex_ai/claude-3-7-sonnet@20250219",
-	"vertex_ai/claude-opus-4-1@20250805",
-	"vertex_ai/claude-opus-4@20250514",
-	"vertex_ai/claude-sonnet-4@20250514",
-	"openrouter/anthropic/claude-3.5-sonnet",
-	"openrouter/anthropic/claude-3.5-sonnet:beta",
-	"openrouter/anthropic/claude-3.7-sonnet",
-	"openrouter/anthropic/claude-3.7-sonnet:beta",
-	"anthropic.claude-opus-4-1-20250805-v1:0",
-	"anthropic.claude-opus-4-20250514-v1:0",
-	"anthropic.claude-sonnet-4-20250514-v1:0",
-	"anthropic.claude-3-7-sonnet-20250219-v1:0",
-	"anthropic.claude-3-5-sonnet-20241022-v2:0",
-	"us.anthropic.claude-3-5-sonnet-20241022-v2:0",
-	"us.anthropic.claude-3-7-sonnet-20250219-v1:0",
-	"us.anthropic.claude-opus-4-1-20250805-v1:0",
-	"us.anthropic.claude-opus-4-20250514-v1:0",
-	"us.anthropic.claude-sonnet-4-20250514-v1:0",
-	"eu.anthropic.claude-3-5-sonnet-20241022-v2:0",
-	"eu.anthropic.claude-3-7-sonnet-20250219-v1:0",
-	"eu.anthropic.claude-opus-4-1-20250805-v1:0",
-	"eu.anthropic.claude-opus-4-20250514-v1:0",
-	"eu.anthropic.claude-sonnet-4-20250514-v1:0",
-	"snowflake/claude-3-5-sonnet",
-])
+// Computer use capability is now determined by image support
+// Any model that supports images can theoretically use browser tools
+// This approach is simpler and more inclusive than maintaining hardcoded lists
diff --git a/packages/types/src/providers/openrouter.ts b/packages/types/src/providers/openrouter.ts
@@ -51,17 +51,9 @@ export const OPEN_ROUTER_PROMPT_CACHING_MODELS = new Set([
 	"google/gemini-flash-1.5-8b",
 ])
 
-// https://www.anthropic.com/news/3-5-models-and-computer-use
-export const OPEN_ROUTER_COMPUTER_USE_MODELS = new Set([
-	"anthropic/claude-3.5-sonnet",
-	"anthropic/claude-3.5-sonnet:beta",
-	"anthropic/claude-3.7-sonnet",
-	"anthropic/claude-3.7-sonnet:beta",
-	"anthropic/claude-3.7-sonnet:thinking",
-	"anthropic/claude-sonnet-4",
-	"anthropic/claude-opus-4",
-	"anthropic/claude-opus-4.1",
-])
+// Computer use capability is now determined by image support
+// Any model that supports images can theoretically use browser tools
+// This approach is simpler and more inclusive than maintaining hardcoded lists
 
 // When we first launched these models we didn't have support for
 // enabling/disabling the reasoning budget for hybrid models. Now that we
diff --git a/src/api/providers/fetchers/__tests__/litellm.spec.ts b/src/api/providers/fetchers/__tests__/litellm.spec.ts
@@ -404,35 +404,29 @@ describe("getLiteLLMModels", () => {
 		expect(result).toEqual({})
 	})
 
-	it("uses fallback computer use detection when supports_computer_use is not available", async () => {
+	it("uses image support as fallback for computer use when supports_computer_use is not available", async () => {
 		const mockResponse = {
 			data: {
 				data: [
 					{
-						model_name: "claude-3-5-sonnet-latest",
+						model_name: "model-with-vision",
 						model_info: {
 							max_tokens: 4096,
 							max_input_tokens: 200000,
 							supports_vision: true,
 							supports_prompt_caching: false,
 							// Note: no supports_computer_use field
 						},
-						litellm_params: {
-							model: "anthropic/claude-3-5-sonnet-latest", // This should match the fallback list
-						},
 					},
 					{
-						model_name: "gpt-4-turbo",
+						model_name: "model-without-vision",
 						model_info: {
 							max_tokens: 8192,
 							max_input_tokens: 128000,
 							supports_vision: false,
 							supports_prompt_caching: false,
 							// Note: no supports_computer_use field
 						},
-						litellm_params: {
-							model: "openai/gpt-4-turbo", // This should NOT match the fallback list
-						},
 					},
 				],
 			},
@@ -442,71 +436,62 @@ describe("getLiteLLMModels", () => {
 
 		const result = await getLiteLLMModels("test-api-key", "http://localhost:4000")
 
-		expect(result["claude-3-5-sonnet-latest"]).toEqual({
+		expect(result["model-with-vision"]).toEqual({
 			maxTokens: 4096,
 			contextWindow: 200000,
 			supportsImages: true,
-			supportsComputerUse: true, // Should be true due to fallback
+			supportsComputerUse: true, // Should be true because supports_vision is true
 			supportsPromptCache: false,
 			inputPrice: undefined,
 			outputPrice: undefined,
-			description: "claude-3-5-sonnet-latest via LiteLLM proxy",
+			description: "model-with-vision via LiteLLM proxy",
 		})
 
-		expect(result["gpt-4-turbo"]).toEqual({
+		expect(result["model-without-vision"]).toEqual({
 			maxTokens: 8192,
 			contextWindow: 128000,
 			supportsImages: false,
-			supportsComputerUse: false, // Should be false as it's not in fallback list
+			supportsComputerUse: false, // Should be false because supports_vision is false
 			supportsPromptCache: false,
 			inputPrice: undefined,
 			outputPrice: undefined,
-			description: "gpt-4-turbo via LiteLLM proxy",
+			description: "model-without-vision via LiteLLM proxy",
 		})
 	})
 
-	it("prioritizes explicit supports_computer_use over fallback detection", async () => {
+	it("prioritizes explicit supports_computer_use over image-based fallback", async () => {
 		const mockResponse = {
 			data: {
 				data: [
 					{
-						model_name: "claude-3-5-sonnet-latest",
+						model_name: "model-with-vision-but-no-computer",
 						model_info: {
 							max_tokens: 4096,
 							max_input_tokens: 200000,
 							supports_vision: true,
 							supports_prompt_caching: false,
-							supports_computer_use: false, // Explicitly set to false
-						},
-						litellm_params: {
-							model: "anthropic/claude-3-5-sonnet-latest", // This matches fallback list but should be ignored
+							supports_computer_use: false, // Explicitly set to false despite vision support
 						},
 					},
 					{
-						model_name: "custom-model",
+						model_name: "model-without-vision-but-computer",
 						model_info: {
 							max_tokens: 8192,
 							max_input_tokens: 128000,
 							supports_vision: false,
 							supports_prompt_caching: false,
-							supports_computer_use: true, // Explicitly set to true
-						},
-						litellm_params: {
-							model: "custom/custom-model", // This would NOT match fallback list
+							supports_computer_use: true, // Explicitly set to true despite no vision support
 						},
 					},
 					{
-						model_name: "another-custom-model",
+						model_name: "model-with-both-false",
 						model_info: {
 							max_tokens: 8192,
 							max_input_tokens: 128000,
 							supports_vision: false,
 							supports_prompt_caching: false,
 							supports_computer_use: false, // Explicitly set to false
 						},
-						litellm_params: {
-							model: "custom/another-custom-model", // This would NOT match fallback list
-						},
 					},
 				],
 			},
@@ -516,79 +501,70 @@ describe("getLiteLLMModels", () => {
 
 		const result = await getLiteLLMModels("test-api-key", "http://localhost:4000")
 
-		expect(result["claude-3-5-sonnet-latest"]).toEqual({
+		expect(result["model-with-vision-but-no-computer"]).toEqual({
 			maxTokens: 4096,
 			contextWindow: 200000,
 			supportsImages: true,
-			supportsComputerUse: false, // False because explicitly set to false (fallback ignored)
+			supportsComputerUse: false, // False because explicitly set to false (image fallback ignored)
 			supportsPromptCache: false,
 			inputPrice: undefined,
 			outputPrice: undefined,
-			description: "claude-3-5-sonnet-latest via LiteLLM proxy",
+			description: "model-with-vision-but-no-computer via LiteLLM proxy",
 		})
 
-		expect(result["custom-model"]).toEqual({
+		expect(result["model-without-vision-but-computer"]).toEqual({
 			maxTokens: 8192,
 			contextWindow: 128000,
 			supportsImages: false,
 			supportsComputerUse: true, // True because explicitly set to true
 			supportsPromptCache: false,
 			inputPrice: undefined,
 			outputPrice: undefined,
-			description: "custom-model via LiteLLM proxy",
+			description: "model-without-vision-but-computer via LiteLLM proxy",
 		})
 
-		expect(result["another-custom-model"]).toEqual({
+		expect(result["model-with-both-false"]).toEqual({
 			maxTokens: 8192,
 			contextWindow: 128000,
 			supportsImages: false,
 			supportsComputerUse: false, // False because explicitly set to false
 			supportsPromptCache: false,
 			inputPrice: undefined,
 			outputPrice: undefined,
-			description: "another-custom-model via LiteLLM proxy",
+			description: "model-with-both-false via LiteLLM proxy",
 		})
 	})
 
-	it("handles fallback detection with various model name formats", async () => {
+	it("handles image-based computer use detection for various models", async () => {
 		const mockResponse = {
 			data: {
 				data: [
 					{
-						model_name: "vertex-claude",
+						model_name: "vertex-model",
 						model_info: {
 							max_tokens: 4096,
 							max_input_tokens: 200000,
 							supports_vision: true,
 							supports_prompt_caching: false,
 						},
-						litellm_params: {
-							model: "vertex_ai/claude-3-5-sonnet", // Should match fallback list
-						},
 					},
 					{
-						model_name: "openrouter-claude",
+						model_name: "openrouter-model",
 						model_info: {
 							max_tokens: 4096,
 							max_input_tokens: 200000,
 							supports_vision: true,
 							supports_prompt_caching: false,
 						},
-						litellm_params: {
-							model: "openrouter/anthropic/claude-3.5-sonnet", // Should match fallback list
-						},
 					},
 					{
-						model_name: "bedrock-claude",
+						model_name: "bedrock-model",
 						model_info: {
 							max_tokens: 4096,
 							max_input_tokens: 200000,
-							supports_vision: true,
+							supports_vision: false,
 							supports_prompt_caching: false,
 						},
-						litellm_params: {
-							model: "anthropic.claude-3-5-sonnet-20241022-v2:0", // Should match fallback list
-						},
 					},
 				],
 			},
@@ -598,8 +574,10 @@ describe("getLiteLLMModels", () => {
 
 		const result = await getLiteLLMModels("test-api-key", "http://localhost:4000")
 
-		expect(result["vertex-claude"].supportsComputerUse).toBe(true)
-		expect(result["openrouter-claude"].supportsComputerUse).toBe(true)
-		expect(result["bedrock-claude"].supportsComputerUse).toBe(true)
+		// Models with vision support should have computer use enabled
+		expect(result["vertex-model"].supportsComputerUse).toBe(true)
+		expect(result["openrouter-model"].supportsComputerUse).toBe(true)
+		// Model without vision support should not have computer use enabled
+		expect(result["bedrock-model"].supportsComputerUse).toBe(false)
 	})
 })
diff --git a/src/api/providers/fetchers/__tests__/openrouter.spec.ts b/src/api/providers/fetchers/__tests__/openrouter.spec.ts
@@ -6,7 +6,6 @@ import { back as nockBack } from "nock"
 
 import {
 	OPEN_ROUTER_PROMPT_CACHING_MODELS,
-	OPEN_ROUTER_COMPUTER_USE_MODELS,
 	OPEN_ROUTER_REASONING_BUDGET_MODELS,
 	OPEN_ROUTER_REQUIRED_REASONING_BUDGET_MODELS,
 } from "@roo-code/types"
@@ -49,20 +48,20 @@ describe("OpenRouter API", () => {
 
 			expect(ourCachingModels.sort()).toEqual(expectedCachingModels)
 
-			const excludedComputerUseModels = new Set([
-				"anthropic/claude-opus-4.1", // Not yet available in OpenRouter API
-			])
+			// Computer use is now determined by image support
+			// Verify that models with image support have computer use enabled
+			const modelsWithImages = Object.entries(models)
+				.filter(([_, model]) => model.supportsImages)
+				.map(([id, _]) => id)
 
-			const expectedComputerUseModels = Array.from(OPEN_ROUTER_COMPUTER_USE_MODELS)
-				.filter((id) => !excludedComputerUseModels.has(id))
-				.sort()
+			const modelsWithComputerUse = Object.entries(models)
+				.filter(([_, model]) => model.supportsComputerUse)
+				.map(([id, _]) => id)
 
-			expect(
-				Object.entries(models)
-					.filter(([_, model]) => model.supportsComputerUse)
-					.map(([id, _]) => id)
-					.sort(),
-			).toEqual(expectedComputerUseModels)
+			// All models with image support should have computer use enabled
+			for (const modelId of modelsWithImages) {
+				expect(modelsWithComputerUse).toContain(modelId)
+			}
 
 			expect(
 				Object.entries(models)
@@ -233,6 +232,7 @@ describe("OpenRouter API", () => {
 					maxTokens: 65535,
 					contextWindow: 1048576,
 					supportsImages: true,
+					supportsComputerUse: true, // Added because supportsImages is true
 					supportsPromptCache: true,
 					supportsReasoningBudget: true,
 					inputPrice: 1.25,
@@ -247,6 +247,7 @@ describe("OpenRouter API", () => {
 					maxTokens: 65536,
 					contextWindow: 1048576,
 					supportsImages: true,
+					supportsComputerUse: true, // Added because supportsImages is true
 					supportsPromptCache: true,
 					supportsReasoningBudget: true,
 					inputPrice: 1.25,
diff --git a/src/api/providers/fetchers/litellm.ts b/src/api/providers/fetchers/litellm.ts
@@ -1,7 +1,5 @@
 import axios from "axios"
 
-import { LITELLM_COMPUTER_USE_MODELS } from "@roo-code/types"
-
 import type { ModelRecord } from "../../../shared/api"
 
 import { DEFAULT_HEADERS } from "../constants"
@@ -33,33 +31,28 @@ export async function getLiteLLMModels(apiKey: string, baseUrl: string): Promise
 		const response = await axios.get(url, { headers, timeout: 5000 })
 		const models: ModelRecord = {}
 
-		const computerModels = Array.from(LITELLM_COMPUTER_USE_MODELS)
-
 		// Process the model info from the response
 		if (response.data && response.data.data && Array.isArray(response.data.data)) {
 			for (const model of response.data.data) {
 				const modelName = model.model_name
 				const modelInfo = model.model_info
-				const litellmModelName = model?.litellm_params?.model as string | undefined
 
-				if (!modelName || !modelInfo || !litellmModelName) continue
+				if (!modelName || !modelInfo) continue
 
-				// Use explicit supports_computer_use if available, otherwise fall back to hardcoded list
+				// Use explicit supports_computer_use if available, otherwise use image support
 				let supportsComputerUse: boolean
 				if (modelInfo.supports_computer_use !== undefined) {
 					supportsComputerUse = Boolean(modelInfo.supports_computer_use)
 				} else {
-					// Fallback for older LiteLLM versions that don't have supports_computer_use field
-					supportsComputerUse = computerModels.some((computer_model) =>
-						litellmModelName.endsWith(computer_model),
-					)
+					// Browser automation requires screenshot analysis, which requires image/vision capabilities
+					// Any model that can process images can theoretically use the browser tool
+					supportsComputerUse = Boolean(modelInfo.supports_vision)
 				}
 
 				models[modelName] = {
 					maxTokens: modelInfo.max_tokens || 8192,
 					contextWindow: modelInfo.max_input_tokens || 200000,
 					supportsImages: Boolean(modelInfo.supports_vision),
-					// litellm_params.model may have a prefix like openrouter/
 					supportsComputerUse,
 					supportsPromptCache: Boolean(modelInfo.supports_prompt_caching),
 					inputPrice: modelInfo.input_cost_per_token ? modelInfo.input_cost_per_token * 1000000 : undefined,
diff --git a/src/api/providers/fetchers/openrouter.ts b/src/api/providers/fetchers/openrouter.ts