Merge pull request RooCodeInc#1305 from RooVetGit/cte/max-tokens-fix

cte · web-flow · commit 15a4c4c68cf6 · 2025-03-01T21:13:51.000-08:00
Custom max tokens fix for non-thinking models
diff --git a/.changeset/fuzzy-donkeys-whisper.md b/.changeset/fuzzy-donkeys-whisper.md
@@ -0,0 +1,5 @@
+---
+"roo-cline": patch
+---
+
+Don't honor custom max tokens for non thinking models
diff --git a/src/api/providers/__tests__/anthropic.test.ts b/src/api/providers/__tests__/anthropic.test.ts
@@ -194,5 +194,33 @@ describe("AnthropicHandler", () => {
 			expect(model.info.supportsImages).toBe(true)
 			expect(model.info.supportsPromptCache).toBe(true)
 		})
+
+		it("honors custom maxTokens for thinking models", () => {
+			const handler = new AnthropicHandler({
+				apiKey: "test-api-key",
+				apiModelId: "claude-3-7-sonnet-20250219:thinking",
+				modelMaxTokens: 32_768,
+				modelMaxThinkingTokens: 16_384,
+			})
+
+			const result = handler.getModel()
+			expect(result.maxTokens).toBe(32_768)
+			expect(result.thinking).toEqual({ type: "enabled", budget_tokens: 16_384 })
+			expect(result.temperature).toBe(1.0)
+		})
+
+		it("does not honor custom maxTokens for non-thinking models", () => {
+			const handler = new AnthropicHandler({
+				apiKey: "test-api-key",
+				apiModelId: "claude-3-7-sonnet-20250219",
+				modelMaxTokens: 32_768,
+				modelMaxThinkingTokens: 16_384,
+			})
+
+			const result = handler.getModel()
+			expect(result.maxTokens).toBe(16_384)
+			expect(result.thinking).toBeUndefined()
+			expect(result.temperature).toBe(0)
+		})
 	})
 })
diff --git a/src/api/providers/__tests__/openrouter.test.ts b/src/api/providers/__tests__/openrouter.test.ts
@@ -1,29 +1,30 @@
 // npx jest src/api/providers/__tests__/openrouter.test.ts
 
-import { OpenRouterHandler } from "../openrouter"
-import { ApiHandlerOptions, ModelInfo } from "../../../shared/api"
-import OpenAI from "openai"
 import axios from "axios"
 import { Anthropic } from "@anthropic-ai/sdk"
+import OpenAI from "openai"
+
+import { OpenRouterHandler } from "../openrouter"
+import { ApiHandlerOptions, ModelInfo } from "../../../shared/api"
 
 // Mock dependencies
 jest.mock("openai")
 jest.mock("axios")
 jest.mock("delay", () => jest.fn(() => Promise.resolve()))
 
+const mockOpenRouterModelInfo: ModelInfo = {
+	maxTokens: 1000,
+	contextWindow: 2000,
+	supportsPromptCache: true,
+	inputPrice: 0.01,
+	outputPrice: 0.02,
+}
+
 describe("OpenRouterHandler", () => {
 	const mockOptions: ApiHandlerOptions = {
 		openRouterApiKey: "test-key",
 		openRouterModelId: "test-model",
-		openRouterModelInfo: {
-			name: "Test Model",
-			description: "Test Description",
-			maxTokens: 1000,
-			contextWindow: 2000,
-			supportsPromptCache: true,
-			inputPrice: 0.01,
-			outputPrice: 0.02,
-		} as ModelInfo,
+		openRouterModelInfo: mockOpenRouterModelInfo,
 	}
 
 	beforeEach(() => {
@@ -50,6 +51,10 @@ describe("OpenRouterHandler", () => {
 		expect(result).toEqual({
 			id: mockOptions.openRouterModelId,
 			info: mockOptions.openRouterModelInfo,
+			maxTokens: 1000,
+			temperature: 0,
+			thinking: undefined,
+			topP: undefined,
 		})
 	})
 
@@ -61,6 +66,38 @@ describe("OpenRouterHandler", () => {
 		expect(result.info.supportsPromptCache).toBe(true)
 	})
 
+	test("getModel honors custom maxTokens for thinking models", () => {
+		const handler = new OpenRouterHandler({
+			openRouterApiKey: "test-key",
+			openRouterModelId: "test-model",
+			openRouterModelInfo: {
+				...mockOpenRouterModelInfo,
+				maxTokens: 64_000,
+				thinking: true,
+			},
+			modelMaxTokens: 32_768,
+			modelMaxThinkingTokens: 16_384,
+		})
+
+		const result = handler.getModel()
+		expect(result.maxTokens).toBe(32_768)
+		expect(result.thinking).toEqual({ type: "enabled", budget_tokens: 16_384 })
+		expect(result.temperature).toBe(1.0)
+	})
+
+	test("getModel does not honor custom maxTokens for non-thinking models", () => {
+		const handler = new OpenRouterHandler({
+			...mockOptions,
+			modelMaxTokens: 32_768,
+			modelMaxThinkingTokens: 16_384,
+		})
+
+		const result = handler.getModel()
+		expect(result.maxTokens).toBe(1000)
+		expect(result.thinking).toBeUndefined()
+		expect(result.temperature).toBe(0)
+	})
+
 	test("createMessage generates correct stream chunks", async () => {
 		const handler = new OpenRouterHandler(mockOptions)
 		const mockStream = {
@@ -242,15 +279,7 @@ describe("OpenRouterHandler", () => {
 
 	test("completePrompt returns correct response", async () => {
 		const handler = new OpenRouterHandler(mockOptions)
-		const mockResponse = {
-			choices: [
-				{
-					message: {
-						content: "test completion",
-					},
-				},
-			],
-		}
+		const mockResponse = { choices: [{ message: { content: "test completion" } }] }
 
 		const mockCreate = jest.fn().mockResolvedValue(mockResponse)
 		;(OpenAI as jest.MockedClass<typeof OpenAI>).prototype.chat = {
@@ -260,10 +289,13 @@ describe("OpenRouterHandler", () => {
 		const result = await handler.completePrompt("test prompt")
 
 		expect(result).toBe("test completion")
+
 		expect(mockCreate).toHaveBeenCalledWith({
 			model: mockOptions.openRouterModelId,
-			messages: [{ role: "user", content: "test prompt" }],
+			max_tokens: 1000,
+			thinking: undefined,
 			temperature: 0,
+			messages: [{ role: "user", content: "test prompt" }],
 			stream: false,
 		})
 	})
@@ -292,8 +324,6 @@ describe("OpenRouterHandler", () => {
 			completions: { create: mockCreate },
 		} as any
 
-		await expect(handler.completePrompt("test prompt")).rejects.toThrow(
-			"OpenRouter completion error: Unexpected error",
-		)
+		await expect(handler.completePrompt("test prompt")).rejects.toThrow("Unexpected error")
 	})
 })
diff --git a/src/api/providers/__tests__/vertex.test.ts b/src/api/providers/__tests__/vertex.test.ts
@@ -890,6 +890,34 @@ describe("VertexHandler", () => {
 			expect(modelInfo.info.maxTokens).toBe(8192)
 			expect(modelInfo.info.contextWindow).toBe(1048576)
 		})
+
+		it("honors custom maxTokens for thinking models", () => {
+			const handler = new VertexHandler({
+				apiKey: "test-api-key",
+				apiModelId: "claude-3-7-sonnet@20250219:thinking",
+				modelMaxTokens: 32_768,
+				modelMaxThinkingTokens: 16_384,
+			})
+
+			const result = handler.getModel()
+			expect(result.maxTokens).toBe(32_768)
+			expect(result.thinking).toEqual({ type: "enabled", budget_tokens: 16_384 })
+			expect(result.temperature).toBe(1.0)
+		})
+
+		it("does not honor custom maxTokens for non-thinking models", () => {
+			const handler = new VertexHandler({
+				apiKey: "test-api-key",
+				apiModelId: "claude-3-7-sonnet@20250219",
+				modelMaxTokens: 32_768,
+				modelMaxThinkingTokens: 16_384,
+			})
+
+			const result = handler.getModel()
+			expect(result.maxTokens).toBe(16_384)
+			expect(result.thinking).toBeUndefined()
+			expect(result.temperature).toBe(0)
+		})
 	})
 
 	describe("thinking model configuration", () => {
diff --git a/src/api/providers/anthropic.ts b/src/api/providers/anthropic.ts
@@ -12,8 +12,6 @@ import {
 import { ApiHandler, SingleCompletionHandler } from "../index"
 import { ApiStream } from "../transform/stream"
 
-const ANTHROPIC_DEFAULT_TEMPERATURE = 0
-
 export class AnthropicHandler implements ApiHandler, SingleCompletionHandler {
 	private options: ApiHandlerOptions
 	private client: Anthropic
@@ -30,7 +28,7 @@ export class AnthropicHandler implements ApiHandler, SingleCompletionHandler {
 	async *createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream {
 		let stream: AnthropicStream<Anthropic.Messages.RawMessageStreamEvent>
 		const cacheControl: CacheControlEphemeral = { type: "ephemeral" }
-		let { id: modelId, temperature, maxTokens, thinking } = this.getModel()
+		let { id: modelId, maxTokens, thinking, temperature } = this.getModel()
 
 		switch (modelId) {
 			case "claude-3-7-sonnet-20250219":
@@ -182,55 +180,52 @@ export class AnthropicHandler implements ApiHandler, SingleCompletionHandler {
 
 	getModel() {
 		const modelId = this.options.apiModelId
-		let temperature = this.options.modelTemperature ?? ANTHROPIC_DEFAULT_TEMPERATURE
-		let thinking: BetaThinkingConfigParam | undefined = undefined
 
-		if (modelId && modelId in anthropicModels) {
-			let id = modelId as AnthropicModelId
-			const info: ModelInfo = anthropicModels[id]
+		const {
+			modelMaxTokens: customMaxTokens,
+			modelMaxThinkingTokens: customMaxThinkingTokens,
+			modelTemperature: customTemperature,
+		} = this.options
 
-			// The `:thinking` variant is a virtual identifier for the
-			// `claude-3-7-sonnet-20250219` model with a thinking budget.
-			// We can handle this more elegantly in the future.
-			if (id === "claude-3-7-sonnet-20250219:thinking") {
-				id = "claude-3-7-sonnet-20250219"
-			}
+		let id = modelId && modelId in anthropicModels ? (modelId as AnthropicModelId) : anthropicDefaultModelId
+		const info: ModelInfo = anthropicModels[id]
 
-			const maxTokens = this.options.modelMaxTokens || info.maxTokens || 8192
+		// The `:thinking` variant is a virtual identifier for the
+		// `claude-3-7-sonnet-20250219` model with a thinking budget.
+		// We can handle this more elegantly in the future.
+		if (id === "claude-3-7-sonnet-20250219:thinking") {
+			id = "claude-3-7-sonnet-20250219"
+		}
 
-			if (info.thinking) {
-				// Anthropic "Thinking" models require a temperature of 1.0.
-				temperature = 1.0
+		let maxTokens = info.maxTokens ?? 8192
+		let thinking: BetaThinkingConfigParam | undefined = undefined
+		let temperature = customTemperature ?? 0
 
-				// Clamp the thinking budget to be at most 80% of max tokens and at
-				// least 1024 tokens.
-				const maxBudgetTokens = Math.floor(maxTokens * 0.8)
-				const budgetTokens = Math.max(
-					Math.min(this.options.modelMaxThinkingTokens ?? maxBudgetTokens, maxBudgetTokens),
-					1024,
-				)
+		if (info.thinking) {
+			// Only honor `customMaxTokens` for thinking models.
+			maxTokens = customMaxTokens ?? maxTokens
 
-				thinking = { type: "enabled", budget_tokens: budgetTokens }
-			}
+			// Clamp the thinking budget to be at most 80% of max tokens and at
+			// least 1024 tokens.
+			const maxBudgetTokens = Math.floor(maxTokens * 0.8)
+			const budgetTokens = Math.max(Math.min(customMaxThinkingTokens ?? maxBudgetTokens, maxBudgetTokens), 1024)
+			thinking = { type: "enabled", budget_tokens: budgetTokens }
 
-			return { id, info, temperature, maxTokens, thinking }
+			// Anthropic "Thinking" models require a temperature of 1.0.
+			temperature = 1.0
 		}
 
-		const id = anthropicDefaultModelId
-		const info: ModelInfo = anthropicModels[id]
-		const maxTokens = this.options.modelMaxTokens || info.maxTokens || 8192
-
-		return { id, info, temperature, maxTokens, thinking }
+		return { id, info, maxTokens, thinking, temperature }
 	}
 
 	async completePrompt(prompt: string) {
-		let { id: modelId, temperature, maxTokens, thinking } = this.getModel()
+		let { id: modelId, maxTokens, thinking, temperature } = this.getModel()
 
 		const message = await this.client.messages.create({
 			model: modelId,
 			max_tokens: maxTokens,
-			temperature,
 			thinking,
+			temperature,
 			messages: [{ role: "user", content: prompt }],
 			stream: false,
 		})
diff --git a/src/api/providers/openrouter.ts b/src/api/providers/openrouter.ts
diff --git a/src/api/providers/vertex.ts b/src/api/providers/vertex.ts

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
 +"roo-cline": patch
 +---
++
 +Don't honor custom max tokens for non thinking models