Clamp default model max tokens to 20% of context window (RooCodeInc#6761)

mrubens · web-flow · commit c52fdc439712 · 2025-08-06T13:51:10.000-07:00
diff --git a/src/api/providers/__tests__/openrouter.spec.ts b/src/api/providers/__tests__/openrouter.spec.ts
@@ -98,9 +98,11 @@ describe("OpenRouterHandler", () => {
 			})
 
 			const result = await handler.fetchModel()
-			expect(result.maxTokens).toBe(128000) // Use actual implementation value
-			expect(result.reasoningBudget).toBeUndefined() // Use actual implementation value
-			expect(result.temperature).toBe(0) // Use actual implementation value
+			// With the new clamping logic, 128000 tokens (64% of 200000 context window)
+			// gets clamped to 20% of context window: 200000 * 0.2 = 40000
+			expect(result.maxTokens).toBe(40000)
+			expect(result.reasoningBudget).toBeUndefined()
+			expect(result.temperature).toBe(0)
 		})
 
 		it("does not honor custom maxTokens for non-thinking models", async () => {
diff --git a/src/api/transform/__tests__/model-params.spec.ts b/src/api/transform/__tests__/model-params.spec.ts
@@ -293,12 +293,12 @@ describe("getModelParams", () => {
 		it("should not honor customMaxThinkingTokens for non-reasoning budget models", () => {
 			const model: ModelInfo = {
 				...baseModel,
-				maxTokens: 4000,
+				maxTokens: 3000, // 3000 is 18.75% of 16000 context window, within 20% threshold
 			}
 
 			expect(getModelParams({ ...anthropicParams, settings: { modelMaxThinkingTokens: 1500 }, model })).toEqual({
 				format: anthropicParams.format,
-				maxTokens: 4000,
+				maxTokens: 3000, // Uses model.maxTokens since it's within 20% threshold
 				temperature: 0, // Using default temperature.
 				reasoningEffort: undefined,
 				reasoningBudget: undefined, // Should remain undefined despite customMaxThinkingTokens being set.
@@ -565,7 +565,7 @@ describe("getModelParams", () => {
 		it("should use reasoningEffort if supportsReasoningEffort is false but reasoningEffort is set", () => {
 			const model: ModelInfo = {
 				...baseModel,
-				maxTokens: 8000,
+				maxTokens: 3000, // Changed to 3000 (18.75% of 16000), which is within 20% threshold
 				supportsReasoningEffort: false,
 				reasoningEffort: "medium",
 			}
@@ -576,7 +576,7 @@ describe("getModelParams", () => {
 				model,
 			})
 
-			expect(result.maxTokens).toBe(8000)
+			expect(result.maxTokens).toBe(3000) // Now uses model.maxTokens since it's within 20% threshold
 			expect(result.reasoningEffort).toBe("medium")
 		})
 	})
@@ -595,7 +595,8 @@ describe("getModelParams", () => {
 				model,
 			})
 
-			// Should discard model's maxTokens and use default
+			// For hybrid models (supportsReasoningBudget) in Anthropic contexts,
+			// should discard model's maxTokens and use ANTHROPIC_DEFAULT_MAX_TOKENS
 			expect(result.maxTokens).toBe(ANTHROPIC_DEFAULT_MAX_TOKENS)
 			expect(result.reasoningBudget).toBeUndefined()
 		})
diff --git a/src/shared/__tests__/api.spec.ts b/src/shared/__tests__/api.spec.ts
@@ -25,11 +25,13 @@ describe("getModelMaxOutputTokens", () => {
 		expect(result).toBe(16384)
 	})
 
-	test("should return model maxTokens when not using claude-code provider", () => {
+	test("should return model maxTokens when not using claude-code provider and maxTokens is within 20% of context window", () => {
 		const settings: ProviderSettings = {
 			apiProvider: "anthropic",
 		}
 
+		// mockModel has maxTokens: 8192 and contextWindow: 200000
+		// 8192 is 4.096% of 200000, which is <= 20%, so it should use model.maxTokens
 		const result = getModelMaxOutputTokens({
 			modelId: "claude-3-5-sonnet-20241022",
 			model: mockModel,
@@ -115,7 +117,7 @@ describe("getModelMaxOutputTokens", () => {
 			contextWindow: 1_048_576,
 			supportsPromptCache: false,
 			supportsReasoningBudget: true,
-			maxTokens: 65_535,
+			maxTokens: 65_535, // 65_535 is ~6.25% of 1_048_576, which is <= 20%
 		}
 
 		const settings: ProviderSettings = {
@@ -124,7 +126,68 @@ describe("getModelMaxOutputTokens", () => {
 		}
 
 		const result = getModelMaxOutputTokens({ modelId: geminiModelId, model, settings })
-		expect(result).toBe(65_535) // Should use model.maxTokens, not ANTHROPIC_DEFAULT_MAX_TOKENS
+		expect(result).toBe(65_535) // Should use model.maxTokens since it's within 20% threshold
+	})
+
+	test("should clamp maxTokens to 20% of context window when maxTokens exceeds threshold", () => {
+		const model: ModelInfo = {
+			contextWindow: 100_000,
+			supportsPromptCache: false,
+			maxTokens: 50_000, // 50% of context window, exceeds 20% threshold
+		}
+
+		const settings: ProviderSettings = {
+			apiProvider: "openai",
+		}
+
+		const result = getModelMaxOutputTokens({
+			modelId: "gpt-4",
+			model,
+			settings,
+			format: "openai",
+		})
+		// Should clamp to 20% of context window: 100_000 * 0.2 = 20_000
+		expect(result).toBe(20_000)
+	})
+
+	test("should clamp maxTokens to 20% of context window for Anthropic models when maxTokens exceeds threshold", () => {
+		const model: ModelInfo = {
+			contextWindow: 100_000,
+			supportsPromptCache: true,
+			maxTokens: 50_000, // 50% of context window, exceeds 20% threshold
+		}
+
+		const settings: ProviderSettings = {
+			apiProvider: "anthropic",
+		}
+
+		const result = getModelMaxOutputTokens({
+			modelId: "claude-3-5-sonnet-20241022",
+			model,
+			settings,
+		})
+		// Should clamp to 20% of context window: 100_000 * 0.2 = 20_000
+		expect(result).toBe(20_000)
+	})
+
+	test("should use model.maxTokens when exactly at 20% threshold", () => {
+		const model: ModelInfo = {
+			contextWindow: 100_000,
+			supportsPromptCache: false,
+			maxTokens: 20_000, // Exactly 20% of context window
+		}
+
+		const settings: ProviderSettings = {
+			apiProvider: "openai",
+		}
+
+		const result = getModelMaxOutputTokens({
+			modelId: "gpt-4",
+			model,
+			settings,
+			format: "openai",
+		})
+		expect(result).toBe(20_000) // Should use model.maxTokens since it's exactly at 20%
 	})
 
 	test("should return modelMaxTokens from settings when reasoning budget is required", () => {
diff --git a/src/shared/api.ts b/src/shared/api.ts
@@ -90,9 +90,9 @@ export const getModelMaxOutputTokens = ({
 		return ANTHROPIC_DEFAULT_MAX_TOKENS
 	}
 
-	// If model has explicit maxTokens and it's not the full context window, use it
-	if (model.maxTokens && model.maxTokens !== model.contextWindow) {
-		return model.maxTokens
+	// If model has explicit maxTokens, clamp it to 20% of the context window
+	if (model.maxTokens) {
+		return Math.min(model.maxTokens, model.contextWindow * 0.2)
 	}
 
 	// For non-Anthropic formats without explicit maxTokens, return undefined

Original file line number	Diff line number	Diff line change
`@@ -90,9 +90,9 @@ export const getModelMaxOutputTokens = ({`
`90`	`90`	`return ANTHROPIC_DEFAULT_MAX_TOKENS`
`91`	`91`	`}`
`92`	`92`
`93`		`- // If model has explicit maxTokens and it's not the full context window, use it`
`94`		`- if (model.maxTokens && model.maxTokens !== model.contextWindow) {`
`95`		`- return model.maxTokens`
	`93`	`+ // If model has explicit maxTokens, clamp it to 20% of the context window`
	`94`	`+ if (model.maxTokens) {`
	`95`	`+ return Math.min(model.maxTokens, model.contextWindow * 0.2)`
`96`	`96`	`}`
`97`	`97`
`98`	`98`	`// For non-Anthropic formats without explicit maxTokens, return undefined`