Update the max-token calculation in model-params to use the shared logic (#5720)

mrubens · web-flow · commit 8a3dcfb59319 · 2025-07-15T02:20:22.000-04:00
diff --git a/packages/types/src/providers/groq.ts b/packages/types/src/providers/groq.ts
@@ -17,7 +17,7 @@ export const groqDefaultModelId: GroqModelId = "llama-3.3-70b-versatile" // Defa
 export const groqModels = {
 	// Models based on API response: https://api.groq.com/openai/v1/models
 	"llama-3.1-8b-instant": {
-		maxTokens: 131072,
+		maxTokens: 8192,
 		contextWindow: 131072,
 		supportsImages: false,
 		supportsPromptCache: false,
@@ -26,7 +26,7 @@ export const groqModels = {
 		description: "Meta Llama 3.1 8B Instant model, 128K context.",
 	},
 	"llama-3.3-70b-versatile": {
-		maxTokens: 32768,
+		maxTokens: 8192,
 		contextWindow: 131072,
 		supportsImages: false,
 		supportsPromptCache: false,
@@ -53,7 +53,7 @@ export const groqModels = {
 		description: "Meta Llama 4 Maverick 17B Instruct model, 128K context.",
 	},
 	"mistral-saba-24b": {
-		maxTokens: 32768,
+		maxTokens: 8192,
 		contextWindow: 32768,
 		supportsImages: false,
 		supportsPromptCache: false,
@@ -62,7 +62,7 @@ export const groqModels = {
 		description: "Mistral Saba 24B model, 32K context.",
 	},
 	"qwen-qwq-32b": {
-		maxTokens: 131072,
+		maxTokens: 8192,
 		contextWindow: 131072,
 		supportsImages: false,
 		supportsPromptCache: false,
@@ -71,7 +71,7 @@ export const groqModels = {
 		description: "Alibaba Qwen QwQ 32B model, 128K context.",
 	},
 	"qwen/qwen3-32b": {
-		maxTokens: 40960,
+		maxTokens: 8192,
 		contextWindow: 131072,
 		supportsImages: false,
 		supportsPromptCache: false,
@@ -80,7 +80,7 @@ export const groqModels = {
 		description: "Alibaba Qwen 3 32B model, 128K context.",
 	},
 	"deepseek-r1-distill-llama-70b": {
-		maxTokens: 131072,
+		maxTokens: 8192,
 		contextWindow: 131072,
 		supportsImages: false,
 		supportsPromptCache: false,
@@ -89,7 +89,7 @@ export const groqModels = {
 		description: "DeepSeek R1 Distill Llama 70B model, 128K context.",
 	},
 	"moonshotai/kimi-k2-instruct": {
-		maxTokens: 131072,
+		maxTokens: 8192,
 		contextWindow: 131072,
 		supportsImages: false,
 		supportsPromptCache: false,
diff --git a/src/api/transform/model-params.ts b/src/api/transform/model-params.ts
@@ -5,6 +5,7 @@ import {
 	DEFAULT_HYBRID_REASONING_MODEL_THINKING_TOKENS,
 	shouldUseReasoningBudget,
 	shouldUseReasoningEffort,
+	getModelMaxOutputTokens,
 } from "../../shared/api"
 
 import {
@@ -76,20 +77,25 @@ export function getModelParams({
 		reasoningEffort: customReasoningEffort,
 	} = settings
 
-	let maxTokens = model.maxTokens ?? undefined
+	// Use the centralized logic for computing maxTokens
+	const maxTokens = getModelMaxOutputTokens({
+		modelId,
+		model,
+		settings,
+		format,
+	})
+
 	let temperature = customTemperature ?? defaultTemperature
 	let reasoningBudget: ModelParams["reasoningBudget"] = undefined
 	let reasoningEffort: ModelParams["reasoningEffort"] = undefined
 
 	if (shouldUseReasoningBudget({ model, settings })) {
-		// If `customMaxTokens` is not specified use the default.
-		maxTokens = customMaxTokens ?? DEFAULT_HYBRID_REASONING_MODEL_MAX_TOKENS
-
 		// If `customMaxThinkingTokens` is not specified use the default.
 		reasoningBudget = customMaxThinkingTokens ?? DEFAULT_HYBRID_REASONING_MODEL_THINKING_TOKENS
 
 		// Reasoning cannot exceed 80% of the `maxTokens` value.
-		if (reasoningBudget > Math.floor(maxTokens * 0.8)) {
+		// maxTokens should always be defined for reasoning budget models, but add a guard just in case
+		if (maxTokens && reasoningBudget > Math.floor(maxTokens * 0.8)) {
 			reasoningBudget = Math.floor(maxTokens * 0.8)
 		}
 
@@ -106,24 +112,6 @@ export function getModelParams({
 		reasoningEffort = customReasoningEffort ?? model.reasoningEffort
 	}
 
-	// TODO: We should consolidate this logic to compute `maxTokens` with
-	// `getModelMaxOutputTokens` in order to maintain a single source of truth.
-
-	const isAnthropic = format === "anthropic" || (format === "openrouter" && modelId.startsWith("anthropic/"))
-
-	// For "Hybrid" reasoning models, we should discard the model's actual
-	// `maxTokens` value if we're not using reasoning. We do this for Anthropic
-	// models only for now. Should we do this for Gemini too?
-	if (model.supportsReasoningBudget && !reasoningBudget && isAnthropic) {
-		maxTokens = ANTHROPIC_DEFAULT_MAX_TOKENS
-	}
-
-	// For Anthropic models we should always make sure a `maxTokens` value is
-	// set.
-	if (!maxTokens && isAnthropic) {
-		maxTokens = ANTHROPIC_DEFAULT_MAX_TOKENS
-	}
-
 	const params: BaseModelParams = { maxTokens, temperature, reasoningEffort, reasoningBudget }
 
 	if (format === "anthropic") {
diff --git a/src/shared/__tests__/api.spec.ts b/src/shared/__tests__/api.spec.ts
@@ -76,7 +76,7 @@ describe("getModelMaxOutputTokens", () => {
 		expect(result).toBe(32000)
 	})
 
-	test("should return 20% of context window when maxTokens is undefined", () => {
+	test("should return default of 8192 when maxTokens is undefined", () => {
 		const modelWithoutMaxTokens: ModelInfo = {
 			contextWindow: 100000,
 			supportsPromptCache: true,
@@ -88,7 +88,7 @@ describe("getModelMaxOutputTokens", () => {
 			settings: {},
 		})
 
-		expect(result).toBe(20000) // 20% of 100000
+		expect(result).toBe(8192)
 	})
 
 	test("should return ANTHROPIC_DEFAULT_MAX_TOKENS for Anthropic models that support reasoning budget but aren't using it", () => {
diff --git a/src/shared/api.ts b/src/shared/api.ts
@@ -58,36 +58,49 @@ export const getModelMaxOutputTokens = ({
 	modelId,
 	model,
 	settings,
+	format,
 }: {
 	modelId: string
 	model: ModelInfo
 	settings?: ProviderSettings
+	format?: "anthropic" | "openai" | "gemini" | "openrouter"
 }): number | undefined => {
 	// Check for Claude Code specific max output tokens setting
 	if (settings?.apiProvider === "claude-code") {
-		// Return the configured value or default to CLAUDE_CODE_DEFAULT_MAX_OUTPUT_TOKENS
 		return settings.claudeCodeMaxOutputTokens || CLAUDE_CODE_DEFAULT_MAX_OUTPUT_TOKENS
 	}
 
 	if (shouldUseReasoningBudget({ model, settings })) {
 		return settings?.modelMaxTokens || DEFAULT_HYBRID_REASONING_MODEL_MAX_TOKENS
 	}
 
-	const isAnthropicModel = modelId.includes("claude")
+	const isAnthropicContext =
+		modelId.includes("claude") ||
+		format === "anthropic" ||
+		(format === "openrouter" && modelId.startsWith("anthropic/"))
 
-	// For "Hybrid" reasoning models, we should discard the model's actual
-	// `maxTokens` value if we're not using reasoning. We do this for Anthropic
-	// models only for now. Should we do this for Gemini too?
-	if (model.supportsReasoningBudget && isAnthropicModel) {
+	// For "Hybrid" reasoning models, discard the model's actual maxTokens for Anthropic contexts
+	if (model.supportsReasoningBudget && isAnthropicContext) {
 		return ANTHROPIC_DEFAULT_MAX_TOKENS
 	}
 
-	// If maxTokens is 0 or undefined or the full context window, fall back to 20% of context window
+	// For Anthropic contexts, always ensure a maxTokens value is set
+	if (isAnthropicContext && (!model.maxTokens || model.maxTokens === 0)) {
+		return ANTHROPIC_DEFAULT_MAX_TOKENS
+	}
+
+	// If model has explicit maxTokens and it's not the full context window, use it
 	if (model.maxTokens && model.maxTokens !== model.contextWindow) {
 		return model.maxTokens
-	} else {
-		return Math.ceil(model.contextWindow * 0.2)
 	}
+
+	// For non-Anthropic formats without explicit maxTokens, return undefined
+	if (format) {
+		return undefined
+	}
+
+	// Default fallback
+	return ANTHROPIC_DEFAULT_MAX_TOKENS
 }
 
 // GetModelsOptions
diff --git a/webview-ui/src/__tests__/ContextWindowProgressLogic.spec.ts b/webview-ui/src/__tests__/ContextWindowProgressLogic.spec.ts
@@ -7,41 +7,41 @@ export {} // This makes the file a proper TypeScript module
 describe("ContextWindowProgress Logic", () => {
 	// Using the shared utility function from model-utils.ts instead of reimplementing it
 
-	test("calculates correct token distribution with default 20% reservation", () => {
-		const contextWindow = 4000
+	test("calculates correct token distribution with default 8192 reservation", () => {
+		const contextWindow = 10000
 		const contextTokens = 1000
 
 		const result = calculateTokenDistribution(contextWindow, contextTokens)
 
 		// Expected calculations:
-		// reservedForOutput = 0.2 * 4000 = 800
-		// availableSize = 4000 - 1000 - 800 = 2200
-		// total = 1000 + 800 + 2200 = 4000
-		expect(result.reservedForOutput).toBe(800)
-		expect(result.availableSize).toBe(2200)
+		// reservedForOutput = 8192 (ANTHROPIC_DEFAULT_MAX_TOKENS)
+		// availableSize = 10000 - 1000 - 8192 = 808
+		// total = 1000 + 8192 + 808 = 10000
+		expect(result.reservedForOutput).toBe(8192)
+		expect(result.availableSize).toBe(808)
 
 		// Check percentages
-		expect(result.currentPercent).toBeCloseTo(25) // 1000/4000 * 100 = 25%
-		expect(result.reservedPercent).toBeCloseTo(20) // 800/4000 * 100 = 20%
-		expect(result.availablePercent).toBeCloseTo(55) // 2200/4000 * 100 = 55%
+		expect(result.currentPercent).toBeCloseTo(10) // 1000/10000 * 100 = 10%
+		expect(result.reservedPercent).toBeCloseTo(81.92) // 8192/10000 * 100 = 81.92%
+		expect(result.availablePercent).toBeCloseTo(8.08) // 808/10000 * 100 = 8.08%
 
 		// Verify percentages sum to 100%
 		expect(result.currentPercent + result.reservedPercent + result.availablePercent).toBeCloseTo(100)
 	})
 
 	test("uses provided maxTokens when available instead of default calculation", () => {
-		const contextWindow = 4000
+		const contextWindow = 10000
 		const contextTokens = 1000
 
-		// First calculate with default 20% reservation (no maxTokens provided)
+		// First calculate with default 8192 reservation (no maxTokens provided)
 		const defaultResult = calculateTokenDistribution(contextWindow, contextTokens)
 
 		// Then calculate with custom maxTokens value
-		const customMaxTokens = 1500 // Custom maxTokens instead of default 20%
+		const customMaxTokens = 1500 // Custom maxTokens instead of default 8192
 		const customResult = calculateTokenDistribution(contextWindow, contextTokens, customMaxTokens)
 
-		// VERIFY MAXTOKEN PROP EFFECT: Custom maxTokens should be used directly instead of 20% calculation
-		const defaultReserved = Math.ceil(contextWindow * 0.2) // 800 tokens (20% of 4000)
+		// VERIFY MAXTOKEN PROP EFFECT: Custom maxTokens should be used directly instead of 8192 calculation
+		const defaultReserved = 8192 // ANTHROPIC_DEFAULT_MAX_TOKENS
 		expect(defaultResult.reservedForOutput).toBe(defaultReserved)
 		expect(customResult.reservedForOutput).toBe(customMaxTokens) // Should use exact provided value
 
@@ -51,13 +51,13 @@ describe("ContextWindowProgress Logic", () => {
 		expect(defaultTooltip).not.toBe(customTooltip)
 
 		// Verify the effect on available space
-		expect(customResult.availableSize).toBe(4000 - 1000 - 1500) // 1500 tokens available
-		expect(defaultResult.availableSize).toBe(4000 - 1000 - 800) // 2200 tokens available
+		expect(customResult.availableSize).toBe(10000 - 1000 - 1500) // 7500 tokens available
+		expect(defaultResult.availableSize).toBe(10000 - 1000 - 8192) // 808 tokens available
 
 		// Verify the effect on percentages
-		// With custom maxTokens (1500), the reserved percentage should be higher
-		expect(defaultResult.reservedPercent).toBeCloseTo(20) // 800/4000 * 100 = 20%
-		expect(customResult.reservedPercent).toBeCloseTo(37.5) // 1500/4000 * 100 = 37.5%
+		// With custom maxTokens (1500), the reserved percentage should be lower than default
+		expect(defaultResult.reservedPercent).toBeCloseTo(81.92) // 8192/10000 * 100 = 81.92%
+		expect(customResult.reservedPercent).toBeCloseTo(15) // 1500/10000 * 100 = 15%
 
 		// Verify percentages still sum to 100%
 		expect(customResult.currentPercent + customResult.reservedPercent + customResult.availablePercent).toBeCloseTo(
@@ -66,19 +66,19 @@ describe("ContextWindowProgress Logic", () => {
 	})
 
 	test("handles negative input values", () => {
-		const contextWindow = 4000
+		const contextWindow = 10000
 		const contextTokens = -500 // Negative tokens should be handled gracefully
 
 		const result = calculateTokenDistribution(contextWindow, contextTokens)
 
 		// Expected calculations:
 		// safeContextTokens = Math.max(0, -500) = 0
-		// reservedForOutput = 0.2 * 4000 = 800
-		// availableSize = 4000 - 0 - 800 = 3200
-		// total = 0 + 800 + 3200 = 4000
-		expect(result.currentPercent).toBeCloseTo(0) // 0/4000 * 100 = 0%
-		expect(result.reservedPercent).toBeCloseTo(20) // 800/4000 * 100 = 20%
-		expect(result.availablePercent).toBeCloseTo(80) // 3200/4000 * 100 = 80%
+		// reservedForOutput = 8192 (ANTHROPIC_DEFAULT_MAX_TOKENS)
+		// availableSize = 10000 - 0 - 8192 = 1808
+		// total = 0 + 8192 + 1808 = 10000
+		expect(result.currentPercent).toBeCloseTo(0) // 0/10000 * 100 = 0%
+		expect(result.reservedPercent).toBeCloseTo(81.92) // 8192/10000 * 100 = 81.92%
+		expect(result.availablePercent).toBeCloseTo(18.08) // 1808/10000 * 100 = 18.08%
 	})
 
 	test("handles zero context window gracefully", () => {
@@ -87,9 +87,9 @@ describe("ContextWindowProgress Logic", () => {
 
 		const result = calculateTokenDistribution(contextWindow, contextTokens)
 
-		// With zero context window, everything should be zero
-		expect(result.reservedForOutput).toBe(0)
-		expect(result.availableSize).toBe(0)
+		// With zero context window, the function uses ANTHROPIC_DEFAULT_MAX_TOKENS but available size becomes 0
+		expect(result.reservedForOutput).toBe(8192) // ANTHROPIC_DEFAULT_MAX_TOKENS
+		expect(result.availableSize).toBe(0) // max(0, 0 - 1000 - 8192) = 0
 
 		// The percentages maintain total of 100% even with zero context window
 		// due to how the division handles this edge case
@@ -98,20 +98,20 @@ describe("ContextWindowProgress Logic", () => {
 	})
 
 	test("handles case where tokens exceed context window", () => {
-		const contextWindow = 4000
-		const contextTokens = 5000 // More tokens than the window size
+		const contextWindow = 10000
+		const contextTokens = 12000 // More tokens than the window size
 
 		const result = calculateTokenDistribution(contextWindow, contextTokens)
 
 		// Expected calculations:
-		// reservedForOutput = 0.2 * 4000 = 800
-		// availableSize = Math.max(0, 4000 - 5000 - 800) = 0
-		expect(result.reservedForOutput).toBe(800)
+		// reservedForOutput = 8192 (ANTHROPIC_DEFAULT_MAX_TOKENS)
+		// availableSize = Math.max(0, 10000 - 12000 - 8192) = 0
+		expect(result.reservedForOutput).toBe(8192)
 		expect(result.availableSize).toBe(0)
 
-		// Percentages should be calculated based on total (5000 + 800 + 0 = 5800)
-		expect(result.currentPercent).toBeCloseTo((5000 / 5800) * 100)
-		expect(result.reservedPercent).toBeCloseTo((800 / 5800) * 100)
+		// Percentages should be calculated based on total (12000 + 8192 + 0 = 20192)
+		expect(result.currentPercent).toBeCloseTo((12000 / 20192) * 100)
+		expect(result.reservedPercent).toBeCloseTo((8192 / 20192) * 100)
 		expect(result.availablePercent).toBeCloseTo(0)
 
 		// Verify percentages sum to 100%
diff --git a/webview-ui/src/utils/__tests__/model-utils.spec.ts b/webview-ui/src/utils/__tests__/model-utils.spec.ts
@@ -17,33 +17,33 @@ describe("calculateTokenDistribution", () => {
 		expect(Math.round(result.currentPercent + result.reservedPercent + result.availablePercent)).toBe(100)
 	})
 
-	it("should default to 20% of context window when maxTokens not provided", () => {
-		const contextWindow = 10000
+	it("should default to 8192 when maxTokens not provided", () => {
+		const contextWindow = 20000
 		const contextTokens = 5000
 
 		const result = calculateTokenDistribution(contextWindow, contextTokens)
 
-		expect(result.reservedForOutput).toBe(2000) // 20% of 10000
-		expect(result.availableSize).toBe(3000) // 10000 - 5000 - 2000
+		expect(result.reservedForOutput).toBe(8192)
+		expect(result.availableSize).toBe(6808) // 20000 - 5000 - 8192
 	})
 
 	it("should handle negative or zero inputs by using positive fallbacks", () => {
 		const result = calculateTokenDistribution(-1000, -500)
 
 		expect(result.currentPercent).toBe(0)
-		expect(result.reservedPercent).toBe(0)
+		expect(result.reservedPercent).toBe(100) // 8192 / 8192 = 100%
 		expect(result.availablePercent).toBe(0)
-		expect(result.reservedForOutput).toBe(0) // With negative inputs, both context window and tokens become 0, so 20% of 0 is 0
-		expect(result.availableSize).toBe(0)
+		expect(result.reservedForOutput).toBe(8192) // Uses ANTHROPIC_DEFAULT_MAX_TOKENS
+		expect(result.availableSize).toBe(0) // max(0, 0 - 0 - 8192) = 0
 	})
 
-	it("should handle zero total tokens without division by zero errors", () => {
-		const result = calculateTokenDistribution(0, 0, 0)
+	it("should handle zero context window without division by zero errors", () => {
+		const result = calculateTokenDistribution(0, 0)
 
 		expect(result.currentPercent).toBe(0)
-		expect(result.reservedPercent).toBe(0)
+		expect(result.reservedPercent).toBe(100) // When contextWindow is 0, reserved gets 100%
 		expect(result.availablePercent).toBe(0)
-		expect(result.reservedForOutput).toBe(0)
+		expect(result.reservedForOutput).toBe(8192) // Uses ANTHROPIC_DEFAULT_MAX_TOKENS when no maxTokens provided
 		expect(result.availableSize).toBe(0)
 	})
 })
diff --git a/webview-ui/src/utils/model-utils.ts b/webview-ui/src/utils/model-utils.ts