Skip to content

Commit 8a3dcfb

Browse files
authored
Update the max-token calculation in model-params to use the shared logic (#5720)
1 parent 301977b commit 8a3dcfb

File tree

7 files changed

+96
-93
lines changed

7 files changed

+96
-93
lines changed

packages/types/src/providers/groq.ts

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ export const groqDefaultModelId: GroqModelId = "llama-3.3-70b-versatile" // Defa
1717
export const groqModels = {
1818
// Models based on API response: https://api.groq.com/openai/v1/models
1919
"llama-3.1-8b-instant": {
20-
maxTokens: 131072,
20+
maxTokens: 8192,
2121
contextWindow: 131072,
2222
supportsImages: false,
2323
supportsPromptCache: false,
@@ -26,7 +26,7 @@ export const groqModels = {
2626
description: "Meta Llama 3.1 8B Instant model, 128K context.",
2727
},
2828
"llama-3.3-70b-versatile": {
29-
maxTokens: 32768,
29+
maxTokens: 8192,
3030
contextWindow: 131072,
3131
supportsImages: false,
3232
supportsPromptCache: false,
@@ -53,7 +53,7 @@ export const groqModels = {
5353
description: "Meta Llama 4 Maverick 17B Instruct model, 128K context.",
5454
},
5555
"mistral-saba-24b": {
56-
maxTokens: 32768,
56+
maxTokens: 8192,
5757
contextWindow: 32768,
5858
supportsImages: false,
5959
supportsPromptCache: false,
@@ -62,7 +62,7 @@ export const groqModels = {
6262
description: "Mistral Saba 24B model, 32K context.",
6363
},
6464
"qwen-qwq-32b": {
65-
maxTokens: 131072,
65+
maxTokens: 8192,
6666
contextWindow: 131072,
6767
supportsImages: false,
6868
supportsPromptCache: false,
@@ -71,7 +71,7 @@ export const groqModels = {
7171
description: "Alibaba Qwen QwQ 32B model, 128K context.",
7272
},
7373
"qwen/qwen3-32b": {
74-
maxTokens: 40960,
74+
maxTokens: 8192,
7575
contextWindow: 131072,
7676
supportsImages: false,
7777
supportsPromptCache: false,
@@ -80,7 +80,7 @@ export const groqModels = {
8080
description: "Alibaba Qwen 3 32B model, 128K context.",
8181
},
8282
"deepseek-r1-distill-llama-70b": {
83-
maxTokens: 131072,
83+
maxTokens: 8192,
8484
contextWindow: 131072,
8585
supportsImages: false,
8686
supportsPromptCache: false,
@@ -89,7 +89,7 @@ export const groqModels = {
8989
description: "DeepSeek R1 Distill Llama 70B model, 128K context.",
9090
},
9191
"moonshotai/kimi-k2-instruct": {
92-
maxTokens: 131072,
92+
maxTokens: 8192,
9393
contextWindow: 131072,
9494
supportsImages: false,
9595
supportsPromptCache: false,

src/api/transform/model-params.ts

Lines changed: 11 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import {
55
DEFAULT_HYBRID_REASONING_MODEL_THINKING_TOKENS,
66
shouldUseReasoningBudget,
77
shouldUseReasoningEffort,
8+
getModelMaxOutputTokens,
89
} from "../../shared/api"
910

1011
import {
@@ -76,20 +77,25 @@ export function getModelParams({
7677
reasoningEffort: customReasoningEffort,
7778
} = settings
7879

79-
let maxTokens = model.maxTokens ?? undefined
80+
// Use the centralized logic for computing maxTokens
81+
const maxTokens = getModelMaxOutputTokens({
82+
modelId,
83+
model,
84+
settings,
85+
format,
86+
})
87+
8088
let temperature = customTemperature ?? defaultTemperature
8189
let reasoningBudget: ModelParams["reasoningBudget"] = undefined
8290
let reasoningEffort: ModelParams["reasoningEffort"] = undefined
8391

8492
if (shouldUseReasoningBudget({ model, settings })) {
85-
// If `customMaxTokens` is not specified use the default.
86-
maxTokens = customMaxTokens ?? DEFAULT_HYBRID_REASONING_MODEL_MAX_TOKENS
87-
8893
// If `customMaxThinkingTokens` is not specified use the default.
8994
reasoningBudget = customMaxThinkingTokens ?? DEFAULT_HYBRID_REASONING_MODEL_THINKING_TOKENS
9095

9196
// Reasoning cannot exceed 80% of the `maxTokens` value.
92-
if (reasoningBudget > Math.floor(maxTokens * 0.8)) {
97+
// maxTokens should always be defined for reasoning budget models, but add a guard just in case
98+
if (maxTokens && reasoningBudget > Math.floor(maxTokens * 0.8)) {
9399
reasoningBudget = Math.floor(maxTokens * 0.8)
94100
}
95101

@@ -106,24 +112,6 @@ export function getModelParams({
106112
reasoningEffort = customReasoningEffort ?? model.reasoningEffort
107113
}
108114

109-
// TODO: We should consolidate this logic to compute `maxTokens` with
110-
// `getModelMaxOutputTokens` in order to maintain a single source of truth.
111-
112-
const isAnthropic = format === "anthropic" || (format === "openrouter" && modelId.startsWith("anthropic/"))
113-
114-
// For "Hybrid" reasoning models, we should discard the model's actual
115-
// `maxTokens` value if we're not using reasoning. We do this for Anthropic
116-
// models only for now. Should we do this for Gemini too?
117-
if (model.supportsReasoningBudget && !reasoningBudget && isAnthropic) {
118-
maxTokens = ANTHROPIC_DEFAULT_MAX_TOKENS
119-
}
120-
121-
// For Anthropic models we should always make sure a `maxTokens` value is
122-
// set.
123-
if (!maxTokens && isAnthropic) {
124-
maxTokens = ANTHROPIC_DEFAULT_MAX_TOKENS
125-
}
126-
127115
const params: BaseModelParams = { maxTokens, temperature, reasoningEffort, reasoningBudget }
128116

129117
if (format === "anthropic") {

src/shared/__tests__/api.spec.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ describe("getModelMaxOutputTokens", () => {
7676
expect(result).toBe(32000)
7777
})
7878

79-
test("should return 20% of context window when maxTokens is undefined", () => {
79+
test("should return default of 8192 when maxTokens is undefined", () => {
8080
const modelWithoutMaxTokens: ModelInfo = {
8181
contextWindow: 100000,
8282
supportsPromptCache: true,
@@ -88,7 +88,7 @@ describe("getModelMaxOutputTokens", () => {
8888
settings: {},
8989
})
9090

91-
expect(result).toBe(20000) // 20% of 100000
91+
expect(result).toBe(8192)
9292
})
9393

9494
test("should return ANTHROPIC_DEFAULT_MAX_TOKENS for Anthropic models that support reasoning budget but aren't using it", () => {

src/shared/api.ts

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -58,36 +58,49 @@ export const getModelMaxOutputTokens = ({
5858
modelId,
5959
model,
6060
settings,
61+
format,
6162
}: {
6263
modelId: string
6364
model: ModelInfo
6465
settings?: ProviderSettings
66+
format?: "anthropic" | "openai" | "gemini" | "openrouter"
6567
}): number | undefined => {
6668
// Check for Claude Code specific max output tokens setting
6769
if (settings?.apiProvider === "claude-code") {
68-
// Return the configured value or default to CLAUDE_CODE_DEFAULT_MAX_OUTPUT_TOKENS
6970
return settings.claudeCodeMaxOutputTokens || CLAUDE_CODE_DEFAULT_MAX_OUTPUT_TOKENS
7071
}
7172

7273
if (shouldUseReasoningBudget({ model, settings })) {
7374
return settings?.modelMaxTokens || DEFAULT_HYBRID_REASONING_MODEL_MAX_TOKENS
7475
}
7576

76-
const isAnthropicModel = modelId.includes("claude")
77+
const isAnthropicContext =
78+
modelId.includes("claude") ||
79+
format === "anthropic" ||
80+
(format === "openrouter" && modelId.startsWith("anthropic/"))
7781

78-
// For "Hybrid" reasoning models, we should discard the model's actual
79-
// `maxTokens` value if we're not using reasoning. We do this for Anthropic
80-
// models only for now. Should we do this for Gemini too?
81-
if (model.supportsReasoningBudget && isAnthropicModel) {
82+
// For "Hybrid" reasoning models, discard the model's actual maxTokens for Anthropic contexts
83+
if (model.supportsReasoningBudget && isAnthropicContext) {
8284
return ANTHROPIC_DEFAULT_MAX_TOKENS
8385
}
8486

85-
// If maxTokens is 0 or undefined or the full context window, fall back to 20% of context window
87+
// For Anthropic contexts, always ensure a maxTokens value is set
88+
if (isAnthropicContext && (!model.maxTokens || model.maxTokens === 0)) {
89+
return ANTHROPIC_DEFAULT_MAX_TOKENS
90+
}
91+
92+
// If model has explicit maxTokens and it's not the full context window, use it
8693
if (model.maxTokens && model.maxTokens !== model.contextWindow) {
8794
return model.maxTokens
88-
} else {
89-
return Math.ceil(model.contextWindow * 0.2)
9095
}
96+
97+
// For non-Anthropic formats without explicit maxTokens, return undefined
98+
if (format) {
99+
return undefined
100+
}
101+
102+
// Default fallback
103+
return ANTHROPIC_DEFAULT_MAX_TOKENS
91104
}
92105

93106
// GetModelsOptions

webview-ui/src/__tests__/ContextWindowProgressLogic.spec.ts

Lines changed: 38 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -7,41 +7,41 @@ export {} // This makes the file a proper TypeScript module
77
describe("ContextWindowProgress Logic", () => {
88
// Using the shared utility function from model-utils.ts instead of reimplementing it
99

10-
test("calculates correct token distribution with default 20% reservation", () => {
11-
const contextWindow = 4000
10+
test("calculates correct token distribution with default 8192 reservation", () => {
11+
const contextWindow = 10000
1212
const contextTokens = 1000
1313

1414
const result = calculateTokenDistribution(contextWindow, contextTokens)
1515

1616
// Expected calculations:
17-
// reservedForOutput = 0.2 * 4000 = 800
18-
// availableSize = 4000 - 1000 - 800 = 2200
19-
// total = 1000 + 800 + 2200 = 4000
20-
expect(result.reservedForOutput).toBe(800)
21-
expect(result.availableSize).toBe(2200)
17+
// reservedForOutput = 8192 (ANTHROPIC_DEFAULT_MAX_TOKENS)
18+
// availableSize = 10000 - 1000 - 8192 = 808
19+
// total = 1000 + 8192 + 808 = 10000
20+
expect(result.reservedForOutput).toBe(8192)
21+
expect(result.availableSize).toBe(808)
2222

2323
// Check percentages
24-
expect(result.currentPercent).toBeCloseTo(25) // 1000/4000 * 100 = 25%
25-
expect(result.reservedPercent).toBeCloseTo(20) // 800/4000 * 100 = 20%
26-
expect(result.availablePercent).toBeCloseTo(55) // 2200/4000 * 100 = 55%
24+
expect(result.currentPercent).toBeCloseTo(10) // 1000/10000 * 100 = 10%
25+
expect(result.reservedPercent).toBeCloseTo(81.92) // 8192/10000 * 100 = 81.92%
26+
expect(result.availablePercent).toBeCloseTo(8.08) // 808/10000 * 100 = 8.08%
2727

2828
// Verify percentages sum to 100%
2929
expect(result.currentPercent + result.reservedPercent + result.availablePercent).toBeCloseTo(100)
3030
})
3131

3232
test("uses provided maxTokens when available instead of default calculation", () => {
33-
const contextWindow = 4000
33+
const contextWindow = 10000
3434
const contextTokens = 1000
3535

36-
// First calculate with default 20% reservation (no maxTokens provided)
36+
// First calculate with default 8192 reservation (no maxTokens provided)
3737
const defaultResult = calculateTokenDistribution(contextWindow, contextTokens)
3838

3939
// Then calculate with custom maxTokens value
40-
const customMaxTokens = 1500 // Custom maxTokens instead of default 20%
40+
const customMaxTokens = 1500 // Custom maxTokens instead of default 8192
4141
const customResult = calculateTokenDistribution(contextWindow, contextTokens, customMaxTokens)
4242

43-
// VERIFY MAXTOKEN PROP EFFECT: Custom maxTokens should be used directly instead of 20% calculation
44-
const defaultReserved = Math.ceil(contextWindow * 0.2) // 800 tokens (20% of 4000)
43+
// VERIFY MAXTOKEN PROP EFFECT: Custom maxTokens should be used directly instead of 8192 calculation
44+
const defaultReserved = 8192 // ANTHROPIC_DEFAULT_MAX_TOKENS
4545
expect(defaultResult.reservedForOutput).toBe(defaultReserved)
4646
expect(customResult.reservedForOutput).toBe(customMaxTokens) // Should use exact provided value
4747

@@ -51,13 +51,13 @@ describe("ContextWindowProgress Logic", () => {
5151
expect(defaultTooltip).not.toBe(customTooltip)
5252

5353
// Verify the effect on available space
54-
expect(customResult.availableSize).toBe(4000 - 1000 - 1500) // 1500 tokens available
55-
expect(defaultResult.availableSize).toBe(4000 - 1000 - 800) // 2200 tokens available
54+
expect(customResult.availableSize).toBe(10000 - 1000 - 1500) // 7500 tokens available
55+
expect(defaultResult.availableSize).toBe(10000 - 1000 - 8192) // 808 tokens available
5656

5757
// Verify the effect on percentages
58-
// With custom maxTokens (1500), the reserved percentage should be higher
59-
expect(defaultResult.reservedPercent).toBeCloseTo(20) // 800/4000 * 100 = 20%
60-
expect(customResult.reservedPercent).toBeCloseTo(37.5) // 1500/4000 * 100 = 37.5%
58+
// With custom maxTokens (1500), the reserved percentage should be lower than default
59+
expect(defaultResult.reservedPercent).toBeCloseTo(81.92) // 8192/10000 * 100 = 81.92%
60+
expect(customResult.reservedPercent).toBeCloseTo(15) // 1500/10000 * 100 = 15%
6161

6262
// Verify percentages still sum to 100%
6363
expect(customResult.currentPercent + customResult.reservedPercent + customResult.availablePercent).toBeCloseTo(
@@ -66,19 +66,19 @@ describe("ContextWindowProgress Logic", () => {
6666
})
6767

6868
test("handles negative input values", () => {
69-
const contextWindow = 4000
69+
const contextWindow = 10000
7070
const contextTokens = -500 // Negative tokens should be handled gracefully
7171

7272
const result = calculateTokenDistribution(contextWindow, contextTokens)
7373

7474
// Expected calculations:
7575
// safeContextTokens = Math.max(0, -500) = 0
76-
// reservedForOutput = 0.2 * 4000 = 800
77-
// availableSize = 4000 - 0 - 800 = 3200
78-
// total = 0 + 800 + 3200 = 4000
79-
expect(result.currentPercent).toBeCloseTo(0) // 0/4000 * 100 = 0%
80-
expect(result.reservedPercent).toBeCloseTo(20) // 800/4000 * 100 = 20%
81-
expect(result.availablePercent).toBeCloseTo(80) // 3200/4000 * 100 = 80%
76+
// reservedForOutput = 8192 (ANTHROPIC_DEFAULT_MAX_TOKENS)
77+
// availableSize = 10000 - 0 - 8192 = 1808
78+
// total = 0 + 8192 + 1808 = 10000
79+
expect(result.currentPercent).toBeCloseTo(0) // 0/10000 * 100 = 0%
80+
expect(result.reservedPercent).toBeCloseTo(81.92) // 8192/10000 * 100 = 81.92%
81+
expect(result.availablePercent).toBeCloseTo(18.08) // 1808/10000 * 100 = 18.08%
8282
})
8383

8484
test("handles zero context window gracefully", () => {
@@ -87,9 +87,9 @@ describe("ContextWindowProgress Logic", () => {
8787

8888
const result = calculateTokenDistribution(contextWindow, contextTokens)
8989

90-
// With zero context window, everything should be zero
91-
expect(result.reservedForOutput).toBe(0)
92-
expect(result.availableSize).toBe(0)
90+
// With zero context window, the function uses ANTHROPIC_DEFAULT_MAX_TOKENS but available size becomes 0
91+
expect(result.reservedForOutput).toBe(8192) // ANTHROPIC_DEFAULT_MAX_TOKENS
92+
expect(result.availableSize).toBe(0) // max(0, 0 - 1000 - 8192) = 0
9393

9494
// The percentages maintain total of 100% even with zero context window
9595
// due to how the division handles this edge case
@@ -98,20 +98,20 @@ describe("ContextWindowProgress Logic", () => {
9898
})
9999

100100
test("handles case where tokens exceed context window", () => {
101-
const contextWindow = 4000
102-
const contextTokens = 5000 // More tokens than the window size
101+
const contextWindow = 10000
102+
const contextTokens = 12000 // More tokens than the window size
103103

104104
const result = calculateTokenDistribution(contextWindow, contextTokens)
105105

106106
// Expected calculations:
107-
// reservedForOutput = 0.2 * 4000 = 800
108-
// availableSize = Math.max(0, 4000 - 5000 - 800) = 0
109-
expect(result.reservedForOutput).toBe(800)
107+
// reservedForOutput = 8192 (ANTHROPIC_DEFAULT_MAX_TOKENS)
108+
// availableSize = Math.max(0, 10000 - 12000 - 8192) = 0
109+
expect(result.reservedForOutput).toBe(8192)
110110
expect(result.availableSize).toBe(0)
111111

112-
// Percentages should be calculated based on total (5000 + 800 + 0 = 5800)
113-
expect(result.currentPercent).toBeCloseTo((5000 / 5800) * 100)
114-
expect(result.reservedPercent).toBeCloseTo((800 / 5800) * 100)
112+
// Percentages should be calculated based on total (12000 + 8192 + 0 = 20192)
113+
expect(result.currentPercent).toBeCloseTo((12000 / 20192) * 100)
114+
expect(result.reservedPercent).toBeCloseTo((8192 / 20192) * 100)
115115
expect(result.availablePercent).toBeCloseTo(0)
116116

117117
// Verify percentages sum to 100%

webview-ui/src/utils/__tests__/model-utils.spec.ts

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -17,33 +17,33 @@ describe("calculateTokenDistribution", () => {
1717
expect(Math.round(result.currentPercent + result.reservedPercent + result.availablePercent)).toBe(100)
1818
})
1919

20-
it("should default to 20% of context window when maxTokens not provided", () => {
21-
const contextWindow = 10000
20+
it("should default to 8192 when maxTokens not provided", () => {
21+
const contextWindow = 20000
2222
const contextTokens = 5000
2323

2424
const result = calculateTokenDistribution(contextWindow, contextTokens)
2525

26-
expect(result.reservedForOutput).toBe(2000) // 20% of 10000
27-
expect(result.availableSize).toBe(3000) // 10000 - 5000 - 2000
26+
expect(result.reservedForOutput).toBe(8192)
27+
expect(result.availableSize).toBe(6808) // 20000 - 5000 - 8192
2828
})
2929

3030
it("should handle negative or zero inputs by using positive fallbacks", () => {
3131
const result = calculateTokenDistribution(-1000, -500)
3232

3333
expect(result.currentPercent).toBe(0)
34-
expect(result.reservedPercent).toBe(0)
34+
expect(result.reservedPercent).toBe(100) // 8192 / 8192 = 100%
3535
expect(result.availablePercent).toBe(0)
36-
expect(result.reservedForOutput).toBe(0) // With negative inputs, both context window and tokens become 0, so 20% of 0 is 0
37-
expect(result.availableSize).toBe(0)
36+
expect(result.reservedForOutput).toBe(8192) // Uses ANTHROPIC_DEFAULT_MAX_TOKENS
37+
expect(result.availableSize).toBe(0) // max(0, 0 - 0 - 8192) = 0
3838
})
3939

40-
it("should handle zero total tokens without division by zero errors", () => {
41-
const result = calculateTokenDistribution(0, 0, 0)
40+
it("should handle zero context window without division by zero errors", () => {
41+
const result = calculateTokenDistribution(0, 0)
4242

4343
expect(result.currentPercent).toBe(0)
44-
expect(result.reservedPercent).toBe(0)
44+
expect(result.reservedPercent).toBe(100) // When contextWindow is 0, reserved gets 100%
4545
expect(result.availablePercent).toBe(0)
46-
expect(result.reservedForOutput).toBe(0)
46+
expect(result.reservedForOutput).toBe(8192) // Uses ANTHROPIC_DEFAULT_MAX_TOKENS when no maxTokens provided
4747
expect(result.availableSize).toBe(0)
4848
})
4949
})

0 commit comments

Comments
 (0)