Skip to content

Commit 1fb46fc

Browse files
committed
fix: adjust token clamping threshold from 20% to 80% for GLM-4.5 compatibility
The previous 20% clamping threshold was too restrictive for models like GLM-4.5 that have legitimate high output token requirements (98,304 tokens out of 131,072 context window = 75%). This change only applies clamping when maxTokens exceeds 80% of the context window, preventing models from using the entire context for output while still allowing models with high output requirements to function properly. Fixes #6806
1 parent 15b0f50 commit 1fb46fc

File tree

4 files changed

+30
-23
lines changed

4 files changed

+30
-23
lines changed

src/api/providers/__tests__/openrouter.spec.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,8 +99,8 @@ describe("OpenRouterHandler", () => {
9999

100100
const result = await handler.fetchModel()
101101
// With the new clamping logic, 128000 tokens (64% of 200000 context window)
102-
// gets clamped to 20% of context window: 200000 * 0.2 = 40000
103-
expect(result.maxTokens).toBe(40000)
102+
// is below the 80% threshold, so it should not be clamped
103+
expect(result.maxTokens).toBe(128000)
104104
expect(result.reasoningBudget).toBeUndefined()
105105
expect(result.temperature).toBe(0)
106106
})

src/api/transform/__tests__/model-params.spec.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -293,12 +293,12 @@ describe("getModelParams", () => {
293293
it("should not honor customMaxThinkingTokens for non-reasoning budget models", () => {
294294
const model: ModelInfo = {
295295
...baseModel,
296-
maxTokens: 3000, // 3000 is 18.75% of 16000 context window, within 20% threshold
296+
maxTokens: 3000, // 3000 is 18.75% of 16000 context window, within 80% threshold
297297
}
298298

299299
expect(getModelParams({ ...anthropicParams, settings: { modelMaxThinkingTokens: 1500 }, model })).toEqual({
300300
format: anthropicParams.format,
301-
maxTokens: 3000, // Uses model.maxTokens since it's within 20% threshold
301+
maxTokens: 3000, // Uses model.maxTokens since it's within 80% threshold
302302
temperature: 0, // Using default temperature.
303303
reasoningEffort: undefined,
304304
reasoningBudget: undefined, // Should remain undefined despite customMaxThinkingTokens being set.
@@ -565,7 +565,7 @@ describe("getModelParams", () => {
565565
it("should use reasoningEffort if supportsReasoningEffort is false but reasoningEffort is set", () => {
566566
const model: ModelInfo = {
567567
...baseModel,
568-
maxTokens: 3000, // Changed to 3000 (18.75% of 16000), which is within 20% threshold
568+
maxTokens: 3000, // Changed to 3000 (18.75% of 16000), which is within 80% threshold
569569
supportsReasoningEffort: false,
570570
reasoningEffort: "medium",
571571
}
@@ -576,7 +576,7 @@ describe("getModelParams", () => {
576576
model,
577577
})
578578

579-
expect(result.maxTokens).toBe(3000) // Now uses model.maxTokens since it's within 20% threshold
579+
expect(result.maxTokens).toBe(3000) // Now uses model.maxTokens since it's within 80% threshold
580580
expect(result.reasoningEffort).toBe("medium")
581581
})
582582
})

src/shared/__tests__/api.spec.ts

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,13 @@ describe("getModelMaxOutputTokens", () => {
2525
expect(result).toBe(16384)
2626
})
2727

28-
test("should return model maxTokens when not using claude-code provider and maxTokens is within 20% of context window", () => {
28+
test("should return model maxTokens when not using claude-code provider and maxTokens is within 80% of context window", () => {
2929
const settings: ProviderSettings = {
3030
apiProvider: "anthropic",
3131
}
3232

3333
// mockModel has maxTokens: 8192 and contextWindow: 200000
34-
// 8192 is 4.096% of 200000, which is <= 20%, so it should use model.maxTokens
34+
// 8192 is 4.096% of 200000, which is <= 80%, so it should use model.maxTokens
3535
const result = getModelMaxOutputTokens({
3636
modelId: "claude-3-5-sonnet-20241022",
3737
model: mockModel,
@@ -117,7 +117,7 @@ describe("getModelMaxOutputTokens", () => {
117117
contextWindow: 1_048_576,
118118
supportsPromptCache: false,
119119
supportsReasoningBudget: true,
120-
maxTokens: 65_535, // 65_535 is ~6.25% of 1_048_576, which is <= 20%
120+
maxTokens: 65_535, // 65_535 is ~6.25% of 1_048_576, which is <= 80%
121121
}
122122

123123
const settings: ProviderSettings = {
@@ -126,14 +126,14 @@ describe("getModelMaxOutputTokens", () => {
126126
}
127127

128128
const result = getModelMaxOutputTokens({ modelId: geminiModelId, model, settings })
129-
expect(result).toBe(65_535) // Should use model.maxTokens since it's within 20% threshold
129+
expect(result).toBe(65_535) // Should use model.maxTokens since it's within 80% threshold
130130
})
131131

132-
test("should clamp maxTokens to 20% of context window when maxTokens exceeds threshold", () => {
132+
test("should clamp maxTokens to 80% of context window when maxTokens exceeds threshold", () => {
133133
const model: ModelInfo = {
134134
contextWindow: 100_000,
135135
supportsPromptCache: false,
136-
maxTokens: 50_000, // 50% of context window, exceeds 20% threshold
136+
maxTokens: 90_000, // 90% of context window, exceeds 80% threshold
137137
}
138138

139139
const settings: ProviderSettings = {
@@ -146,15 +146,15 @@ describe("getModelMaxOutputTokens", () => {
146146
settings,
147147
format: "openai",
148148
})
149-
// Should clamp to 20% of context window: 100_000 * 0.2 = 20_000
150-
expect(result).toBe(20_000)
149+
// Should clamp to 80% of context window: 100_000 * 0.8 = 80_000
150+
expect(result).toBe(80_000)
151151
})
152152

153-
test("should clamp maxTokens to 20% of context window for Anthropic models when maxTokens exceeds threshold", () => {
153+
test("should clamp maxTokens to 80% of context window for Anthropic models when maxTokens exceeds threshold", () => {
154154
const model: ModelInfo = {
155155
contextWindow: 100_000,
156156
supportsPromptCache: true,
157-
maxTokens: 50_000, // 50% of context window, exceeds 20% threshold
157+
maxTokens: 90_000, // 90% of context window, exceeds 80% threshold
158158
}
159159

160160
const settings: ProviderSettings = {
@@ -166,15 +166,15 @@ describe("getModelMaxOutputTokens", () => {
166166
model,
167167
settings,
168168
})
169-
// Should clamp to 20% of context window: 100_000 * 0.2 = 20_000
170-
expect(result).toBe(20_000)
169+
// Should clamp to 80% of context window: 100_000 * 0.8 = 80_000
170+
expect(result).toBe(80_000)
171171
})
172172

173-
test("should use model.maxTokens when exactly at 20% threshold", () => {
173+
test("should use model.maxTokens when at or below 80% threshold", () => {
174174
const model: ModelInfo = {
175175
contextWindow: 100_000,
176176
supportsPromptCache: false,
177-
maxTokens: 20_000, // Exactly 20% of context window
177+
maxTokens: 80_000, // Exactly 80% of context window
178178
}
179179

180180
const settings: ProviderSettings = {
@@ -187,7 +187,7 @@ describe("getModelMaxOutputTokens", () => {
187187
settings,
188188
format: "openai",
189189
})
190-
expect(result).toBe(20_000) // Should use model.maxTokens since it's exactly at 20%
190+
expect(result).toBe(80_000) // Should use model.maxTokens since it's at 80%
191191
})
192192

193193
test("should return modelMaxTokens from settings when reasoning budget is required", () => {

src/shared/api.ts

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,9 +90,16 @@ export const getModelMaxOutputTokens = ({
9090
return ANTHROPIC_DEFAULT_MAX_TOKENS
9191
}
9292

93-
// If model has explicit maxTokens, clamp it to 20% of the context window
93+
// If model has explicit maxTokens, only clamp it if it exceeds 80% of the context window
94+
// This prevents models from using the entire context for output while still allowing
95+
// models with legitimately high output requirements (like GLM-4.5) to function
9496
if (model.maxTokens) {
95-
return Math.min(model.maxTokens, model.contextWindow * 0.2)
97+
// Only apply clamping if maxTokens is more than 80% of context window
98+
if (model.maxTokens > model.contextWindow * 0.8) {
99+
// Clamp to 80% to leave room for input
100+
return Math.floor(model.contextWindow * 0.8)
101+
}
102+
return model.maxTokens
96103
}
97104

98105
// For non-Anthropic formats without explicit maxTokens, return undefined

0 commit comments

Comments
 (0)