fix: adjust token clamping threshold from 20% to 80% for GLM-4.5 compatibility

roomote · roomote · commit 1fb46fc67088 · 2025-08-07T13:41:47.000Z
The previous 20% clamping threshold was too restrictive for models like GLM-4.5 that have legitimate high output token requirements (98,304 tokens out of 131,072 context window = 75%). This change only applies clamping when maxTokens exceeds 80% of the context window, preventing models from using the entire context for output while still allowing models with high output requirements to function properly. Fixes #6806
diff --git a/src/api/providers/__tests__/openrouter.spec.ts b/src/api/providers/__tests__/openrouter.spec.ts
@@ -99,8 +99,8 @@ describe("OpenRouterHandler", () => {
 
 			const result = await handler.fetchModel()
 			// With the new clamping logic, 128000 tokens (64% of 200000 context window)
-			// gets clamped to 20% of context window: 200000 * 0.2 = 40000
-			expect(result.maxTokens).toBe(40000)
+			// is below the 80% threshold, so it should not be clamped
+			expect(result.maxTokens).toBe(128000)
 			expect(result.reasoningBudget).toBeUndefined()
 			expect(result.temperature).toBe(0)
 		})
diff --git a/src/api/transform/__tests__/model-params.spec.ts b/src/api/transform/__tests__/model-params.spec.ts
@@ -293,12 +293,12 @@ describe("getModelParams", () => {
 		it("should not honor customMaxThinkingTokens for non-reasoning budget models", () => {
 			const model: ModelInfo = {
 				...baseModel,
-				maxTokens: 3000, // 3000 is 18.75% of 16000 context window, within 20% threshold
+				maxTokens: 3000, // 3000 is 18.75% of 16000 context window, within 80% threshold
 			}
 
 			expect(getModelParams({ ...anthropicParams, settings: { modelMaxThinkingTokens: 1500 }, model })).toEqual({
 				format: anthropicParams.format,
-				maxTokens: 3000, // Uses model.maxTokens since it's within 20% threshold
+				maxTokens: 3000, // Uses model.maxTokens since it's within 80% threshold
 				temperature: 0, // Using default temperature.
 				reasoningEffort: undefined,
 				reasoningBudget: undefined, // Should remain undefined despite customMaxThinkingTokens being set.
@@ -565,7 +565,7 @@ describe("getModelParams", () => {
 		it("should use reasoningEffort if supportsReasoningEffort is false but reasoningEffort is set", () => {
 			const model: ModelInfo = {
 				...baseModel,
-				maxTokens: 3000, // Changed to 3000 (18.75% of 16000), which is within 20% threshold
+				maxTokens: 3000, // Changed to 3000 (18.75% of 16000), which is within 80% threshold
 				supportsReasoningEffort: false,
 				reasoningEffort: "medium",
 			}
@@ -576,7 +576,7 @@ describe("getModelParams", () => {
 				model,
 			})
 
-			expect(result.maxTokens).toBe(3000) // Now uses model.maxTokens since it's within 20% threshold
+			expect(result.maxTokens).toBe(3000) // Now uses model.maxTokens since it's within 80% threshold
 			expect(result.reasoningEffort).toBe("medium")
 		})
 	})
diff --git a/src/shared/__tests__/api.spec.ts b/src/shared/__tests__/api.spec.ts
@@ -25,13 +25,13 @@ describe("getModelMaxOutputTokens", () => {
 		expect(result).toBe(16384)
 	})
 
-	test("should return model maxTokens when not using claude-code provider and maxTokens is within 20% of context window", () => {
+	test("should return model maxTokens when not using claude-code provider and maxTokens is within 80% of context window", () => {
 		const settings: ProviderSettings = {
 			apiProvider: "anthropic",
 		}
 
 		// mockModel has maxTokens: 8192 and contextWindow: 200000
-		// 8192 is 4.096% of 200000, which is <= 20%, so it should use model.maxTokens
+		// 8192 is 4.096% of 200000, which is <= 80%, so it should use model.maxTokens
 		const result = getModelMaxOutputTokens({
 			modelId: "claude-3-5-sonnet-20241022",
 			model: mockModel,
@@ -117,7 +117,7 @@ describe("getModelMaxOutputTokens", () => {
 			contextWindow: 1_048_576,
 			supportsPromptCache: false,
 			supportsReasoningBudget: true,
-			maxTokens: 65_535, // 65_535 is ~6.25% of 1_048_576, which is <= 20%
+			maxTokens: 65_535, // 65_535 is ~6.25% of 1_048_576, which is <= 80%
 		}
 
 		const settings: ProviderSettings = {
@@ -126,14 +126,14 @@ describe("getModelMaxOutputTokens", () => {
 		}
 
 		const result = getModelMaxOutputTokens({ modelId: geminiModelId, model, settings })
-		expect(result).toBe(65_535) // Should use model.maxTokens since it's within 20% threshold
+		expect(result).toBe(65_535) // Should use model.maxTokens since it's within 80% threshold
 	})
 
-	test("should clamp maxTokens to 20% of context window when maxTokens exceeds threshold", () => {
+	test("should clamp maxTokens to 80% of context window when maxTokens exceeds threshold", () => {
 		const model: ModelInfo = {
 			contextWindow: 100_000,
 			supportsPromptCache: false,
-			maxTokens: 50_000, // 50% of context window, exceeds 20% threshold
+			maxTokens: 90_000, // 90% of context window, exceeds 80% threshold
 		}
 
 		const settings: ProviderSettings = {
@@ -146,15 +146,15 @@ describe("getModelMaxOutputTokens", () => {
 			settings,
 			format: "openai",
 		})
-		// Should clamp to 20% of context window: 100_000 * 0.2 = 20_000
-		expect(result).toBe(20_000)
+		// Should clamp to 80% of context window: 100_000 * 0.8 = 80_000
+		expect(result).toBe(80_000)
 	})
 
-	test("should clamp maxTokens to 20% of context window for Anthropic models when maxTokens exceeds threshold", () => {
+	test("should clamp maxTokens to 80% of context window for Anthropic models when maxTokens exceeds threshold", () => {
 		const model: ModelInfo = {
 			contextWindow: 100_000,
 			supportsPromptCache: true,
-			maxTokens: 50_000, // 50% of context window, exceeds 20% threshold
+			maxTokens: 90_000, // 90% of context window, exceeds 80% threshold
 		}
 
 		const settings: ProviderSettings = {
@@ -166,15 +166,15 @@ describe("getModelMaxOutputTokens", () => {
 			model,
 			settings,
 		})
-		// Should clamp to 20% of context window: 100_000 * 0.2 = 20_000
-		expect(result).toBe(20_000)
+		// Should clamp to 80% of context window: 100_000 * 0.8 = 80_000
+		expect(result).toBe(80_000)
 	})
 
-	test("should use model.maxTokens when exactly at 20% threshold", () => {
+	test("should use model.maxTokens when at or below 80% threshold", () => {
 		const model: ModelInfo = {
 			contextWindow: 100_000,
 			supportsPromptCache: false,
-			maxTokens: 20_000, // Exactly 20% of context window
+			maxTokens: 80_000, // Exactly 80% of context window
 		}
 
 		const settings: ProviderSettings = {
@@ -187,7 +187,7 @@ describe("getModelMaxOutputTokens", () => {
 			settings,
 			format: "openai",
 		})
-		expect(result).toBe(20_000) // Should use model.maxTokens since it's exactly at 20%
+		expect(result).toBe(80_000) // Should use model.maxTokens since it's at 80%
 	})
 
 	test("should return modelMaxTokens from settings when reasoning budget is required", () => {
diff --git a/src/shared/api.ts b/src/shared/api.ts
@@ -90,9 +90,16 @@ export const getModelMaxOutputTokens = ({
 		return ANTHROPIC_DEFAULT_MAX_TOKENS
 	}
 
-	// If model has explicit maxTokens, clamp it to 20% of the context window
+	// If model has explicit maxTokens, only clamp it if it exceeds 80% of the context window
+	// This prevents models from using the entire context for output while still allowing
+	// models with legitimately high output requirements (like GLM-4.5) to function
 	if (model.maxTokens) {
-		return Math.min(model.maxTokens, model.contextWindow * 0.2)
+		// Only apply clamping if maxTokens is more than 80% of context window
+		if (model.maxTokens > model.contextWindow * 0.8) {
+			// Clamp to 80% to leave room for input
+			return Math.floor(model.contextWindow * 0.8)
+		}
+		return model.maxTokens
 	}
 
 	// For non-Anthropic formats without explicit maxTokens, return undefined