fix: adjust GLM-4.6-turbo max output tokens to prevent context limit errors (#8822)

roomote[bot] · roomote · web-flow · commit 98b8d5b00539 · 2025-10-25T14:08:27.000-04:00
Co-authored-by: Roo Code &lt;roomote@roocode.com&gt;
diff --git a/packages/types/src/providers/chutes.ts b/packages/types/src/providers/chutes.ts
@@ -88,7 +88,8 @@ export const chutesModels = {
 		supportsPromptCache: false,
 		inputPrice: 0.23,
 		outputPrice: 0.9,
-		description: "DeepSeek‑V3.1‑Terminus is an update to V3.1 that improves language consistency by reducing CN/EN mix‑ups and eliminating random characters, while strengthening agent capabilities with notably better Code Agent and Search Agent performance.",
+		description:
+			"DeepSeek‑V3.1‑Terminus is an update to V3.1 that improves language consistency by reducing CN/EN mix‑ups and eliminating random characters, while strengthening agent capabilities with notably better Code Agent and Search Agent performance.",
 	},
 	"deepseek-ai/DeepSeek-V3.1-turbo": {
 		maxTokens: 32768,
@@ -97,7 +98,8 @@ export const chutesModels = {
 		supportsPromptCache: false,
 		inputPrice: 1.0,
 		outputPrice: 3.0,
-		description: "DeepSeek-V3.1-turbo is an FP8, speculative-decoding turbo variant optimized for ultra-fast single-shot queries (~200 TPS), with outputs close to the originals and solid function calling/reasoning/structured output, priced at $1/M input and $3/M output tokens, using 2× quota per request and not intended for bulk workloads.",
+		description:
+			"DeepSeek-V3.1-turbo is an FP8, speculative-decoding turbo variant optimized for ultra-fast single-shot queries (~200 TPS), with outputs close to the originals and solid function calling/reasoning/structured output, priced at $1/M input and $3/M output tokens, using 2× quota per request and not intended for bulk workloads.",
 	},
 	"deepseek-ai/DeepSeek-V3.2-Exp": {
 		maxTokens: 163840,
@@ -106,7 +108,8 @@ export const chutesModels = {
 		supportsPromptCache: false,
 		inputPrice: 0.25,
 		outputPrice: 0.35,
-		description: "DeepSeek-V3.2-Exp is an experimental LLM that introduces DeepSeek Sparse Attention to improve long‑context training and inference efficiency while maintaining performance comparable to V3.1‑Terminus.",
+		description:
+			"DeepSeek-V3.2-Exp is an experimental LLM that introduces DeepSeek Sparse Attention to improve long‑context training and inference efficiency while maintaining performance comparable to V3.1‑Terminus.",
 	},
 	"unsloth/Llama-3.3-70B-Instruct": {
 		maxTokens: 32768, // From Groq
@@ -397,8 +400,9 @@ export const chutesModels = {
 		contextWindow: 262144,
 		supportsImages: true,
 		supportsPromptCache: false,
-		inputPrice: 0.1600,
-		outputPrice: 0.6500,
-		description: "Qwen3‑VL‑235B‑A22B‑Thinking is an open‑weight MoE vision‑language model (235B total, ~22B activated) optimized for deliberate multi‑step reasoning with strong text‑image‑video understanding and long‑context capabilities.",
+		inputPrice: 0.16,
+		outputPrice: 0.65,
+		description:
+			"Qwen3‑VL‑235B‑A22B‑Thinking is an open‑weight MoE vision‑language model (235B total, ~22B activated) optimized for deliberate multi‑step reasoning with strong text‑image‑video understanding and long‑context capabilities.",
 	},
 } as const satisfies Record<string, ModelInfo>
diff --git a/src/api/providers/__tests__/chutes.spec.ts b/src/api/providers/__tests__/chutes.spec.ts
@@ -460,10 +460,13 @@ describe("ChutesHandler", () => {
 		const messageGenerator = handlerWithModel.createMessage(systemPrompt, messages)
 		await messageGenerator.next()
 
+		// Centralized 20% cap should apply to OpenAI-compatible providers like Chutes
+		const expectedMaxTokens = Math.min(modelInfo.maxTokens, Math.ceil(modelInfo.contextWindow * 0.2))
+
 		expect(mockCreate).toHaveBeenCalledWith(
 			expect.objectContaining({
 				model: modelId,
-				max_tokens: modelInfo.maxTokens,
+				max_tokens: expectedMaxTokens,
 				temperature: 0.5,
 				messages: expect.arrayContaining([{ role: "system", content: systemPrompt }]),
 				stream: true,
diff --git a/src/api/providers/__tests__/zai.spec.ts b/src/api/providers/__tests__/zai.spec.ts
@@ -280,10 +280,13 @@ describe("ZAiHandler", () => {
 			const messageGenerator = handlerWithModel.createMessage(systemPrompt, messages)
 			await messageGenerator.next()
 
+			// Centralized 20% cap should apply to OpenAI-compatible providers like Z AI
+			const expectedMaxTokens = Math.min(modelInfo.maxTokens, Math.ceil(modelInfo.contextWindow * 0.2))
+
 			expect(mockCreate).toHaveBeenCalledWith(
 				expect.objectContaining({
 					model: modelId,
-					max_tokens: modelInfo.maxTokens,
+					max_tokens: expectedMaxTokens,
 					temperature: ZAI_DEFAULT_TEMPERATURE,
 					messages: expect.arrayContaining([{ role: "system", content: systemPrompt }]),
 					stream: true,
diff --git a/src/api/providers/base-openai-compatible-provider.ts b/src/api/providers/base-openai-compatible-provider.ts
@@ -3,7 +3,7 @@ import OpenAI from "openai"
 
 import type { ModelInfo } from "@roo-code/types"
 
-import type { ApiHandlerOptions } from "../../shared/api"
+import { type ApiHandlerOptions, getModelMaxOutputTokens } from "../../shared/api"
 import { ApiStream } from "../transform/stream"
 import { convertToOpenAiMessages } from "../transform/openai-format"
 
@@ -69,10 +69,16 @@ export abstract class BaseOpenAiCompatibleProvider<ModelName extends string>
 		metadata?: ApiHandlerCreateMessageMetadata,
 		requestOptions?: OpenAI.RequestOptions,
 	) {
-		const {
-			id: model,
-			info: { maxTokens: max_tokens },
-		} = this.getModel()
+		const { id: model, info } = this.getModel()
+
+		// Centralized cap: clamp to 20% of the context window (unless provider-specific exceptions apply)
+		const max_tokens =
+			getModelMaxOutputTokens({
+				modelId: model,
+				model: info,
+				settings: this.options,
+				format: "openai",
+			}) ?? undefined
 
 		const temperature = this.options.modelTemperature ?? this.defaultTemperature