Skip to content

Commit 98b8d5b

Browse files
fix: adjust GLM-4.6-turbo max output tokens to prevent context limit errors (#8822)
Co-authored-by: Roo Code <[email protected]>
1 parent 97331bc commit 98b8d5b

File tree

4 files changed

+29
-13
lines changed

4 files changed

+29
-13
lines changed

packages/types/src/providers/chutes.ts

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,8 @@ export const chutesModels = {
8888
supportsPromptCache: false,
8989
inputPrice: 0.23,
9090
outputPrice: 0.9,
91-
description: "DeepSeek‑V3.1‑Terminus is an update to V3.1 that improves language consistency by reducing CN/EN mix‑ups and eliminating random characters, while strengthening agent capabilities with notably better Code Agent and Search Agent performance.",
91+
description:
92+
"DeepSeek‑V3.1‑Terminus is an update to V3.1 that improves language consistency by reducing CN/EN mix‑ups and eliminating random characters, while strengthening agent capabilities with notably better Code Agent and Search Agent performance.",
9293
},
9394
"deepseek-ai/DeepSeek-V3.1-turbo": {
9495
maxTokens: 32768,
@@ -97,7 +98,8 @@ export const chutesModels = {
9798
supportsPromptCache: false,
9899
inputPrice: 1.0,
99100
outputPrice: 3.0,
100-
description: "DeepSeek-V3.1-turbo is an FP8, speculative-decoding turbo variant optimized for ultra-fast single-shot queries (~200 TPS), with outputs close to the originals and solid function calling/reasoning/structured output, priced at $1/M input and $3/M output tokens, using 2× quota per request and not intended for bulk workloads.",
101+
description:
102+
"DeepSeek-V3.1-turbo is an FP8, speculative-decoding turbo variant optimized for ultra-fast single-shot queries (~200 TPS), with outputs close to the originals and solid function calling/reasoning/structured output, priced at $1/M input and $3/M output tokens, using 2× quota per request and not intended for bulk workloads.",
101103
},
102104
"deepseek-ai/DeepSeek-V3.2-Exp": {
103105
maxTokens: 163840,
@@ -106,7 +108,8 @@ export const chutesModels = {
106108
supportsPromptCache: false,
107109
inputPrice: 0.25,
108110
outputPrice: 0.35,
109-
description: "DeepSeek-V3.2-Exp is an experimental LLM that introduces DeepSeek Sparse Attention to improve long‑context training and inference efficiency while maintaining performance comparable to V3.1‑Terminus.",
111+
description:
112+
"DeepSeek-V3.2-Exp is an experimental LLM that introduces DeepSeek Sparse Attention to improve long‑context training and inference efficiency while maintaining performance comparable to V3.1‑Terminus.",
110113
},
111114
"unsloth/Llama-3.3-70B-Instruct": {
112115
maxTokens: 32768, // From Groq
@@ -397,8 +400,9 @@ export const chutesModels = {
397400
contextWindow: 262144,
398401
supportsImages: true,
399402
supportsPromptCache: false,
400-
inputPrice: 0.1600,
401-
outputPrice: 0.6500,
402-
description: "Qwen3‑VL‑235B‑A22B‑Thinking is an open‑weight MoE vision‑language model (235B total, ~22B activated) optimized for deliberate multi‑step reasoning with strong text‑image‑video understanding and long‑context capabilities.",
403+
inputPrice: 0.16,
404+
outputPrice: 0.65,
405+
description:
406+
"Qwen3‑VL‑235B‑A22B‑Thinking is an open‑weight MoE vision‑language model (235B total, ~22B activated) optimized for deliberate multi‑step reasoning with strong text‑image‑video understanding and long‑context capabilities.",
403407
},
404408
} as const satisfies Record<string, ModelInfo>

src/api/providers/__tests__/chutes.spec.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -460,10 +460,13 @@ describe("ChutesHandler", () => {
460460
const messageGenerator = handlerWithModel.createMessage(systemPrompt, messages)
461461
await messageGenerator.next()
462462

463+
// Centralized 20% cap should apply to OpenAI-compatible providers like Chutes
464+
const expectedMaxTokens = Math.min(modelInfo.maxTokens, Math.ceil(modelInfo.contextWindow * 0.2))
465+
463466
expect(mockCreate).toHaveBeenCalledWith(
464467
expect.objectContaining({
465468
model: modelId,
466-
max_tokens: modelInfo.maxTokens,
469+
max_tokens: expectedMaxTokens,
467470
temperature: 0.5,
468471
messages: expect.arrayContaining([{ role: "system", content: systemPrompt }]),
469472
stream: true,

src/api/providers/__tests__/zai.spec.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -280,10 +280,13 @@ describe("ZAiHandler", () => {
280280
const messageGenerator = handlerWithModel.createMessage(systemPrompt, messages)
281281
await messageGenerator.next()
282282

283+
// Centralized 20% cap should apply to OpenAI-compatible providers like Z AI
284+
const expectedMaxTokens = Math.min(modelInfo.maxTokens, Math.ceil(modelInfo.contextWindow * 0.2))
285+
283286
expect(mockCreate).toHaveBeenCalledWith(
284287
expect.objectContaining({
285288
model: modelId,
286-
max_tokens: modelInfo.maxTokens,
289+
max_tokens: expectedMaxTokens,
287290
temperature: ZAI_DEFAULT_TEMPERATURE,
288291
messages: expect.arrayContaining([{ role: "system", content: systemPrompt }]),
289292
stream: true,

src/api/providers/base-openai-compatible-provider.ts

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import OpenAI from "openai"
33

44
import type { ModelInfo } from "@roo-code/types"
55

6-
import type { ApiHandlerOptions } from "../../shared/api"
6+
import { type ApiHandlerOptions, getModelMaxOutputTokens } from "../../shared/api"
77
import { ApiStream } from "../transform/stream"
88
import { convertToOpenAiMessages } from "../transform/openai-format"
99

@@ -69,10 +69,16 @@ export abstract class BaseOpenAiCompatibleProvider<ModelName extends string>
6969
metadata?: ApiHandlerCreateMessageMetadata,
7070
requestOptions?: OpenAI.RequestOptions,
7171
) {
72-
const {
73-
id: model,
74-
info: { maxTokens: max_tokens },
75-
} = this.getModel()
72+
const { id: model, info } = this.getModel()
73+
74+
// Centralized cap: clamp to 20% of the context window (unless provider-specific exceptions apply)
75+
const max_tokens =
76+
getModelMaxOutputTokens({
77+
modelId: model,
78+
model: info,
79+
settings: this.options,
80+
format: "openai",
81+
}) ?? undefined
7682

7783
const temperature = this.options.modelTemperature ?? this.defaultTemperature
7884

0 commit comments

Comments
 (0)