feat: Add DeepSeek V3.1 variants and GLM-4.6 with reasoning support (#8256)

yieldsurfer · yieldsurfer · commit be2ad231412f · 2025-10-02T22:07:20.000+02:00
- Add DeepSeek-V3.1-Terminus and DeepSeek-V3.1-turbo models - Add GLM-4.6-FP8 model with 200K context window - Fix reasoning implementation to use chat_template_kwargs with thinking parameter - Parse reasoning_content field for hybrid reasoning models (DeepSeek V3.1, GLM-4.5, GLM-4.6) - Update tests to verify reasoning mode functionality - Fix capitalization: DeepSeek-V3.1-Turbo -> DeepSeek-V3.1-turbo Fixes #8256
diff --git a/packages/types/src/providers/chutes.ts b/packages/types/src/providers/chutes.ts
@@ -7,7 +7,7 @@ export type ChutesModelId =
 	| "deepseek-ai/DeepSeek-V3"
 	| "deepseek-ai/DeepSeek-V3.1"
 	| "deepseek-ai/DeepSeek-V3.1-Terminus"
-	| "deepseek-ai/DeepSeek-V3.1-Turbo"
+	| "deepseek-ai/DeepSeek-V3.1-turbo"
 	| "unsloth/Llama-3.3-70B-Instruct"
 	| "chutesai/Llama-4-Scout-17B-16E-Instruct"
 	| "unsloth/Mistral-Nemo-Instruct-2407"
@@ -31,6 +31,7 @@ export type ChutesModelId =
 	| "tngtech/DeepSeek-R1T-Chimera"
 	| "zai-org/GLM-4.5-Air"
 	| "zai-org/GLM-4.5-FP8"
+	| "zai-org/GLM-4.6-FP8"
 	| "moonshotai/Kimi-K2-Instruct-75k"
 	| "moonshotai/Kimi-K2-Instruct-0905"
 	| "Qwen/Qwen3-235B-A22B-Thinking-2507"
@@ -72,6 +73,7 @@ export const chutesModels = {
 		contextWindow: 163840,
 		supportsImages: false,
 		supportsPromptCache: false,
+		supportsReasoningEffort: true,
 		inputPrice: 0,
 		outputPrice: 0,
 		description: "DeepSeek V3.1 model.",
@@ -81,15 +83,17 @@ export const chutesModels = {
 		contextWindow: 163840,
 		supportsImages: false,
 		supportsPromptCache: false,
+		supportsReasoningEffort: true,
 		inputPrice: 0,
 		outputPrice: 0,
 		description: "DeepSeek V3.1 Terminus variant - optimized for complex reasoning and extended context.",
 	},
-	"deepseek-ai/DeepSeek-V3.1-Turbo": {
+	"deepseek-ai/DeepSeek-V3.1-turbo": {
 		maxTokens: 32768,
 		contextWindow: 163840,
 		supportsImages: false,
 		supportsPromptCache: false,
+		supportsReasoningEffort: true,
 		inputPrice: 0,
 		outputPrice: 0,
 		description: "DeepSeek V3.1 Turbo variant - faster inference with maintained quality.",
@@ -279,6 +283,7 @@ export const chutesModels = {
 		contextWindow: 151329,
 		supportsImages: false,
 		supportsPromptCache: false,
+		supportsReasoningEffort: true,
 		inputPrice: 0,
 		outputPrice: 0,
 		description:
@@ -289,11 +294,23 @@ export const chutesModels = {
 		contextWindow: 131072,
 		supportsImages: false,
 		supportsPromptCache: false,
+		supportsReasoningEffort: true,
 		inputPrice: 0,
 		outputPrice: 0,
 		description:
 			"GLM-4.5-FP8 model with 128k token context window, optimized for agent-based applications with MoE architecture.",
 	},
+	"zai-org/GLM-4.6-FP8": {
+		maxTokens: 32768,
+		contextWindow: 204800,
+		supportsImages: false,
+		supportsPromptCache: false,
+		supportsReasoningEffort: true,
+		inputPrice: 0,
+		outputPrice: 0,
+		description:
+			"GLM-4.6-FP8 model with 200K context window, FP8 precision for efficient inference. Improved reasoning, coding, and agent capabilities.",
+	},
 	"Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8": {
 		maxTokens: 32768,
 		contextWindow: 262144,
diff --git a/src/api/providers/__tests__/chutes.spec.ts b/src/api/providers/__tests__/chutes.spec.ts
@@ -253,6 +253,30 @@ describe("ChutesHandler", () => {
 		)
 	})
 
+	it("should return zai-org/GLM-4.6-FP8 model with correct configuration", () => {
+		const testModelId: ChutesModelId = "zai-org/GLM-4.6-FP8"
+		const handlerWithModel = new ChutesHandler({
+			apiModelId: testModelId,
+			chutesApiKey: "test-chutes-api-key",
+		})
+		const model = handlerWithModel.getModel()
+		expect(model.id).toBe(testModelId)
+		expect(model.info).toEqual(
+			expect.objectContaining({
+				maxTokens: 32768,
+				contextWindow: 204800,
+				supportsImages: false,
+				supportsPromptCache: false,
+				supportsReasoningEffort: true,
+				inputPrice: 0,
+				outputPrice: 0,
+				description:
+					"GLM-4.6-FP8 model with 200K context window, FP8 precision for efficient inference. Improved reasoning, coding, and agent capabilities.",
+				temperature: 0.5, // Default temperature for non-DeepSeek models
+			}),
+		)
+	})
+
 	it("should return Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8 model with correct configuration", () => {
 		const testModelId: ChutesModelId = "Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8"
 		const handlerWithModel = new ChutesHandler({
@@ -311,6 +335,7 @@ describe("ChutesHandler", () => {
 				contextWindow: 163840,
 				supportsImages: false,
 				supportsPromptCache: false,
+				supportsReasoningEffort: true,
 				inputPrice: 0,
 				outputPrice: 0,
 				description: "DeepSeek V3.1 Terminus variant - optimized for complex reasoning and extended context.",
@@ -319,8 +344,8 @@ describe("ChutesHandler", () => {
 		)
 	})
 
-	it("should return DeepSeek V3.1 Turbo model with correct configuration", () => {
-		const testModelId: ChutesModelId = "deepseek-ai/DeepSeek-V3.1-Turbo"
+	it("should return DeepSeek V3.1 turbo model with correct configuration", () => {
+		const testModelId: ChutesModelId = "deepseek-ai/DeepSeek-V3.1-turbo"
 		const handlerWithModel = new ChutesHandler({
 			apiModelId: testModelId,
 			chutesApiKey: "test-chutes-api-key",
@@ -333,6 +358,7 @@ describe("ChutesHandler", () => {
 				contextWindow: 163840,
 				supportsImages: false,
 				supportsPromptCache: false,
+				supportsReasoningEffort: true,
 				inputPrice: 0,
 				outputPrice: 0,
 				description: "DeepSeek V3.1 Turbo variant - faster inference with maintained quality.",
@@ -515,7 +541,7 @@ describe("ChutesHandler", () => {
 		expect(model.info.temperature).toBe(0.5)
 	})
 
-	it.skip("should enable reasoning for DeepSeek V3.1 models when enableReasoningEffort is true", async () => {
+	it("should enable reasoning for DeepSeek V3.1 models when enableReasoningEffort is true", async () => {
 		const modelId: ChutesModelId = "deepseek-ai/DeepSeek-V3.1"
 		const handlerWithModel = new ChutesHandler({
 			apiModelId: modelId,
@@ -525,10 +551,17 @@ describe("ChutesHandler", () => {
 
 		mockCreate.mockImplementationOnce(async () => ({
 			[Symbol.asyncIterator]: async function* () {
+				// First yield reasoning content
 				yield {
-					choices: [{ delta: { content: "<think>Reasoning content</think>Regular content" } }],
+					choices: [{ delta: { reasoning_content: "Let me think about this..." } }],
 				}
+				// Then yield regular content
 				yield {
+					choices: [{ delta: { content: "Here's my response." } }],
+				}
+				// Finally yield usage
+				yield {
+					choices: [],
 					usage: { prompt_tokens: 100, completion_tokens: 50 },
 				}
 			},
@@ -543,12 +576,22 @@ describe("ChutesHandler", () => {
 			chunks.push(chunk)
 		}
 
-		// Should parse reasoning content separately
-		expect(chunks).toContainEqual({ type: "reasoning", text: "Reasoning content" })
-		expect(chunks).toContainEqual({ type: "text", text: "Regular content" })
+		// Should parse reasoning content and regular content separately
+		expect(chunks).toContainEqual({ type: "reasoning", text: "Let me think about this..." })
+		expect(chunks).toContainEqual({ type: "text", text: "Here's my response." })
+		expect(chunks).toContainEqual({ type: "usage", inputTokens: 100, outputTokens: 50 })
+
+		// Verify that the API was called with reasoning enabled
+		expect(mockCreate).toHaveBeenCalledWith(
+			expect.objectContaining({
+				chat_template_kwargs: {
+					thinking: true,
+				},
+			}),
+		)
 	})
 
-	it.skip("should enable reasoning for GLM-4.5 models when enableReasoningEffort is true", async () => {
+	it("should enable reasoning for GLM-4.5 models when enableReasoningEffort is true", async () => {
 		const modelId: ChutesModelId = "zai-org/GLM-4.5-Air"
 		const handlerWithModel = new ChutesHandler({
 			apiModelId: modelId,
@@ -558,10 +601,17 @@ describe("ChutesHandler", () => {
 
 		mockCreate.mockImplementationOnce(async () => ({
 			[Symbol.asyncIterator]: async function* () {
+				// First yield reasoning content
+				yield {
+					choices: [{ delta: { reasoning_content: "GLM reasoning process..." } }],
+				}
+				// Then yield regular content
 				yield {
-					choices: [{ delta: { content: "<think>GLM reasoning</think>GLM response" } }],
+					choices: [{ delta: { content: "GLM response" } }],
 				}
+				// Finally yield usage
 				yield {
+					choices: [],
 					usage: { prompt_tokens: 100, completion_tokens: 50 },
 				}
 			},
@@ -577,8 +627,17 @@ describe("ChutesHandler", () => {
 		}
 
 		// Should parse reasoning content separately
-		expect(chunks).toContainEqual({ type: "reasoning", text: "GLM reasoning" })
+		expect(chunks).toContainEqual({ type: "reasoning", text: "GLM reasoning process..." })
 		expect(chunks).toContainEqual({ type: "text", text: "GLM response" })
+
+		// Verify that the API was called with reasoning enabled
+		expect(mockCreate).toHaveBeenCalledWith(
+			expect.objectContaining({
+				chat_template_kwargs: {
+					thinking: true,
+				},
+			}),
+		)
 	})
 
 	it.skip("should disable reasoning for DeepSeek V3.1 models when enableReasoningEffort is false", async () => {
@@ -595,6 +654,7 @@ describe("ChutesHandler", () => {
 					choices: [{ delta: { content: "<think>Reasoning content</think>Regular content" } }],
 				}
 				yield {
+					choices: [],
 					usage: { prompt_tokens: 100, completion_tokens: 50 },
 				}
 			},
diff --git a/src/api/providers/chutes.ts b/src/api/providers/chutes.ts
@@ -27,6 +27,7 @@ export class ChutesHandler extends BaseOpenAiCompatibleProvider<ChutesModelId> {
 	private getCompletionParams(
 		systemPrompt: string,
 		messages: Anthropic.Messages.MessageParam[],
+		enableReasoning: boolean = false,
 	): OpenAI.Chat.Completions.ChatCompletionCreateParamsStreaming {
 		const {
 			id: model,
@@ -35,36 +36,33 @@ export class ChutesHandler extends BaseOpenAiCompatibleProvider<ChutesModelId> {
 
 		const temperature = this.options.modelTemperature ?? this.getModel().info.temperature
 
-		return {
+		const params: any = {
 			model,
 			max_tokens,
 			temperature,
 			messages: [{ role: "system", content: systemPrompt }, ...convertToOpenAiMessages(messages)],
 			stream: true,
 			stream_options: { include_usage: true },
 		}
+
+		// Add reasoning support for DeepSeek V3.1, GLM-4.5, and GLM-4.6 models
+		if (enableReasoning) {
+			params.chat_template_kwargs = {
+				thinking: true,
+			}
+		}
+
+		return params
 	}
 
 	override async *createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream {
 		const model = this.getModel()
 
-		// Check if this is a model that supports reasoning mode
-		const modelSupportsReasoning =
-			model.id.includes("DeepSeek-R1") || model.id.includes("DeepSeek-V3.1") || model.id.includes("GLM-4.5")
-
-		// Check if reasoning is enabled via user settings
-		const reasoningEnabled = this.options.enableReasoningEffort !== false
-
-		if (modelSupportsReasoning && reasoningEnabled) {
-			// For DeepSeek R1 models, use the R1 format conversion
-			const isR1Model = model.id.includes("DeepSeek-R1")
-			const messageParams = isR1Model
-				? { messages: convertToR1Format([{ role: "user", content: systemPrompt }, ...messages]) }
-				: {}
-
+		// Handle DeepSeek R1 models with XML tag parsing
+		if (model.id.includes("DeepSeek-R1")) {
 			const stream = await this.client.chat.completions.create({
 				...this.getCompletionParams(systemPrompt, messages),
-				...messageParams,
+				messages: convertToR1Format([{ role: "user", content: systemPrompt }, ...messages]),
 			})
 
 			const matcher = new XmlMatcher(
@@ -98,7 +96,48 @@ export class ChutesHandler extends BaseOpenAiCompatibleProvider<ChutesModelId> {
 			for (const processedChunk of matcher.final()) {
 				yield processedChunk
 			}
+			return
+		}
+
+		// Handle DeepSeek V3.1, GLM-4.5, and GLM-4.6 models with reasoning_content parsing
+		const isHybridReasoningModel =
+			model.id.includes("DeepSeek-V3.1") || model.id.includes("GLM-4.5") || model.id.includes("GLM-4.6")
+		const reasoningEnabled = this.options.enableReasoningEffort === true
+
+		if (isHybridReasoningModel && reasoningEnabled) {
+			const stream = await this.client.chat.completions.create(
+				this.getCompletionParams(systemPrompt, messages, true),
+			)
+
+			for await (const chunk of stream) {
+				const delta = chunk.choices[0]?.delta
+
+				// Handle reasoning content from the response
+				if ((delta as any)?.reasoning_content) {
+					yield {
+						type: "reasoning",
+						text: (delta as any).reasoning_content,
+					}
+				}
+
+				// Handle regular text content
+				if (delta?.content) {
+					yield {
+						type: "text",
+						text: delta.content,
+					}
+				}
+
+				if (chunk.usage) {
+					yield {
+						type: "usage",
+						inputTokens: chunk.usage.prompt_tokens || 0,
+						outputTokens: chunk.usage.completion_tokens || 0,
+					}
+				}
+			}
 		} else {
+			// For non-reasoning models or when reasoning is disabled, use the base implementation
 			yield* super.createMessage(systemPrompt, messages)
 		}
 	}