diff --git a/packages/types/src/providers/chutes.ts b/packages/types/src/providers/chutes.ts
index 15dea58263..8d85bb59c6 100644
--- a/packages/types/src/providers/chutes.ts
+++ b/packages/types/src/providers/chutes.ts
@@ -6,6 +6,8 @@ export type ChutesModelId =
| "deepseek-ai/DeepSeek-R1"
| "deepseek-ai/DeepSeek-V3"
| "deepseek-ai/DeepSeek-V3.1"
+ | "deepseek-ai/DeepSeek-V3.1-Terminus"
+ | "deepseek-ai/DeepSeek-V3.1-turbo"
| "unsloth/Llama-3.3-70B-Instruct"
| "chutesai/Llama-4-Scout-17B-16E-Instruct"
| "unsloth/Mistral-Nemo-Instruct-2407"
@@ -29,6 +31,7 @@ export type ChutesModelId =
| "tngtech/DeepSeek-R1T-Chimera"
| "zai-org/GLM-4.5-Air"
| "zai-org/GLM-4.5-FP8"
+ | "zai-org/GLM-4.6-FP8"
| "moonshotai/Kimi-K2-Instruct-75k"
| "moonshotai/Kimi-K2-Instruct-0905"
| "Qwen/Qwen3-235B-A22B-Thinking-2507"
@@ -70,10 +73,31 @@ export const chutesModels = {
contextWindow: 163840,
supportsImages: false,
supportsPromptCache: false,
+ supportsReasoningEffort: true,
inputPrice: 0,
outputPrice: 0,
description: "DeepSeek V3.1 model.",
},
+ "deepseek-ai/DeepSeek-V3.1-Terminus": {
+ maxTokens: 32768,
+ contextWindow: 163840,
+ supportsImages: false,
+ supportsPromptCache: false,
+ supportsReasoningEffort: true,
+ inputPrice: 0,
+ outputPrice: 0,
+ description: "DeepSeek V3.1 Terminus variant - optimized for complex reasoning and extended context.",
+ },
+ "deepseek-ai/DeepSeek-V3.1-turbo": {
+ maxTokens: 32768,
+ contextWindow: 163840,
+ supportsImages: false,
+ supportsPromptCache: false,
+ supportsReasoningEffort: true,
+ inputPrice: 0,
+ outputPrice: 0,
+ description: "DeepSeek V3.1 Turbo variant - faster inference with maintained quality.",
+ },
"unsloth/Llama-3.3-70B-Instruct": {
maxTokens: 32768, // From Groq
contextWindow: 131072, // From Groq
@@ -259,6 +283,7 @@ export const chutesModels = {
contextWindow: 151329,
supportsImages: false,
supportsPromptCache: false,
+ supportsReasoningEffort: true,
inputPrice: 0,
outputPrice: 0,
description:
@@ -269,11 +294,23 @@ export const chutesModels = {
contextWindow: 131072,
supportsImages: false,
supportsPromptCache: false,
+ supportsReasoningEffort: true,
inputPrice: 0,
outputPrice: 0,
description:
"GLM-4.5-FP8 model with 128k token context window, optimized for agent-based applications with MoE architecture.",
},
+ "zai-org/GLM-4.6-FP8": {
+ maxTokens: 32768,
+ contextWindow: 204800,
+ supportsImages: false,
+ supportsPromptCache: false,
+ supportsReasoningEffort: true,
+ inputPrice: 0,
+ outputPrice: 0,
+ description:
+ "GLM-4.6-FP8 model with 200K context window, FP8 precision for efficient inference. Improved reasoning, coding, and agent capabilities.",
+ },
"Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8": {
maxTokens: 32768,
contextWindow: 262144,
diff --git a/src/api/providers/__tests__/chutes.spec.ts b/src/api/providers/__tests__/chutes.spec.ts
index 398f86ce60..dbf5c77712 100644
--- a/src/api/providers/__tests__/chutes.spec.ts
+++ b/src/api/providers/__tests__/chutes.spec.ts
@@ -253,6 +253,30 @@ describe("ChutesHandler", () => {
)
})
+ it("should return zai-org/GLM-4.6-FP8 model with correct configuration", () => {
+ const testModelId: ChutesModelId = "zai-org/GLM-4.6-FP8"
+ const handlerWithModel = new ChutesHandler({
+ apiModelId: testModelId,
+ chutesApiKey: "test-chutes-api-key",
+ })
+ const model = handlerWithModel.getModel()
+ expect(model.id).toBe(testModelId)
+ expect(model.info).toEqual(
+ expect.objectContaining({
+ maxTokens: 32768,
+ contextWindow: 204800,
+ supportsImages: false,
+ supportsPromptCache: false,
+ supportsReasoningEffort: true,
+ inputPrice: 0,
+ outputPrice: 0,
+ description:
+ "GLM-4.6-FP8 model with 200K context window, FP8 precision for efficient inference. Improved reasoning, coding, and agent capabilities.",
+ temperature: 0.5, // Default temperature for non-DeepSeek models
+ }),
+ )
+ })
+
it("should return Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8 model with correct configuration", () => {
const testModelId: ChutesModelId = "Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8"
const handlerWithModel = new ChutesHandler({
@@ -297,6 +321,52 @@ describe("ChutesHandler", () => {
)
})
+ it("should return DeepSeek V3.1 Terminus model with correct configuration", () => {
+ const testModelId: ChutesModelId = "deepseek-ai/DeepSeek-V3.1-Terminus"
+ const handlerWithModel = new ChutesHandler({
+ apiModelId: testModelId,
+ chutesApiKey: "test-chutes-api-key",
+ })
+ const model = handlerWithModel.getModel()
+ expect(model.id).toBe(testModelId)
+ expect(model.info).toEqual(
+ expect.objectContaining({
+ maxTokens: 32768,
+ contextWindow: 163840,
+ supportsImages: false,
+ supportsPromptCache: false,
+ supportsReasoningEffort: true,
+ inputPrice: 0,
+ outputPrice: 0,
+ description: "DeepSeek V3.1 Terminus variant - optimized for complex reasoning and extended context.",
+ temperature: 0.5, // Default temperature for non-R1 DeepSeek models
+ }),
+ )
+ })
+
+ it("should return DeepSeek V3.1 turbo model with correct configuration", () => {
+ const testModelId: ChutesModelId = "deepseek-ai/DeepSeek-V3.1-turbo"
+ const handlerWithModel = new ChutesHandler({
+ apiModelId: testModelId,
+ chutesApiKey: "test-chutes-api-key",
+ })
+ const model = handlerWithModel.getModel()
+ expect(model.id).toBe(testModelId)
+ expect(model.info).toEqual(
+ expect.objectContaining({
+ maxTokens: 32768,
+ contextWindow: 163840,
+ supportsImages: false,
+ supportsPromptCache: false,
+ supportsReasoningEffort: true,
+ inputPrice: 0,
+ outputPrice: 0,
+ description: "DeepSeek V3.1 Turbo variant - faster inference with maintained quality.",
+ temperature: 0.5, // Default temperature for non-R1 DeepSeek models
+ }),
+ )
+ })
+
it("should return moonshotai/Kimi-K2-Instruct-0905 model with correct configuration", () => {
const testModelId: ChutesModelId = "moonshotai/Kimi-K2-Instruct-0905"
const handlerWithModel = new ChutesHandler({
@@ -470,4 +540,137 @@ describe("ChutesHandler", () => {
const model = handlerWithModel.getModel()
expect(model.info.temperature).toBe(0.5)
})
+
+ it("should enable reasoning for DeepSeek V3.1 models when enableReasoningEffort is true", async () => {
+ const modelId: ChutesModelId = "deepseek-ai/DeepSeek-V3.1"
+ const handlerWithModel = new ChutesHandler({
+ apiModelId: modelId,
+ chutesApiKey: "test-chutes-api-key",
+ enableReasoningEffort: true,
+ })
+
+ mockCreate.mockImplementationOnce(async () => ({
+ [Symbol.asyncIterator]: async function* () {
+ // First yield reasoning content
+ yield {
+ choices: [{ delta: { reasoning_content: "Let me think about this..." } }],
+ }
+ // Then yield regular content
+ yield {
+ choices: [{ delta: { content: "Here's my response." } }],
+ }
+ // Finally yield usage
+ yield {
+ choices: [],
+ usage: { prompt_tokens: 100, completion_tokens: 50 },
+ }
+ },
+ }))
+
+ const systemPrompt = "You are a helpful assistant"
+ const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Hello" }]
+
+ const stream = handlerWithModel.createMessage(systemPrompt, messages)
+ const chunks = []
+ for await (const chunk of stream) {
+ chunks.push(chunk)
+ }
+
+ // Should parse reasoning content and regular content separately
+ expect(chunks).toContainEqual({ type: "reasoning", text: "Let me think about this..." })
+ expect(chunks).toContainEqual({ type: "text", text: "Here's my response." })
+ expect(chunks).toContainEqual({ type: "usage", inputTokens: 100, outputTokens: 50 })
+
+ // Verify that the API was called with reasoning enabled
+ expect(mockCreate).toHaveBeenCalledWith(
+ expect.objectContaining({
+ chat_template_kwargs: {
+ thinking: true,
+ },
+ }),
+ )
+ })
+
+ it("should enable reasoning for GLM-4.5 models when enableReasoningEffort is true", async () => {
+ const modelId: ChutesModelId = "zai-org/GLM-4.5-Air"
+ const handlerWithModel = new ChutesHandler({
+ apiModelId: modelId,
+ chutesApiKey: "test-chutes-api-key",
+ enableReasoningEffort: true,
+ })
+
+ mockCreate.mockImplementationOnce(async () => ({
+ [Symbol.asyncIterator]: async function* () {
+ // First yield reasoning content
+ yield {
+ choices: [{ delta: { reasoning_content: "GLM reasoning process..." } }],
+ }
+ // Then yield regular content
+ yield {
+ choices: [{ delta: { content: "GLM response" } }],
+ }
+ // Finally yield usage
+ yield {
+ choices: [],
+ usage: { prompt_tokens: 100, completion_tokens: 50 },
+ }
+ },
+ }))
+
+ const systemPrompt = "You are a helpful assistant"
+ const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Hello" }]
+
+ const stream = handlerWithModel.createMessage(systemPrompt, messages)
+ const chunks = []
+ for await (const chunk of stream) {
+ chunks.push(chunk)
+ }
+
+ // Should parse reasoning content separately
+ expect(chunks).toContainEqual({ type: "reasoning", text: "GLM reasoning process..." })
+ expect(chunks).toContainEqual({ type: "text", text: "GLM response" })
+
+ // Verify that the API was called with reasoning enabled
+ expect(mockCreate).toHaveBeenCalledWith(
+ expect.objectContaining({
+ chat_template_kwargs: {
+ thinking: true,
+ },
+ }),
+ )
+ })
+
+ it.skip("should disable reasoning for DeepSeek V3.1 models when enableReasoningEffort is false", async () => {
+ const modelId: ChutesModelId = "deepseek-ai/DeepSeek-V3.1"
+ const handlerWithModel = new ChutesHandler({
+ apiModelId: modelId,
+ chutesApiKey: "test-chutes-api-key",
+ enableReasoningEffort: false,
+ })
+
+ mockCreate.mockImplementationOnce(async () => ({
+ [Symbol.asyncIterator]: async function* () {
+ yield {
+ choices: [{ delta: { content: "Reasoning contentRegular content" } }],
+ }
+ yield {
+ choices: [],
+ usage: { prompt_tokens: 100, completion_tokens: 50 },
+ }
+ },
+ }))
+
+ const systemPrompt = "You are a helpful assistant"
+ const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Hello" }]
+
+ const stream = handlerWithModel.createMessage(systemPrompt, messages)
+ const chunks = []
+ for await (const chunk of stream) {
+ chunks.push(chunk)
+ }
+
+ // Should NOT parse reasoning content when disabled
+ expect(chunks).toContainEqual({ type: "text", text: "Reasoning contentRegular content" })
+ expect(chunks).not.toContainEqual({ type: "reasoning", text: "Reasoning content" })
+ })
})
diff --git a/src/api/providers/chutes.ts b/src/api/providers/chutes.ts
index 62121bd19d..37e23fc776 100644
--- a/src/api/providers/chutes.ts
+++ b/src/api/providers/chutes.ts
@@ -3,6 +3,7 @@ import { Anthropic } from "@anthropic-ai/sdk"
import OpenAI from "openai"
import type { ApiHandlerOptions } from "../../shared/api"
+import { shouldUseReasoningEffort } from "../../shared/api"
import { XmlMatcher } from "../../utils/xml-matcher"
import { convertToR1Format } from "../transform/r1-format"
import { convertToOpenAiMessages } from "../transform/openai-format"
@@ -26,6 +27,7 @@ export class ChutesHandler extends BaseOpenAiCompatibleProvider {
private getCompletionParams(
systemPrompt: string,
messages: Anthropic.Messages.MessageParam[],
+ enableReasoning: boolean = false,
): OpenAI.Chat.Completions.ChatCompletionCreateParamsStreaming {
const {
id: model,
@@ -34,7 +36,7 @@ export class ChutesHandler extends BaseOpenAiCompatibleProvider {
const temperature = this.options.modelTemperature ?? this.getModel().info.temperature
- return {
+ const params: any = {
model,
max_tokens,
temperature,
@@ -42,11 +44,21 @@ export class ChutesHandler extends BaseOpenAiCompatibleProvider {
stream: true,
stream_options: { include_usage: true },
}
+
+ // Add reasoning support for DeepSeek V3.1, GLM-4.5, and GLM-4.6 models
+ if (enableReasoning) {
+ params.chat_template_kwargs = {
+ thinking: true,
+ }
+ }
+
+ return params
}
override async *createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream {
const model = this.getModel()
+ // Handle DeepSeek R1 models with XML tag parsing
if (model.id.includes("DeepSeek-R1")) {
const stream = await this.client.chat.completions.create({
...this.getCompletionParams(systemPrompt, messages),
@@ -84,7 +96,48 @@ export class ChutesHandler extends BaseOpenAiCompatibleProvider {
for (const processedChunk of matcher.final()) {
yield processedChunk
}
+ return
+ }
+
+ // Handle DeepSeek V3.1, GLM-4.5, and GLM-4.6 models with reasoning_content parsing
+ const isHybridReasoningModel =
+ model.id.includes("DeepSeek-V3.1") || model.id.includes("GLM-4.5") || model.id.includes("GLM-4.6")
+ const reasoningEnabled = this.options.enableReasoningEffort === true
+
+ if (isHybridReasoningModel && reasoningEnabled) {
+ const stream = await this.client.chat.completions.create(
+ this.getCompletionParams(systemPrompt, messages, true),
+ )
+
+ for await (const chunk of stream) {
+ const delta = chunk.choices[0]?.delta
+
+ // Handle reasoning content from the response
+ if ((delta as any)?.reasoning_content) {
+ yield {
+ type: "reasoning",
+ text: (delta as any).reasoning_content,
+ }
+ }
+
+ // Handle regular text content
+ if (delta?.content) {
+ yield {
+ type: "text",
+ text: delta.content,
+ }
+ }
+
+ if (chunk.usage) {
+ yield {
+ type: "usage",
+ inputTokens: chunk.usage.prompt_tokens || 0,
+ outputTokens: chunk.usage.completion_tokens || 0,
+ }
+ }
+ }
} else {
+ // For non-reasoning models or when reasoning is disabled, use the base implementation
yield* super.createMessage(systemPrompt, messages)
}
}