diff --git a/packages/types/src/providers/chutes.ts b/packages/types/src/providers/chutes.ts index 15dea58263..8d85bb59c6 100644 --- a/packages/types/src/providers/chutes.ts +++ b/packages/types/src/providers/chutes.ts @@ -6,6 +6,8 @@ export type ChutesModelId = | "deepseek-ai/DeepSeek-R1" | "deepseek-ai/DeepSeek-V3" | "deepseek-ai/DeepSeek-V3.1" + | "deepseek-ai/DeepSeek-V3.1-Terminus" + | "deepseek-ai/DeepSeek-V3.1-turbo" | "unsloth/Llama-3.3-70B-Instruct" | "chutesai/Llama-4-Scout-17B-16E-Instruct" | "unsloth/Mistral-Nemo-Instruct-2407" @@ -29,6 +31,7 @@ export type ChutesModelId = | "tngtech/DeepSeek-R1T-Chimera" | "zai-org/GLM-4.5-Air" | "zai-org/GLM-4.5-FP8" + | "zai-org/GLM-4.6-FP8" | "moonshotai/Kimi-K2-Instruct-75k" | "moonshotai/Kimi-K2-Instruct-0905" | "Qwen/Qwen3-235B-A22B-Thinking-2507" @@ -70,10 +73,31 @@ export const chutesModels = { contextWindow: 163840, supportsImages: false, supportsPromptCache: false, + supportsReasoningEffort: true, inputPrice: 0, outputPrice: 0, description: "DeepSeek V3.1 model.", }, + "deepseek-ai/DeepSeek-V3.1-Terminus": { + maxTokens: 32768, + contextWindow: 163840, + supportsImages: false, + supportsPromptCache: false, + supportsReasoningEffort: true, + inputPrice: 0, + outputPrice: 0, + description: "DeepSeek V3.1 Terminus variant - optimized for complex reasoning and extended context.", + }, + "deepseek-ai/DeepSeek-V3.1-turbo": { + maxTokens: 32768, + contextWindow: 163840, + supportsImages: false, + supportsPromptCache: false, + supportsReasoningEffort: true, + inputPrice: 0, + outputPrice: 0, + description: "DeepSeek V3.1 Turbo variant - faster inference with maintained quality.", + }, "unsloth/Llama-3.3-70B-Instruct": { maxTokens: 32768, // From Groq contextWindow: 131072, // From Groq @@ -259,6 +283,7 @@ export const chutesModels = { contextWindow: 151329, supportsImages: false, supportsPromptCache: false, + supportsReasoningEffort: true, inputPrice: 0, outputPrice: 0, description: @@ -269,11 +294,23 @@ export const chutesModels = { contextWindow: 131072, supportsImages: false, supportsPromptCache: false, + supportsReasoningEffort: true, inputPrice: 0, outputPrice: 0, description: "GLM-4.5-FP8 model with 128k token context window, optimized for agent-based applications with MoE architecture.", }, + "zai-org/GLM-4.6-FP8": { + maxTokens: 32768, + contextWindow: 204800, + supportsImages: false, + supportsPromptCache: false, + supportsReasoningEffort: true, + inputPrice: 0, + outputPrice: 0, + description: + "GLM-4.6-FP8 model with 200K context window, FP8 precision for efficient inference. Improved reasoning, coding, and agent capabilities.", + }, "Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8": { maxTokens: 32768, contextWindow: 262144, diff --git a/src/api/providers/__tests__/chutes.spec.ts b/src/api/providers/__tests__/chutes.spec.ts index 398f86ce60..dbf5c77712 100644 --- a/src/api/providers/__tests__/chutes.spec.ts +++ b/src/api/providers/__tests__/chutes.spec.ts @@ -253,6 +253,30 @@ describe("ChutesHandler", () => { ) }) + it("should return zai-org/GLM-4.6-FP8 model with correct configuration", () => { + const testModelId: ChutesModelId = "zai-org/GLM-4.6-FP8" + const handlerWithModel = new ChutesHandler({ + apiModelId: testModelId, + chutesApiKey: "test-chutes-api-key", + }) + const model = handlerWithModel.getModel() + expect(model.id).toBe(testModelId) + expect(model.info).toEqual( + expect.objectContaining({ + maxTokens: 32768, + contextWindow: 204800, + supportsImages: false, + supportsPromptCache: false, + supportsReasoningEffort: true, + inputPrice: 0, + outputPrice: 0, + description: + "GLM-4.6-FP8 model with 200K context window, FP8 precision for efficient inference. Improved reasoning, coding, and agent capabilities.", + temperature: 0.5, // Default temperature for non-DeepSeek models + }), + ) + }) + it("should return Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8 model with correct configuration", () => { const testModelId: ChutesModelId = "Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8" const handlerWithModel = new ChutesHandler({ @@ -297,6 +321,52 @@ describe("ChutesHandler", () => { ) }) + it("should return DeepSeek V3.1 Terminus model with correct configuration", () => { + const testModelId: ChutesModelId = "deepseek-ai/DeepSeek-V3.1-Terminus" + const handlerWithModel = new ChutesHandler({ + apiModelId: testModelId, + chutesApiKey: "test-chutes-api-key", + }) + const model = handlerWithModel.getModel() + expect(model.id).toBe(testModelId) + expect(model.info).toEqual( + expect.objectContaining({ + maxTokens: 32768, + contextWindow: 163840, + supportsImages: false, + supportsPromptCache: false, + supportsReasoningEffort: true, + inputPrice: 0, + outputPrice: 0, + description: "DeepSeek V3.1 Terminus variant - optimized for complex reasoning and extended context.", + temperature: 0.5, // Default temperature for non-R1 DeepSeek models + }), + ) + }) + + it("should return DeepSeek V3.1 turbo model with correct configuration", () => { + const testModelId: ChutesModelId = "deepseek-ai/DeepSeek-V3.1-turbo" + const handlerWithModel = new ChutesHandler({ + apiModelId: testModelId, + chutesApiKey: "test-chutes-api-key", + }) + const model = handlerWithModel.getModel() + expect(model.id).toBe(testModelId) + expect(model.info).toEqual( + expect.objectContaining({ + maxTokens: 32768, + contextWindow: 163840, + supportsImages: false, + supportsPromptCache: false, + supportsReasoningEffort: true, + inputPrice: 0, + outputPrice: 0, + description: "DeepSeek V3.1 Turbo variant - faster inference with maintained quality.", + temperature: 0.5, // Default temperature for non-R1 DeepSeek models + }), + ) + }) + it("should return moonshotai/Kimi-K2-Instruct-0905 model with correct configuration", () => { const testModelId: ChutesModelId = "moonshotai/Kimi-K2-Instruct-0905" const handlerWithModel = new ChutesHandler({ @@ -470,4 +540,137 @@ describe("ChutesHandler", () => { const model = handlerWithModel.getModel() expect(model.info.temperature).toBe(0.5) }) + + it("should enable reasoning for DeepSeek V3.1 models when enableReasoningEffort is true", async () => { + const modelId: ChutesModelId = "deepseek-ai/DeepSeek-V3.1" + const handlerWithModel = new ChutesHandler({ + apiModelId: modelId, + chutesApiKey: "test-chutes-api-key", + enableReasoningEffort: true, + }) + + mockCreate.mockImplementationOnce(async () => ({ + [Symbol.asyncIterator]: async function* () { + // First yield reasoning content + yield { + choices: [{ delta: { reasoning_content: "Let me think about this..." } }], + } + // Then yield regular content + yield { + choices: [{ delta: { content: "Here's my response." } }], + } + // Finally yield usage + yield { + choices: [], + usage: { prompt_tokens: 100, completion_tokens: 50 }, + } + }, + })) + + const systemPrompt = "You are a helpful assistant" + const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Hello" }] + + const stream = handlerWithModel.createMessage(systemPrompt, messages) + const chunks = [] + for await (const chunk of stream) { + chunks.push(chunk) + } + + // Should parse reasoning content and regular content separately + expect(chunks).toContainEqual({ type: "reasoning", text: "Let me think about this..." }) + expect(chunks).toContainEqual({ type: "text", text: "Here's my response." }) + expect(chunks).toContainEqual({ type: "usage", inputTokens: 100, outputTokens: 50 }) + + // Verify that the API was called with reasoning enabled + expect(mockCreate).toHaveBeenCalledWith( + expect.objectContaining({ + chat_template_kwargs: { + thinking: true, + }, + }), + ) + }) + + it("should enable reasoning for GLM-4.5 models when enableReasoningEffort is true", async () => { + const modelId: ChutesModelId = "zai-org/GLM-4.5-Air" + const handlerWithModel = new ChutesHandler({ + apiModelId: modelId, + chutesApiKey: "test-chutes-api-key", + enableReasoningEffort: true, + }) + + mockCreate.mockImplementationOnce(async () => ({ + [Symbol.asyncIterator]: async function* () { + // First yield reasoning content + yield { + choices: [{ delta: { reasoning_content: "GLM reasoning process..." } }], + } + // Then yield regular content + yield { + choices: [{ delta: { content: "GLM response" } }], + } + // Finally yield usage + yield { + choices: [], + usage: { prompt_tokens: 100, completion_tokens: 50 }, + } + }, + })) + + const systemPrompt = "You are a helpful assistant" + const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Hello" }] + + const stream = handlerWithModel.createMessage(systemPrompt, messages) + const chunks = [] + for await (const chunk of stream) { + chunks.push(chunk) + } + + // Should parse reasoning content separately + expect(chunks).toContainEqual({ type: "reasoning", text: "GLM reasoning process..." }) + expect(chunks).toContainEqual({ type: "text", text: "GLM response" }) + + // Verify that the API was called with reasoning enabled + expect(mockCreate).toHaveBeenCalledWith( + expect.objectContaining({ + chat_template_kwargs: { + thinking: true, + }, + }), + ) + }) + + it.skip("should disable reasoning for DeepSeek V3.1 models when enableReasoningEffort is false", async () => { + const modelId: ChutesModelId = "deepseek-ai/DeepSeek-V3.1" + const handlerWithModel = new ChutesHandler({ + apiModelId: modelId, + chutesApiKey: "test-chutes-api-key", + enableReasoningEffort: false, + }) + + mockCreate.mockImplementationOnce(async () => ({ + [Symbol.asyncIterator]: async function* () { + yield { + choices: [{ delta: { content: "Reasoning contentRegular content" } }], + } + yield { + choices: [], + usage: { prompt_tokens: 100, completion_tokens: 50 }, + } + }, + })) + + const systemPrompt = "You are a helpful assistant" + const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Hello" }] + + const stream = handlerWithModel.createMessage(systemPrompt, messages) + const chunks = [] + for await (const chunk of stream) { + chunks.push(chunk) + } + + // Should NOT parse reasoning content when disabled + expect(chunks).toContainEqual({ type: "text", text: "Reasoning contentRegular content" }) + expect(chunks).not.toContainEqual({ type: "reasoning", text: "Reasoning content" }) + }) }) diff --git a/src/api/providers/chutes.ts b/src/api/providers/chutes.ts index 62121bd19d..37e23fc776 100644 --- a/src/api/providers/chutes.ts +++ b/src/api/providers/chutes.ts @@ -3,6 +3,7 @@ import { Anthropic } from "@anthropic-ai/sdk" import OpenAI from "openai" import type { ApiHandlerOptions } from "../../shared/api" +import { shouldUseReasoningEffort } from "../../shared/api" import { XmlMatcher } from "../../utils/xml-matcher" import { convertToR1Format } from "../transform/r1-format" import { convertToOpenAiMessages } from "../transform/openai-format" @@ -26,6 +27,7 @@ export class ChutesHandler extends BaseOpenAiCompatibleProvider { private getCompletionParams( systemPrompt: string, messages: Anthropic.Messages.MessageParam[], + enableReasoning: boolean = false, ): OpenAI.Chat.Completions.ChatCompletionCreateParamsStreaming { const { id: model, @@ -34,7 +36,7 @@ export class ChutesHandler extends BaseOpenAiCompatibleProvider { const temperature = this.options.modelTemperature ?? this.getModel().info.temperature - return { + const params: any = { model, max_tokens, temperature, @@ -42,11 +44,21 @@ export class ChutesHandler extends BaseOpenAiCompatibleProvider { stream: true, stream_options: { include_usage: true }, } + + // Add reasoning support for DeepSeek V3.1, GLM-4.5, and GLM-4.6 models + if (enableReasoning) { + params.chat_template_kwargs = { + thinking: true, + } + } + + return params } override async *createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream { const model = this.getModel() + // Handle DeepSeek R1 models with XML tag parsing if (model.id.includes("DeepSeek-R1")) { const stream = await this.client.chat.completions.create({ ...this.getCompletionParams(systemPrompt, messages), @@ -84,7 +96,48 @@ export class ChutesHandler extends BaseOpenAiCompatibleProvider { for (const processedChunk of matcher.final()) { yield processedChunk } + return + } + + // Handle DeepSeek V3.1, GLM-4.5, and GLM-4.6 models with reasoning_content parsing + const isHybridReasoningModel = + model.id.includes("DeepSeek-V3.1") || model.id.includes("GLM-4.5") || model.id.includes("GLM-4.6") + const reasoningEnabled = this.options.enableReasoningEffort === true + + if (isHybridReasoningModel && reasoningEnabled) { + const stream = await this.client.chat.completions.create( + this.getCompletionParams(systemPrompt, messages, true), + ) + + for await (const chunk of stream) { + const delta = chunk.choices[0]?.delta + + // Handle reasoning content from the response + if ((delta as any)?.reasoning_content) { + yield { + type: "reasoning", + text: (delta as any).reasoning_content, + } + } + + // Handle regular text content + if (delta?.content) { + yield { + type: "text", + text: delta.content, + } + } + + if (chunk.usage) { + yield { + type: "usage", + inputTokens: chunk.usage.prompt_tokens || 0, + outputTokens: chunk.usage.completion_tokens || 0, + } + } + } } else { + // For non-reasoning models or when reasoning is disabled, use the base implementation yield* super.createMessage(systemPrompt, messages) } }