-
Notifications
You must be signed in to change notification settings - Fork 2.6k
fix: prevent duplicate BOS tokens with DeepSeek V3.1 in llama.cpp #7501
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,231 @@ | ||
| import { describe, it, expect, vi, beforeEach } from "vitest" | ||
| import OpenAI from "openai" | ||
| import { Anthropic } from "@anthropic-ai/sdk" | ||
|
|
||
| import { OpenAiHandler } from "../openai" | ||
| import type { ApiHandlerOptions } from "../../../shared/api" | ||
|
|
||
| vi.mock("openai") | ||
|
|
||
| describe("OpenAI Handler - DeepSeek V3 BOS Token Handling", () => { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Great test coverage! Consider adding a few edge cases:
|
||
| let mockOpenAIClient: any | ||
| let mockStream: any | ||
|
|
||
| beforeEach(() => { | ||
| vi.clearAllMocks() | ||
|
|
||
| // Create a mock async generator for streaming | ||
| mockStream = (async function* () { | ||
| yield { | ||
| choices: [{ delta: { content: "Test response" } }], | ||
| usage: { prompt_tokens: 10, completion_tokens: 5 }, | ||
| } | ||
| })() | ||
|
|
||
| mockOpenAIClient = { | ||
| chat: { | ||
| completions: { | ||
| create: vi.fn().mockResolvedValue(mockStream), | ||
| }, | ||
| }, | ||
| } | ||
|
|
||
| vi.mocked(OpenAI).mockImplementation(() => mockOpenAIClient as any) | ||
| }) | ||
|
|
||
| describe("Streaming mode", () => { | ||
| it("should skip system message when openAiSkipSystemMessage is true for DeepSeek V3", async () => { | ||
| const options: ApiHandlerOptions = { | ||
| openAiApiKey: "test-key", | ||
| openAiModelId: "deepseek-v3", | ||
| openAiBaseUrl: "http://localhost:11434/v1", | ||
| openAiStreamingEnabled: true, | ||
| openAiSkipSystemMessage: true, | ||
| } | ||
|
|
||
| const handler = new OpenAiHandler(options) | ||
| const systemPrompt = "You are a helpful assistant" | ||
| const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Hello" }] | ||
|
|
||
| const stream = handler.createMessage(systemPrompt, messages) | ||
| const results = [] | ||
| for await (const chunk of stream) { | ||
| results.push(chunk) | ||
| } | ||
|
|
||
| expect(mockOpenAIClient.chat.completions.create).toHaveBeenCalledWith( | ||
| expect.objectContaining({ | ||
| messages: expect.arrayContaining([ | ||
| expect.objectContaining({ | ||
| role: "user", | ||
| content: expect.stringContaining("You are a helpful assistant"), | ||
| }), | ||
| ]), | ||
| }), | ||
| expect.any(Object), | ||
| ) | ||
|
|
||
| // Verify system message is not included separately | ||
| const callArgs = mockOpenAIClient.chat.completions.create.mock.calls[0][0] | ||
| expect(callArgs.messages.find((m: any) => m.role === "system")).toBeUndefined() | ||
| }) | ||
|
|
||
| it("should include system message normally when openAiSkipSystemMessage is false", async () => { | ||
| const options: ApiHandlerOptions = { | ||
| openAiApiKey: "test-key", | ||
| openAiModelId: "deepseek-v3", | ||
| openAiBaseUrl: "http://localhost:11434/v1", | ||
| openAiStreamingEnabled: true, | ||
| openAiSkipSystemMessage: false, | ||
| } | ||
|
|
||
| const handler = new OpenAiHandler(options) | ||
| const systemPrompt = "You are a helpful assistant" | ||
| const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Hello" }] | ||
|
|
||
| const stream = handler.createMessage(systemPrompt, messages) | ||
| const results = [] | ||
| for await (const chunk of stream) { | ||
| results.push(chunk) | ||
| } | ||
|
|
||
| expect(mockOpenAIClient.chat.completions.create).toHaveBeenCalledWith( | ||
| expect.objectContaining({ | ||
| messages: expect.arrayContaining([ | ||
| expect.objectContaining({ | ||
| role: "system", | ||
| content: "You are a helpful assistant", | ||
| }), | ||
| ]), | ||
| }), | ||
| expect.any(Object), | ||
| ) | ||
| }) | ||
|
|
||
| it("should handle case when no user message exists", async () => { | ||
| const options: ApiHandlerOptions = { | ||
| openAiApiKey: "test-key", | ||
| openAiModelId: "deepseek-v3.1", | ||
| openAiBaseUrl: "http://localhost:11434/v1", | ||
| openAiStreamingEnabled: true, | ||
| openAiSkipSystemMessage: true, | ||
| } | ||
|
|
||
| const handler = new OpenAiHandler(options) | ||
| const systemPrompt = "You are a helpful assistant" | ||
| const messages: Anthropic.Messages.MessageParam[] = [{ role: "assistant", content: "Previous response" }] | ||
|
|
||
| const stream = handler.createMessage(systemPrompt, messages) | ||
| const results = [] | ||
| for await (const chunk of stream) { | ||
| results.push(chunk) | ||
| } | ||
|
|
||
| // Should create a user message with system prompt | ||
| expect(mockOpenAIClient.chat.completions.create).toHaveBeenCalledWith( | ||
| expect.objectContaining({ | ||
| messages: expect.arrayContaining([ | ||
| expect.objectContaining({ | ||
| role: "user", | ||
| content: "You are a helpful assistant", | ||
| }), | ||
| ]), | ||
| }), | ||
| expect.any(Object), | ||
| ) | ||
| }) | ||
| }) | ||
|
|
||
| describe("Non-streaming mode", () => { | ||
| beforeEach(() => { | ||
| mockOpenAIClient.chat.completions.create = vi.fn().mockResolvedValue({ | ||
| choices: [{ message: { content: "Test response" } }], | ||
| usage: { prompt_tokens: 10, completion_tokens: 5 }, | ||
| }) | ||
| }) | ||
|
|
||
| it("should skip system message in non-streaming mode when configured", async () => { | ||
| const options: ApiHandlerOptions = { | ||
| openAiApiKey: "test-key", | ||
| openAiModelId: "deepseek-v3", | ||
| openAiBaseUrl: "http://localhost:11434/v1", | ||
| openAiStreamingEnabled: false, | ||
| openAiSkipSystemMessage: true, | ||
| } | ||
|
|
||
| const handler = new OpenAiHandler(options) | ||
| const systemPrompt = "You are a helpful assistant" | ||
| const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Hello" }] | ||
|
|
||
| const stream = handler.createMessage(systemPrompt, messages) | ||
| const results = [] | ||
| for await (const chunk of stream) { | ||
| results.push(chunk) | ||
| } | ||
|
|
||
| const callArgs = mockOpenAIClient.chat.completions.create.mock.calls[0][0] | ||
| // First message should be user message with merged system prompt | ||
| expect(callArgs.messages[0]).toMatchObject({ | ||
| role: "user", | ||
| content: expect.stringContaining("You are a helpful assistant"), | ||
| }) | ||
| // No separate system message | ||
| expect(callArgs.messages.find((m: any) => m.role === "system")).toBeUndefined() | ||
| }) | ||
| }) | ||
|
|
||
| describe("Model detection", () => { | ||
| it.each(["deepseek-v3", "deepseek-v3.1", "DeepSeek-V3", "DEEPSEEK-V3.1", "deepseek-chat"])( | ||
| "should detect %s as DeepSeek model when skipSystemMessage is enabled", | ||
| async (modelId) => { | ||
| const options: ApiHandlerOptions = { | ||
| openAiApiKey: "test-key", | ||
| openAiModelId: modelId, | ||
| openAiBaseUrl: "http://localhost:11434/v1", | ||
| openAiStreamingEnabled: true, | ||
| openAiSkipSystemMessage: true, | ||
| } | ||
|
|
||
| const handler = new OpenAiHandler(options) | ||
| const systemPrompt = "System prompt" | ||
| const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "User message" }] | ||
|
|
||
| const stream = handler.createMessage(systemPrompt, messages) | ||
| for await (const chunk of stream) { | ||
| // Consume stream | ||
| } | ||
|
|
||
| const callArgs = mockOpenAIClient.chat.completions.create.mock.calls[0][0] | ||
| // Should merge system prompt into user message | ||
| expect(callArgs.messages[0].content).toContain("System prompt") | ||
| expect(callArgs.messages.find((m: any) => m.role === "system")).toBeUndefined() | ||
| }, | ||
| ) | ||
|
|
||
| it("should not apply skip logic to non-DeepSeek models", async () => { | ||
| const options: ApiHandlerOptions = { | ||
| openAiApiKey: "test-key", | ||
| openAiModelId: "gpt-4", | ||
| openAiBaseUrl: "http://localhost:11434/v1", | ||
| openAiStreamingEnabled: true, | ||
| openAiSkipSystemMessage: true, | ||
| } | ||
|
|
||
| const handler = new OpenAiHandler(options) | ||
| const systemPrompt = "System prompt" | ||
| const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "User message" }] | ||
|
|
||
| const stream = handler.createMessage(systemPrompt, messages) | ||
| for await (const chunk of stream) { | ||
| // Consume stream | ||
| } | ||
|
|
||
| const callArgs = mockOpenAIClient.chat.completions.create.mock.calls[0][0] | ||
| // Should still have system message for non-DeepSeek models | ||
| expect(callArgs.messages[0]).toMatchObject({ | ||
| role: "system", | ||
| content: "System prompt", | ||
| }) | ||
| }) | ||
| }) | ||
| }) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -105,8 +105,28 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl | |
|
|
||
| let convertedMessages | ||
|
|
||
| // Check if we should skip system message for DeepSeek V3 models with llama.cpp | ||
| const skipSystemMessage = | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of hardcoding this for DeepSeek, could this feature be useful for other llama.cpp deployments? Consider renaming the option to something more generic like |
||
| this.options.openAiSkipSystemMessage && | ||
| (modelId.toLowerCase().includes("deepseek") || modelId.toLowerCase().includes("deepseek-v3")) | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The model detection using |
||
|
|
||
| if (deepseekReasoner) { | ||
| convertedMessages = convertToR1Format([{ role: "user", content: systemPrompt }, ...messages]) | ||
| } else if (skipSystemMessage) { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I notice there's duplicate logic here between streaming (lines 108-129) and non-streaming (lines 248-268) modes. Could we extract this into a helper method like |
||
| // For DeepSeek V3 with llama.cpp, merge system prompt into first user message to avoid duplicate BOS | ||
| const firstUserMessage = messages.find((msg) => msg.role === "user") | ||
| if (firstUserMessage) { | ||
| const modifiedMessages = [...messages] | ||
| const firstUserIndex = modifiedMessages.findIndex((msg) => msg.role === "user") | ||
| modifiedMessages[firstUserIndex] = { | ||
| ...firstUserMessage, | ||
| content: `${systemPrompt}\n\n${typeof firstUserMessage.content === "string" ? firstUserMessage.content : JSON.stringify(firstUserMessage.content)}`, | ||
| } | ||
| convertedMessages = convertToOpenAiMessages(modifiedMessages) | ||
| } else { | ||
| // If no user message, create one with the system prompt | ||
| convertedMessages = convertToOpenAiMessages([{ role: "user", content: systemPrompt }, ...messages]) | ||
| } | ||
| } else if (ark || enabledLegacyFormat) { | ||
| convertedMessages = [systemMessage, ...convertToSimpleMessages(messages)] | ||
| } else { | ||
|
|
@@ -224,13 +244,37 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl | |
| content: systemPrompt, | ||
| } | ||
|
|
||
| // Check if we should skip system message for DeepSeek V3 models with llama.cpp | ||
| const skipSystemMessage = | ||
| this.options.openAiSkipSystemMessage && | ||
| (modelId.toLowerCase().includes("deepseek") || modelId.toLowerCase().includes("deepseek-v3")) | ||
|
|
||
| let messagesForRequest | ||
| if (deepseekReasoner) { | ||
| messagesForRequest = convertToR1Format([{ role: "user", content: systemPrompt }, ...messages]) | ||
| } else if (skipSystemMessage) { | ||
| // For DeepSeek V3 with llama.cpp, merge system prompt into first user message | ||
| const firstUserMessage = messages.find((msg) => msg.role === "user") | ||
| if (firstUserMessage) { | ||
| const modifiedMessages = [...messages] | ||
| const firstUserIndex = modifiedMessages.findIndex((msg) => msg.role === "user") | ||
| modifiedMessages[firstUserIndex] = { | ||
| ...firstUserMessage, | ||
| content: `${systemPrompt}\n\n${typeof firstUserMessage.content === "string" ? firstUserMessage.content : JSON.stringify(firstUserMessage.content)}`, | ||
| } | ||
| messagesForRequest = convertToOpenAiMessages(modifiedMessages) | ||
| } else { | ||
| messagesForRequest = convertToOpenAiMessages([{ role: "user", content: systemPrompt }, ...messages]) | ||
| } | ||
| } else if (enabledLegacyFormat) { | ||
| messagesForRequest = [systemMessage, ...convertToSimpleMessages(messages)] | ||
| } else { | ||
| messagesForRequest = [systemMessage, ...convertToOpenAiMessages(messages)] | ||
| } | ||
|
|
||
| const requestOptions: OpenAI.Chat.Completions.ChatCompletionCreateParamsNonStreaming = { | ||
| model: modelId, | ||
| messages: deepseekReasoner | ||
| ? convertToR1Format([{ role: "user", content: systemPrompt }, ...messages]) | ||
| : enabledLegacyFormat | ||
| ? [systemMessage, ...convertToSimpleMessages(messages)] | ||
| : [systemMessage, ...convertToOpenAiMessages(messages)], | ||
| messages: messagesForRequest, | ||
| } | ||
|
|
||
| // Add max_tokens if needed | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This comment is helpful, but could we expand it to explain when users should enable this option? For example: 'Enable this if you see duplicate BOS token warnings with DeepSeek V3.1 and llama.cpp'