fix: add openAiSkipSystemMessage option to prevent duplicate BOS tokens with DeepSeek V3.1

roomote · roomote · commit bd283b7c04ee · 2025-08-28T17:37:43.000Z
- Added openAiSkipSystemMessage configuration option for OpenAI Compatible providers - When enabled for DeepSeek models, merges system prompt into first user message - Prevents duplicate BOS tokens when using llama.cpp with --jinja flag - Added comprehensive tests for the new functionality Fixes #7500
diff --git a/packages/types/src/provider-settings.ts b/packages/types/src/provider-settings.ts
@@ -190,6 +190,7 @@ const openAiSchema = baseProviderSettingsSchema.extend({
 	openAiStreamingEnabled: z.boolean().optional(),
 	openAiHostHeader: z.string().optional(), // Keep temporarily for backward compatibility during migration.
 	openAiHeaders: z.record(z.string(), z.string()).optional(),
+	openAiSkipSystemMessage: z.boolean().optional(), // Skip system message for models that auto-add BOS tokens (e.g., llama.cpp with --jinja)
 })
 
 const ollamaSchema = baseProviderSettingsSchema.extend({
diff --git a/src/api/providers/__tests__/openai-deepseek-bos.spec.ts b/src/api/providers/__tests__/openai-deepseek-bos.spec.ts
@@ -0,0 +1,231 @@
+import { describe, it, expect, vi, beforeEach } from "vitest"
+import OpenAI from "openai"
+import { Anthropic } from "@anthropic-ai/sdk"
+
+import { OpenAiHandler } from "../openai"
+import type { ApiHandlerOptions } from "../../../shared/api"
+
+vi.mock("openai")
+
+describe("OpenAI Handler - DeepSeek V3 BOS Token Handling", () => {
+	let mockOpenAIClient: any
+	let mockStream: any
+
+	beforeEach(() => {
+		vi.clearAllMocks()
+
+		// Create a mock async generator for streaming
+		mockStream = (async function* () {
+			yield {
+				choices: [{ delta: { content: "Test response" } }],
+				usage: { prompt_tokens: 10, completion_tokens: 5 },
+			}
+		})()
+
+		mockOpenAIClient = {
+			chat: {
+				completions: {
+					create: vi.fn().mockResolvedValue(mockStream),
+				},
+			},
+		}
+
+		vi.mocked(OpenAI).mockImplementation(() => mockOpenAIClient as any)
+	})
+
+	describe("Streaming mode", () => {
+		it("should skip system message when openAiSkipSystemMessage is true for DeepSeek V3", async () => {
+			const options: ApiHandlerOptions = {
+				openAiApiKey: "test-key",
+				openAiModelId: "deepseek-v3",
+				openAiBaseUrl: "http://localhost:11434/v1",
+				openAiStreamingEnabled: true,
+				openAiSkipSystemMessage: true,
+			}
+
+			const handler = new OpenAiHandler(options)
+			const systemPrompt = "You are a helpful assistant"
+			const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Hello" }]
+
+			const stream = handler.createMessage(systemPrompt, messages)
+			const results = []
+			for await (const chunk of stream) {
+				results.push(chunk)
+			}
+
+			expect(mockOpenAIClient.chat.completions.create).toHaveBeenCalledWith(
+				expect.objectContaining({
+					messages: expect.arrayContaining([
+						expect.objectContaining({
+							role: "user",
+							content: expect.stringContaining("You are a helpful assistant"),
+						}),
+					]),
+				}),
+				expect.any(Object),
+			)
+
+			// Verify system message is not included separately
+			const callArgs = mockOpenAIClient.chat.completions.create.mock.calls[0][0]
+			expect(callArgs.messages.find((m: any) => m.role === "system")).toBeUndefined()
+		})
+
+		it("should include system message normally when openAiSkipSystemMessage is false", async () => {
+			const options: ApiHandlerOptions = {
+				openAiApiKey: "test-key",
+				openAiModelId: "deepseek-v3",
+				openAiBaseUrl: "http://localhost:11434/v1",
+				openAiStreamingEnabled: true,
+				openAiSkipSystemMessage: false,
+			}
+
+			const handler = new OpenAiHandler(options)
+			const systemPrompt = "You are a helpful assistant"
+			const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Hello" }]
+
+			const stream = handler.createMessage(systemPrompt, messages)
+			const results = []
+			for await (const chunk of stream) {
+				results.push(chunk)
+			}
+
+			expect(mockOpenAIClient.chat.completions.create).toHaveBeenCalledWith(
+				expect.objectContaining({
+					messages: expect.arrayContaining([
+						expect.objectContaining({
+							role: "system",
+							content: "You are a helpful assistant",
+						}),
+					]),
+				}),
+				expect.any(Object),
+			)
+		})
+
+		it("should handle case when no user message exists", async () => {
+			const options: ApiHandlerOptions = {
+				openAiApiKey: "test-key",
+				openAiModelId: "deepseek-v3.1",
+				openAiBaseUrl: "http://localhost:11434/v1",
+				openAiStreamingEnabled: true,
+				openAiSkipSystemMessage: true,
+			}
+
+			const handler = new OpenAiHandler(options)
+			const systemPrompt = "You are a helpful assistant"
+			const messages: Anthropic.Messages.MessageParam[] = [{ role: "assistant", content: "Previous response" }]
+
+			const stream = handler.createMessage(systemPrompt, messages)
+			const results = []
+			for await (const chunk of stream) {
+				results.push(chunk)
+			}
+
+			// Should create a user message with system prompt
+			expect(mockOpenAIClient.chat.completions.create).toHaveBeenCalledWith(
+				expect.objectContaining({
+					messages: expect.arrayContaining([
+						expect.objectContaining({
+							role: "user",
+							content: "You are a helpful assistant",
+						}),
+					]),
+				}),
+				expect.any(Object),
+			)
+		})
+	})
+
+	describe("Non-streaming mode", () => {
+		beforeEach(() => {
+			mockOpenAIClient.chat.completions.create = vi.fn().mockResolvedValue({
+				choices: [{ message: { content: "Test response" } }],
+				usage: { prompt_tokens: 10, completion_tokens: 5 },
+			})
+		})
+
+		it("should skip system message in non-streaming mode when configured", async () => {
+			const options: ApiHandlerOptions = {
+				openAiApiKey: "test-key",
+				openAiModelId: "deepseek-v3",
+				openAiBaseUrl: "http://localhost:11434/v1",
+				openAiStreamingEnabled: false,
+				openAiSkipSystemMessage: true,
+			}
+
+			const handler = new OpenAiHandler(options)
+			const systemPrompt = "You are a helpful assistant"
+			const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Hello" }]
+
+			const stream = handler.createMessage(systemPrompt, messages)
+			const results = []
+			for await (const chunk of stream) {
+				results.push(chunk)
+			}
+
+			const callArgs = mockOpenAIClient.chat.completions.create.mock.calls[0][0]
+			// First message should be user message with merged system prompt
+			expect(callArgs.messages[0]).toMatchObject({
+				role: "user",
+				content: expect.stringContaining("You are a helpful assistant"),
+			})
+			// No separate system message
+			expect(callArgs.messages.find((m: any) => m.role === "system")).toBeUndefined()
+		})
+	})
+
+	describe("Model detection", () => {
+		it.each(["deepseek-v3", "deepseek-v3.1", "DeepSeek-V3", "DEEPSEEK-V3.1", "deepseek-chat"])(
+			"should detect %s as DeepSeek model when skipSystemMessage is enabled",
+			async (modelId) => {
+				const options: ApiHandlerOptions = {
+					openAiApiKey: "test-key",
+					openAiModelId: modelId,
+					openAiBaseUrl: "http://localhost:11434/v1",
+					openAiStreamingEnabled: true,
+					openAiSkipSystemMessage: true,
+				}
+
+				const handler = new OpenAiHandler(options)
+				const systemPrompt = "System prompt"
+				const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "User message" }]
+
+				const stream = handler.createMessage(systemPrompt, messages)
+				for await (const chunk of stream) {
+					// Consume stream
+				}
+
+				const callArgs = mockOpenAIClient.chat.completions.create.mock.calls[0][0]
+				// Should merge system prompt into user message
+				expect(callArgs.messages[0].content).toContain("System prompt")
+				expect(callArgs.messages.find((m: any) => m.role === "system")).toBeUndefined()
+			},
+		)
+
+		it("should not apply skip logic to non-DeepSeek models", async () => {
+			const options: ApiHandlerOptions = {
+				openAiApiKey: "test-key",
+				openAiModelId: "gpt-4",
+				openAiBaseUrl: "http://localhost:11434/v1",
+				openAiStreamingEnabled: true,
+				openAiSkipSystemMessage: true,
+			}
+
+			const handler = new OpenAiHandler(options)
+			const systemPrompt = "System prompt"
+			const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "User message" }]
+
+			const stream = handler.createMessage(systemPrompt, messages)
+			for await (const chunk of stream) {
+				// Consume stream
+			}
+
+			const callArgs = mockOpenAIClient.chat.completions.create.mock.calls[0][0]
+			// Should still have system message for non-DeepSeek models
+			expect(callArgs.messages[0]).toMatchObject({
+				role: "system",
+				content: "System prompt",
+			})
+		})
+	})
+})
diff --git a/src/api/providers/openai.ts b/src/api/providers/openai.ts
@@ -105,8 +105,28 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl
 
 			let convertedMessages
 
+			// Check if we should skip system message for DeepSeek V3 models with llama.cpp
+			const skipSystemMessage =
+				this.options.openAiSkipSystemMessage &&
+				(modelId.toLowerCase().includes("deepseek") || modelId.toLowerCase().includes("deepseek-v3"))
+
 			if (deepseekReasoner) {
 				convertedMessages = convertToR1Format([{ role: "user", content: systemPrompt }, ...messages])
+			} else if (skipSystemMessage) {
+				// For DeepSeek V3 with llama.cpp, merge system prompt into first user message to avoid duplicate BOS
+				const firstUserMessage = messages.find((msg) => msg.role === "user")
+				if (firstUserMessage) {
+					const modifiedMessages = [...messages]
+					const firstUserIndex = modifiedMessages.findIndex((msg) => msg.role === "user")
+					modifiedMessages[firstUserIndex] = {
+						...firstUserMessage,
+						content: `${systemPrompt}\n\n${typeof firstUserMessage.content === "string" ? firstUserMessage.content : JSON.stringify(firstUserMessage.content)}`,
+					}
+					convertedMessages = convertToOpenAiMessages(modifiedMessages)
+				} else {
+					// If no user message, create one with the system prompt
+					convertedMessages = convertToOpenAiMessages([{ role: "user", content: systemPrompt }, ...messages])
+				}
 			} else if (ark || enabledLegacyFormat) {
 				convertedMessages = [systemMessage, ...convertToSimpleMessages(messages)]
 			} else {
@@ -224,13 +244,37 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl
 				content: systemPrompt,
 			}
 
+			// Check if we should skip system message for DeepSeek V3 models with llama.cpp
+			const skipSystemMessage =
+				this.options.openAiSkipSystemMessage &&
+				(modelId.toLowerCase().includes("deepseek") || modelId.toLowerCase().includes("deepseek-v3"))
+
+			let messagesForRequest
+			if (deepseekReasoner) {
+				messagesForRequest = convertToR1Format([{ role: "user", content: systemPrompt }, ...messages])
+			} else if (skipSystemMessage) {
+				// For DeepSeek V3 with llama.cpp, merge system prompt into first user message
+				const firstUserMessage = messages.find((msg) => msg.role === "user")
+				if (firstUserMessage) {
+					const modifiedMessages = [...messages]
+					const firstUserIndex = modifiedMessages.findIndex((msg) => msg.role === "user")
+					modifiedMessages[firstUserIndex] = {
+						...firstUserMessage,
+						content: `${systemPrompt}\n\n${typeof firstUserMessage.content === "string" ? firstUserMessage.content : JSON.stringify(firstUserMessage.content)}`,
+					}
+					messagesForRequest = convertToOpenAiMessages(modifiedMessages)
+				} else {
+					messagesForRequest = convertToOpenAiMessages([{ role: "user", content: systemPrompt }, ...messages])
+				}
+			} else if (enabledLegacyFormat) {
+				messagesForRequest = [systemMessage, ...convertToSimpleMessages(messages)]
+			} else {
+				messagesForRequest = [systemMessage, ...convertToOpenAiMessages(messages)]
+			}
+
 			const requestOptions: OpenAI.Chat.Completions.ChatCompletionCreateParamsNonStreaming = {
 				model: modelId,
-				messages: deepseekReasoner
-					? convertToR1Format([{ role: "user", content: systemPrompt }, ...messages])
-					: enabledLegacyFormat
-						? [systemMessage, ...convertToSimpleMessages(messages)]
-						: [systemMessage, ...convertToOpenAiMessages(messages)],
+				messages: messagesForRequest,
 			}
 
 			// Add max_tokens if needed