Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions packages/types/src/provider-settings.ts
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@ const openAiSchema = baseProviderSettingsSchema.extend({
openAiStreamingEnabled: z.boolean().optional(),
openAiHostHeader: z.string().optional(), // Keep temporarily for backward compatibility during migration.
openAiHeaders: z.record(z.string(), z.string()).optional(),
openAiSkipSystemMessage: z.boolean().optional(), // Skip system message for models that auto-add BOS tokens (e.g., llama.cpp with --jinja)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment is helpful, but could we expand it to explain when users should enable this option? For example: 'Enable this if you see duplicate BOS token warnings with DeepSeek V3.1 and llama.cpp'

})

const ollamaSchema = baseProviderSettingsSchema.extend({
Expand Down
231 changes: 231 additions & 0 deletions src/api/providers/__tests__/openai-deepseek-bos.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
import { describe, it, expect, vi, beforeEach } from "vitest"
import OpenAI from "openai"
import { Anthropic } from "@anthropic-ai/sdk"

import { OpenAiHandler } from "../openai"
import type { ApiHandlerOptions } from "../../../shared/api"

vi.mock("openai")

describe("OpenAI Handler - DeepSeek V3 BOS Token Handling", () => {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great test coverage! Consider adding a few edge cases:

  • What happens when the system prompt is empty?
  • Behavior with complex message content (arrays with multiple text/image parts)?
  • Interaction with R1 format when both openAiR1FormatEnabled and openAiSkipSystemMessage are true?

let mockOpenAIClient: any
let mockStream: any

beforeEach(() => {
vi.clearAllMocks()

// Create a mock async generator for streaming
mockStream = (async function* () {
yield {
choices: [{ delta: { content: "Test response" } }],
usage: { prompt_tokens: 10, completion_tokens: 5 },
}
})()

mockOpenAIClient = {
chat: {
completions: {
create: vi.fn().mockResolvedValue(mockStream),
},
},
}

vi.mocked(OpenAI).mockImplementation(() => mockOpenAIClient as any)
})

describe("Streaming mode", () => {
it("should skip system message when openAiSkipSystemMessage is true for DeepSeek V3", async () => {
const options: ApiHandlerOptions = {
openAiApiKey: "test-key",
openAiModelId: "deepseek-v3",
openAiBaseUrl: "http://localhost:11434/v1",
openAiStreamingEnabled: true,
openAiSkipSystemMessage: true,
}

const handler = new OpenAiHandler(options)
const systemPrompt = "You are a helpful assistant"
const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Hello" }]

const stream = handler.createMessage(systemPrompt, messages)
const results = []
for await (const chunk of stream) {
results.push(chunk)
}

expect(mockOpenAIClient.chat.completions.create).toHaveBeenCalledWith(
expect.objectContaining({
messages: expect.arrayContaining([
expect.objectContaining({
role: "user",
content: expect.stringContaining("You are a helpful assistant"),
}),
]),
}),
expect.any(Object),
)

// Verify system message is not included separately
const callArgs = mockOpenAIClient.chat.completions.create.mock.calls[0][0]
expect(callArgs.messages.find((m: any) => m.role === "system")).toBeUndefined()
})

it("should include system message normally when openAiSkipSystemMessage is false", async () => {
const options: ApiHandlerOptions = {
openAiApiKey: "test-key",
openAiModelId: "deepseek-v3",
openAiBaseUrl: "http://localhost:11434/v1",
openAiStreamingEnabled: true,
openAiSkipSystemMessage: false,
}

const handler = new OpenAiHandler(options)
const systemPrompt = "You are a helpful assistant"
const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Hello" }]

const stream = handler.createMessage(systemPrompt, messages)
const results = []
for await (const chunk of stream) {
results.push(chunk)
}

expect(mockOpenAIClient.chat.completions.create).toHaveBeenCalledWith(
expect.objectContaining({
messages: expect.arrayContaining([
expect.objectContaining({
role: "system",
content: "You are a helpful assistant",
}),
]),
}),
expect.any(Object),
)
})

it("should handle case when no user message exists", async () => {
const options: ApiHandlerOptions = {
openAiApiKey: "test-key",
openAiModelId: "deepseek-v3.1",
openAiBaseUrl: "http://localhost:11434/v1",
openAiStreamingEnabled: true,
openAiSkipSystemMessage: true,
}

const handler = new OpenAiHandler(options)
const systemPrompt = "You are a helpful assistant"
const messages: Anthropic.Messages.MessageParam[] = [{ role: "assistant", content: "Previous response" }]

const stream = handler.createMessage(systemPrompt, messages)
const results = []
for await (const chunk of stream) {
results.push(chunk)
}

// Should create a user message with system prompt
expect(mockOpenAIClient.chat.completions.create).toHaveBeenCalledWith(
expect.objectContaining({
messages: expect.arrayContaining([
expect.objectContaining({
role: "user",
content: "You are a helpful assistant",
}),
]),
}),
expect.any(Object),
)
})
})

describe("Non-streaming mode", () => {
beforeEach(() => {
mockOpenAIClient.chat.completions.create = vi.fn().mockResolvedValue({
choices: [{ message: { content: "Test response" } }],
usage: { prompt_tokens: 10, completion_tokens: 5 },
})
})

it("should skip system message in non-streaming mode when configured", async () => {
const options: ApiHandlerOptions = {
openAiApiKey: "test-key",
openAiModelId: "deepseek-v3",
openAiBaseUrl: "http://localhost:11434/v1",
openAiStreamingEnabled: false,
openAiSkipSystemMessage: true,
}

const handler = new OpenAiHandler(options)
const systemPrompt = "You are a helpful assistant"
const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Hello" }]

const stream = handler.createMessage(systemPrompt, messages)
const results = []
for await (const chunk of stream) {
results.push(chunk)
}

const callArgs = mockOpenAIClient.chat.completions.create.mock.calls[0][0]
// First message should be user message with merged system prompt
expect(callArgs.messages[0]).toMatchObject({
role: "user",
content: expect.stringContaining("You are a helpful assistant"),
})
// No separate system message
expect(callArgs.messages.find((m: any) => m.role === "system")).toBeUndefined()
})
})

describe("Model detection", () => {
it.each(["deepseek-v3", "deepseek-v3.1", "DeepSeek-V3", "DEEPSEEK-V3.1", "deepseek-chat"])(
"should detect %s as DeepSeek model when skipSystemMessage is enabled",
async (modelId) => {
const options: ApiHandlerOptions = {
openAiApiKey: "test-key",
openAiModelId: modelId,
openAiBaseUrl: "http://localhost:11434/v1",
openAiStreamingEnabled: true,
openAiSkipSystemMessage: true,
}

const handler = new OpenAiHandler(options)
const systemPrompt = "System prompt"
const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "User message" }]

const stream = handler.createMessage(systemPrompt, messages)
for await (const chunk of stream) {
// Consume stream
}

const callArgs = mockOpenAIClient.chat.completions.create.mock.calls[0][0]
// Should merge system prompt into user message
expect(callArgs.messages[0].content).toContain("System prompt")
expect(callArgs.messages.find((m: any) => m.role === "system")).toBeUndefined()
},
)

it("should not apply skip logic to non-DeepSeek models", async () => {
const options: ApiHandlerOptions = {
openAiApiKey: "test-key",
openAiModelId: "gpt-4",
openAiBaseUrl: "http://localhost:11434/v1",
openAiStreamingEnabled: true,
openAiSkipSystemMessage: true,
}

const handler = new OpenAiHandler(options)
const systemPrompt = "System prompt"
const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "User message" }]

const stream = handler.createMessage(systemPrompt, messages)
for await (const chunk of stream) {
// Consume stream
}

const callArgs = mockOpenAIClient.chat.completions.create.mock.calls[0][0]
// Should still have system message for non-DeepSeek models
expect(callArgs.messages[0]).toMatchObject({
role: "system",
content: "System prompt",
})
})
})
})
54 changes: 49 additions & 5 deletions src/api/providers/openai.ts
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,28 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl

let convertedMessages

// Check if we should skip system message for DeepSeek V3 models with llama.cpp
const skipSystemMessage =
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of hardcoding this for DeepSeek, could this feature be useful for other llama.cpp deployments? Consider renaming the option to something more generic like mergeSystemIntoFirstUser to indicate the behavior rather than the specific use case.

this.options.openAiSkipSystemMessage &&
(modelId.toLowerCase().includes("deepseek") || modelId.toLowerCase().includes("deepseek-v3"))
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The model detection using includes("deepseek") might be too broad and could match unintended models. Would it be more robust to use a specific list of model IDs or a regex pattern?


if (deepseekReasoner) {
convertedMessages = convertToR1Format([{ role: "user", content: systemPrompt }, ...messages])
} else if (skipSystemMessage) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I notice there's duplicate logic here between streaming (lines 108-129) and non-streaming (lines 248-268) modes. Could we extract this into a helper method like prepareMessagesWithSkipSystemMessage() to reduce duplication and improve maintainability?

// For DeepSeek V3 with llama.cpp, merge system prompt into first user message to avoid duplicate BOS
const firstUserMessage = messages.find((msg) => msg.role === "user")
if (firstUserMessage) {
const modifiedMessages = [...messages]
const firstUserIndex = modifiedMessages.findIndex((msg) => msg.role === "user")
modifiedMessages[firstUserIndex] = {
...firstUserMessage,
content: `${systemPrompt}\n\n${typeof firstUserMessage.content === "string" ? firstUserMessage.content : JSON.stringify(firstUserMessage.content)}`,
}
convertedMessages = convertToOpenAiMessages(modifiedMessages)
} else {
// If no user message, create one with the system prompt
convertedMessages = convertToOpenAiMessages([{ role: "user", content: systemPrompt }, ...messages])
}
} else if (ark || enabledLegacyFormat) {
convertedMessages = [systemMessage, ...convertToSimpleMessages(messages)]
} else {
Expand Down Expand Up @@ -224,13 +244,37 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl
content: systemPrompt,
}

// Check if we should skip system message for DeepSeek V3 models with llama.cpp
const skipSystemMessage =
this.options.openAiSkipSystemMessage &&
(modelId.toLowerCase().includes("deepseek") || modelId.toLowerCase().includes("deepseek-v3"))

let messagesForRequest
if (deepseekReasoner) {
messagesForRequest = convertToR1Format([{ role: "user", content: systemPrompt }, ...messages])
} else if (skipSystemMessage) {
// For DeepSeek V3 with llama.cpp, merge system prompt into first user message
const firstUserMessage = messages.find((msg) => msg.role === "user")
if (firstUserMessage) {
const modifiedMessages = [...messages]
const firstUserIndex = modifiedMessages.findIndex((msg) => msg.role === "user")
modifiedMessages[firstUserIndex] = {
...firstUserMessage,
content: `${systemPrompt}\n\n${typeof firstUserMessage.content === "string" ? firstUserMessage.content : JSON.stringify(firstUserMessage.content)}`,
}
messagesForRequest = convertToOpenAiMessages(modifiedMessages)
} else {
messagesForRequest = convertToOpenAiMessages([{ role: "user", content: systemPrompt }, ...messages])
}
} else if (enabledLegacyFormat) {
messagesForRequest = [systemMessage, ...convertToSimpleMessages(messages)]
} else {
messagesForRequest = [systemMessage, ...convertToOpenAiMessages(messages)]
}

const requestOptions: OpenAI.Chat.Completions.ChatCompletionCreateParamsNonStreaming = {
model: modelId,
messages: deepseekReasoner
? convertToR1Format([{ role: "user", content: systemPrompt }, ...messages])
: enabledLegacyFormat
? [systemMessage, ...convertToSimpleMessages(messages)]
: [systemMessage, ...convertToOpenAiMessages(messages)],
messages: messagesForRequest,
}

// Add max_tokens if needed
Expand Down
Loading