Skip to content

Commit 5df2ce0

Browse files
committed
fix: sanitize unwanted "极速模式" characters from DeepSeek V3.1 responses
- Add sanitization logic to remove "极速模式" and its variations from DeepSeek responses - These unwanted characters were being injected into file paths and content - Add comprehensive unit tests to verify the sanitization works correctly - Preserve legitimate Chinese text while removing artifacts Fixes #7382
1 parent 0c481a3 commit 5df2ce0

File tree

2 files changed

+232
-1
lines changed

2 files changed

+232
-1
lines changed

src/api/providers/__tests__/deepseek.spec.ts

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,180 @@ describe("DeepSeekHandler", () => {
260260
expect(usageChunks[0].cacheWriteTokens).toBe(8)
261261
expect(usageChunks[0].cacheReadTokens).toBe(2)
262262
})
263+
264+
it("should sanitize unwanted '极速模式' characters from response", async () => {
265+
// Mock a response with unwanted characters
266+
mockCreate.mockImplementationOnce(async (options) => {
267+
if (!options.stream) {
268+
return {
269+
id: "test-completion",
270+
choices: [
271+
{
272+
message: {
273+
role: "assistant",
274+
content: "Test response with 极速模式 unwanted characters",
275+
refusal: null,
276+
},
277+
finish_reason: "stop",
278+
index: 0,
279+
},
280+
],
281+
usage: {
282+
prompt_tokens: 10,
283+
completion_tokens: 5,
284+
total_tokens: 15,
285+
},
286+
}
287+
}
288+
289+
// Return async iterator for streaming with unwanted characters
290+
return {
291+
[Symbol.asyncIterator]: async function* () {
292+
yield {
293+
choices: [
294+
{
295+
delta: {
296+
content: "Here is 极速模式 some text with 极 unwanted 速 characters 模式",
297+
},
298+
index: 0,
299+
},
300+
],
301+
usage: null,
302+
}
303+
yield {
304+
choices: [
305+
{
306+
delta: {},
307+
index: 0,
308+
},
309+
],
310+
usage: {
311+
prompt_tokens: 10,
312+
completion_tokens: 5,
313+
total_tokens: 15,
314+
},
315+
}
316+
},
317+
}
318+
})
319+
320+
const stream = handler.createMessage(systemPrompt, messages)
321+
const chunks: any[] = []
322+
for await (const chunk of stream) {
323+
chunks.push(chunk)
324+
}
325+
326+
const textChunks = chunks.filter((chunk) => chunk.type === "text")
327+
expect(textChunks).toHaveLength(1)
328+
// The unwanted characters should be removed
329+
expect(textChunks[0].text).toBe("Here is some text with unwanted characters")
330+
expect(textChunks[0].text).not.toContain("极速模式")
331+
expect(textChunks[0].text).not.toContain("极")
332+
expect(textChunks[0].text).not.toContain("速")
333+
expect(textChunks[0].text).not.toContain("模")
334+
expect(textChunks[0].text).not.toContain("式")
335+
})
336+
337+
it("should preserve legitimate Chinese text while removing artifacts", async () => {
338+
// Mock a response with both legitimate Chinese text and unwanted artifacts
339+
mockCreate.mockImplementationOnce(async (options) => {
340+
// Return async iterator for streaming
341+
return {
342+
[Symbol.asyncIterator]: async function* () {
343+
yield {
344+
choices: [
345+
{
346+
delta: {
347+
content: "这是正常的中文文本极速模式,不应该被删除。File path: 极 test.txt",
348+
},
349+
index: 0,
350+
},
351+
],
352+
usage: null,
353+
}
354+
yield {
355+
choices: [
356+
{
357+
delta: {},
358+
index: 0,
359+
},
360+
],
361+
usage: {
362+
prompt_tokens: 10,
363+
completion_tokens: 5,
364+
total_tokens: 15,
365+
},
366+
}
367+
},
368+
}
369+
})
370+
371+
const stream = handler.createMessage(systemPrompt, messages)
372+
const chunks: any[] = []
373+
for await (const chunk of stream) {
374+
chunks.push(chunk)
375+
}
376+
377+
const textChunks = chunks.filter((chunk) => chunk.type === "text")
378+
expect(textChunks).toHaveLength(1)
379+
// Should remove "极速模式" phrase and isolated "极" between spaces
380+
expect(textChunks[0].text).toBe("这是正常的中文文本,不应该被删除。File path: test.txt")
381+
expect(textChunks[0].text).toContain("这是正常的中文文本")
382+
expect(textChunks[0].text).not.toContain("极速模式")
383+
// The isolated "极" between spaces should be removed
384+
expect(textChunks[0].text).not.toContain(" 极 ")
385+
})
386+
387+
it("should handle reasoning content with unwanted characters", async () => {
388+
// Mock a response with reasoning content containing unwanted characters
389+
mockCreate.mockImplementationOnce(async (options) => {
390+
return {
391+
[Symbol.asyncIterator]: async function* () {
392+
yield {
393+
choices: [
394+
{
395+
delta: {
396+
content: "<think>Reasoning with 极速模式 artifacts</think>Regular text",
397+
},
398+
index: 0,
399+
},
400+
],
401+
usage: null,
402+
}
403+
yield {
404+
choices: [
405+
{
406+
delta: {},
407+
index: 0,
408+
},
409+
],
410+
usage: {
411+
prompt_tokens: 10,
412+
completion_tokens: 5,
413+
total_tokens: 15,
414+
},
415+
}
416+
},
417+
}
418+
})
419+
420+
const stream = handler.createMessage(systemPrompt, messages)
421+
const chunks: any[] = []
422+
for await (const chunk of stream) {
423+
chunks.push(chunk)
424+
}
425+
426+
// Check both reasoning and text chunks
427+
const reasoningChunks = chunks.filter((chunk) => chunk.type === "reasoning")
428+
const textChunks = chunks.filter((chunk) => chunk.type === "text")
429+
430+
if (reasoningChunks.length > 0) {
431+
expect(reasoningChunks[0].text).not.toContain("极速模式")
432+
}
433+
if (textChunks.length > 0) {
434+
expect(textChunks[0].text).not.toContain("极速模式")
435+
}
436+
})
263437
})
264438

265439
describe("processUsageMetrics", () => {

src/api/providers/deepseek.ts

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,18 @@
11
import { deepSeekModels, deepSeekDefaultModelId } from "@roo-code/types"
2+
import { Anthropic } from "@anthropic-ai/sdk"
23

34
import type { ApiHandlerOptions } from "../../shared/api"
45

5-
import type { ApiStreamUsageChunk } from "../transform/stream"
6+
import type { ApiStreamUsageChunk, ApiStream } from "../transform/stream"
67
import { getModelParams } from "../transform/model-params"
8+
import type { ApiHandlerCreateMessageMetadata } from "../index"
79

810
import { OpenAiHandler } from "./openai"
911

1012
export class DeepSeekHandler extends OpenAiHandler {
13+
// Pattern to match unwanted "极速模式" characters and its variations
14+
private readonly UNWANTED_PATTERN = /[][][][]|[]|[]?[]?[]?/g
15+
1116
constructor(options: ApiHandlerOptions) {
1217
super({
1318
...options,
@@ -26,6 +31,58 @@ export class DeepSeekHandler extends OpenAiHandler {
2631
return { id, info, ...params }
2732
}
2833

34+
override async *createMessage(
35+
systemPrompt: string,
36+
messages: Anthropic.Messages.MessageParam[],
37+
metadata?: ApiHandlerCreateMessageMetadata,
38+
): ApiStream {
39+
// Get the stream from the parent class
40+
const stream = super.createMessage(systemPrompt, messages, metadata)
41+
42+
// Process each chunk to remove unwanted characters
43+
for await (const chunk of stream) {
44+
if (chunk.type === "text" && chunk.text) {
45+
// Sanitize the text content
46+
chunk.text = this.sanitizeContent(chunk.text)
47+
} else if (chunk.type === "reasoning" && chunk.text) {
48+
// Also sanitize reasoning content
49+
chunk.text = this.sanitizeContent(chunk.text)
50+
}
51+
yield chunk
52+
}
53+
}
54+
55+
/**
56+
* Removes unwanted "极速模式" characters from the content.
57+
* These characters appear to be injected by some DeepSeek V3.1 configurations.
58+
*/
59+
private sanitizeContent(content: string): string {
60+
// First, try to remove the complete phrase "极速模式"
61+
let sanitized = content.replace(//g, "")
62+
63+
// Remove partial sequences like "模式" that might remain
64+
sanitized = sanitized.replace(/(?![-龿])/g, "")
65+
66+
// Remove isolated occurrences of these characters when they appear
67+
// between non-Chinese characters or at boundaries
68+
// Using more specific patterns to avoid removing legitimate Chinese text
69+
sanitized = sanitized.replace(/(?<![-龿])(?![-龿])/g, "")
70+
sanitized = sanitized.replace(/(?<![-龿])(?![-龿])/g, "")
71+
sanitized = sanitized.replace(/(?<![-龿])(?![-龿])/g, "")
72+
sanitized = sanitized.replace(/(?<![-龿])(?![-龿])/g, "")
73+
74+
// Handle cases where these characters appear with spaces
75+
sanitized = sanitized.replace(/\s+\s*/g, " ")
76+
sanitized = sanitized.replace(/\s+\s*/g, " ")
77+
sanitized = sanitized.replace(/\s+\s*/g, " ")
78+
sanitized = sanitized.replace(/\s+\s*/g, " ")
79+
80+
// Clean up any resulting multiple spaces
81+
sanitized = sanitized.replace(/\s+/g, " ").trim()
82+
83+
return sanitized
84+
}
85+
2986
// Override to handle DeepSeek's usage metrics, including caching.
3087
protected override processUsageMetrics(usage: any): ApiStreamUsageChunk {
3188
return {

0 commit comments

Comments
 (0)