fix: sanitize unwanted "极速模式" characters from DeepSeek V3.1 responses

roomote · roomote · commit 5df2ce088296 · 2025-08-25T07:35:28.000Z
- Add sanitization logic to remove "极速模式" and its variations from DeepSeek responses - These unwanted characters were being injected into file paths and content - Add comprehensive unit tests to verify the sanitization works correctly - Preserve legitimate Chinese text while removing artifacts Fixes #7382
diff --git a/src/api/providers/__tests__/deepseek.spec.ts b/src/api/providers/__tests__/deepseek.spec.ts
@@ -260,6 +260,180 @@ describe("DeepSeekHandler", () => {
 			expect(usageChunks[0].cacheWriteTokens).toBe(8)
 			expect(usageChunks[0].cacheReadTokens).toBe(2)
 		})
+
+		it("should sanitize unwanted '极速模式' characters from response", async () => {
+			// Mock a response with unwanted characters
+			mockCreate.mockImplementationOnce(async (options) => {
+				if (!options.stream) {
+					return {
+						id: "test-completion",
+						choices: [
+							{
+								message: {
+									role: "assistant",
+									content: "Test response with 极速模式 unwanted characters",
+									refusal: null,
+								},
+								finish_reason: "stop",
+								index: 0,
+							},
+						],
+						usage: {
+							prompt_tokens: 10,
+							completion_tokens: 5,
+							total_tokens: 15,
+						},
+					}
+				}
+
+				// Return async iterator for streaming with unwanted characters
+				return {
+					[Symbol.asyncIterator]: async function* () {
+						yield {
+							choices: [
+								{
+									delta: {
+										content: "Here is 极速模式 some text with 极 unwanted 速 characters 模式",
+									},
+									index: 0,
+								},
+							],
+							usage: null,
+						}
+						yield {
+							choices: [
+								{
+									delta: {},
+									index: 0,
+								},
+							],
+							usage: {
+								prompt_tokens: 10,
+								completion_tokens: 5,
+								total_tokens: 15,
+							},
+						}
+					},
+				}
+			})
+
+			const stream = handler.createMessage(systemPrompt, messages)
+			const chunks: any[] = []
+			for await (const chunk of stream) {
+				chunks.push(chunk)
+			}
+
+			const textChunks = chunks.filter((chunk) => chunk.type === "text")
+			expect(textChunks).toHaveLength(1)
+			// The unwanted characters should be removed
+			expect(textChunks[0].text).toBe("Here is some text with unwanted characters")
+			expect(textChunks[0].text).not.toContain("极速模式")
+			expect(textChunks[0].text).not.toContain("极")
+			expect(textChunks[0].text).not.toContain("速")
+			expect(textChunks[0].text).not.toContain("模")
+			expect(textChunks[0].text).not.toContain("式")
+		})
+
+		it("should preserve legitimate Chinese text while removing artifacts", async () => {
+			// Mock a response with both legitimate Chinese text and unwanted artifacts
+			mockCreate.mockImplementationOnce(async (options) => {
+				// Return async iterator for streaming
+				return {
+					[Symbol.asyncIterator]: async function* () {
+						yield {
+							choices: [
+								{
+									delta: {
+										content: "这是正常的中文文本极速模式，不应该被删除。File path: 极 test.txt",
+									},
+									index: 0,
+								},
+							],
+							usage: null,
+						}
+						yield {
+							choices: [
+								{
+									delta: {},
+									index: 0,
+								},
+							],
+							usage: {
+								prompt_tokens: 10,
+								completion_tokens: 5,
+								total_tokens: 15,
+							},
+						}
+					},
+				}
+			})
+
+			const stream = handler.createMessage(systemPrompt, messages)
+			const chunks: any[] = []
+			for await (const chunk of stream) {
+				chunks.push(chunk)
+			}
+
+			const textChunks = chunks.filter((chunk) => chunk.type === "text")
+			expect(textChunks).toHaveLength(1)
+			// Should remove "极速模式" phrase and isolated "极" between spaces
+			expect(textChunks[0].text).toBe("这是正常的中文文本，不应该被删除。File path: test.txt")
+			expect(textChunks[0].text).toContain("这是正常的中文文本")
+			expect(textChunks[0].text).not.toContain("极速模式")
+			// The isolated "极" between spaces should be removed
+			expect(textChunks[0].text).not.toContain(" 极 ")
+		})
+
+		it("should handle reasoning content with unwanted characters", async () => {
+			// Mock a response with reasoning content containing unwanted characters
+			mockCreate.mockImplementationOnce(async (options) => {
+				return {
+					[Symbol.asyncIterator]: async function* () {
+						yield {
+							choices: [
+								{
+									delta: {
+										content: "<think>Reasoning with 极速模式 artifacts</think>Regular text",
+									},
+									index: 0,
+								},
+							],
+							usage: null,
+						}
+						yield {
+							choices: [
+								{
+									delta: {},
+									index: 0,
+								},
+							],
+							usage: {
+								prompt_tokens: 10,
+								completion_tokens: 5,
+								total_tokens: 15,
+							},
+						}
+					},
+				}
+			})
+
+			const stream = handler.createMessage(systemPrompt, messages)
+			const chunks: any[] = []
+			for await (const chunk of stream) {
+				chunks.push(chunk)
+			}
+
+			// Check both reasoning and text chunks
+			const reasoningChunks = chunks.filter((chunk) => chunk.type === "reasoning")
+			const textChunks = chunks.filter((chunk) => chunk.type === "text")
+
+			if (reasoningChunks.length > 0) {
+				expect(reasoningChunks[0].text).not.toContain("极速模式")
+			}
+			if (textChunks.length > 0) {
+				expect(textChunks[0].text).not.toContain("极速模式")
+			}
+		})
 	})
 
 	describe("processUsageMetrics", () => {
diff --git a/src/api/providers/deepseek.ts b/src/api/providers/deepseek.ts
@@ -1,13 +1,18 @@
 import { deepSeekModels, deepSeekDefaultModelId } from "@roo-code/types"
+import { Anthropic } from "@anthropic-ai/sdk"
 
 import type { ApiHandlerOptions } from "../../shared/api"
 
-import type { ApiStreamUsageChunk } from "../transform/stream"
+import type { ApiStreamUsageChunk, ApiStream } from "../transform/stream"
 import { getModelParams } from "../transform/model-params"
+import type { ApiHandlerCreateMessageMetadata } from "../index"
 
 import { OpenAiHandler } from "./openai"
 
 export class DeepSeekHandler extends OpenAiHandler {
+	// Pattern to match unwanted "极速模式" characters and its variations
+	private readonly UNWANTED_PATTERN = /[极極][速][模][式]|[极極]|[速]?[模]?[式]?/g
+
 	constructor(options: ApiHandlerOptions) {
 		super({
 			...options,
@@ -26,6 +31,58 @@ export class DeepSeekHandler extends OpenAiHandler {
 		return { id, info, ...params }
 	}
 
+	override async *createMessage(
+		systemPrompt: string,
+		messages: Anthropic.Messages.MessageParam[],
+		metadata?: ApiHandlerCreateMessageMetadata,
+	): ApiStream {
+		// Get the stream from the parent class
+		const stream = super.createMessage(systemPrompt, messages, metadata)
+
+		// Process each chunk to remove unwanted characters
+		for await (const chunk of stream) {
+			if (chunk.type === "text" && chunk.text) {
+				// Sanitize the text content
+				chunk.text = this.sanitizeContent(chunk.text)
+			} else if (chunk.type === "reasoning" && chunk.text) {
+				// Also sanitize reasoning content
+				chunk.text = this.sanitizeContent(chunk.text)
+			}
+			yield chunk
+		}
+	}
+
+	/**
+	 * Removes unwanted "极速模式" characters from the content.
+	 * These characters appear to be injected by some DeepSeek V3.1 configurations.
+	 */
+	private sanitizeContent(content: string): string {
+		// First, try to remove the complete phrase "极速模式"
+		let sanitized = content.replace(/极速模式/g, "")
+
+		// Remove partial sequences like "模式" that might remain
+		sanitized = sanitized.replace(/模式(?![一-龿])/g, "")
+
+		// Remove isolated occurrences of these characters when they appear
+		// between non-Chinese characters or at boundaries
+		// Using more specific patterns to avoid removing legitimate Chinese text
+		sanitized = sanitized.replace(/(?<![一-龿])极(?![一-龿])/g, "")
+		sanitized = sanitized.replace(/(?<![一-龿])速(?![一-龿])/g, "")
+		sanitized = sanitized.replace(/(?<![一-龿])模(?![一-龿])/g, "")
+		sanitized = sanitized.replace(/(?<![一-龿])式(?![一-龿])/g, "")
+
+		// Handle cases where these characters appear with spaces
+		sanitized = sanitized.replace(/\s+极\s*/g, " ")
+		sanitized = sanitized.replace(/\s+速\s*/g, " ")
+		sanitized = sanitized.replace(/\s+模\s*/g, " ")
+		sanitized = sanitized.replace(/\s+式\s*/g, " ")
+
+		// Clean up any resulting multiple spaces
+		sanitized = sanitized.replace(/\s+/g, " ").trim()
+
+		return sanitized
+	}
+
 	// Override to handle DeepSeek's usage metrics, including caching.
 	protected override processUsageMetrics(usage: any): ApiStreamUsageChunk {
 		return {