Fixes #4950: Preserve Unicode characters when applying diffs

roomote · roomote · commit a1821c75c479 · 2025-06-20T18:19:31.000Z
- Added preserveUnicodeCharacters function to both multi-search-replace and multi-file-search-replace diff strategies
- The function maps Unicode characters from original content to replacement content to prevent conversion to ASCII
- Fixes issue where Unicode apostrophes (') and quotes () were being converted to ASCII equivalents (' and ") during diff operations
- Added comprehensive tests to verify Unicode character preservation
diff --git a/roo-code-messages.log b/roo-code-messages.log
diff --git a/src/core/diff/strategies/__tests__/unicode-preservation.test.ts b/src/core/diff/strategies/__tests__/unicode-preservation.test.ts
@@ -0,0 +1,105 @@
+import { MultiSearchReplaceDiffStrategy } from "../multi-search-replace"
+
+describe("Unicode Character Preservation", () => {
+	it("should preserve Unicode apostrophes when applying diffs", async () => {
+		const strategy = new MultiSearchReplaceDiffStrategy(1.0) // Exact matching
+
+		const originalContent = `This file contains Unicode apostrophes: \u2018hello\u2019 and \u201Cworld\u201D
+Another line with Unicode: \u2018test\u2019 and \u201Cexample\u201D
+Regular ASCII: 'normal' and "standard"`
+
+		const diffContent = `<<<<<<< SEARCH
+:start_line:1
+-------
+This file contains Unicode apostrophes: 'hello' and "world"
+=======
+This file contains Unicode apostrophes: 'goodbye' and "universe"
+\>>>>>>> REPLACE`
+
+		const result = await strategy.applyDiff(originalContent, diffContent)
+
+		expect(result.success).toBe(true)
+		if (result.success && result.content) {
+			// Check that Unicode characters are preserved
+			expect(result.content).toContain("\u2018goodbye\u2019") // Should preserve Unicode apostrophe (U+2018/U+2019)
+			expect(result.content).toContain("\u201Cuniverse\u201D") // Should preserve Unicode quotes (U+201C/U+201D)
+			// Check that ASCII characters are NOT present (they should be converted to Unicode)
+			expect(result.content).not.toContain("'goodbye'") // Should not have ASCII apostrophe
+			expect(result.content).not.toContain('"universe"') // Should not have ASCII quotes
+		}
+	})
+
+	it("should preserve Unicode quotes in multi-line replacements", async () => {
+		const strategy = new MultiSearchReplaceDiffStrategy(1.0)
+
+		const originalContent = `Line 1: \u2018unicode\u2019
+Line 2: \u201Cquotes\u201D
+Line 3: normal`
+
+		const diffContent = `<<<<<<< SEARCH
+:start_line:1
+-------
+Line 1: 'unicode'
+Line 2: "quotes"
+=======
+Line 1: 'modified'
+Line 2: "changed"
+\>>>>>>> REPLACE`
+
+		const result = await strategy.applyDiff(originalContent, diffContent)
+
+		expect(result.success).toBe(true)
+		if (result.success && result.content) {
+			expect(result.content).toContain("\u2018modified\u2019")
+			expect(result.content).toContain("\u201Cchanged\u201D")
+		}
+	})
+
+	it("should handle mixed Unicode and ASCII quotes correctly", async () => {
+		const strategy = new MultiSearchReplaceDiffStrategy(1.0)
+
+		const originalContent = `Unicode: \u2018test\u2019 and \u201Cexample\u201D
+ASCII: 'normal' and "standard"`
+
+		const diffContent = `<<<<<<< SEARCH
+:start_line:1
+-------
+Unicode: 'test' and "example"
+=======
+Unicode: 'replaced' and "modified"
+\>>>>>>> REPLACE`
+
+		const result = await strategy.applyDiff(originalContent, diffContent)
+
+		expect(result.success).toBe(true)
+		if (result.success && result.content) {
+			// Should preserve Unicode in the replaced line
+			expect(result.content).toContain("\u2018replaced\u2019")
+			expect(result.content).toContain("\u201Cmodified\u201D")
+			// Should keep ASCII in the unchanged line
+			expect(result.content).toContain("'normal'")
+			expect(result.content).toContain('"standard"')
+		}
+	})
+
+	it("should not affect content when no Unicode characters are present", async () => {
+		const strategy = new MultiSearchReplaceDiffStrategy(1.0)
+
+		const originalContent = `Regular ASCII: 'test' and "example"`
+
+		const diffContent = `<<<<<<< SEARCH
+:start_line:1
+-------
+Regular ASCII: 'test' and "example"
+=======
+Regular ASCII: 'modified' and "changed"
+\>>>>>>> REPLACE`
+
+		const result = await strategy.applyDiff(originalContent, diffContent)
+
+		expect(result.success).toBe(true)
+		if (result.success && result.content) {
+			expect(result.content).toBe(`Regular ASCII: 'modified' and "changed"`)
+		}
+	})
+})
diff --git a/src/core/diff/strategies/multi-file-search-replace.ts b/src/core/diff/strategies/multi-file-search-replace.ts
@@ -29,6 +29,39 @@ function getSimilarity(original: string, search: string): number {
 	return 1 - dist / maxLength
 }
 
+/**
+ * Preserves Unicode characters from the original content when applying replacements.
+ * This function maps Unicode characters from the original to the replacement content
+ * when they have been normalized to ASCII equivalents.
+ */
+function preserveUnicodeCharacters(originalContent: string, searchContent: string, replaceContent: string): string {
+	// Create a mapping of ASCII characters to their Unicode equivalents from the original
+	const unicodeMap = new Map<string, string>()
+
+	// Check for Unicode quotes in the original content
+	const unicodeChars = ["\u201C", "\u201D", "\u2018", "\u2019"] // ""''
+	const asciiChars = ['"', '"', "'", "'"]
+
+	for (let i = 0; i < unicodeChars.length; i++) {
+		const unicodeChar = unicodeChars[i]
+		const asciiChar = asciiChars[i]
+
+		// If original contains Unicode character, map ASCII to Unicode
+		if (originalContent.includes(unicodeChar)) {
+			unicodeMap.set(asciiChar, unicodeChar)
+		}
+	}
+
+	// Apply the mapping to the replacement content
+	let result = replaceContent
+	for (const [ascii, unicode] of unicodeMap) {
+		// Use a more specific replacement to avoid replacing characters that shouldn't be replaced
+		result = result.replace(new RegExp(ascii.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), "g"), unicode)
+	}
+
+	return result
+}
+
 /**
  * Performs a "middle-out" search of `lines` (between [startIndex, endIndex]) to find
  * the slice that is most similar to `searchChunk`. Returns the best score, index, and matched text.
@@ -650,6 +683,11 @@ Each file requires its own path, start_line, and diff elements.
 
 			// Get the matched lines from the original content
 			const matchedLines = resultLines.slice(matchIndex, matchIndex + searchLines.length)
+			const originalMatchedContent = matchedLines.join("\n")
+
+			// Preserve Unicode characters from the original content in the replacement
+			replaceContent = preserveUnicodeCharacters(originalMatchedContent, searchContent, replaceContent)
+			replaceLines = replaceContent === "" ? [] : replaceContent.split(/\r?\n/)
 
 			// Get the exact indentation (preserving tabs/spaces) of each line
 			const originalIndents = matchedLines.map((line) => {
diff --git a/src/core/diff/strategies/multi-search-replace.ts b/src/core/diff/strategies/multi-search-replace.ts
@@ -16,7 +16,8 @@ function getSimilarity(original: string, search: string): number {
 		return 0
 	}
 
-	// Use the normalizeString utility to handle smart quotes and other special characters
+	// Use the normalizeString utility for comparison only, but preserve original characters
+	// This allows matching content with different quote styles without changing the actual content
 	const normalizedOriginal = normalizeString(original)
 	const normalizedSearch = normalizeString(search)
 
@@ -32,6 +33,39 @@ function getSimilarity(original: string, search: string): number {
 	return 1 - dist / maxLength
 }
 
+/**
+ * Preserves Unicode characters from the original content when applying replacements.
+ * This function maps Unicode characters from the original to the replacement content
+ * when they have been normalized to ASCII equivalents.
+ */
+function preserveUnicodeCharacters(originalContent: string, searchContent: string, replaceContent: string): string {
+	// Create a mapping of ASCII characters to their Unicode equivalents from the original
+	const unicodeMap = new Map<string, string>()
+
+	// Check for Unicode quotes in the original content
+	const unicodeChars = ["\u201C", "\u201D", "\u2018", "\u2019"] // ""''
+	const asciiChars = ['"', '"', "'", "'"]
+
+	for (let i = 0; i < unicodeChars.length; i++) {
+		const unicodeChar = unicodeChars[i]
+		const asciiChar = asciiChars[i]
+
+		// If original contains Unicode character, map ASCII to Unicode
+		if (originalContent.includes(unicodeChar)) {
+			unicodeMap.set(asciiChar, unicodeChar)
+		}
+	}
+
+	// Apply the mapping to the replacement content
+	let result = replaceContent
+	for (const [ascii, unicode] of unicodeMap) {
+		// Use a more specific replacement to avoid replacing characters that shouldn't be replaced
+		result = result.replace(new RegExp(ascii.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), "g"), unicode)
+	}
+
+	return result
+}
+
 /**
  * Performs a "middle-out" search of `lines` (between [startIndex, endIndex]) to find
  * the slice that is most similar to `searchChunk`. Returns the best score, index, and matched text.
@@ -550,6 +584,11 @@ Only use a single line of '=======' between search and replacement content, beca
 
 			// Get the matched lines from the original content
 			const matchedLines = resultLines.slice(matchIndex, matchIndex + searchLines.length)
+			const originalMatchedContent = matchedLines.join("\n")
+
+			// Preserve Unicode characters from the original content in the replacement
+			replaceContent = preserveUnicodeCharacters(originalMatchedContent, searchContent, replaceContent)
+			replaceLines = replaceContent === "" ? [] : replaceContent.split(/\r?\n/)
 
 			// Get the exact indentation (preserving tabs/spaces) of each line
 			const originalIndents = matchedLines.map((line) => {