Skip to content

Commit a1821c7

Browse files
committed
Fixes #4950: Preserve Unicode characters when applying diffs
- Added preserveUnicodeCharacters function to both multi-search-replace and multi-file-search-replace diff strategies - The function maps Unicode characters from original content to replacement content to prevent conversion to ASCII - Fixes issue where Unicode apostrophes (') and quotes () were being converted to ASCII equivalents (' and ") during diff operations - Added comprehensive tests to verify Unicode character preservation
1 parent 2e2f83b commit a1821c7

File tree

4 files changed

+1381
-1
lines changed

4 files changed

+1381
-1
lines changed

roo-code-messages.log

Lines changed: 1198 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
import { MultiSearchReplaceDiffStrategy } from "../multi-search-replace"
2+
3+
describe("Unicode Character Preservation", () => {
4+
it("should preserve Unicode apostrophes when applying diffs", async () => {
5+
const strategy = new MultiSearchReplaceDiffStrategy(1.0) // Exact matching
6+
7+
const originalContent = `This file contains Unicode apostrophes: \u2018hello\u2019 and \u201Cworld\u201D
8+
Another line with Unicode: \u2018test\u2019 and \u201Cexample\u201D
9+
Regular ASCII: 'normal' and "standard"`
10+
11+
const diffContent = `<<<<<<< SEARCH
12+
:start_line:1
13+
-------
14+
This file contains Unicode apostrophes: 'hello' and "world"
15+
=======
16+
This file contains Unicode apostrophes: 'goodbye' and "universe"
17+
\>>>>>>> REPLACE`
18+
19+
const result = await strategy.applyDiff(originalContent, diffContent)
20+
21+
expect(result.success).toBe(true)
22+
if (result.success && result.content) {
23+
// Check that Unicode characters are preserved
24+
expect(result.content).toContain("\u2018goodbye\u2019") // Should preserve Unicode apostrophe (U+2018/U+2019)
25+
expect(result.content).toContain("\u201Cuniverse\u201D") // Should preserve Unicode quotes (U+201C/U+201D)
26+
// Check that ASCII characters are NOT present (they should be converted to Unicode)
27+
expect(result.content).not.toContain("'goodbye'") // Should not have ASCII apostrophe
28+
expect(result.content).not.toContain('"universe"') // Should not have ASCII quotes
29+
}
30+
})
31+
32+
it("should preserve Unicode quotes in multi-line replacements", async () => {
33+
const strategy = new MultiSearchReplaceDiffStrategy(1.0)
34+
35+
const originalContent = `Line 1: \u2018unicode\u2019
36+
Line 2: \u201Cquotes\u201D
37+
Line 3: normal`
38+
39+
const diffContent = `<<<<<<< SEARCH
40+
:start_line:1
41+
-------
42+
Line 1: 'unicode'
43+
Line 2: "quotes"
44+
=======
45+
Line 1: 'modified'
46+
Line 2: "changed"
47+
\>>>>>>> REPLACE`
48+
49+
const result = await strategy.applyDiff(originalContent, diffContent)
50+
51+
expect(result.success).toBe(true)
52+
if (result.success && result.content) {
53+
expect(result.content).toContain("\u2018modified\u2019")
54+
expect(result.content).toContain("\u201Cchanged\u201D")
55+
}
56+
})
57+
58+
it("should handle mixed Unicode and ASCII quotes correctly", async () => {
59+
const strategy = new MultiSearchReplaceDiffStrategy(1.0)
60+
61+
const originalContent = `Unicode: \u2018test\u2019 and \u201Cexample\u201D
62+
ASCII: 'normal' and "standard"`
63+
64+
const diffContent = `<<<<<<< SEARCH
65+
:start_line:1
66+
-------
67+
Unicode: 'test' and "example"
68+
=======
69+
Unicode: 'replaced' and "modified"
70+
\>>>>>>> REPLACE`
71+
72+
const result = await strategy.applyDiff(originalContent, diffContent)
73+
74+
expect(result.success).toBe(true)
75+
if (result.success && result.content) {
76+
// Should preserve Unicode in the replaced line
77+
expect(result.content).toContain("\u2018replaced\u2019")
78+
expect(result.content).toContain("\u201Cmodified\u201D")
79+
// Should keep ASCII in the unchanged line
80+
expect(result.content).toContain("'normal'")
81+
expect(result.content).toContain('"standard"')
82+
}
83+
})
84+
85+
it("should not affect content when no Unicode characters are present", async () => {
86+
const strategy = new MultiSearchReplaceDiffStrategy(1.0)
87+
88+
const originalContent = `Regular ASCII: 'test' and "example"`
89+
90+
const diffContent = `<<<<<<< SEARCH
91+
:start_line:1
92+
-------
93+
Regular ASCII: 'test' and "example"
94+
=======
95+
Regular ASCII: 'modified' and "changed"
96+
\>>>>>>> REPLACE`
97+
98+
const result = await strategy.applyDiff(originalContent, diffContent)
99+
100+
expect(result.success).toBe(true)
101+
if (result.success && result.content) {
102+
expect(result.content).toBe(`Regular ASCII: 'modified' and "changed"`)
103+
}
104+
})
105+
})

src/core/diff/strategies/multi-file-search-replace.ts

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,39 @@ function getSimilarity(original: string, search: string): number {
2929
return 1 - dist / maxLength
3030
}
3131

32+
/**
33+
* Preserves Unicode characters from the original content when applying replacements.
34+
* This function maps Unicode characters from the original to the replacement content
35+
* when they have been normalized to ASCII equivalents.
36+
*/
37+
function preserveUnicodeCharacters(originalContent: string, searchContent: string, replaceContent: string): string {
38+
// Create a mapping of ASCII characters to their Unicode equivalents from the original
39+
const unicodeMap = new Map<string, string>()
40+
41+
// Check for Unicode quotes in the original content
42+
const unicodeChars = ["\u201C", "\u201D", "\u2018", "\u2019"] // ""''
43+
const asciiChars = ['"', '"', "'", "'"]
44+
45+
for (let i = 0; i < unicodeChars.length; i++) {
46+
const unicodeChar = unicodeChars[i]
47+
const asciiChar = asciiChars[i]
48+
49+
// If original contains Unicode character, map ASCII to Unicode
50+
if (originalContent.includes(unicodeChar)) {
51+
unicodeMap.set(asciiChar, unicodeChar)
52+
}
53+
}
54+
55+
// Apply the mapping to the replacement content
56+
let result = replaceContent
57+
for (const [ascii, unicode] of unicodeMap) {
58+
// Use a more specific replacement to avoid replacing characters that shouldn't be replaced
59+
result = result.replace(new RegExp(ascii.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), "g"), unicode)
60+
}
61+
62+
return result
63+
}
64+
3265
/**
3366
* Performs a "middle-out" search of `lines` (between [startIndex, endIndex]) to find
3467
* the slice that is most similar to `searchChunk`. Returns the best score, index, and matched text.
@@ -650,6 +683,11 @@ Each file requires its own path, start_line, and diff elements.
650683

651684
// Get the matched lines from the original content
652685
const matchedLines = resultLines.slice(matchIndex, matchIndex + searchLines.length)
686+
const originalMatchedContent = matchedLines.join("\n")
687+
688+
// Preserve Unicode characters from the original content in the replacement
689+
replaceContent = preserveUnicodeCharacters(originalMatchedContent, searchContent, replaceContent)
690+
replaceLines = replaceContent === "" ? [] : replaceContent.split(/\r?\n/)
653691

654692
// Get the exact indentation (preserving tabs/spaces) of each line
655693
const originalIndents = matchedLines.map((line) => {

src/core/diff/strategies/multi-search-replace.ts

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@ function getSimilarity(original: string, search: string): number {
1616
return 0
1717
}
1818

19-
// Use the normalizeString utility to handle smart quotes and other special characters
19+
// Use the normalizeString utility for comparison only, but preserve original characters
20+
// This allows matching content with different quote styles without changing the actual content
2021
const normalizedOriginal = normalizeString(original)
2122
const normalizedSearch = normalizeString(search)
2223

@@ -32,6 +33,39 @@ function getSimilarity(original: string, search: string): number {
3233
return 1 - dist / maxLength
3334
}
3435

36+
/**
37+
* Preserves Unicode characters from the original content when applying replacements.
38+
* This function maps Unicode characters from the original to the replacement content
39+
* when they have been normalized to ASCII equivalents.
40+
*/
41+
function preserveUnicodeCharacters(originalContent: string, searchContent: string, replaceContent: string): string {
42+
// Create a mapping of ASCII characters to their Unicode equivalents from the original
43+
const unicodeMap = new Map<string, string>()
44+
45+
// Check for Unicode quotes in the original content
46+
const unicodeChars = ["\u201C", "\u201D", "\u2018", "\u2019"] // ""''
47+
const asciiChars = ['"', '"', "'", "'"]
48+
49+
for (let i = 0; i < unicodeChars.length; i++) {
50+
const unicodeChar = unicodeChars[i]
51+
const asciiChar = asciiChars[i]
52+
53+
// If original contains Unicode character, map ASCII to Unicode
54+
if (originalContent.includes(unicodeChar)) {
55+
unicodeMap.set(asciiChar, unicodeChar)
56+
}
57+
}
58+
59+
// Apply the mapping to the replacement content
60+
let result = replaceContent
61+
for (const [ascii, unicode] of unicodeMap) {
62+
// Use a more specific replacement to avoid replacing characters that shouldn't be replaced
63+
result = result.replace(new RegExp(ascii.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), "g"), unicode)
64+
}
65+
66+
return result
67+
}
68+
3569
/**
3670
* Performs a "middle-out" search of `lines` (between [startIndex, endIndex]) to find
3771
* the slice that is most similar to `searchChunk`. Returns the best score, index, and matched text.
@@ -550,6 +584,11 @@ Only use a single line of '=======' between search and replacement content, beca
550584

551585
// Get the matched lines from the original content
552586
const matchedLines = resultLines.slice(matchIndex, matchIndex + searchLines.length)
587+
const originalMatchedContent = matchedLines.join("\n")
588+
589+
// Preserve Unicode characters from the original content in the replacement
590+
replaceContent = preserveUnicodeCharacters(originalMatchedContent, searchContent, replaceContent)
591+
replaceLines = replaceContent === "" ? [] : replaceContent.split(/\r?\n/)
553592

554593
// Get the exact indentation (preserving tabs/spaces) of each line
555594
const originalIndents = matchedLines.map((line) => {

0 commit comments

Comments
 (0)