diff --git a/src/core/diff/strategies/__tests__/unicode-preservation.test.ts b/src/core/diff/strategies/__tests__/unicode-preservation.test.ts new file mode 100644 index 0000000000..23c1261834 --- /dev/null +++ b/src/core/diff/strategies/__tests__/unicode-preservation.test.ts @@ -0,0 +1,105 @@ +import { MultiSearchReplaceDiffStrategy } from "../multi-search-replace" + +describe("Unicode Character Preservation", () => { + it("should preserve Unicode apostrophes when applying diffs", async () => { + const strategy = new MultiSearchReplaceDiffStrategy(1.0) // Exact matching + + const originalContent = `This file contains Unicode apostrophes: \u2018hello\u2019 and \u201Cworld\u201D +Another line with Unicode: \u2018test\u2019 and \u201Cexample\u201D +Regular ASCII: 'normal' and "standard"` + + const diffContent = `<<<<<<< SEARCH +:start_line:1 +------- +This file contains Unicode apostrophes: 'hello' and "world" +======= +This file contains Unicode apostrophes: 'goodbye' and "universe" +\>>>>>>> REPLACE` + + const result = await strategy.applyDiff(originalContent, diffContent) + + expect(result.success).toBe(true) + if (result.success && result.content) { + // Check that Unicode characters are preserved + expect(result.content).toContain("\u2018goodbye\u2019") // Should preserve Unicode apostrophe (U+2018/U+2019) + expect(result.content).toContain("\u201Cuniverse\u201D") // Should preserve Unicode quotes (U+201C/U+201D) + // Check that ASCII characters are NOT present (they should be converted to Unicode) + expect(result.content).not.toContain("'goodbye'") // Should not have ASCII apostrophe + expect(result.content).not.toContain('"universe"') // Should not have ASCII quotes + } + }) + + it("should preserve Unicode quotes in multi-line replacements", async () => { + const strategy = new MultiSearchReplaceDiffStrategy(1.0) + + const originalContent = `Line 1: \u2018unicode\u2019 +Line 2: \u201Cquotes\u201D +Line 3: normal` + + const diffContent = `<<<<<<< SEARCH +:start_line:1 +------- +Line 1: 'unicode' +Line 2: "quotes" +======= +Line 1: 'modified' +Line 2: "changed" +\>>>>>>> REPLACE` + + const result = await strategy.applyDiff(originalContent, diffContent) + + expect(result.success).toBe(true) + if (result.success && result.content) { + expect(result.content).toContain("\u2018modified\u2019") + expect(result.content).toContain("\u201Cchanged\u201D") + } + }) + + it("should handle mixed Unicode and ASCII quotes correctly", async () => { + const strategy = new MultiSearchReplaceDiffStrategy(1.0) + + const originalContent = `Unicode: \u2018test\u2019 and \u201Cexample\u201D +ASCII: 'normal' and "standard"` + + const diffContent = `<<<<<<< SEARCH +:start_line:1 +------- +Unicode: 'test' and "example" +======= +Unicode: 'replaced' and "modified" +\>>>>>>> REPLACE` + + const result = await strategy.applyDiff(originalContent, diffContent) + + expect(result.success).toBe(true) + if (result.success && result.content) { + // Should preserve Unicode in the replaced line + expect(result.content).toContain("\u2018replaced\u2019") + expect(result.content).toContain("\u201Cmodified\u201D") + // Should keep ASCII in the unchanged line + expect(result.content).toContain("'normal'") + expect(result.content).toContain('"standard"') + } + }) + + it("should not affect content when no Unicode characters are present", async () => { + const strategy = new MultiSearchReplaceDiffStrategy(1.0) + + const originalContent = `Regular ASCII: 'test' and "example"` + + const diffContent = `<<<<<<< SEARCH +:start_line:1 +------- +Regular ASCII: 'test' and "example" +======= +Regular ASCII: 'modified' and "changed" +\>>>>>>> REPLACE` + + const result = await strategy.applyDiff(originalContent, diffContent) + + expect(result.success).toBe(true) + if (result.success && result.content) { + expect(result.content).toBe(`Regular ASCII: 'modified' and "changed"`) + } + }) +}) diff --git a/src/core/diff/strategies/multi-file-search-replace.ts b/src/core/diff/strategies/multi-file-search-replace.ts index d35f32685e..e9edf0687b 100644 --- a/src/core/diff/strategies/multi-file-search-replace.ts +++ b/src/core/diff/strategies/multi-file-search-replace.ts @@ -29,6 +29,39 @@ function getSimilarity(original: string, search: string): number { return 1 - dist / maxLength } +/** + * Preserves Unicode characters from the original content when applying replacements. + * This function maps Unicode characters from the original to the replacement content + * when they have been normalized to ASCII equivalents. + */ +function preserveUnicodeCharacters(originalContent: string, searchContent: string, replaceContent: string): string { + // Create a mapping of ASCII characters to their Unicode equivalents from the original + const unicodeMap = new Map() + + // Check for Unicode quotes in the original content + const unicodeChars = ["\u201C", "\u201D", "\u2018", "\u2019"] // ""'' + const asciiChars = ['"', '"', "'", "'"] + + for (let i = 0; i < unicodeChars.length; i++) { + const unicodeChar = unicodeChars[i] + const asciiChar = asciiChars[i] + + // If original contains Unicode character, map ASCII to Unicode + if (originalContent.includes(unicodeChar)) { + unicodeMap.set(asciiChar, unicodeChar) + } + } + + // Apply the mapping to the replacement content + let result = replaceContent + for (const [ascii, unicode] of unicodeMap) { + // Use a more specific replacement to avoid replacing characters that shouldn't be replaced + result = result.replace(new RegExp(ascii.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), "g"), unicode) + } + + return result +} + /** * Performs a "middle-out" search of `lines` (between [startIndex, endIndex]) to find * the slice that is most similar to `searchChunk`. Returns the best score, index, and matched text. @@ -650,6 +683,11 @@ Each file requires its own path, start_line, and diff elements. // Get the matched lines from the original content const matchedLines = resultLines.slice(matchIndex, matchIndex + searchLines.length) + const originalMatchedContent = matchedLines.join("\n") + + // Preserve Unicode characters from the original content in the replacement + replaceContent = preserveUnicodeCharacters(originalMatchedContent, searchContent, replaceContent) + replaceLines = replaceContent === "" ? [] : replaceContent.split(/\r?\n/) // Get the exact indentation (preserving tabs/spaces) of each line const originalIndents = matchedLines.map((line) => { diff --git a/src/core/diff/strategies/multi-search-replace.ts b/src/core/diff/strategies/multi-search-replace.ts index 9e740a6571..df6dbb60ac 100644 --- a/src/core/diff/strategies/multi-search-replace.ts +++ b/src/core/diff/strategies/multi-search-replace.ts @@ -16,7 +16,8 @@ function getSimilarity(original: string, search: string): number { return 0 } - // Use the normalizeString utility to handle smart quotes and other special characters + // Use the normalizeString utility for comparison only, but preserve original characters + // This allows matching content with different quote styles without changing the actual content const normalizedOriginal = normalizeString(original) const normalizedSearch = normalizeString(search) @@ -32,6 +33,39 @@ function getSimilarity(original: string, search: string): number { return 1 - dist / maxLength } +/** + * Preserves Unicode characters from the original content when applying replacements. + * This function maps Unicode characters from the original to the replacement content + * when they have been normalized to ASCII equivalents. + */ +function preserveUnicodeCharacters(originalContent: string, searchContent: string, replaceContent: string): string { + // Create a mapping of ASCII characters to their Unicode equivalents from the original + const unicodeMap = new Map() + + // Check for Unicode quotes in the original content + const unicodeChars = ["\u201C", "\u201D", "\u2018", "\u2019"] // ""'' + const asciiChars = ['"', '"', "'", "'"] + + for (let i = 0; i < unicodeChars.length; i++) { + const unicodeChar = unicodeChars[i] + const asciiChar = asciiChars[i] + + // If original contains Unicode character, map ASCII to Unicode + if (originalContent.includes(unicodeChar)) { + unicodeMap.set(asciiChar, unicodeChar) + } + } + + // Apply the mapping to the replacement content + let result = replaceContent + for (const [ascii, unicode] of unicodeMap) { + // Use a more specific replacement to avoid replacing characters that shouldn't be replaced + result = result.replace(new RegExp(ascii.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), "g"), unicode) + } + + return result +} + /** * Performs a "middle-out" search of `lines` (between [startIndex, endIndex]) to find * the slice that is most similar to `searchChunk`. Returns the best score, index, and matched text. @@ -550,6 +584,11 @@ Only use a single line of '=======' between search and replacement content, beca // Get the matched lines from the original content const matchedLines = resultLines.slice(matchIndex, matchIndex + searchLines.length) + const originalMatchedContent = matchedLines.join("\n") + + // Preserve Unicode characters from the original content in the replacement + replaceContent = preserveUnicodeCharacters(originalMatchedContent, searchContent, replaceContent) + replaceLines = replaceContent === "" ? [] : replaceContent.split(/\r?\n/) // Get the exact indentation (preserving tabs/spaces) of each line const originalIndents = matchedLines.map((line) => {