Skip to content

Commit 51bcade

Browse files
authored
Better string normalization for diffs (#2659)
1 parent 648c6e7 commit 51bcade

File tree

4 files changed

+135
-5
lines changed

4 files changed

+135
-5
lines changed

src/core/diff/strategies/__tests__/multi-search-replace.test.ts

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1711,6 +1711,27 @@ function sum(a, b) {
17111711
}
17121712
})
17131713

1714+
it("should match content with smart quotes", async () => {
1715+
const originalContent =
1716+
"**Enjoy Roo Code!** Whether you keep it on a short leash or let it roam autonomously, we can’t wait to see what you build. If you have questions or feature ideas, drop by our [Reddit community](https://www.reddit.com/r/RooCode/) or [Discord](https://discord.gg/roocode). Happy coding!"
1717+
const diffContent = `test.ts
1718+
<<<<<<< SEARCH
1719+
**Enjoy Roo Code!** Whether you keep it on a short leash or let it roam autonomously, we can’t wait to see what you build. If you have questions or feature ideas, drop by our [Reddit community](https://www.reddit.com/r/RooCode/) or [Discord](https://discord.gg/roocode). Happy coding!
1720+
=======
1721+
**Enjoy Roo Code!** Whether you keep it on a short leash or let it roam autonomously, we can't wait to see what you build. If you have questions or feature ideas, drop by our [Reddit community](https://www.reddit.com/r/RooCode/) or [Discord](https://discord.gg/roocode). Happy coding!
1722+
1723+
You're still here?
1724+
>>>>>>> REPLACE`
1725+
1726+
const result = await strategy.applyDiff(originalContent, diffContent)
1727+
expect(result.success).toBe(true)
1728+
if (result.success) {
1729+
expect(result.content).toBe(
1730+
"**Enjoy Roo Code!** Whether you keep it on a short leash or let it roam autonomously, we can't wait to see what you build. If you have questions or feature ideas, drop by our [Reddit community](https://www.reddit.com/r/RooCode/) or [Discord](https://discord.gg/roocode). Happy coding!\n\nYou're still here?",
1731+
)
1732+
}
1733+
})
1734+
17141735
it("should not exact match empty lines", async () => {
17151736
const originalContent = "function sum(a, b) {\n\n return a + b;\n}"
17161737
const diffContent = `test.ts

src/core/diff/strategies/multi-search-replace.ts

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import { addLineNumbers, everyLineHasLineNumbers, stripLineNumbers } from "../..
33
import { distance } from "fastest-levenshtein"
44
import { ToolProgressStatus } from "../../../shared/ExtensionMessage"
55
import { ToolUse } from "../../assistant-message"
6+
import { normalizeString } from "../../../utils/text-normalization"
67

78
const BUFFER_LINES = 40 // Number of extra context lines to show before and after matches
89

@@ -12,11 +13,9 @@ function getSimilarity(original: string, search: string): number {
1213
return 0
1314
}
1415

15-
// Normalize strings by removing extra whitespace but preserve case
16-
const normalizeStr = (str: string) => str.replace(/\s+/g, " ").trim()
17-
18-
const normalizedOriginal = normalizeStr(original)
19-
const normalizedSearch = normalizeStr(search)
16+
// Use the normalizeString utility to handle smart quotes and other special characters
17+
const normalizedOriginal = normalizeString(original)
18+
const normalizedSearch = normalizeString(search)
2019

2120
if (normalizedOriginal === normalizedSearch) {
2221
return 1
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import { normalizeString } from "../text-normalization"
2+
3+
describe("Text normalization utilities", () => {
4+
describe("normalizeString", () => {
5+
test("normalizes smart quotes by default", () => {
6+
expect(normalizeString("These are \u201Csmart quotes\u201D and \u2018single quotes\u2019")).toBe(
7+
"These are \"smart quotes\" and 'single quotes'",
8+
)
9+
})
10+
11+
test("normalizes typographic characters by default", () => {
12+
expect(normalizeString("This has an em dash \u2014 and ellipsis\u2026")).toBe(
13+
"This has an em dash - and ellipsis...",
14+
)
15+
})
16+
17+
test("normalizes whitespace by default", () => {
18+
expect(normalizeString("Multiple spaces and\t\ttabs")).toBe("Multiple spaces and tabs")
19+
})
20+
21+
test("can be configured to skip certain normalizations", () => {
22+
const input = "Keep \u201Csmart quotes\u201D but normalize whitespace"
23+
expect(normalizeString(input, { smartQuotes: false })).toBe(
24+
"Keep \u201Csmart quotes\u201D but normalize whitespace",
25+
)
26+
})
27+
28+
test("real-world example with mixed characters", () => {
29+
const input = "Let\u2019s test this\u2014with some \u201Cfancy\u201D punctuation\u2026 and spaces"
30+
expect(normalizeString(input)).toBe('Let\'s test this-with some "fancy" punctuation... and spaces')
31+
})
32+
})
33+
})

src/utils/text-normalization.ts

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
/**
2+
* Common character mappings for normalization
3+
*/
4+
export const NORMALIZATION_MAPS = {
5+
// Smart quotes to regular quotes
6+
SMART_QUOTES: {
7+
"\u201C": '"', // Left double quote (U+201C)
8+
"\u201D": '"', // Right double quote (U+201D)
9+
"\u2018": "'", // Left single quote (U+2018)
10+
"\u2019": "'", // Right single quote (U+2019)
11+
},
12+
// Other typographic characters
13+
TYPOGRAPHIC: {
14+
"\u2026": "...", // Ellipsis
15+
"\u2014": "-", // Em dash
16+
"\u2013": "-", // En dash
17+
"\u00A0": " ", // Non-breaking space
18+
},
19+
}
20+
21+
/**
22+
* Options for string normalization
23+
*/
24+
export interface NormalizeOptions {
25+
smartQuotes?: boolean // Replace smart quotes with straight quotes
26+
typographicChars?: boolean // Replace typographic characters
27+
extraWhitespace?: boolean // Collapse multiple whitespace to single space
28+
trim?: boolean // Trim whitespace from start and end
29+
}
30+
31+
/**
32+
* Default options for normalization
33+
*/
34+
const DEFAULT_OPTIONS: NormalizeOptions = {
35+
smartQuotes: true,
36+
typographicChars: true,
37+
extraWhitespace: true,
38+
trim: true,
39+
}
40+
41+
/**
42+
* Normalizes a string based on the specified options
43+
*
44+
* @param str The string to normalize
45+
* @param options Normalization options
46+
* @returns The normalized string
47+
*/
48+
export function normalizeString(str: string, options: NormalizeOptions = DEFAULT_OPTIONS): string {
49+
const opts = { ...DEFAULT_OPTIONS, ...options }
50+
let normalized = str
51+
52+
// Replace smart quotes
53+
if (opts.smartQuotes) {
54+
for (const [smart, regular] of Object.entries(NORMALIZATION_MAPS.SMART_QUOTES)) {
55+
normalized = normalized.replace(new RegExp(smart, "g"), regular)
56+
}
57+
}
58+
59+
// Replace typographic characters
60+
if (opts.typographicChars) {
61+
for (const [typographic, regular] of Object.entries(NORMALIZATION_MAPS.TYPOGRAPHIC)) {
62+
normalized = normalized.replace(new RegExp(typographic, "g"), regular)
63+
}
64+
}
65+
66+
// Normalize whitespace
67+
if (opts.extraWhitespace) {
68+
normalized = normalized.replace(/\s+/g, " ")
69+
}
70+
71+
// Trim whitespace
72+
if (opts.trim) {
73+
normalized = normalized.trim()
74+
}
75+
76+
return normalized
77+
}

0 commit comments

Comments
 (0)