Skip to content

Commit 75202eb

Browse files
committed
refactor: enhance similarity calculation by improving string normalization
Updated the string normalization process in both search-replace and multi-search-replace strategies to handle invisible whitespace more effectively. The changes include: - Standardizing line endings to \n - Converting tabs to spaces - Removing zero-width spaces and other invisible characters - Trimming trailing spaces from each line - Collapsing multiple spaces into a single space These improvements aim to reduce false negatives in similarity matching, allowing for better handling of semantically similar content with minor formatting differences.
1 parent aecf968 commit 75202eb

File tree

2 files changed

+86
-24
lines changed

2 files changed

+86
-24
lines changed

src/core/diff/strategies/multi-search-replace.ts

Lines changed: 43 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,31 +7,62 @@ import { ToolUse } from "../../assistant-message"
77
const BUFFER_LINES = 40 // Number of extra context lines to show before and after matches
88

99
function getSimilarity(original: string, search: string): number {
10-
if (search === "") {
10+
// If there's no search text, treat it as a perfect match
11+
if (search.trim() === "") {
1112
return 1
1213
}
1314

14-
// Normalize strings by removing extra whitespace but preserve case
15-
const normalizeStr = (str: string) =>
16-
str
17-
.replace(/\r\n/g, "\n") // Standardize line endings
18-
.replace(/\t/g, " ") // Convert tabs to spaces
19-
.replace(/\s+/g, " ") // Replace all whitespace sequences with a single space
20-
.replace(/\u200B/g, "") // Remove zero-width spaces
21-
.trim() // Remove leading/trailing whitespace
15+
const normalizeStr = (input: string) => {
16+
let str = input
2217

18+
// 1) Unicode normalization for consistent codepoints
19+
// (helps unify visually identical characters, e.g. different emoji variants)
20+
str = str.normalize("NFKC")
21+
22+
// 2) Standardize line endings: convert \r\n -> \n
23+
str = str.replace(/\r\n/g, "\n")
24+
25+
// 3) Remove zero-width spaces or other invisible chars
26+
// (Add more if you suspect other hidden chars)
27+
str = str.replace(/\u200B/g, "")
28+
str = str.replace(/\u00A0/g, " ") // Non-breaking space -> normal space
29+
// str = str.replace(/\u00AD/g, ""); // Soft hyphen (optional)
30+
31+
// 4) Trim trailing spaces from each line
32+
// (Removes leftover spaces at line ends)
33+
str = str.replace(/[ \t]+$/gm, "")
34+
35+
// 5) Convert tabs to single spaces (adjust if you prefer 2 or 4)
36+
str = str.replace(/\t/g, " ")
37+
38+
// 6) Collapse multiple spaces into a single space
39+
// (You can do this per line or across the whole string)
40+
str = str.replace(/\s+/g, " ")
41+
42+
// 7) Optional: remove lines containing only triple backticks
43+
// If you don't want to treat them as differences:
44+
// str = str.replace(/^```$/gm, "");
45+
46+
// 8) Final trim to remove any leading/trailing whitespace
47+
str = str.trim()
48+
49+
return str
50+
}
51+
52+
// Normalize both original and search
2353
const normalizedOriginal = normalizeStr(original)
2454
const normalizedSearch = normalizeStr(search)
2555

56+
// If they're now identical, perfect match
2657
if (normalizedOriginal === normalizedSearch) {
2758
return 1
2859
}
2960

30-
// Calculate Levenshtein distance using fastest-levenshtein's distance function
61+
// Otherwise compute Levenshtein distance
3162
const dist = distance(normalizedOriginal, normalizedSearch)
32-
33-
// Calculate similarity ratio (0 to 1, where 1 is an exact match)
3463
const maxLength = Math.max(normalizedOriginal.length, normalizedSearch.length)
64+
65+
// Similarity from 0 to 1 (1 = exact match)
3566
return 1 - dist / maxLength
3667
}
3768

src/core/diff/strategies/search-replace.ts

Lines changed: 43 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,31 +5,62 @@ import { distance } from "fastest-levenshtein"
55
const BUFFER_LINES = 20 // Number of extra context lines to show before and after matches
66

77
function getSimilarity(original: string, search: string): number {
8-
if (search === "") {
8+
// If there's no search text, treat it as a perfect match
9+
if (search.trim() === "") {
910
return 1
1011
}
1112

12-
// Normalize strings by removing extra whitespace but preserve case
13-
const normalizeStr = (str: string) =>
14-
str
15-
.replace(/\r\n/g, "\n") // Standardize line endings
16-
.replace(/\t/g, " ") // Convert tabs to spaces
17-
.replace(/\s+/g, " ") // Replace all whitespace sequences with a single space
18-
.replace(/\u200B/g, "") // Remove zero-width spaces
19-
.trim() // Remove leading/trailing whitespace
13+
const normalizeStr = (input: string) => {
14+
let str = input
2015

16+
// 1) Unicode normalization for consistent codepoints
17+
// (helps unify visually identical characters, e.g. different emoji variants)
18+
str = str.normalize("NFKC")
19+
20+
// 2) Standardize line endings: convert \r\n -> \n
21+
str = str.replace(/\r\n/g, "\n")
22+
23+
// 3) Remove zero-width spaces or other invisible chars
24+
// (Add more if you suspect other hidden chars)
25+
str = str.replace(/\u200B/g, "")
26+
str = str.replace(/\u00A0/g, " ") // Non-breaking space -> normal space
27+
// str = str.replace(/\u00AD/g, ""); // Soft hyphen (optional)
28+
29+
// 4) Trim trailing spaces from each line
30+
// (Removes leftover spaces at line ends)
31+
str = str.replace(/[ \t]+$/gm, "")
32+
33+
// 5) Convert tabs to single spaces (adjust if you prefer 2 or 4)
34+
str = str.replace(/\t/g, " ")
35+
36+
// 6) Collapse multiple spaces into a single space
37+
// (You can do this per line or across the whole string)
38+
str = str.replace(/\s+/g, " ")
39+
40+
// 7) Optional: remove lines containing only triple backticks
41+
// If you don't want to treat them as differences:
42+
// str = str.replace(/^```$/gm, "");
43+
44+
// 8) Final trim to remove any leading/trailing whitespace
45+
str = str.trim()
46+
47+
return str
48+
}
49+
50+
// Normalize both original and search
2151
const normalizedOriginal = normalizeStr(original)
2252
const normalizedSearch = normalizeStr(search)
2353

54+
// If they're now identical, perfect match
2455
if (normalizedOriginal === normalizedSearch) {
2556
return 1
2657
}
2758

28-
// Calculate Levenshtein distance using fastest-levenshtein's distance function
59+
// Otherwise compute Levenshtein distance
2960
const dist = distance(normalizedOriginal, normalizedSearch)
30-
31-
// Calculate similarity ratio (0 to 1, where 1 is an exact match)
3261
const maxLength = Math.max(normalizedOriginal.length, normalizedSearch.length)
62+
63+
// Similarity from 0 to 1 (1 = exact match)
3364
return 1 - dist / maxLength
3465
}
3566

0 commit comments

Comments
 (0)