Skip to content

Commit 2df29f8

Browse files
committed
Handle surrogate pairs in jest-diff's diffStrings
String slicing and indexing treats strings as sequences of UTF-16 code units. That makes diffStrings() break apart surrogate pairs. Turn the strings to diff into arrays of code points to avoid that. Add tests also for some other Unicode text segmentation edge cases.
1 parent da9b532 commit 2df29f8

File tree

2 files changed

+60
-9
lines changed

2 files changed

+60
-9
lines changed

packages/jest-diff/src/__tests__/diffStringsRaw.test.ts

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,4 +31,40 @@ describe('diffStringsRaw', () => {
3131

3232
expect(received).toEqual(expected);
3333
});
34+
35+
describe('unicode', () => {
36+
test('surrogate pairs', () => {
37+
const expected: Array<Diff> = [
38+
new Diff(DIFF_DELETE, '😞'),
39+
new Diff(DIFF_INSERT, '😄'),
40+
];
41+
const received = diffStringsRaw('😞', '😄', false);
42+
43+
expect(received).toEqual(expected);
44+
});
45+
test('grapheme clusters', () => {
46+
const expected: Array<Diff> = [
47+
new Diff(DIFF_DELETE, '👩‍👩‍'),
48+
new Diff(DIFF_EQUAL, '👧'),
49+
new Diff(DIFF_DELETE, '‍👦'),
50+
new Diff(DIFF_EQUAL, ' 🇺'),
51+
new Diff(DIFF_DELETE, '🇸'),
52+
new Diff(DIFF_INSERT, '🇦'),
53+
];
54+
const received = diffStringsRaw('👩‍👩‍👧‍👦 🇺🇸', '👧 🇺🇦', false);
55+
56+
expect(received).toEqual(expected);
57+
});
58+
test('normalization', () => {
59+
const expected: Array<Diff> = [
60+
new Diff(DIFF_EQUAL, 'ma'),
61+
new Diff(DIFF_DELETE, 'n\u0303'),
62+
new Diff(DIFF_INSERT, 'ñ'),
63+
new Diff(DIFF_EQUAL, 'ana'),
64+
];
65+
const received = diffStringsRaw('man\u0303ana', 'mañana', false);
66+
67+
expect(received).toEqual(expected);
68+
});
69+
});
3470
});

packages/jest-diff/src/diffStrings.ts

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,11 @@ import diffSequences from '@jest/diff-sequences';
99
import {DIFF_DELETE, DIFF_EQUAL, DIFF_INSERT, Diff} from './cleanupSemantic';
1010

1111
const diffStrings = (a: string, b: string): Array<Diff> => {
12-
const isCommon = (aIndex: number, bIndex: number) => a[aIndex] === b[bIndex];
12+
// Split strings into code points to handle surrogate pairs.
13+
const aCodepoints = [...a];
14+
const bCodepoints = [...b];
15+
const isCommon = (aIndex: number, bIndex: number) =>
16+
aCodepoints[aIndex] === bCodepoints[bIndex];
1317

1418
let aIndex = 0;
1519
let bIndex = 0;
@@ -21,25 +25,36 @@ const diffStrings = (a: string, b: string): Array<Diff> => {
2125
bCommon: number,
2226
) => {
2327
if (aIndex !== aCommon) {
24-
diffs.push(new Diff(DIFF_DELETE, a.slice(aIndex, aCommon)));
28+
diffs.push(
29+
new Diff(DIFF_DELETE, aCodepoints.slice(aIndex, aCommon).join('')),
30+
);
2531
}
2632
if (bIndex !== bCommon) {
27-
diffs.push(new Diff(DIFF_INSERT, b.slice(bIndex, bCommon)));
33+
diffs.push(
34+
new Diff(DIFF_INSERT, bCodepoints.slice(bIndex, bCommon).join('')),
35+
);
2836
}
2937

3038
aIndex = aCommon + nCommon; // number of characters compared in a
3139
bIndex = bCommon + nCommon; // number of characters compared in b
32-
diffs.push(new Diff(DIFF_EQUAL, b.slice(bCommon, bIndex)));
40+
diffs.push(
41+
new Diff(DIFF_EQUAL, bCodepoints.slice(bCommon, bIndex).join('')),
42+
);
3343
};
3444

35-
diffSequences(a.length, b.length, isCommon, foundSubsequence);
45+
diffSequences(
46+
aCodepoints.length,
47+
bCodepoints.length,
48+
isCommon,
49+
foundSubsequence,
50+
);
3651

3752
// After the last common subsequence, push remaining change items.
38-
if (aIndex !== a.length) {
39-
diffs.push(new Diff(DIFF_DELETE, a.slice(aIndex)));
53+
if (aIndex !== aCodepoints.length) {
54+
diffs.push(new Diff(DIFF_DELETE, aCodepoints.slice(aIndex).join('')));
4055
}
41-
if (bIndex !== b.length) {
42-
diffs.push(new Diff(DIFF_INSERT, b.slice(bIndex)));
56+
if (bIndex !== bCodepoints.length) {
57+
diffs.push(new Diff(DIFF_INSERT, bCodepoints.slice(bIndex).join('')));
4358
}
4459

4560
return diffs;

0 commit comments

Comments
 (0)