From 2932e578c3a128fe4a2338193a1a9c761c2e83cf Mon Sep 17 00:00:00 2001 From: gofr <32750931+gofr@users.noreply.github.com> Date: Thu, 4 Sep 2025 12:12:40 +0200 Subject: [PATCH] Handle surrogate pairs in jest-diff's diffStrings String slicing and indexing treats strings as sequences of UTF-16 code units. That makes diffStrings() break apart surrogate pairs. Turn the strings to diff into arrays of code points to avoid that. Add tests also for some other Unicode text segmentation edge cases. --- CHANGELOG.md | 4 +++ .../src/__tests__/diffStringsRaw.test.ts | 36 +++++++++++++++++++ packages/jest-diff/src/diffStrings.ts | 33 ++++++++++++----- 3 files changed, 64 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ca86cce4f524..e5aa82c80f8d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ ## main +### Fixes + +- `[jest-diff]` Fix `diffStrings()` to not break apart surrogate pairs ([#15812](https://github.com/jestjs/jest/pull/15812)) + ## 30.1.3 ### Fixes diff --git a/packages/jest-diff/src/__tests__/diffStringsRaw.test.ts b/packages/jest-diff/src/__tests__/diffStringsRaw.test.ts index 47620b7b2115..4b4d437f661c 100644 --- a/packages/jest-diff/src/__tests__/diffStringsRaw.test.ts +++ b/packages/jest-diff/src/__tests__/diffStringsRaw.test.ts @@ -31,4 +31,40 @@ describe('diffStringsRaw', () => { expect(received).toEqual(expected); }); + + describe('unicode', () => { + test('surrogate pairs', () => { + const expected: Array = [ + new Diff(DIFF_DELETE, '😞'), + new Diff(DIFF_INSERT, 'πŸ˜„'), + ]; + const received = diffStringsRaw('😞', 'πŸ˜„', false); + + expect(received).toEqual(expected); + }); + test('grapheme clusters', () => { + const expected: Array = [ + new Diff(DIFF_DELETE, 'πŸ‘©β€πŸ‘©β€'), + new Diff(DIFF_EQUAL, 'πŸ‘§'), + new Diff(DIFF_DELETE, 'β€πŸ‘¦'), + new Diff(DIFF_EQUAL, ' πŸ‡Ί'), + new Diff(DIFF_DELETE, 'πŸ‡Έ'), + new Diff(DIFF_INSERT, 'πŸ‡¦'), + ]; + const received = diffStringsRaw('πŸ‘©β€πŸ‘©β€πŸ‘§β€πŸ‘¦ πŸ‡ΊπŸ‡Έ', 'πŸ‘§ πŸ‡ΊπŸ‡¦', false); + + expect(received).toEqual(expected); + }); + test('normalization', () => { + const expected: Array = [ + new Diff(DIFF_EQUAL, 'ma'), + new Diff(DIFF_DELETE, 'n\u0303'), + new Diff(DIFF_INSERT, 'Γ±'), + new Diff(DIFF_EQUAL, 'ana'), + ]; + const received = diffStringsRaw('man\u0303ana', 'maΓ±ana', false); + + expect(received).toEqual(expected); + }); + }); }); diff --git a/packages/jest-diff/src/diffStrings.ts b/packages/jest-diff/src/diffStrings.ts index 0b733da62aa6..c1ff9463570f 100644 --- a/packages/jest-diff/src/diffStrings.ts +++ b/packages/jest-diff/src/diffStrings.ts @@ -9,7 +9,11 @@ import diffSequences from '@jest/diff-sequences'; import {DIFF_DELETE, DIFF_EQUAL, DIFF_INSERT, Diff} from './cleanupSemantic'; const diffStrings = (a: string, b: string): Array => { - const isCommon = (aIndex: number, bIndex: number) => a[aIndex] === b[bIndex]; + // Split strings into code points to handle surrogate pairs. + const aCodepoints = [...a]; + const bCodepoints = [...b]; + const isCommon = (aIndex: number, bIndex: number) => + aCodepoints[aIndex] === bCodepoints[bIndex]; let aIndex = 0; let bIndex = 0; @@ -21,25 +25,36 @@ const diffStrings = (a: string, b: string): Array => { bCommon: number, ) => { if (aIndex !== aCommon) { - diffs.push(new Diff(DIFF_DELETE, a.slice(aIndex, aCommon))); + diffs.push( + new Diff(DIFF_DELETE, aCodepoints.slice(aIndex, aCommon).join('')), + ); } if (bIndex !== bCommon) { - diffs.push(new Diff(DIFF_INSERT, b.slice(bIndex, bCommon))); + diffs.push( + new Diff(DIFF_INSERT, bCodepoints.slice(bIndex, bCommon).join('')), + ); } aIndex = aCommon + nCommon; // number of characters compared in a bIndex = bCommon + nCommon; // number of characters compared in b - diffs.push(new Diff(DIFF_EQUAL, b.slice(bCommon, bIndex))); + diffs.push( + new Diff(DIFF_EQUAL, bCodepoints.slice(bCommon, bIndex).join('')), + ); }; - diffSequences(a.length, b.length, isCommon, foundSubsequence); + diffSequences( + aCodepoints.length, + bCodepoints.length, + isCommon, + foundSubsequence, + ); // After the last common subsequence, push remaining change items. - if (aIndex !== a.length) { - diffs.push(new Diff(DIFF_DELETE, a.slice(aIndex))); + if (aIndex !== aCodepoints.length) { + diffs.push(new Diff(DIFF_DELETE, aCodepoints.slice(aIndex).join(''))); } - if (bIndex !== b.length) { - diffs.push(new Diff(DIFF_INSERT, b.slice(bIndex))); + if (bIndex !== bCodepoints.length) { + diffs.push(new Diff(DIFF_INSERT, bCodepoints.slice(bIndex).join(''))); } return diffs;