Handle surrogate pairs in jest-diff's diffStrings

gofr · gofr · commit 2df29f8d1ff1 · 2025-09-04T12:12:40.000+02:00
String slicing and indexing treats strings as sequences of UTF-16 code
units. That makes diffStrings() break apart surrogate pairs.

Turn the strings to diff into arrays of code points to avoid that.

Add tests also for some other Unicode text segmentation edge cases.
diff --git a/packages/jest-diff/src/__tests__/diffStringsRaw.test.ts b/packages/jest-diff/src/__tests__/diffStringsRaw.test.ts
@@ -31,4 +31,40 @@ describe('diffStringsRaw', () => {
 
     expect(received).toEqual(expected);
   });
+
+  describe('unicode', () => {
+    test('surrogate pairs', () => {
+      const expected: Array<Diff> = [
+        new Diff(DIFF_DELETE, '😞'),
+        new Diff(DIFF_INSERT, '😄'),
+      ];
+      const received = diffStringsRaw('😞', '😄', false);
+
+      expect(received).toEqual(expected);
+    });
+    test('grapheme clusters', () => {
+      const expected: Array<Diff> = [
+        new Diff(DIFF_DELETE, '👩‍👩‍'),
+        new Diff(DIFF_EQUAL, '👧'),
+        new Diff(DIFF_DELETE, '‍👦'),
+        new Diff(DIFF_EQUAL, ' 🇺'),
+        new Diff(DIFF_DELETE, '🇸'),
+        new Diff(DIFF_INSERT, '🇦'),
+      ];
+      const received = diffStringsRaw('👩‍👩‍👧‍👦 🇺🇸', '👧 🇺🇦', false);
+
+      expect(received).toEqual(expected);
+    });
+    test('normalization', () => {
+      const expected: Array<Diff> = [
+        new Diff(DIFF_EQUAL, 'ma'),
+        new Diff(DIFF_DELETE, 'n\u0303'),
+        new Diff(DIFF_INSERT, 'ñ'),
+        new Diff(DIFF_EQUAL, 'ana'),
+      ];
+      const received = diffStringsRaw('man\u0303ana', 'mañana', false);
+
+      expect(received).toEqual(expected);
+    });
+  });
 });
diff --git a/packages/jest-diff/src/diffStrings.ts b/packages/jest-diff/src/diffStrings.ts
@@ -9,7 +9,11 @@ import diffSequences from '@jest/diff-sequences';
 import {DIFF_DELETE, DIFF_EQUAL, DIFF_INSERT, Diff} from './cleanupSemantic';
 
 const diffStrings = (a: string, b: string): Array<Diff> => {
-  const isCommon = (aIndex: number, bIndex: number) => a[aIndex] === b[bIndex];
+  // Split strings into code points to handle surrogate pairs.
+  const aCodepoints = [...a];
+  const bCodepoints = [...b];
+  const isCommon = (aIndex: number, bIndex: number) =>
+    aCodepoints[aIndex] === bCodepoints[bIndex];
 
   let aIndex = 0;
   let bIndex = 0;
@@ -21,25 +25,36 @@ const diffStrings = (a: string, b: string): Array<Diff> => {
     bCommon: number,
   ) => {
     if (aIndex !== aCommon) {
-      diffs.push(new Diff(DIFF_DELETE, a.slice(aIndex, aCommon)));
+      diffs.push(
+        new Diff(DIFF_DELETE, aCodepoints.slice(aIndex, aCommon).join('')),
+      );
     }
     if (bIndex !== bCommon) {
-      diffs.push(new Diff(DIFF_INSERT, b.slice(bIndex, bCommon)));
+      diffs.push(
+        new Diff(DIFF_INSERT, bCodepoints.slice(bIndex, bCommon).join('')),
+      );
     }
 
     aIndex = aCommon + nCommon; // number of characters compared in a
     bIndex = bCommon + nCommon; // number of characters compared in b
-    diffs.push(new Diff(DIFF_EQUAL, b.slice(bCommon, bIndex)));
+    diffs.push(
+      new Diff(DIFF_EQUAL, bCodepoints.slice(bCommon, bIndex).join('')),
+    );
   };
 
-  diffSequences(a.length, b.length, isCommon, foundSubsequence);
+  diffSequences(
+    aCodepoints.length,
+    bCodepoints.length,
+    isCommon,
+    foundSubsequence,
+  );
 
   // After the last common subsequence, push remaining change items.
-  if (aIndex !== a.length) {
-    diffs.push(new Diff(DIFF_DELETE, a.slice(aIndex)));
+  if (aIndex !== aCodepoints.length) {
+    diffs.push(new Diff(DIFF_DELETE, aCodepoints.slice(aIndex).join('')));
   }
-  if (bIndex !== b.length) {
-    diffs.push(new Diff(DIFF_INSERT, b.slice(bIndex)));
+  if (bIndex !== bCodepoints.length) {
+    diffs.push(new Diff(DIFF_INSERT, bCodepoints.slice(bIndex).join('')));
   }
 
   return diffs;