Move InCB pattern lookup to bottom (#106)

cometkim · web-flow · commit 65c38ce9756d · 2025-12-15T06:04:30.000+09:00
Same logic, but update the InCB state only when it's not the beginning of a segment.
diff --git a/.changeset/shiny-buttons-bathe.md b/.changeset/shiny-buttons-bathe.md
@@ -0,0 +1,8 @@
+---
+"unicode-segmenter": patch
+---
+
+Move GB9c rule checking to be _after_ the main boundary checking.
+To try to avoid unnecessary work as much as possible.
+
+No noticeable changes, but perf seems to be improved by ~2% for most cases.
diff --git a/README.md b/README.md
@@ -215,7 +215,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb
 
 | Name                         | Unicode® | ESM? |   Size    | Size (min) | Size (min+gzip) | Size (min+br) | Size (min+zstd) |
 |------------------------------|----------|------|----------:|-----------:|----------------:|--------------:|----------------:|
-| `unicode-segmenter/grapheme` |   16.0.0 |   ✔️ |    16,704 |     12,554 |           5,308 |         3,958 |           5,010 |
+| `unicode-segmenter/grapheme` |   16.0.0 |   ✔️ |    16,685 |     12,549 |           5,314 |         3,952 |           5,012 |
 | `graphemer`                  |   15.0.0 |   ✖️ ️|   410,435 |     95,104 |          15,752 |        10,660 |          15,911 |
 | `grapheme-splitter`          |   10.0.0 |   ✖️ |   122,254 |     23,682 |           7,852 |         4,802 |           6,753 |
 | `@formatjs/intl-segmenter`*  |   15.0.0 |   ✖️ |   603,510 |    369,673 |          72,273 |        49,530 |          68,027 |
@@ -231,7 +231,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb
 
 | Name                         | Bytecode size | Bytecode size (gzip)* |
 |------------------------------|--------------:|----------------------:|
-| `unicode-segmenter/grapheme` |        26,309 |                13,811 |
+| `unicode-segmenter/grapheme` |        26,278 |                13,797 |
 | `graphemer`                  |       134,089 |                31,766 |
 | `grapheme-splitter`          |        63,946 |                19,162 |
 
diff --git a/src/grapheme.js b/src/grapheme.js
@@ -82,21 +82,10 @@ export function* graphemeSegments(input) {
   /** Beginning category of a segment */
   let _catBegin = catBefore;
 
-  /** Memoize the beginnig code point a the segment. */
+  /** Memoize the beginnig code point of the segment. */
   let _hd = cp;
 
   while (cursor < len) {
-    // Note: Lazily update `consonant` and `linker` state
-    // which is a extra overhead only for Hindi text.
-    if (cp >= 2325) {
-      if (!consonant && catBefore === 0) {
-        consonant = isIndicConjunctConsonant(cp);
-      } else if (catBefore === 3 /* Extend */) {
-        // Note: \p{InCB=Linker} is a subset of \p{Extend}
-        linker = isIndicConjunctLinker(cp);
-      }
-    }
-
     cp = /** @type {number} */ (input.codePointAt(cursor));
     catAfter = cat(cp);
 
@@ -110,11 +99,8 @@ export function* graphemeSegments(input) {
       ) {
         emoji = true;
 
-      } else if (catAfter === 0 /* Any */ && cp >= 2325) {
-        // Note: Put GB9c rule checking here to reduce.
-        incb = consonant && linker && (consonant = isIndicConjunctConsonant(cp));
-        // It cannot be both a linker and a consonant.
-        linker = linker && !consonant;
+      } else if (catAfter === 0) {
+        incb = consonant && linker && isIndicConjunctConsonant(cp);
       }
     }
 
@@ -134,6 +120,17 @@ export function* graphemeSegments(input) {
       index = cursor;
       _catBegin = catAfter;
       _hd = cp;
+
+    } else if (cp >= 2325) {
+      // Note: Avoid InCB state checking much as possible
+      // Update InCB state only when continuing within a segment
+      if (!consonant && catBefore === 0)
+        consonant = isIndicConjunctConsonant(_hd);
+
+      if (consonant && catAfter === 3)
+        linker = isIndicConjunctLinker(cp);
+      else if (catAfter === 0)
+        linker = false;
     }
 
     cursor += cp <= BMP_MAX ? 1 : 2;