infer: Use grapheme clusters rather than down-weighting combining marks

Tavian Barnes · Tavian Barnes · commit dc65c0ecb8f2 · 2019-07-18T16:00:02.000-04:00
This makes it perform better on things like Zalgo text with a great deal
of combining marks.
diff --git a/python/bistring/_infer.py b/python/bistring/_infer.py
@@ -9,37 +9,43 @@
 
 from ._alignment import Alignment
 from ._bistr import bistr
+from ._token import CharacterTokenizer
 
 
 @dataclass(frozen=True)
 class AugmentedChar:
     """
-    A single character (code point) augmented with extra information.
+    A single character (grapheme cluster) augmented with extra information.
     """
 
-    folded: str
+    top_category: str
     """
-    The case-folded form of the char.
+    The top-level Unicode category of the char (L, P, Z, etc.).
     """
 
-    normalized: str
+    category: str
     """
-    The Unicode compatibility normalized form of the char.
+    The specific Unicode category of the char (Lu, Po, Zs, etc.).
     """
 
-    original: str
+    root: str
     """
-    The original form of the char.
+    The root code point of the grapheme cluster.
     """
 
-    top_category: str
+    folded: str
     """
-    The top-level Unicode category of the char (L, P, Z, etc.).
+    The case-folded form of the char.
     """
 
-    category: str
+    normalized: str
     """
-    The specific Unicode category of the char (Lu, Po, Zs, etc.).
+    The Unicode compatibility normalized form of the char.
+    """
+
+    original: str
+    """
+    The original form of the char.
     """
 
     @classmethod
@@ -49,29 +55,22 @@ def cost_fn(cls, a: Optional[AugmentedChar], b: Optional[AugmentedChar]) -> int:
         """
 
         if a is None or b is None:
-            if a:
-                top_category = a.top_category
-            elif b:
-                top_category = b.top_category
-            else:
-                assert False, 'Unreachable'
-
-            if top_category == 'M':
-                # Less penalty for combining marks
-                return 1
-            else:
-                # cost(insert) + cost(delete) (3 + 3) should be more than cost(substitute) (5)
-                return 3
+            # cost(insert) + cost(delete) (4 + 4) should be more than cost(substitute) (6)
+            return 4
 
         result = 0
+        result += int(a.top_category != b.top_category)
+        result += int(a.category != b.category)
+        result += int(a.root != b.root)
         result += int(a.folded != b.folded)
         result += int(a.normalized != b.normalized)
         result += int(a.original != b.original)
-        result += int(a.top_category != b.top_category)
-        result += int(a.category != b.category)
         return result
 
 
+TOKENIZER = CharacterTokenizer('root')
+
+
 @dataclass(frozen=True)
 class AugmentedString:
     """
@@ -97,21 +96,27 @@ class AugmentedString:
     def augment(cls, original: str) -> AugmentedString:
         normalized = bistr(original).normalize('NFKD')
         folded = bistr(normalized.modified).casefold()
+        glyphs = TOKENIZER.tokenize(folded)
 
         chars = []
-        for i, fold_c in enumerate(folded):
-            norm_slice = folded.alignment.original_slice(i, i + 1)
+        for glyph in glyphs:
+            fold_c = glyph.text.modified
+            root = fold_c[0]
+
+            norm_slice = folded.alignment.original_slice(glyph.start, glyph.end)
             norm_c = folded.original[norm_slice]
 
             orig_slice = normalized.alignment.original_slice(norm_slice)
             orig_c = normalized.original[orig_slice]
 
-            cat = unicodedata.category(fold_c)
+            cat = unicodedata.category(root)
             top_cat = cat[0]
 
-            chars.append(AugmentedChar(fold_c, norm_c, orig_c, top_cat, cat))
+            chars.append(AugmentedChar(top_cat, cat, root, fold_c, norm_c, orig_c))
 
-        alignment = normalized.alignment.compose(folded.alignment)
+        alignment = normalized.alignment
+        alignment = alignment.compose(folded.alignment)
+        alignment = alignment.compose(glyphs.alignment)
         return cls(original, chars, alignment)
 
 
diff --git a/python/tests/test_bistr.py b/python/tests/test_bistr.py
@@ -65,6 +65,10 @@ def test_infer():
     assert bs[40:43].original == '🐶'
     assert bs[40:43].modified == 'dog'
 
+    bs = bistr.infer('Z̴̡̪̫̖̥̔̿̃̈̏̎͠͝á̸̪̠̖̻̬̖̪̞͙͇̮̠͎̆͋́̐͌̒͆̓l̶͉̭̳̤̬̮̩͎̟̯̜͇̥̠̘͑͐̌͂̄́̀̂̌̈͛̊̄̚͜ģ̸̬̼̞̙͇͕͎̌̾̒̐̿̎̆̿̌̃̏̌́̾̈͘͜o̶̢̭͕͔̩͐ ̴̡̡̜̥̗͔̘̦͉̣̲͚͙̐̈́t̵͈̰̉̀͒̎̈̿̔̄̽͑͝͠ẹ̵̫̲̫̄͜͜x̵͕̳͈̝̤̭̼̼̻͓̿̌̽̂̆̀̀̍̒͐́̈̀̚͝t̸̡̨̥̺̣̟͎̝̬̘̪͔͆́̄̅̚', 'Zalgo text')
+    for i, c in enumerate(bs):
+        assert bs[i:i+1].original.startswith(c)
+
 
 def test_concat():
     bs = bistr('  ', '')