nondeterministic char_substitutes (PR port) (#4)

bt2901 · web-flow · commit 21bd8bbd94c6 · 2023-09-20T09:44:33.000+03:00
* try to port to a fork in a less fancy way

* fix merge artifacts

* fix some more merge artifacts

* fix similar_items_values

* add test data

* remove commented code

* add sources for binary files
diff --git a/dawg_python/dawgs.py b/dawg_python/dawgs.py
@@ -38,15 +38,15 @@ def _similar_keys(self, current_prefix, key, index, replace_chars):
             b_step = key[word_pos].encode('utf8')
 
             if b_step in replace_chars:
-                next_index = index
-                b_replace_char, u_replace_char = replace_chars[b_step]
+                for (b_replace_char, u_replace_char) in replace_chars[b_step]:
+                    next_index = index
 
-                next_index = self.dct.follow_bytes(b_replace_char, next_index)
+                    next_index = self.dct.follow_bytes(b_replace_char, next_index)
 
-                if next_index is not None:
-                    prefix = current_prefix + key[start_pos:word_pos] + u_replace_char
-                    extra_keys = self._similar_keys(prefix, key, next_index, replace_chars)
-                    res += extra_keys
+                    if next_index:
+                        prefix = current_prefix + key[start_pos:word_pos] + u_replace_char
+                        extra_keys = self._similar_keys(prefix, key, next_index, replace_chars)
+                        res += extra_keys
 
             index = self.dct.follow_bytes(b_step, index)
             if index is None:
@@ -67,7 +67,7 @@ def similar_keys(self, key, replaces):
 
         ``replaces`` is an object obtained from
         ``DAWG.compile_replaces(mapping)`` where mapping is a dict
-        that maps single-char unicode sitrings to another single-char
+        that maps single-char unicode strings to (one or more) single-char
         unicode strings.
 
         This may be useful e.g. for handling single-character umlauts.
@@ -77,14 +77,18 @@ def similar_keys(self, key, replaces):
     @classmethod
     def compile_replaces(cls, replaces):
 
-        for k, v in replaces.items():
-            if len(k) != 1 or len(v) != 1:
-                raise ValueError("Keys and values must be single-char unicode strings.")
+        for k,v in replaces.items():
+            if len(k) != 1:
+                raise ValueError("Keys must be single-char unicode strings.")
+            if (isinstance(v, str) and len(v) != 1):
+                raise ValueError("Values must be single-char unicode strings or non-empty lists of such.")
+            if isinstance(v, list) and (any(len(v_entry) != 1 for v_entry in v) or len(v) < 1):
+                raise ValueError("Values must be single-char unicode strings or non-empty lists of such.")
 
         return dict(
             (
                 k.encode('utf8'),
-                (v.encode('utf8'), v),
+                [(v_entry.encode('utf8'), v_entry) for v_entry in v]
             )
             for k, v in replaces.items()
         )
@@ -326,14 +330,15 @@ def _similar_items(self, current_prefix, key, index, replace_chars):
             b_step = key[word_pos].encode('utf8')
 
             if b_step in replace_chars:
-                next_index = index
-                b_replace_char, u_replace_char = replace_chars[b_step]
+                for (b_replace_char, u_replace_char) in replace_chars[b_step]:
+                    next_index = index
 
-                next_index = self.dct.follow_bytes(b_replace_char, next_index)
-                if next_index:
-                    prefix = current_prefix + key[start_pos:word_pos] + u_replace_char
-                    extra_items = self._similar_items(prefix, key, next_index, replace_chars)
-                    res += extra_items
+                    next_index = self.dct.follow_bytes(b_replace_char, next_index)
+
+                    if next_index:
+                        prefix = current_prefix + key[start_pos:word_pos] + u_replace_char
+                        extra_items = self._similar_items(prefix, key, next_index, replace_chars)
+                        res += extra_items
 
             index = self.dct.follow_bytes(b_step, index)
             if not index:
@@ -356,7 +361,7 @@ def similar_items(self, key, replaces):
 
         ``replaces`` is an object obtained from
         ``DAWG.compile_replaces(mapping)`` where mapping is a dict
-        that maps single-char unicode sitrings to another single-char
+        that maps single-char unicode strings to (one or more) single-char
         unicode strings.
         """
         return self._similar_items("", key, self.dct.ROOT, replaces)
@@ -370,13 +375,14 @@ def _similar_item_values(self, start_pos, key, index, replace_chars):
             b_step = key[word_pos].encode('utf8')
 
             if b_step in replace_chars:
-                next_index = index
-                b_replace_char, u_replace_char = replace_chars[b_step]
+                for (b_replace_char, u_replace_char) in replace_chars[b_step]:
+                    next_index = index
+
+                    next_index = self.dct.follow_bytes(b_replace_char, next_index)
 
-                next_index = self.dct.follow_bytes(b_replace_char, next_index)
-                if next_index:
-                    extra_items = self._similar_item_values(word_pos + 1, key, next_index, replace_chars)
-                    res += extra_items
+                    if next_index:
+                        extra_items = self._similar_item_values(word_pos + 1, key, next_index, replace_chars)
+                        res += extra_items
 
             index = self.dct.follow_bytes(b_step, index)
             if not index:
@@ -398,7 +404,7 @@ def similar_item_values(self, key, replaces):
 
         ``replaces`` is an object obtained from
         ``DAWG.compile_replaces(mapping)`` where mapping is a dict
-        that maps single-char unicode sitrings to another single-char
+        that maps single-char unicode strings to (one or more) single-char
         unicode strings.
         """
         return self._similar_item_values(0, key, self.dct.ROOT, replaces)
diff --git a/dev_data/small/prediction1917-record.csv b/dev_data/small/prediction1917-record.csv
@@ -0,0 +1,14 @@
+,0,1,2,3
+хлѣб,98,51,54,49
+ёлка,54,99,99,98
+ель,51,53,101,101
+лѣс,101,56,102,48
+лѣсное,57,99,53,56
+всё,50,99,55,53
+всѣ,49,99,54,48
+бѣлёная,97,49,56,97
+изобрѣтён,51,99,99,99
+лев,98,50,52,56
+лёв,50,101,51,99
+лѣв,99,100,102,50
+вѣнскій,100,57,101,57
diff --git a/dev_data/small/prediction1917-record.dawg b/dev_data/small/prediction1917-record.dawg
diff --git a/dev_data/small/prediction1917.dawg b/dev_data/small/prediction1917.dawg
diff --git a/dev_data/small/prediction1917.txt b/dev_data/small/prediction1917.txt
@@ -0,0 +1 @@
+хлѣб ёлка ель лѣс лѣсное всё всѣ бѣлёная изобрѣтён лев лёв лѣв вѣнскій
diff --git a/tests/test_prediction.py b/tests/test_prediction.py
@@ -2,6 +2,12 @@
 
 import dawg_python
 from .utils import data_path
+from hashlib import md5
+
+
+def encode(w):
+    code = md5(w.encode('utf8'))
+    return tuple([ord(c) for c in code.hexdigest()])[:4]
 
 
 class TestPrediction:
@@ -62,3 +68,74 @@ def test_record_dawg_items(self, word, prediction):
     def test_record_dawg_items_values(self, word, prediction):
         d = self.record_dawg()
         assert d.similar_item_values(word, self.REPLACES) == prediction
+
+
+class TestMultiValuedPrediction(object):
+
+    REPLACES = dawg_python.DAWG.compile_replaces({'е': ['ё', 'ѣ'], 'и': 'і'})
+
+    DATA = "хлѣб ёлка ель лѣс лѣсное всё всѣ бѣлёная изобрѣтён лев лёв лѣв вѣнскій".split(" ")
+
+    SUITE = [
+        ('осел', []),
+        ('ель', ['ель']),
+        ('ёль', []),
+        ('хлеб', ['хлѣб']),
+        ('елка', ['ёлка']),
+        ('лесное', ['лѣсное']),
+        ('лесноё', []),
+        ('лёсное', []),
+        ('изобретен', ['изобрѣтён']),
+        ('беленая', ['бѣлёная']),
+        ('белёная', ['бѣлёная']),
+        ('бѣленая', ['бѣлёная']),
+        ('бѣлёная', ['бѣлёная']),
+        ('белѣная', []),
+        ('бѣлѣная', []),
+        ('все', ['всё', 'всѣ']),
+        ('лев', ['лев', 'лёв', 'лѣв']),
+        ('венский', ['вѣнскій']),
+    ]
+
+    SUITE_ITEMS = [
+        (
+            it[0], # key
+            [
+                (w, [encode(w)]) # item, value pair
+                for w in it[1]
+            ]
+        )
+        for it in SUITE
+    ]
+
+    SUITE_VALUES = [
+        (
+            it[0], # key
+            [[encode(w)] for w in it[1]]
+        )
+        for it in SUITE
+    ]
+
+    def record_dawg(self):
+        path = data_path("small", "prediction1917-record.dawg")
+        return dawg_python.RecordDAWG(str("=HHHH")).load(path)
+
+    @pytest.mark.parametrize(("word", "prediction"), SUITE)
+    def test_dawg_prediction(self, word, prediction):
+        d = dawg_python.DAWG().load(data_path("small", "prediction1917.dawg"))
+        assert d.similar_keys(word, self.REPLACES) == prediction
+
+    @pytest.mark.parametrize(("word", "prediction"), SUITE)
+    def test_record_dawg_prediction(self, word, prediction):
+        d = self.record_dawg()
+        assert d.similar_keys(word, self.REPLACES) == prediction
+
+    @pytest.mark.parametrize(("word", "prediction"), SUITE_ITEMS)
+    def test_record_dawg_items(self, word, prediction):
+        d = self.record_dawg()
+        assert d.similar_items(word, self.REPLACES) == prediction
+
+    @pytest.mark.parametrize(("word", "prediction"), SUITE_VALUES)
+    def test_record_dawg_items_values(self, word, prediction):
+        d = self.record_dawg()
+        assert d.similar_item_values(word, self.REPLACES) == prediction

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+хлѣб ёлка ель лѣс лѣсное всё всѣ бѣлёная изобрѣтён лев лёв лѣв вѣнскій`