Skip to content

Commit 21bd8bb

Browse files
authored
nondeterministic char_substitutes (PR port) (#4)
* try to port to a fork in a less fancy way * fix merge artifacts * fix some more merge artifacts * fix similar_items_values * add test data * remove commented code * add sources for binary files
1 parent 98d9aa4 commit 21bd8bb

File tree

6 files changed

+125
-27
lines changed

6 files changed

+125
-27
lines changed

dawg_python/dawgs.py

Lines changed: 33 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -38,15 +38,15 @@ def _similar_keys(self, current_prefix, key, index, replace_chars):
3838
b_step = key[word_pos].encode('utf8')
3939

4040
if b_step in replace_chars:
41-
next_index = index
42-
b_replace_char, u_replace_char = replace_chars[b_step]
41+
for (b_replace_char, u_replace_char) in replace_chars[b_step]:
42+
next_index = index
4343

44-
next_index = self.dct.follow_bytes(b_replace_char, next_index)
44+
next_index = self.dct.follow_bytes(b_replace_char, next_index)
4545

46-
if next_index is not None:
47-
prefix = current_prefix + key[start_pos:word_pos] + u_replace_char
48-
extra_keys = self._similar_keys(prefix, key, next_index, replace_chars)
49-
res += extra_keys
46+
if next_index:
47+
prefix = current_prefix + key[start_pos:word_pos] + u_replace_char
48+
extra_keys = self._similar_keys(prefix, key, next_index, replace_chars)
49+
res += extra_keys
5050

5151
index = self.dct.follow_bytes(b_step, index)
5252
if index is None:
@@ -67,7 +67,7 @@ def similar_keys(self, key, replaces):
6767
6868
``replaces`` is an object obtained from
6969
``DAWG.compile_replaces(mapping)`` where mapping is a dict
70-
that maps single-char unicode sitrings to another single-char
70+
that maps single-char unicode strings to (one or more) single-char
7171
unicode strings.
7272
7373
This may be useful e.g. for handling single-character umlauts.
@@ -77,14 +77,18 @@ def similar_keys(self, key, replaces):
7777
@classmethod
7878
def compile_replaces(cls, replaces):
7979

80-
for k, v in replaces.items():
81-
if len(k) != 1 or len(v) != 1:
82-
raise ValueError("Keys and values must be single-char unicode strings.")
80+
for k,v in replaces.items():
81+
if len(k) != 1:
82+
raise ValueError("Keys must be single-char unicode strings.")
83+
if (isinstance(v, str) and len(v) != 1):
84+
raise ValueError("Values must be single-char unicode strings or non-empty lists of such.")
85+
if isinstance(v, list) and (any(len(v_entry) != 1 for v_entry in v) or len(v) < 1):
86+
raise ValueError("Values must be single-char unicode strings or non-empty lists of such.")
8387

8488
return dict(
8589
(
8690
k.encode('utf8'),
87-
(v.encode('utf8'), v),
91+
[(v_entry.encode('utf8'), v_entry) for v_entry in v]
8892
)
8993
for k, v in replaces.items()
9094
)
@@ -326,14 +330,15 @@ def _similar_items(self, current_prefix, key, index, replace_chars):
326330
b_step = key[word_pos].encode('utf8')
327331

328332
if b_step in replace_chars:
329-
next_index = index
330-
b_replace_char, u_replace_char = replace_chars[b_step]
333+
for (b_replace_char, u_replace_char) in replace_chars[b_step]:
334+
next_index = index
331335

332-
next_index = self.dct.follow_bytes(b_replace_char, next_index)
333-
if next_index:
334-
prefix = current_prefix + key[start_pos:word_pos] + u_replace_char
335-
extra_items = self._similar_items(prefix, key, next_index, replace_chars)
336-
res += extra_items
336+
next_index = self.dct.follow_bytes(b_replace_char, next_index)
337+
338+
if next_index:
339+
prefix = current_prefix + key[start_pos:word_pos] + u_replace_char
340+
extra_items = self._similar_items(prefix, key, next_index, replace_chars)
341+
res += extra_items
337342

338343
index = self.dct.follow_bytes(b_step, index)
339344
if not index:
@@ -356,7 +361,7 @@ def similar_items(self, key, replaces):
356361
357362
``replaces`` is an object obtained from
358363
``DAWG.compile_replaces(mapping)`` where mapping is a dict
359-
that maps single-char unicode sitrings to another single-char
364+
that maps single-char unicode strings to (one or more) single-char
360365
unicode strings.
361366
"""
362367
return self._similar_items("", key, self.dct.ROOT, replaces)
@@ -370,13 +375,14 @@ def _similar_item_values(self, start_pos, key, index, replace_chars):
370375
b_step = key[word_pos].encode('utf8')
371376

372377
if b_step in replace_chars:
373-
next_index = index
374-
b_replace_char, u_replace_char = replace_chars[b_step]
378+
for (b_replace_char, u_replace_char) in replace_chars[b_step]:
379+
next_index = index
380+
381+
next_index = self.dct.follow_bytes(b_replace_char, next_index)
375382

376-
next_index = self.dct.follow_bytes(b_replace_char, next_index)
377-
if next_index:
378-
extra_items = self._similar_item_values(word_pos + 1, key, next_index, replace_chars)
379-
res += extra_items
383+
if next_index:
384+
extra_items = self._similar_item_values(word_pos + 1, key, next_index, replace_chars)
385+
res += extra_items
380386

381387
index = self.dct.follow_bytes(b_step, index)
382388
if not index:
@@ -398,7 +404,7 @@ def similar_item_values(self, key, replaces):
398404
399405
``replaces`` is an object obtained from
400406
``DAWG.compile_replaces(mapping)`` where mapping is a dict
401-
that maps single-char unicode sitrings to another single-char
407+
that maps single-char unicode strings to (one or more) single-char
402408
unicode strings.
403409
"""
404410
return self._similar_item_values(0, key, self.dct.ROOT, replaces)
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
,0,1,2,3
2+
хлѣб,98,51,54,49
3+
ёлка,54,99,99,98
4+
ель,51,53,101,101
5+
лѣс,101,56,102,48
6+
лѣсное,57,99,53,56
7+
всё,50,99,55,53
8+
всѣ,49,99,54,48
9+
бѣлёная,97,49,56,97
10+
изобрѣтён,51,99,99,99
11+
лев,98,50,52,56
12+
лёв,50,101,51,99
13+
лѣв,99,100,102,50
14+
вѣнскій,100,57,101,57
3.01 KB
Binary file not shown.

dev_data/small/prediction1917.dawg

1 KB
Binary file not shown.

dev_data/small/prediction1917.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
хлѣб ёлка ель лѣс лѣсное всё всѣ бѣлёная изобрѣтён лев лёв лѣв вѣнскій

tests/test_prediction.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,12 @@
22

33
import dawg_python
44
from .utils import data_path
5+
from hashlib import md5
6+
7+
8+
def encode(w):
9+
code = md5(w.encode('utf8'))
10+
return tuple([ord(c) for c in code.hexdigest()])[:4]
511

612

713
class TestPrediction:
@@ -62,3 +68,74 @@ def test_record_dawg_items(self, word, prediction):
6268
def test_record_dawg_items_values(self, word, prediction):
6369
d = self.record_dawg()
6470
assert d.similar_item_values(word, self.REPLACES) == prediction
71+
72+
73+
class TestMultiValuedPrediction(object):
74+
75+
REPLACES = dawg_python.DAWG.compile_replaces({'е': ['ё', 'ѣ'], 'и': 'і'})
76+
77+
DATA = "хлѣб ёлка ель лѣс лѣсное всё всѣ бѣлёная изобрѣтён лев лёв лѣв вѣнскій".split(" ")
78+
79+
SUITE = [
80+
('осел', []),
81+
('ель', ['ель']),
82+
('ёль', []),
83+
('хлеб', ['хлѣб']),
84+
('елка', ['ёлка']),
85+
('лесное', ['лѣсное']),
86+
('лесноё', []),
87+
('лёсное', []),
88+
('изобретен', ['изобрѣтён']),
89+
('беленая', ['бѣлёная']),
90+
('белёная', ['бѣлёная']),
91+
('бѣленая', ['бѣлёная']),
92+
('бѣлёная', ['бѣлёная']),
93+
('белѣная', []),
94+
('бѣлѣная', []),
95+
('все', ['всё', 'всѣ']),
96+
('лев', ['лев', 'лёв', 'лѣв']),
97+
('венский', ['вѣнскій']),
98+
]
99+
100+
SUITE_ITEMS = [
101+
(
102+
it[0], # key
103+
[
104+
(w, [encode(w)]) # item, value pair
105+
for w in it[1]
106+
]
107+
)
108+
for it in SUITE
109+
]
110+
111+
SUITE_VALUES = [
112+
(
113+
it[0], # key
114+
[[encode(w)] for w in it[1]]
115+
)
116+
for it in SUITE
117+
]
118+
119+
def record_dawg(self):
120+
path = data_path("small", "prediction1917-record.dawg")
121+
return dawg_python.RecordDAWG(str("=HHHH")).load(path)
122+
123+
@pytest.mark.parametrize(("word", "prediction"), SUITE)
124+
def test_dawg_prediction(self, word, prediction):
125+
d = dawg_python.DAWG().load(data_path("small", "prediction1917.dawg"))
126+
assert d.similar_keys(word, self.REPLACES) == prediction
127+
128+
@pytest.mark.parametrize(("word", "prediction"), SUITE)
129+
def test_record_dawg_prediction(self, word, prediction):
130+
d = self.record_dawg()
131+
assert d.similar_keys(word, self.REPLACES) == prediction
132+
133+
@pytest.mark.parametrize(("word", "prediction"), SUITE_ITEMS)
134+
def test_record_dawg_items(self, word, prediction):
135+
d = self.record_dawg()
136+
assert d.similar_items(word, self.REPLACES) == prediction
137+
138+
@pytest.mark.parametrize(("word", "prediction"), SUITE_VALUES)
139+
def test_record_dawg_items_values(self, word, prediction):
140+
d = self.record_dawg()
141+
assert d.similar_item_values(word, self.REPLACES) == prediction

0 commit comments

Comments
 (0)