Skip to content

Commit 9907cb3

Browse files
committed
perf: improve word tokenizer speed
1 parent cae175c commit 9907cb3

File tree

4 files changed

+38
-5
lines changed

4 files changed

+38
-5
lines changed

pythainlp/tokenize/attacut.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ def __init__(self, model="attacut-sc"):
2525
def tokenize(self, text: str) -> List[str]:
2626
return self._tokenizer.tokenize(text)
2727

28+
_tokenizers = {}
29+
2830

2931
def segment(text: str, model: str = "attacut-sc") -> List[str]:
3032
"""
@@ -39,7 +41,9 @@ def segment(text: str, model: str = "attacut-sc") -> List[str]:
3941
"""
4042
if not text or not isinstance(text, str):
4143
return []
44+
45+
global _tokenizers
46+
if model not in _tokenizers:
47+
_tokenizers[model] = AttacutTokenizer(model)
4248

43-
_tokenizer = AttacutTokenizer(model)
44-
45-
return _tokenizer.tokenize(text)
49+
return _tokenizers[model].tokenize(text)

pythainlp/tokenize/longest.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,8 @@ def tokenize(self, text: str) -> List[str]:
150150
return tokens
151151

152152

153+
_tokenizers = {}
154+
153155
def segment(
154156
text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE
155157
) -> List[str]:
@@ -166,4 +168,10 @@ def segment(
166168
if not custom_dict:
167169
custom_dict = DEFAULT_WORD_DICT_TRIE
168170

169-
return LongestMatchTokenizer(custom_dict).tokenize(text)
171+
global _tokenizers
172+
custom_dict_ref_id = id(custom_dict)
173+
if custom_dict_ref_id not in _tokenizers:
174+
_tokenizers[custom_dict_ref_id] = LongestMatchTokenizer(custom_dict)
175+
176+
return _tokenizers[custom_dict_ref_id].tokenize(text)
177+
# return LongestMatchTokenizer(custom_dict).tokenize(text)

pythainlp/tokenize/pyicu.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,10 @@
1515

1616
from icu import BreakIterator, Locale
1717

18+
bd = BreakIterator.createWordInstance(Locale("th"))
1819

1920
def _gen_words(text: str) -> str:
20-
bd = BreakIterator.createWordInstance(Locale("th"))
21+
global bd
2122
bd.setText(text)
2223
p = bd.first()
2324
for q in bd:

tests/core/test_tokenize.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -403,6 +403,26 @@ def test_longest(self):
403403
["ทดสอบ", " ", "ทดสอบ"],
404404
)
405405

406+
def test_longest_custom_dict(self):
407+
"""Test switching the custom dict on longest segment function"""
408+
409+
self.assertEqual(
410+
word_tokenize("ทดสอบ ทดสอบ", engine="longest"),
411+
["ทดสอบ", " ", "ทดสอบ"],
412+
)
413+
self.assertEqual(
414+
word_tokenize("ปวดเฉียบพลัน", engine="longest", custom_dict={'ปวดเฉียบพลัน'}),
415+
["ปวดเฉียบพลัน"],
416+
)
417+
self.assertEqual(
418+
word_tokenize("ทดสอบทดสอบ", engine="longest", custom_dict={'ทดสอบท'}),
419+
['ทดสอบท', 'ดสอบ'],
420+
)
421+
self.assertEqual(
422+
word_tokenize("ทดสอบ ทดสอบ", engine="longest"),
423+
["ทดสอบ", " ", "ทดสอบ"],
424+
)
425+
406426
def test_mm(self):
407427
self.assertEqual(multi_cut.segment(None), [])
408428
self.assertEqual(multi_cut.segment(""), [])

0 commit comments

Comments
 (0)