Skip to content

Commit ae4c5fa

Browse files
authored
Merge pull request #1064 from new5558/dev
[Ready] Reduce reload word tokenizer engine in word_tokenize
2 parents 3aa57c6 + d8d22f6 commit ae4c5fa

File tree

4 files changed

+43
-9
lines changed

4 files changed

+43
-9
lines changed

pythainlp/tokenize/attacut.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
:See Also:
99
* `GitHub repository <https://github.com/PyThaiNLP/attacut>`_
1010
"""
11-
from typing import List
11+
from typing import Dict, List
1212

1313
from attacut import Tokenizer
1414

@@ -26,6 +26,9 @@ def tokenize(self, text: str) -> List[str]:
2626
return self._tokenizer.tokenize(text)
2727

2828

29+
_tokenizers: Dict[str, AttacutTokenizer] = {}
30+
31+
2932
def segment(text: str, model: str = "attacut-sc") -> List[str]:
3033
"""
3134
Wrapper for AttaCut - Fast and Reasonably Accurate Word Tokenizer for Thai
@@ -40,6 +43,8 @@ def segment(text: str, model: str = "attacut-sc") -> List[str]:
4043
if not text or not isinstance(text, str):
4144
return []
4245

43-
_tokenizer = AttacutTokenizer(model)
46+
global _tokenizers
47+
if model not in _tokenizers:
48+
_tokenizers[model] = AttacutTokenizer(model)
4449

45-
return _tokenizer.tokenize(text)
50+
return _tokenizers[model].tokenize(text)

pythainlp/tokenize/longest.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
1313
"""
1414
import re
15-
from typing import List, Union
15+
from typing import Dict, List, Union
1616

1717
from pythainlp import thai_tonemarks
1818
from pythainlp.tokenize import DEFAULT_WORD_DICT_TRIE
@@ -149,9 +149,10 @@ def tokenize(self, text: str) -> List[str]:
149149
return tokens
150150

151151

152-
def segment(
153-
text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE
154-
) -> List[str]:
152+
_tokenizers: Dict[int, LongestMatchTokenizer] = {}
153+
154+
155+
def segment(text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE) -> List[str]:
155156
"""
156157
Dictionary-based longest matching word segmentation.
157158
@@ -165,4 +166,9 @@ def segment(
165166
if not custom_dict:
166167
custom_dict = DEFAULT_WORD_DICT_TRIE
167168

168-
return LongestMatchTokenizer(custom_dict).tokenize(text)
169+
global _tokenizers
170+
custom_dict_ref_id = id(custom_dict)
171+
if custom_dict_ref_id not in _tokenizers:
172+
_tokenizers[custom_dict_ref_id] = LongestMatchTokenizer(custom_dict)
173+
174+
return _tokenizers[custom_dict_ref_id].tokenize(text)

pythainlp/tokenize/pyicu.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,10 @@
1515

1616
from icu import BreakIterator, Locale
1717

18+
bd = BreakIterator.createWordInstance(Locale("th"))
1819

1920
def _gen_words(text: str) -> str:
20-
bd = BreakIterator.createWordInstance(Locale("th"))
21+
global bd
2122
bd.setText(text)
2223
p = bd.first()
2324
for q in bd:

tests/core/test_tokenize.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -403,6 +403,28 @@ def test_longest(self):
403403
["ทดสอบ", " ", "ทดสอบ"],
404404
)
405405

406+
def test_longest_custom_dict(self):
407+
"""Test switching the custom dict on longest segment function"""
408+
409+
self.assertEqual(
410+
word_tokenize("ทดสอบ ทดสอบ", engine="longest"),
411+
["ทดสอบ", " ", "ทดสอบ"],
412+
)
413+
self.assertEqual(
414+
word_tokenize(
415+
"ปวดเฉียบพลัน", engine="longest", custom_dict=dict_trie(["ปวดเฉียบพลัน"])
416+
),
417+
["ปวดเฉียบพลัน"],
418+
)
419+
self.assertEqual(
420+
word_tokenize("ทดสอบทดสอบ", engine="longest", custom_dict=dict_trie(["ทดสอบท"])),
421+
["ทดสอบท", "ดสอบ"],
422+
)
423+
self.assertEqual(
424+
word_tokenize("ทดสอบ ทดสอบ", engine="longest"),
425+
["ทดสอบ", " ", "ทดสอบ"],
426+
)
427+
406428
def test_mm(self):
407429
self.assertEqual(multi_cut.segment(None), [])
408430
self.assertEqual(multi_cut.segment(""), [])

0 commit comments

Comments
 (0)