Merge pull request #1064 from new5558/dev

wannaphong · web-flow · commit ae4c5faeed67 · 2025-01-13T12:53:17.000+07:00
[Ready] Reduce reload word tokenizer engine in word_tokenize
diff --git a/pythainlp/tokenize/attacut.py b/pythainlp/tokenize/attacut.py
@@ -8,7 +8,7 @@
 :See Also:
     * `GitHub repository <https://github.com/PyThaiNLP/attacut>`_
 """
-from typing import List
+from typing import Dict, List
 
 from attacut import Tokenizer
 
@@ -26,6 +26,9 @@ def tokenize(self, text: str) -> List[str]:
         return self._tokenizer.tokenize(text)
 
 
+_tokenizers: Dict[str, AttacutTokenizer] = {}
+
+
 def segment(text: str, model: str = "attacut-sc") -> List[str]:
     """
     Wrapper for AttaCut - Fast and Reasonably Accurate Word Tokenizer for Thai
@@ -40,6 +43,8 @@ def segment(text: str, model: str = "attacut-sc") -> List[str]:
     if not text or not isinstance(text, str):
         return []
 
-    _tokenizer = AttacutTokenizer(model)
+    global _tokenizers
+    if model not in _tokenizers:
+        _tokenizers[model] = AttacutTokenizer(model)
 
-    return _tokenizer.tokenize(text)
+    return _tokenizers[model].tokenize(text)
diff --git a/pythainlp/tokenize/longest.py b/pythainlp/tokenize/longest.py
@@ -12,7 +12,7 @@
 
 """
 import re
-from typing import List, Union
+from typing import Dict, List, Union
 
 from pythainlp import thai_tonemarks
 from pythainlp.tokenize import DEFAULT_WORD_DICT_TRIE
@@ -149,9 +149,10 @@ def tokenize(self, text: str) -> List[str]:
         return tokens
 
 
-def segment(
-    text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE
-) -> List[str]:
+_tokenizers: Dict[int, LongestMatchTokenizer] = {}
+
+
+def segment(text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE) -> List[str]:
     """
     Dictionary-based longest matching word segmentation.
 
@@ -165,4 +166,9 @@ def segment(
     if not custom_dict:
         custom_dict = DEFAULT_WORD_DICT_TRIE
 
-    return LongestMatchTokenizer(custom_dict).tokenize(text)
+    global _tokenizers
+    custom_dict_ref_id = id(custom_dict)
+    if custom_dict_ref_id not in _tokenizers:
+        _tokenizers[custom_dict_ref_id] = LongestMatchTokenizer(custom_dict)
+
+    return _tokenizers[custom_dict_ref_id].tokenize(text)
diff --git a/pythainlp/tokenize/pyicu.py b/pythainlp/tokenize/pyicu.py
@@ -15,9 +15,10 @@
 
 from icu import BreakIterator, Locale
 
+bd = BreakIterator.createWordInstance(Locale("th"))
 
 def _gen_words(text: str) -> str:
-    bd = BreakIterator.createWordInstance(Locale("th"))
+    global bd
     bd.setText(text)
     p = bd.first()
     for q in bd:
diff --git a/tests/core/test_tokenize.py b/tests/core/test_tokenize.py
@@ -403,6 +403,28 @@ def test_longest(self):
             ["ทดสอบ", "    ", "ทดสอบ"],
         )
 
+    def test_longest_custom_dict(self):
+        """Test switching the custom dict on longest segment function"""
+
+        self.assertEqual(
+            word_tokenize("ทดสอบ  ทดสอบ", engine="longest"),
+            ["ทดสอบ", "  ", "ทดสอบ"],
+        )
+        self.assertEqual(
+            word_tokenize(
+                "ปวดเฉียบพลัน", engine="longest", custom_dict=dict_trie(["ปวดเฉียบพลัน"])
+            ),
+            ["ปวดเฉียบพลัน"],
+        )
+        self.assertEqual(
+            word_tokenize("ทดสอบทดสอบ", engine="longest", custom_dict=dict_trie(["ทดสอบท"])),
+            ["ทดสอบท", "ดสอบ"],
+        )
+        self.assertEqual(
+            word_tokenize("ทดสอบ  ทดสอบ", engine="longest"),
+            ["ทดสอบ", "  ", "ทดสอบ"],
+        )
+
     def test_mm(self):
         self.assertEqual(multi_cut.segment(None), [])
         self.assertEqual(multi_cut.segment(""), [])