perf: improve word tokenizer speed

new5558 · new5558 · commit 9907cb3b2b0b · 2025-01-11T08:02:43.000Z
diff --git a/pythainlp/tokenize/attacut.py b/pythainlp/tokenize/attacut.py
@@ -25,6 +25,8 @@ def __init__(self, model="attacut-sc"):
     def tokenize(self, text: str) -> List[str]:
         return self._tokenizer.tokenize(text)
 
+_tokenizers = {}
+
 
 def segment(text: str, model: str = "attacut-sc") -> List[str]:
     """
@@ -39,7 +41,9 @@ def segment(text: str, model: str = "attacut-sc") -> List[str]:
     """
     if not text or not isinstance(text, str):
         return []
+    
+    global _tokenizers
+    if model not in _tokenizers:
+        _tokenizers[model] = AttacutTokenizer(model)
 
-    _tokenizer = AttacutTokenizer(model)
-
-    return _tokenizer.tokenize(text)
+    return _tokenizers[model].tokenize(text)
diff --git a/pythainlp/tokenize/longest.py b/pythainlp/tokenize/longest.py
@@ -150,6 +150,8 @@ def tokenize(self, text: str) -> List[str]:
         return tokens
 
 
+_tokenizers = {}
+
 def segment(
     text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE
 ) -> List[str]:
@@ -166,4 +168,10 @@ def segment(
     if not custom_dict:
         custom_dict = DEFAULT_WORD_DICT_TRIE
 
-    return LongestMatchTokenizer(custom_dict).tokenize(text)
+    global _tokenizers
+    custom_dict_ref_id = id(custom_dict)
+    if custom_dict_ref_id not in _tokenizers:
+        _tokenizers[custom_dict_ref_id] = LongestMatchTokenizer(custom_dict)
+
+    return _tokenizers[custom_dict_ref_id].tokenize(text)
+    # return LongestMatchTokenizer(custom_dict).tokenize(text)
diff --git a/pythainlp/tokenize/pyicu.py b/pythainlp/tokenize/pyicu.py
@@ -15,9 +15,10 @@
 
 from icu import BreakIterator, Locale
 
+bd = BreakIterator.createWordInstance(Locale("th"))
 
 def _gen_words(text: str) -> str:
-    bd = BreakIterator.createWordInstance(Locale("th"))
+    global bd
     bd.setText(text)
     p = bd.first()
     for q in bd:
diff --git a/tests/core/test_tokenize.py b/tests/core/test_tokenize.py
@@ -403,6 +403,26 @@ def test_longest(self):
             ["ทดสอบ", "    ", "ทดสอบ"],
         )
 
+    def test_longest_custom_dict(self):
+        """Test switching the custom dict on longest segment function"""
+
+        self.assertEqual(
+            word_tokenize("ทดสอบ  ทดสอบ", engine="longest"),
+            ["ทดสอบ", "  ", "ทดสอบ"],
+        )
+        self.assertEqual(
+            word_tokenize("ปวดเฉียบพลัน", engine="longest", custom_dict={'ปวดเฉียบพลัน'}),
+            ["ปวดเฉียบพลัน"],
+        )
+        self.assertEqual(
+            word_tokenize("ทดสอบทดสอบ", engine="longest", custom_dict={'ทดสอบท'}),
+             ['ทดสอบท', 'ดสอบ'],
+        )
+        self.assertEqual(
+            word_tokenize("ทดสอบ  ทดสอบ", engine="longest"),
+             ["ทดสอบ", "  ", "ทดสอบ"],
+        )
+
     def test_mm(self):
         self.assertEqual(multi_cut.segment(None), [])
         self.assertEqual(multi_cut.segment(""), [])