Merge pull request #1066 from PyThaiNLP/wannaphong/fix-custom-dict-error

wannaphong · web-flow · commit 1271452144f8 · 2025-01-14T13:40:16.000+07:00
Fix custom dict error for unsupported tokenization engines
diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
@@ -111,10 +111,10 @@ def word_tokenize(
     :param str engine: name of the tokenizer to be used
     :param pythainlp.util.Trie custom_dict: dictionary trie (some engine may not support)
     :param bool keep_whitespace: True to keep whitespace, a common mark
-                                 for end of phrase in Thai.
-                                 Otherwise, whitespace is omitted.
+                                  for end of phrase in Thai.
+                                  Otherwise, whitespace is omitted.
     :param bool join_broken_num: True to rejoin formatted numeric that could be wrongly separated.
-                                 Otherwise, formatted numeric could be wrongly separated.
+                                  Otherwise, formatted numeric could be wrongly separated.
 
     :return: list of words
     :rtype: List[str]
@@ -221,6 +221,18 @@ def word_tokenize(
 
     segments = []
 
+    if custom_dict and engine in (
+        "attacut",
+        "icu",
+        "nercut",
+        "sefr_cut",
+        "tltk",
+        "oskut"
+    ):
+        raise NotImplementedError(
+            f"The {engine} engine does not support custom dictionaries."
+        )
+
     if engine in ("newmm", "onecut"):
         from pythainlp.tokenize.newmm import segment
 
@@ -366,7 +378,7 @@ def sent_tokenize(
             and ``wtp-large`` to use ``wtp-canine-s-12l`` model.
         * *whitespace+newline* - split by whitespace and newline.
         * *whitespace* - split by whitespace, specifically with \
-                         :class:`regex` pattern  ``r" +"``
+                          :class:`regex` pattern  ``r" +"``
     :Example:
 
     Split the text based on *whitespace*::
@@ -854,9 +866,9 @@ def __init__(
                     used to create a trie, or an instantiated
                     :class:`pythainlp.util.Trie` object.
         :param str engine: choose between different options of tokenizer engines
-                           (i.e.  *newmm*, *mm*, *longest*, *deepcut*)
+                            (i.e.  *newmm*, *mm*, *longest*, *deepcut*)
         :param bool keep_whitespace: True to keep whitespace, a common mark
-                                    for end of phrase in Thai
+                                     for end of phrase in Thai
         """
         self.__trie_dict = Trie([])
         if custom_dict:
diff --git a/tests/core/test_tokenize.py b/tests/core/test_tokenize.py
@@ -356,6 +356,11 @@ def test_word_tokenize(self):
             "ไฟ", word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"]))
         )
 
+        with self.assertRaises(NotImplementedError):
+            word_tokenize(
+                "รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"]), engine="icu"
+            )
+
     def test_etcc(self):
         self.assertEqual(etcc.segment(None), [])
         self.assertEqual(etcc.segment(""), [])