@@ -111,10 +111,10 @@ def word_tokenize(
111111 :param str engine: name of the tokenizer to be used
112112 :param pythainlp.util.Trie custom_dict: dictionary trie (some engine may not support)
113113 :param bool keep_whitespace: True to keep whitespace, a common mark
114- for end of phrase in Thai.
115- Otherwise, whitespace is omitted.
114+ for end of phrase in Thai.
115+ Otherwise, whitespace is omitted.
116116 :param bool join_broken_num: True to rejoin formatted numeric that could be wrongly separated.
117- Otherwise, formatted numeric could be wrongly separated.
117+ Otherwise, formatted numeric could be wrongly separated.
118118
119119 :return: list of words
120120 :rtype: List[str]
@@ -221,6 +221,18 @@ def word_tokenize(
221221
222222 segments = []
223223
224+ if custom_dict and engine in (
225+ "attacut" ,
226+ "icu" ,
227+ "nercut" ,
228+ "sefr_cut" ,
229+ "tltk" ,
230+ "oskut"
231+ ):
232+ raise NotImplementedError (
233+ f"The { engine } engine does not support custom dictionaries."
234+ )
235+
224236 if engine in ("newmm" , "onecut" ):
225237 from pythainlp .tokenize .newmm import segment
226238
@@ -366,7 +378,7 @@ def sent_tokenize(
366378 and ``wtp-large`` to use ``wtp-canine-s-12l`` model.
367379 * *whitespace+newline* - split by whitespace and newline.
368380 * *whitespace* - split by whitespace, specifically with \
369- :class:`regex` pattern ``r" +"``
381+ :class:`regex` pattern ``r" +"``
370382 :Example:
371383
372384 Split the text based on *whitespace*::
@@ -854,9 +866,9 @@ def __init__(
854866 used to create a trie, or an instantiated
855867 :class:`pythainlp.util.Trie` object.
856868 :param str engine: choose between different options of tokenizer engines
857- (i.e. *newmm*, *mm*, *longest*, *deepcut*)
869+ (i.e. *newmm*, *mm*, *longest*, *deepcut*)
858870 :param bool keep_whitespace: True to keep whitespace, a common mark
859- for end of phrase in Thai
871+ for end of phrase in Thai
860872 """
861873 self .__trie_dict = Trie ([])
862874 if custom_dict :
0 commit comments