Skip to content

Commit 1271452

Browse files
authored
Merge pull request #1066 from PyThaiNLP/wannaphong/fix-custom-dict-error
Fix custom dict error for unsupported tokenization engines
2 parents 0850aa8 + 9393cb8 commit 1271452

File tree

2 files changed

+23
-6
lines changed

2 files changed

+23
-6
lines changed

pythainlp/tokenize/core.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -111,10 +111,10 @@ def word_tokenize(
111111
:param str engine: name of the tokenizer to be used
112112
:param pythainlp.util.Trie custom_dict: dictionary trie (some engine may not support)
113113
:param bool keep_whitespace: True to keep whitespace, a common mark
114-
for end of phrase in Thai.
115-
Otherwise, whitespace is omitted.
114+
for end of phrase in Thai.
115+
Otherwise, whitespace is omitted.
116116
:param bool join_broken_num: True to rejoin formatted numeric that could be wrongly separated.
117-
Otherwise, formatted numeric could be wrongly separated.
117+
Otherwise, formatted numeric could be wrongly separated.
118118
119119
:return: list of words
120120
:rtype: List[str]
@@ -221,6 +221,18 @@ def word_tokenize(
221221

222222
segments = []
223223

224+
if custom_dict and engine in (
225+
"attacut",
226+
"icu",
227+
"nercut",
228+
"sefr_cut",
229+
"tltk",
230+
"oskut"
231+
):
232+
raise NotImplementedError(
233+
f"The {engine} engine does not support custom dictionaries."
234+
)
235+
224236
if engine in ("newmm", "onecut"):
225237
from pythainlp.tokenize.newmm import segment
226238

@@ -366,7 +378,7 @@ def sent_tokenize(
366378
and ``wtp-large`` to use ``wtp-canine-s-12l`` model.
367379
* *whitespace+newline* - split by whitespace and newline.
368380
* *whitespace* - split by whitespace, specifically with \
369-
:class:`regex` pattern ``r" +"``
381+
:class:`regex` pattern ``r" +"``
370382
:Example:
371383
372384
Split the text based on *whitespace*::
@@ -854,9 +866,9 @@ def __init__(
854866
used to create a trie, or an instantiated
855867
:class:`pythainlp.util.Trie` object.
856868
:param str engine: choose between different options of tokenizer engines
857-
(i.e. *newmm*, *mm*, *longest*, *deepcut*)
869+
(i.e. *newmm*, *mm*, *longest*, *deepcut*)
858870
:param bool keep_whitespace: True to keep whitespace, a common mark
859-
for end of phrase in Thai
871+
for end of phrase in Thai
860872
"""
861873
self.__trie_dict = Trie([])
862874
if custom_dict:

tests/core/test_tokenize.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,11 @@ def test_word_tokenize(self):
356356
"ไฟ", word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"]))
357357
)
358358

359+
with self.assertRaises(NotImplementedError):
360+
word_tokenize(
361+
"รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"]), engine="icu"
362+
)
363+
359364
def test_etcc(self):
360365
self.assertEqual(etcc.segment(None), [])
361366
self.assertEqual(etcc.segment(""), [])

0 commit comments

Comments
 (0)