1414from pythainlp import thai_lead_vowels as lead_v
1515from pythainlp import thai_tonemarks as tonemarks
1616from pythainlp .tokenize import word_tokenize
17+ from pythainlp .tools import warn_deprecation
1718
1819_DANGLING_CHARS = f"{ above_v } { below_v } { tonemarks } \u0e3a \u0e4c \u0e4d \u0e4e "
1920_RE_REMOVE_DANGLINGS = re .compile (f"^[{ _DANGLING_CHARS } ]+" )
@@ -249,12 +250,13 @@ def normalize(text: str) -> str:
249250 return text
250251
251252
252- def maiyamok (sent : Union [str , List [str ]]) -> List [str ]:
253+ def expand_maiyamok (sent : Union [str , List [str ]]) -> List [str ]:
253254 """
254- Thai MaiYaMok
255+ Expand Maiyamok.
256+
257+ Maiyamok (ๆ) (Unicode U+0E46) is a Thai character indicating word
258+ repetition. This function preprocesses Thai text by expanding Maiyamok
255259
256- MaiYaMok (ๆ) is the mark of duplicate word in Thai language.
257- This function is preprocessing MaiYaMok in Thai sentence.
258260
259261 :param Union[str, List[str]] sent: input sentence (list or str)
260262 :return: list of words
@@ -265,15 +267,12 @@ def maiyamok(sent: Union[str, List[str]]) -> List[str]:
265267
266268 from pythainlp.util import maiyamok
267269
268- maiyamok("เด็กๆชอบไปโรงเรียน")
269- # output: ['เด็ก', 'เด็ก', 'ชอบ', 'ไป', 'โรงเรียน']
270-
271- maiyamok(["ทำไม", "คน", "ดี", " ", "ๆ", "ๆ", " ", "ถึง", "ทำ", "ไม่ได้"])
272- # output: ['ทำไม', 'คน', 'ดี', 'ดี', 'ดี', ' ', 'ถึง', 'ทำ', 'ไม่ได้']
270+ maiyamok("เด็กๆกิน")
271+ # output: ['เด็ก', 'เด็ก', 'กิน']
273272 """
274273 if isinstance (sent , str ):
275274 sent = word_tokenize (sent )
276- _list_word = []
275+ _list_word : list [ str ] = []
277276 i = 0
278277 for j , text in enumerate (sent ):
279278 if text .isspace () and "ๆ" in sent [j + 1 ]:
@@ -292,3 +291,28 @@ def maiyamok(sent: Union[str, List[str]]) -> List[str]:
292291 _list_word .append (text )
293292 i += 1
294293 return _list_word
294+
295+
296+ def maiyamok (sent : Union [str , List [str ]]) -> List [str ]:
297+ """
298+ Expand Maiyamok.
299+
300+ Maiyamok (ๆ) (Unicode U+0E46) is a Thai character indicating word
301+ repetition. This function preprocesses Thai text by expanding Maiyamok
302+
303+ :param Union[str, List[str]] sent: input sentence (list or str)
304+ :return: list of words
305+ :rtype: List[str]
306+
307+ :Example:
308+ ::
309+
310+ from pythainlp.util import maiyamok
311+
312+ maiyamok("เด็กๆกิน")
313+ # output: ['เด็ก', 'เด็ก', 'กิน']
314+ """
315+ warn_deprecation (
316+ "pythainlp.util.maiyamok" , "pythainlp.util.expand_maiyamok"
317+ )
318+ return expand_maiyamok (sent )
0 commit comments