44"""
55Text normalization
66"""
7+
78import re
89from typing import List , Union
910
@@ -76,7 +77,7 @@ def remove_dangling(text: str) -> str:
7677
7778 from pythainlp.util import remove_dangling
7879
79- remove_dangling('๊ก' )
80+ remove_dangling("๊ก" )
8081 # output: 'ก'
8182 """
8283 return _RE_REMOVE_DANGLINGS .sub ("" , text )
@@ -98,7 +99,7 @@ def remove_dup_spaces(text: str) -> str:
9899
99100 from pythainlp.util import remove_dup_spaces
100101
101- remove_dup_spaces(' ก ข ค' )
102+ remove_dup_spaces(" ก ข ค" )
102103 # output: 'ก ข ค'
103104 """
104105 while " " in text :
@@ -132,7 +133,7 @@ def remove_tonemark(text: str) -> str:
132133
133134 from pythainlp.util import remove_tonemark
134135
135- remove_tonemark(' สองพันหนึ่งร้อยสี่สิบเจ็ดล้านสี่แสนแปดหมื่นสามพันหกร้อยสี่สิบเจ็ด' )
136+ remove_tonemark(" สองพันหนึ่งร้อยสี่สิบเจ็ดล้านสี่แสนแปดหมื่นสามพันหกร้อยสี่สิบเจ็ด" )
136137 # output: สองพันหนึงรอยสีสิบเจ็ดลานสีแสนแปดหมืนสามพันหกรอยสีสิบเจ็ด
137138 """
138139 for ch in tonemarks :
@@ -235,10 +236,10 @@ def normalize(text: str) -> str:
235236
236237 from pythainlp.util import normalize
237238
238- normalize(' เเปลก' ) # starts with two Sara E
239+ normalize(" เเปลก" ) # starts with two Sara E
239240 # output: แปลก
240241
241- normalize(' นานาาา' )
242+ normalize(" นานาาา" )
242243 # output: นานา
243244 """
244245 text = remove_zw (text )
@@ -249,46 +250,87 @@ def normalize(text: str) -> str:
249250 return text
250251
251252
252- def maiyamok (sent : Union [str , List [str ]]) -> List [str ]:
253+ def expand_maiyamok (sent : Union [str , List [str ]]) -> List [str ]:
253254 """
254- Thai MaiYaMok
255+ Expand Maiyamok.
255256
256- MaiYaMok (ๆ) is the mark of duplicate word in Thai language.
257- This function is preprocessing MaiYaMok in Thai sentence.
257+ Maiyamok (ๆ) (Unicode U+0E46) is a Thai character indicating word
258+ repetition. This function preprocesses Thai text by replacing
259+ Maiyamok with a word being repeated.
258260
259- :param Union[str, List[str]] sent: input sentence (list or str )
261+ :param Union[str, List[str]] sent: sentence (list or string )
260262 :return: list of words
261263 :rtype: List[str]
262264
263265 :Example:
264266 ::
267+ from pythainlp.util import expand_maiyamok
265268
266- from pythainlp.util import maiyamok
267-
268- maiyamok("เด็กๆชอบไปโรงเรียน")
269- # output: ['เด็ก', 'เด็ก', 'ชอบ', 'ไป', 'โรงเรียน']
270-
271- maiyamok(["ทำไม","คน","ดี"," ","ๆ","ๆ"," ","ถึง","ทำ","ไม่ได้"])
272- # output: ['ทำไม', 'คน', 'ดี', 'ดี', 'ดี', ' ', 'ถึง', 'ทำ', 'ไม่ได้']
269+ expand_maiyamok("คนๆนก")
270+ # output: ['คน', 'คน', 'นก']
273271 """
274272 if isinstance (sent , str ):
275273 sent = word_tokenize (sent )
276- _list_word = []
277- i = 0
278- for j , text in enumerate (sent ):
279- if text .isspace () and "ๆ" in sent [j + 1 ]:
280- continue
281- if " ๆ" in text :
282- text = text .replace (" ๆ" , "ๆ" )
283- if "ๆ" == text :
284- text = _list_word [i - 1 ]
285- elif "ๆ" in text :
286- count = text .count ("ๆ" )
287- text = _list_word [i - 1 ]
288- for _ in range (count ):
289- _list_word .append (text )
290- i += 1
274+
275+ # Breaks Maiyamok that attached to others, e.g. "นกๆๆ", "นกๆ ๆ", "นกๆคน"
276+ temp_toks : list [str ] = []
277+ for _ , token in enumerate (sent ):
278+ toks = re .split (r"(ๆ)" , token )
279+ toks = [tok for tok in toks if tok ] # remove empty string ("")
280+ temp_toks .extend (toks )
281+ sent = temp_toks
282+
283+ output_toks : list [str ] = []
284+
285+ yamok = "ๆ"
286+ yamok_count = 0
287+ len_sent = len (sent )
288+ for i in range (len_sent - 1 , - 1 , - 1 ): # do it backward
289+ if yamok_count == 0 or (i + 1 >= len_sent ):
290+ if sent [i ] == yamok :
291+ yamok_count = yamok_count + 1
292+ else :
293+ output_toks .append (sent [i ])
291294 continue
292- _list_word .append (text )
293- i += 1
294- return _list_word
295+
296+ if sent [i ] == yamok :
297+ yamok_count = yamok_count + 1
298+ else :
299+ if sent [i ].isspace ():
300+ if yamok_count > 0 : # remove space before yamok
301+ continue
302+ else : # with preprocessing above, this should not happen
303+ output_toks .append (sent [i ])
304+ else :
305+ output_toks .extend ([sent [i ]] * (yamok_count + 1 ))
306+ yamok_count = 0
307+
308+ return output_toks [::- 1 ]
309+
310+
311+ def maiyamok (sent : Union [str , List [str ]]) -> List [str ]:
312+ """
313+ Expand Maiyamok.
314+
315+ Deprecated. Use expand_maiyamok() instead.
316+
317+ Maiyamok (ๆ) (Unicode U+0E46) is a Thai character indicating word
318+ repetition. This function preprocesses Thai text by replacing
319+ Maiyamok with a word being repeated.
320+
321+ :param Union[str, List[str]] sent: sentence (list or string)
322+ :return: list of words
323+ :rtype: List[str]
324+
325+ :Example:
326+ ::
327+
328+ from pythainlp.util import expand_maiyamok
329+
330+ expand_maiyamok("คนๆนก")
331+ # output: ['คน', 'คน', 'นก']
332+ """
333+ warn_deprecation (
334+ "pythainlp.util.maiyamok" , "pythainlp.util.expand_maiyamok" , "5.2"
335+ )
336+ return expand_maiyamok (sent )
0 commit comments