Skip to content

Commit b3e2d6e

Browse files
committed
Fix maiyamok()
1 parent 15ec737 commit b3e2d6e

File tree

2 files changed

+81
-36
lines changed

2 files changed

+81
-36
lines changed

pythainlp/util/__init__.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,13 @@
1919
"display_thai_char",
2020
"emoji_to_thai",
2121
"eng_to_thai",
22+
"expand_maiyamok",
2223
"find_keyword",
2324
"ipa_to_rtgs",
2425
"is_native_thai",
2526
"isthai",
2627
"isthaichar",
28+
"maiyamok",
2729
"nectec_to_ipa",
2830
"normalize",
2931
"now_reign_year",
@@ -85,8 +87,9 @@
8587
from pythainlp.util.emojiconv import emoji_to_thai
8688
from pythainlp.util.keywords import find_keyword, rank
8789
from pythainlp.util.normalize import (
88-
normalize,
90+
expand_maiyamok,
8991
maiyamok,
92+
normalize,
9093
remove_dangling,
9194
remove_dup_spaces,
9295
remove_repeat_vowels,

pythainlp/util/normalize.py

Lines changed: 77 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
"""
55
Text normalization
66
"""
7+
78
import re
89
from typing import List, Union
910

@@ -76,7 +77,7 @@ def remove_dangling(text: str) -> str:
7677
7778
from pythainlp.util import remove_dangling
7879
79-
remove_dangling('๊ก')
80+
remove_dangling("๊ก")
8081
# output: 'ก'
8182
"""
8283
return _RE_REMOVE_DANGLINGS.sub("", text)
@@ -98,7 +99,7 @@ def remove_dup_spaces(text: str) -> str:
9899
99100
from pythainlp.util import remove_dup_spaces
100101
101-
remove_dup_spaces('ก ข ค')
102+
remove_dup_spaces("ก ข ค")
102103
# output: 'ก ข ค'
103104
"""
104105
while " " in text:
@@ -132,7 +133,7 @@ def remove_tonemark(text: str) -> str:
132133
133134
from pythainlp.util import remove_tonemark
134135
135-
remove_tonemark('สองพันหนึ่งร้อยสี่สิบเจ็ดล้านสี่แสนแปดหมื่นสามพันหกร้อยสี่สิบเจ็ด')
136+
remove_tonemark("สองพันหนึ่งร้อยสี่สิบเจ็ดล้านสี่แสนแปดหมื่นสามพันหกร้อยสี่สิบเจ็ด")
136137
# output: สองพันหนึงรอยสีสิบเจ็ดลานสีแสนแปดหมืนสามพันหกรอยสีสิบเจ็ด
137138
"""
138139
for ch in tonemarks:
@@ -235,10 +236,10 @@ def normalize(text: str) -> str:
235236
236237
from pythainlp.util import normalize
237238
238-
normalize('เเปลก') # starts with two Sara E
239+
normalize("เเปลก") # starts with two Sara E
239240
# output: แปลก
240241
241-
normalize('นานาาา')
242+
normalize("นานาาา")
242243
# output: นานา
243244
"""
244245
text = remove_zw(text)
@@ -249,46 +250,87 @@ def normalize(text: str) -> str:
249250
return text
250251

251252

252-
def maiyamok(sent: Union[str, List[str]]) -> List[str]:
253+
def expand_maiyamok(sent: Union[str, List[str]]) -> List[str]:
253254
"""
254-
Thai MaiYaMok
255+
Expand Maiyamok.
255256
256-
MaiYaMok (ๆ) is the mark of duplicate word in Thai language.
257-
This function is preprocessing MaiYaMok in Thai sentence.
257+
Maiyamok (ๆ) (Unicode U+0E46) is a Thai character indicating word
258+
repetition. This function preprocesses Thai text by replacing
259+
Maiyamok with a word being repeated.
258260
259-
:param Union[str, List[str]] sent: input sentence (list or str)
261+
:param Union[str, List[str]] sent: sentence (list or string)
260262
:return: list of words
261263
:rtype: List[str]
262264
263265
:Example:
264266
::
267+
from pythainlp.util import expand_maiyamok
265268
266-
from pythainlp.util import maiyamok
267-
268-
maiyamok("เด็กๆชอบไปโรงเรียน")
269-
# output: ['เด็ก', 'เด็ก', 'ชอบ', 'ไป', 'โรงเรียน']
270-
271-
maiyamok(["ทำไม","คน","ดี"," ","ๆ","ๆ"," ","ถึง","ทำ","ไม่ได้"])
272-
# output: ['ทำไม', 'คน', 'ดี', 'ดี', 'ดี', ' ', 'ถึง', 'ทำ', 'ไม่ได้']
269+
expand_maiyamok("คนๆนก")
270+
# output: ['คน', 'คน', 'นก']
273271
"""
274272
if isinstance(sent, str):
275273
sent = word_tokenize(sent)
276-
_list_word = []
277-
i = 0
278-
for j, text in enumerate(sent):
279-
if text.isspace() and "ๆ" in sent[j + 1]:
280-
continue
281-
if " ๆ" in text:
282-
text = text.replace(" ๆ", "ๆ")
283-
if "ๆ" == text:
284-
text = _list_word[i - 1]
285-
elif "ๆ" in text:
286-
count = text.count("ๆ")
287-
text = _list_word[i - 1]
288-
for _ in range(count):
289-
_list_word.append(text)
290-
i += 1
274+
275+
# Breaks Maiyamok that attached to others, e.g. "นกๆๆ", "นกๆ ๆ", "นกๆคน"
276+
temp_toks: list[str] = []
277+
for _, token in enumerate(sent):
278+
toks = re.split(r"(ๆ)", token)
279+
toks = [tok for tok in toks if tok] # remove empty string ("")
280+
temp_toks.extend(toks)
281+
sent = temp_toks
282+
283+
output_toks: list[str] = []
284+
285+
yamok = "ๆ"
286+
yamok_count = 0
287+
len_sent = len(sent)
288+
for i in range(len_sent - 1, -1, -1): # do it backward
289+
if yamok_count == 0 or (i + 1 >= len_sent):
290+
if sent[i] == yamok:
291+
yamok_count = yamok_count + 1
292+
else:
293+
output_toks.append(sent[i])
291294
continue
292-
_list_word.append(text)
293-
i += 1
294-
return _list_word
295+
296+
if sent[i] == yamok:
297+
yamok_count = yamok_count + 1
298+
else:
299+
if sent[i].isspace():
300+
if yamok_count > 0: # remove space before yamok
301+
continue
302+
else: # with preprocessing above, this should not happen
303+
output_toks.append(sent[i])
304+
else:
305+
output_toks.extend([sent[i]] * (yamok_count + 1))
306+
yamok_count = 0
307+
308+
return output_toks[::-1]
309+
310+
311+
def maiyamok(sent: Union[str, List[str]]) -> List[str]:
312+
"""
313+
Expand Maiyamok.
314+
315+
Deprecated. Use expand_maiyamok() instead.
316+
317+
Maiyamok (ๆ) (Unicode U+0E46) is a Thai character indicating word
318+
repetition. This function preprocesses Thai text by replacing
319+
Maiyamok with a word being repeated.
320+
321+
:param Union[str, List[str]] sent: sentence (list or string)
322+
:return: list of words
323+
:rtype: List[str]
324+
325+
:Example:
326+
::
327+
328+
from pythainlp.util import expand_maiyamok
329+
330+
expand_maiyamok("คนๆนก")
331+
# output: ['คน', 'คน', 'นก']
332+
"""
333+
warn_deprecation(
334+
"pythainlp.util.maiyamok", "pythainlp.util.expand_maiyamok", "5.2"
335+
)
336+
return expand_maiyamok(sent)

0 commit comments

Comments
 (0)