Skip to content

Commit e5c6fee

Browse files
committed
Merge branch 'fix-expand-maiyamok' of https://github.com/bact/pythainlp into fix-expand-maiyamok
2 parents 19a3474 + f9783d3 commit e5c6fee

File tree

4 files changed

+25
-32
lines changed

4 files changed

+25
-32
lines changed

pythainlp/tokenize/core.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -538,7 +538,7 @@ def sent_tokenize(
538538
result = map_indices_to_words(word_indices, segments)
539539
return result
540540
else:
541-
return [segments]
541+
return segments
542542

543543

544544
def paragraph_tokenize(

pythainlp/util/normalize.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -255,8 +255,8 @@ def expand_maiyamok(sent: Union[str, List[str]]) -> List[str]:
255255
Expand Maiyamok.
256256
257257
Maiyamok (ๆ) (Unicode U+0E46) is a Thai character indicating word
258-
repetition. This function preprocesses Thai text by expanding Maiyamok
259-
258+
repetition. This function preprocesses Thai text by replacing
259+
Maiyamok with a word being repeated.
260260
261261
:param Union[str, List[str]] sent: input sentence (list or str)
262262
:return: list of words
@@ -300,7 +300,8 @@ def maiyamok(sent: Union[str, List[str]]) -> List[str]:
300300
Use expand_maiyamok() instead.
301301
302302
Maiyamok (ๆ) (Unicode U+0E46) is a Thai character indicating word
303-
repetition. This function preprocesses Thai text by expanding Maiyamok
303+
repetition. This function preprocesses Thai text by replacing
304+
Maiyamok with a word being repeated.
304305
305306
:param Union[str, List[str]] sent: input sentence (list or str)
306307
:return: list of words

tests/test_tokenize.py

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -217,8 +217,6 @@ def test_Tokenizer(self):
217217
Tokenizer(engine="catcut888")
218218

219219
def test_sent_tokenize(self):
220-
self.assertEqual(sent_tokenize(None), [])
221-
self.assertEqual(sent_tokenize(""), [])
222220
self.assertEqual(
223221
sent_tokenize("รักน้ำ รักปลา ", engine="whitespace"),
224222
["รักน้ำ", "รักปลา", ""],
@@ -227,18 +225,6 @@ def test_sent_tokenize(self):
227225
sent_tokenize("รักน้ำ รักปลา ", engine="whitespace+newline"),
228226
["รักน้ำ", "รักปลา"],
229227
)
230-
self.assertEqual(
231-
sent_tokenize(SENT_1),
232-
SENT_1_TOKS,
233-
)
234-
self.assertEqual(
235-
sent_tokenize(SENT_2),
236-
SENT_2_TOKS,
237-
)
238-
self.assertEqual(
239-
sent_tokenize(SENT_3),
240-
SENT_3_TOKS,
241-
)
242228
self.assertIsNotNone(
243229
sent_tokenize(
244230
SENT_1,

tests/testx_tokenize.py

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -89,30 +89,38 @@ def testx_subword_tokenize(self):
8989
)
9090

9191
def testx_sent_tokenize(self):
92+
# Use default engine (crfcut)
93+
self.assertEqual(sent_tokenize(None), [])
94+
self.assertEqual(sent_tokenize(""), [])
9295
self.assertEqual(
93-
sent_tokenize(SENT_1, engine="crfcut"),
96+
sent_tokenize(SENT_1),
9497
SENT_1_TOKS,
9598
)
9699
self.assertEqual(
97-
sent_tokenize(SENT_2, engine="crfcut"),
100+
sent_tokenize(SENT_2),
98101
SENT_2_TOKS,
99102
)
100103
self.assertEqual(
101-
sent_tokenize(SENT_3, engine="crfcut"),
104+
sent_tokenize(SENT_3),
102105
SENT_3_TOKS,
103106
)
107+
104108
self.assertEqual(
105-
sent_tokenize(SENT_1),
109+
sent_tokenize(SENT_1, engine="crfcut"),
106110
SENT_1_TOKS,
107111
)
108112
self.assertEqual(
109-
sent_tokenize(SENT_2),
113+
sent_tokenize(SENT_2, engine="crfcut"),
110114
SENT_2_TOKS,
111115
)
112116
self.assertEqual(
113-
sent_tokenize(SENT_3),
117+
sent_tokenize(SENT_3, engine="crfcut"),
114118
SENT_3_TOKS,
115119
)
120+
self.assertEqual(
121+
sent_tokenize(SENT_4, engine="crfcut"),
122+
[["ผม", "กิน", "ข้าว", " ", "\n", "เธอ", "เล่น", "เกม"]],
123+
)
116124
self.assertIsNotNone(
117125
sent_tokenize(
118126
SENT_1,
@@ -131,6 +139,7 @@ def testx_sent_tokenize(self):
131139
engine="tltk",
132140
),
133141
)
142+
134143
self.assertIsNotNone(
135144
sent_tokenize(
136145
SENT_1,
@@ -149,6 +158,11 @@ def testx_sent_tokenize(self):
149158
engine="thaisum",
150159
),
151160
)
161+
self.assertEqual(
162+
sent_tokenize(SENT_4, engine="thaisum"),
163+
[["ผม", "กิน", "ข้าว", " ", "เธอ", "เล่น", "เกม"]],
164+
)
165+
152166
self.assertIsNotNone(
153167
sent_tokenize(
154168
SENT_3,
@@ -173,14 +187,6 @@ def testx_sent_tokenize(self):
173187
# engine="wtp-large",
174188
# ),
175189
# )
176-
self.assertEqual(
177-
sent_tokenize(SENT_4, engine="crfcut"),
178-
[["ผม", "กิน", "ข้าว", " ", "\n", "เธอ", "เล่น", "เกม"]],
179-
)
180-
self.assertEqual(
181-
sent_tokenize(SENT_4, engine="thaisum"),
182-
[["ผม", "กิน", "ข้าว", " ", "เธอ", "เล่น", "เกม"]],
183-
)
184190

185191
def testx_word_tokenize(self):
186192
self.assertIsNotNone(word_tokenize(TEXT_1, engine="nlpo3"))

0 commit comments

Comments
 (0)