Skip to content

Commit 41ed2c8

Browse files
Copilotwannaphong
andauthored
Fix check_marttra() Thai final consonant classification (#1173)
* Initial plan * Fix check_marttra function to handle Thai final consonants correctly - Fixed handle_karun_sound_silence to correctly remove only silent consonants marked by ์ - Fixed handling of ำ at the end to return กม (not กา) - Fixed handling of consonant clusters with ร (e.g., กร, ขร, คร) in compound words - Fixed handling of words with ไ/ใ vowels to distinguish true final consonants ย/ล from vowel components - Fixed typo in consonant list (", ณ" -> "ณ") - Uncommented all previously failing test cases Co-authored-by: wannaphong <8536487+wannaphong@users.noreply.github.com> * Address code review feedback - Remove duplicate ล from กน consonant list (already classified as เกย on line 282) - Use imported thai_consonants constant instead of duplicating the string - Improves code maintainability and eliminates redundancy Co-authored-by: wannaphong <8536487+wannaphong@users.noreply.github.com> * Optimize helper function by moving to class method - Moved has_true_final_yl from inline function to class method _has_true_final_yl - Avoids recreating function object on every check_marttra call - Improves performance and code organization Co-authored-by: wannaphong <8536487+wannaphong@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: wannaphong <8536487+wannaphong@users.noreply.github.com>
1 parent 0264e67 commit 41ed2c8

File tree

2 files changed

+60
-26
lines changed

2 files changed

+60
-26
lines changed

pythainlp/khavee/core.py

Lines changed: 50 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from typing import List, Union
88

9+
from pythainlp import thai_consonants
910
from pythainlp.tokenize import subword_tokenize
1011
from pythainlp.util import remove_tonemark, sound_syllable
1112

@@ -16,6 +17,22 @@ def __init__(self):
1617
KhaveeVerifier: Thai Poetry verifier
1718
"""
1819

20+
def _has_true_final_yl(self, word: str) -> bool:
21+
"""
22+
Check if ย or ล is a true final consonant
23+
(not just part of the vowel sound with ไ/ใ)
24+
25+
:param str word: Thai word
26+
:return: True if ย or ล is a true final consonant
27+
:rtype: bool
28+
"""
29+
if len(word) < 2:
30+
return False
31+
# Count consonants in the word
32+
consonant_count = sum(1 for c in word if c in thai_consonants)
33+
# If there are 2+ consonants and word ends with ย or ล, it's a true final
34+
return consonant_count >= 2 and word[-1] in ["ย", "ล"]
35+
1936
def check_sara(self, word: str) -> str:
2037
"""
2138
Check the vowels in the Thai word.
@@ -223,15 +240,35 @@ def check_marttra(self, word: str) -> str:
223240
print(kv.check_marttra("สาว"))
224241
# output: 'เกอว'
225242
"""
226-
if word[-1] == "ร" and word[-2] in ["ต", "ท"]:
227-
word = word[:-1]
243+
# Handle consonant clusters ending with ร
244+
# ตร, ทร → remove ร (treat as final ต/ท sound)
245+
# กร, ขร, คร, ฆร in compound words → remove ร (treat as final ก/ข/ค sound)
246+
# But single syllable words like "กร" should keep ร
247+
if len(word) >= 3 and word[-1] == "ร":
248+
if word[-2] in ["ต", "ท"]:
249+
word = word[:-1]
250+
elif word[-2] in ["ก", "ข", "ค", "ฆ"]:
251+
word = word[:-1]
252+
228253
word = self.handle_karun_sound_silence(word)
229254
word = remove_tonemark(word)
255+
256+
# Check for ำ at the end (represents "am" sound, ends with m)
257+
if word[-1] == "ำ":
258+
return "กม"
259+
260+
# Check for vowels and special patterns that indicate open syllables (กา)
261+
# For words with ไ/ใ, check if ย/ล is a true final or just part of vowel
262+
if "ไ" in word or "ใ" in word:
263+
if word[-1] not in ["ย", "ล"]:
264+
return "กา"
265+
elif not self._has_true_final_yl(word):
266+
# ย/ล is part of the vowel sound, not a true final
267+
return "กา"
268+
# else: ย/ล is a true final, continue to consonant classification below
269+
230270
if (
231-
"ำ" in word
232-
or ("ํ" in word and "า" in word)
233-
or "ไ" in word
234-
or "ใ" in word
271+
("ํ" in word and "า" in word)
235272
):
236273
return "กา"
237274
elif (
@@ -245,10 +282,9 @@ def check_marttra(self, word: str) -> str:
245282
elif word[-1] in ["ม"]:
246283
return "กม"
247284
elif word[-1] in ["ย"]:
248-
if "ั" in word:
249-
return "กา"
250-
else:
251-
return "เกย"
285+
return "เกย"
286+
elif word[-1] in ["ล"]:
287+
return "เกย"
252288
elif word[-1] in ["ว"]:
253289
return "เกอว"
254290
elif word[-1] in ["ก", "ข", "ค", "ฆ"]:
@@ -272,7 +308,7 @@ def check_marttra(self, word: str) -> str:
272308
"ส",
273309
]:
274310
return "กด"
275-
elif word[-1] in ["ญ", ", ณ", "น", "ร", "ล", "ฬ"]:
311+
elif word[-1] in ["ญ", "ณ", "น", "ร", "ฬ"]:
276312
return "กน"
277313
elif word[-1] in ["บ", "ป", "พ", "ฟ", "ภ"]:
278314
return "กบ"
@@ -649,9 +685,7 @@ def handle_karun_sound_silence(self, word: str) -> str:
649685
sound_silenced = word.endswith("์")
650686
if not sound_silenced:
651687
return word
652-
thai_consonants = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ"
653-
locate_silenced = word.rfind("์") - 1
654-
can_silence_two = word[locate_silenced - 2] in thai_consonants
655-
cut_off = 2 if can_silence_two else 1
656-
word = word[: locate_silenced + 1 - cut_off]
688+
# Remove ์ and the silent consonant before it
689+
# การันต์ (์) marks the consonant immediately before it as silent
690+
word = word[:-2]
657691
return word

tests/core/test_khavee.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -28,18 +28,18 @@ def test_check_marttra(self):
2828
self.assertEqual(kv.check_marttra("จาม"), "กม")
2929
self.assertEqual(kv.check_marttra("ยิ้ม"), "กม")
3030
self.assertEqual(kv.check_marttra("เกม"), "กม")
31-
# self.assertEqual(kv.check_marttra("ขำ"), "กม")
32-
# self.assertEqual(kv.check_marttra("รมย์"), "กม")
31+
self.assertEqual(kv.check_marttra("ขำ"), "กม")
32+
self.assertEqual(kv.check_marttra("รมย์"), "กม")
3333

3434
self.assertEqual(kv.check_marttra("สวย"), "เกย")
3535
self.assertEqual(kv.check_marttra("โปรย"), "เกย")
3636
self.assertEqual(kv.check_marttra("เนย"), "เกย")
3737
self.assertEqual(kv.check_marttra("คอย"), "เกย")
3838
self.assertEqual(kv.check_marttra("ง่าย"), "เกย")
39-
# self.assertEqual(kv.check_marttra("ทัย"), "เกย")
40-
# self.assertEqual(kv.check_marttra("ไทย"), "เกย")
41-
# self.assertEqual(kv.check_marttra("ไกล"), "เกย")
42-
# self.assertEqual(kv.check_marttra("ใกล้"), "เกย")
39+
self.assertEqual(kv.check_marttra("ทัย"), "เกย")
40+
self.assertEqual(kv.check_marttra("ไทย"), "เกย")
41+
self.assertEqual(kv.check_marttra("ไกล"), "เกย")
42+
self.assertEqual(kv.check_marttra("ใกล้"), "เกย")
4343

4444
self.assertEqual(kv.check_marttra("สาว"), "เกอว")
4545
self.assertEqual(kv.check_marttra("นิ้ว"), "เกอว")
@@ -51,15 +51,15 @@ def test_check_marttra(self):
5151
self.assertEqual(kv.check_marttra("โรค"), "กก")
5252
self.assertEqual(kv.check_marttra("ลาก"), "กก")
5353
self.assertEqual(kv.check_marttra("นัข"), "กก")
54-
# self.assertEqual(kv.check_marttra("จักร"), "กก")
54+
self.assertEqual(kv.check_marttra("จักร"), "กก")
5555

5656
self.assertEqual(kv.check_marttra("จด"), "กด")
5757
self.assertEqual(kv.check_marttra("ตรวจ"), "กด")
5858
self.assertEqual(kv.check_marttra("เสริฐ"), "กด")
5959
self.assertEqual(kv.check_marttra("บุตร"), "กด")
6060
self.assertEqual(kv.check_marttra("ตรุษ"), "กด")
6161
self.assertEqual(kv.check_marttra("มืด"), "กด")
62-
# self.assertEqual(kv.check_marttra("โยชน์"), "กด")
62+
self.assertEqual(kv.check_marttra("โยชน์"), "กด")
6363

6464
self.assertEqual(kv.check_marttra("มึน"), "กน")
6565
self.assertEqual(kv.check_marttra("ร้าน"), "กน")
@@ -70,8 +70,8 @@ def test_check_marttra(self):
7070
self.assertEqual(kv.check_marttra("บรร"), "กน")
7171
self.assertEqual(kv.check_marttra("กร"), "กน")
7272
self.assertEqual(kv.check_marttra("เณร"), "กน")
73-
# self.assertEqual(kv.check_marttra("ยนต์"), "กน")
74-
# self.assertEqual(kv.check_marttra("กรรณ"), "กน")
73+
self.assertEqual(kv.check_marttra("ยนต์"), "กน")
74+
self.assertEqual(kv.check_marttra("กรรณ"), "กน")
7575

7676
self.assertEqual(kv.check_marttra("ชอบ"), "กบ")
7777
self.assertEqual(kv.check_marttra("ภาพ"), "กบ")

0 commit comments

Comments
 (0)