Skip to content

Commit 0264e67

Browse files
Copilotwannaphong
andauthored
Fix royin romanization for Thai consonant clusters and syllable boundaries (#1172)
* Initial plan * Fix royin romanization engine for consonant clusters and syllable boundaries Co-authored-by: wannaphong <8536487+wannaphong@users.noreply.github.com> * Add previously failing test cases to BASIC_TESTS - all now pass Co-authored-by: wannaphong <8536487+wannaphong@users.noreply.github.com> * Enable more consistency tests that now pass Co-authored-by: wannaphong <8536487+wannaphong@users.noreply.github.com> * Refactor code based on review feedback: add comments and extract helper function Co-authored-by: wannaphong <8536487+wannaphong@users.noreply.github.com> * Final code quality improvements: extract vowel constant and add romanization in comments Co-authored-by: wannaphong <8536487+wannaphong@users.noreply.github.com> * Update test expectation for lookup fallback after royin improvements Co-authored-by: wannaphong <8536487+wannaphong@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: wannaphong <8536487+wannaphong@users.noreply.github.com>
1 parent de6be60 commit 0264e67

File tree

2 files changed

+144
-26
lines changed

2 files changed

+144
-26
lines changed

pythainlp/transliterate/royin.py

Lines changed: 127 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@
1414

1515
from pythainlp import thai_consonants, word_tokenize
1616

17+
# Romanized vowels for checking
18+
_ROMANIZED_VOWELS = "aeiou"
19+
1720
# vowel
1821
_vowel_patterns = """เ*ียว,\\1iao
1922
แ*็ว,\\1aeo
@@ -151,46 +154,115 @@ def _replace_vowels(word: str) -> str:
151154
def _replace_consonants(word: str, consonants: str) -> str:
152155
_HO_HIP = "\u0e2b" # ห
153156
_RO_RUA = "\u0e23" # ร
157+
_LO_LING = "\u0e25" # ล
158+
_WO_WAEN = "\u0e27" # ว
154159
_DOUBLE_RO_RUA = _RO_RUA + _RO_RUA
160+
161+
# Consonants that can be second in a cluster
162+
_CLUSTER_SECOND = {_RO_RUA, _LO_LING, _WO_WAEN}
155163

156164
if not consonants:
157165
return word
158166

159167
skip = False
160168
mod_chars = []
161169
j = 0 # j is the index of consonants
170+
vowel_seen = False # Track if we've seen a vowel (non-consonant character)
171+
162172
for i in range(len(word)):
163173
if skip:
164174
skip = False
165175
j += 1
166176
elif word[i] not in _CONSONANTS: # word[i] is not a Thai consonant.
177+
vowel_seen = True
167178
mod_chars.append(word[i])
168179
elif (
169180
len(mod_chars) == 0 and word[i] == _HO_HIP and len(consonants) != 1
170181
): # Skip HO HIP except that HO HIP is the only one consonant
171182
j += 1
172-
elif (
173-
len(mod_chars) == 0
174-
): # The first character must be an initial consonant.
175-
mod_chars.append(_CONSONANTS[consonants[j]][0])
176-
j += 1
177183
elif word[i:] == _DOUBLE_RO_RUA: # Double RO RUA is in end of word
178184
skip = True
179185
mod_chars.append("a")
180186
mod_chars.append("n")
187+
vowel_seen = True # 'a' acts as a vowel
181188
j += 1
182189
elif word[i : i + 2] == _DOUBLE_RO_RUA:
183190
skip = True
184191
mod_chars.append("a")
192+
vowel_seen = True # 'a' acts as a vowel
185193
j += 1
186-
else: # Assume that the rest are final consonants.
187-
mod_chars.append(_CONSONANTS[consonants[j]][1])
188-
j += 1
194+
elif not vowel_seen: # Building initial consonant cluster
195+
# Check if we've added any actual initial consonants (non-empty romanized characters)
196+
# We check for non-vowel characters since mod_chars contains romanized output
197+
has_initial = any(c and c not in _ROMANIZED_VOWELS for c in mod_chars)
198+
199+
if not has_initial:
200+
# First consonant in the cluster
201+
initial = _CONSONANTS[consonants[j]][0]
202+
if initial: # Only append if not empty (e.g., อ has empty initial)
203+
mod_chars.append(initial)
204+
j += 1
205+
else:
206+
# Check if this consonant can be part of a cluster
207+
is_cluster_consonant = word[i] in _CLUSTER_SECOND
208+
is_last_char = (i + 1 >= len(word))
209+
has_vowel_next = not is_last_char and word[i+1] not in _CONSONANTS
210+
211+
# Cluster consonants (ร/r, ล/l, ว/w) are part of initial cluster if:
212+
# - followed by a vowel, OR
213+
# - not the last character (e.g., กรม/krom: ก/k+ร/r are cluster, ม/m is final)
214+
if is_cluster_consonant and (has_vowel_next or not is_last_char):
215+
# This is part of initial cluster (ร/r, ล/l, or ว/w after first consonant)
216+
mod_chars.append(_CONSONANTS[consonants[j]][0])
217+
j += 1
218+
elif not is_cluster_consonant and not is_last_char:
219+
# Not a cluster consonant, and there are more characters
220+
# This likely starts a new syllable, so add implicit 'a' to previous syllable
221+
mod_chars.append("a")
222+
vowel_seen = True
223+
# Now process this consonant as start of new syllable
224+
initial = _CONSONANTS[consonants[j]][0]
225+
if initial: # Only append if not empty
226+
mod_chars.append(initial)
227+
vowel_seen = False # Reset for new syllable
228+
j += 1
229+
elif has_vowel_next:
230+
# Not a cluster consonant, but vowel follows - still initial
231+
mod_chars.append(_CONSONANTS[consonants[j]][0])
232+
j += 1
233+
elif is_last_char:
234+
# This is a final consonant with no vowel, need to add 'o'
235+
mod_chars.append("o")
236+
mod_chars.append(_CONSONANTS[consonants[j]][1])
237+
vowel_seen = True
238+
j += 1
239+
else:
240+
# There's another consonant after this one
241+
# Add implicit 'o' and treat this as final
242+
mod_chars.append("o")
243+
mod_chars.append(_CONSONANTS[consonants[j]][1])
244+
vowel_seen = True
245+
j += 1
246+
else: # After vowel - could be final consonant or start of new syllable
247+
has_vowel_next = (i + 1 < len(word) and word[i+1] not in _CONSONANTS)
248+
if has_vowel_next:
249+
# Consonant followed by vowel - start of new syllable
250+
mod_chars.append(_CONSONANTS[consonants[j]][0])
251+
vowel_seen = False # Reset for new syllable
252+
j += 1
253+
else:
254+
# No vowel follows - this is a final consonant
255+
mod_chars.append(_CONSONANTS[consonants[j]][1])
256+
j += 1
189257
return "".join(mod_chars)
190258

191259

192260
# support function for romanize()
193261
def _romanize(word: str) -> str:
262+
# Special case: single ห character should be empty (silent)
263+
if word == 'ห':
264+
return ''
265+
194266
word = _replace_vowels(_normalize(word))
195267
consonants = _RE_CONSONANT.findall(word)
196268

@@ -204,6 +276,39 @@ def _romanize(word: str) -> str:
204276
return word
205277

206278

279+
def _should_add_syllable_separator(prev_word: str, curr_word: str, prev_romanized: str) -> bool:
280+
"""
281+
Determine if 'a' should be added between two romanized syllables.
282+
283+
This applies when:
284+
- Previous word has explicit vowel and ends with consonant
285+
- Current word is a 2-consonant cluster with no vowels (e.g., 'กร')
286+
287+
:param prev_word: The previous Thai word/token
288+
:param curr_word: The current Thai word/token
289+
:param prev_romanized: The romanized form of the previous word
290+
:return: True if 'a' should be added before the current word
291+
"""
292+
if not prev_romanized or len(curr_word) < 2:
293+
return False
294+
295+
# Check if previous word has explicit vowel
296+
prev_normalized = _normalize(prev_word)
297+
prev_after_vowels = _replace_vowels(prev_normalized)
298+
prev_consonants = _RE_CONSONANT.findall(prev_word)
299+
has_explicit_vowel_prev = len(prev_after_vowels) > len(prev_consonants)
300+
301+
# Check if current word is 2 Thai consonants with no vowel
302+
consonants_in_word = _RE_CONSONANT.findall(curr_word)
303+
vowels_in_word = len(curr_word) - len(consonants_in_word)
304+
305+
# Add 'a' if conditions are met
306+
return (has_explicit_vowel_prev and
307+
len(consonants_in_word) == 2 and
308+
vowels_in_word == 0 and
309+
prev_romanized[-1] not in _ROMANIZED_VOWELS)
310+
311+
207312
def romanize(text: str) -> str:
208313
"""Render Thai words in Latin alphabet, using RTGS
209314
@@ -216,6 +321,18 @@ def romanize(text: str) -> str:
216321
:rtype: str
217322
"""
218323
words = word_tokenize(text)
219-
romanized_words = [_romanize(word) for word in words]
220-
324+
romanized_words = []
325+
326+
for i, word in enumerate(words):
327+
romanized = _romanize(word)
328+
329+
# Check if we need to add syllable separator 'a'
330+
if i > 0 and romanized:
331+
prev_word = words[i-1]
332+
prev_romanized = romanized_words[-1] if romanized_words else ''
333+
if _should_add_syllable_separator(prev_word, word, prev_romanized):
334+
romanized = 'a' + romanized
335+
336+
romanized_words.append(romanized)
337+
221338
return "".join(romanized_words)

tests/core/test_transliterate.py

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -23,31 +23,31 @@
2323
"กร": "kon",
2424
"กรร": "kan",
2525
"กรรม": "kam",
26-
# "กรม": "krom", # failed
26+
"กรม": "krom",
2727
"ฝ้าย": "fai",
2828
"นพพร": "nopphon",
2929
"อัก": "ak",
30-
# "ทีปกร": "thipakon", # failed
31-
# "ธรรพ์": "than", # failed
32-
# "ธรรม": "tham", # failed
33-
# "มหา": "maha", # failed
34-
# "หยาก": "yak", # failed
35-
# "อยาก": "yak", # failed
36-
# "ยมก": "yamok", # failed
37-
# "กลัว": "klua", # failed
38-
# "บ้านไร่": "banrai", # failed
39-
# "ชารินทร์": "charin", # failed
30+
"ทีปกร": "thipakon",
31+
"ธรรพ์": "than",
32+
"ธรรม": "tham",
33+
"มหา": "maha",
34+
"หยาก": "yak",
35+
"อยาก": "yak",
36+
"ยมก": "yamok",
37+
"กลัว": "klua",
38+
"บ้านไร่": "banrai",
39+
"ชารินทร์": "charin",
4040
}
4141

4242
# these are set of two-syllable words,
4343
# to test if the transliteration/romanization is consistent, say
4444
# romanize(1+2) = romanize(1) + romanize(2)
4545
CONSISTENCY_TESTS = [
46-
# ("กระจก", "กระ", "จก"), # failed
47-
# ("ระเบิด", "ระ", "เบิด"), # failed
48-
# ("หยากไย่", "หยาก", "ไย่"), # failed
46+
# ("กระจก", "กระ", "จก"), # failed - tokenization issue
47+
("ระเบิด", "ระ", "เบิด"),
48+
("หยากไย่", "หยาก", "ไย่"),
4949
("ตากใบ", "ตาก", "ใบ"),
50-
# ("จัดสรร", "จัด", "สรร"), # failed
50+
# ("จัดสรร", "จัด", "สรร"), # failed - tokenization issue
5151
]
5252

5353

@@ -83,8 +83,9 @@ def test_romanize_lookup(self):
8383
"caramel cappuccino",
8484
)
8585
## found individually, but needs tokenization
86+
# Updated expectation after royin improvements for syllable boundary detection
8687
self.assertEqual(
87-
romanize("คาราเมลคาปูชิโน่", engine="lookup"), "khanamenkhaputino"
88+
romanize("คาราเมลคาปูชิโน่", engine="lookup"), "kharamenkhapuchino"
8889
)
8990
# not found in v1.4
9091
self.assertEqual(romanize("ภาพยนตร์", engine="lookup"), "phapn")

0 commit comments

Comments
 (0)