Fix royin romanization for Thai consonant clusters and syllable boundaries (#1172)

Copilot · wannaphong · web-flow · commit 0264e676cd16 · 2026-01-06T20:53:42.000+07:00
* Initial plan

* Fix royin romanization engine for consonant clusters and syllable boundaries

Co-authored-by: wannaphong &lt;8536487+wannaphong@users.noreply.github.com&gt;

* Add previously failing test cases to BASIC_TESTS - all now pass

Co-authored-by: wannaphong &lt;8536487+wannaphong@users.noreply.github.com&gt;

* Enable more consistency tests that now pass

Co-authored-by: wannaphong &lt;8536487+wannaphong@users.noreply.github.com&gt;

* Refactor code based on review feedback: add comments and extract helper function

Co-authored-by: wannaphong &lt;8536487+wannaphong@users.noreply.github.com&gt;

* Final code quality improvements: extract vowel constant and add romanization in comments

Co-authored-by: wannaphong &lt;8536487+wannaphong@users.noreply.github.com&gt;

* Update test expectation for lookup fallback after royin improvements

Co-authored-by: wannaphong &lt;8536487+wannaphong@users.noreply.github.com&gt;

---------

Co-authored-by: copilot-swe-agent[bot] &lt;198982749+Copilot@users.noreply.github.com&gt;
Co-authored-by: wannaphong &lt;8536487+wannaphong@users.noreply.github.com&gt;
diff --git a/pythainlp/transliterate/royin.py b/pythainlp/transliterate/royin.py
@@ -14,6 +14,9 @@
 
 from pythainlp import thai_consonants, word_tokenize
 
+# Romanized vowels for checking
+_ROMANIZED_VOWELS = "aeiou"
+
 # vowel
 _vowel_patterns = """เ*ียว,\\1iao
 แ*็ว,\\1aeo
@@ -151,46 +154,115 @@ def _replace_vowels(word: str) -> str:
 def _replace_consonants(word: str, consonants: str) -> str:
     _HO_HIP = "\u0e2b"  # ห
     _RO_RUA = "\u0e23"  # ร
+    _LO_LING = "\u0e25"  # ล
+    _WO_WAEN = "\u0e27"  # ว
     _DOUBLE_RO_RUA = _RO_RUA + _RO_RUA
+    
+    # Consonants that can be second in a cluster
+    _CLUSTER_SECOND = {_RO_RUA, _LO_LING, _WO_WAEN}
 
     if not consonants:
         return word
 
     skip = False
     mod_chars = []
     j = 0  # j is the index of consonants
+    vowel_seen = False  # Track if we've seen a vowel (non-consonant character)
+    
     for i in range(len(word)):
         if skip:
             skip = False
             j += 1
         elif word[i] not in _CONSONANTS:  # word[i] is not a Thai consonant.
+            vowel_seen = True
             mod_chars.append(word[i])
         elif (
             len(mod_chars) == 0 and word[i] == _HO_HIP and len(consonants) != 1
         ):  # Skip HO HIP except that HO HIP is the only one consonant
             j += 1
-        elif (
-            len(mod_chars) == 0
-        ):  # The first character must be an initial consonant.
-            mod_chars.append(_CONSONANTS[consonants[j]][0])
-            j += 1
         elif word[i:] == _DOUBLE_RO_RUA:  # Double RO RUA is in end of word
             skip = True
             mod_chars.append("a")
             mod_chars.append("n")
+            vowel_seen = True  # 'a' acts as a vowel
             j += 1
         elif word[i : i + 2] == _DOUBLE_RO_RUA:
             skip = True
             mod_chars.append("a")
+            vowel_seen = True  # 'a' acts as a vowel
             j += 1
-        else:  # Assume that the rest are final consonants.
-            mod_chars.append(_CONSONANTS[consonants[j]][1])
-            j += 1
+        elif not vowel_seen:  # Building initial consonant cluster
+            # Check if we've added any actual initial consonants (non-empty romanized characters)
+            # We check for non-vowel characters since mod_chars contains romanized output
+            has_initial = any(c and c not in _ROMANIZED_VOWELS for c in mod_chars)
+            
+            if not has_initial:
+                # First consonant in the cluster
+                initial = _CONSONANTS[consonants[j]][0]
+                if initial:  # Only append if not empty (e.g., อ has empty initial)
+                    mod_chars.append(initial)
+                j += 1
+            else:
+                # Check if this consonant can be part of a cluster
+                is_cluster_consonant = word[i] in _CLUSTER_SECOND
+                is_last_char = (i + 1 >= len(word))
+                has_vowel_next = not is_last_char and word[i+1] not in _CONSONANTS
+                
+                # Cluster consonants (ร/r, ล/l, ว/w) are part of initial cluster if:
+                # - followed by a vowel, OR
+                # - not the last character (e.g., กรม/krom: ก/k+ร/r are cluster, ม/m is final)
+                if is_cluster_consonant and (has_vowel_next or not is_last_char):
+                    # This is part of initial cluster (ร/r, ล/l, or ว/w after first consonant)
+                    mod_chars.append(_CONSONANTS[consonants[j]][0])
+                    j += 1
+                elif not is_cluster_consonant and not is_last_char:
+                    # Not a cluster consonant, and there are more characters
+                    # This likely starts a new syllable, so add implicit 'a' to previous syllable
+                    mod_chars.append("a")
+                    vowel_seen = True
+                    # Now process this consonant as start of new syllable
+                    initial = _CONSONANTS[consonants[j]][0]
+                    if initial:  # Only append if not empty
+                        mod_chars.append(initial)
+                    vowel_seen = False  # Reset for new syllable
+                    j += 1
+                elif has_vowel_next:
+                    # Not a cluster consonant, but vowel follows - still initial
+                    mod_chars.append(_CONSONANTS[consonants[j]][0])
+                    j += 1
+                elif is_last_char:
+                    # This is a final consonant with no vowel, need to add 'o'
+                    mod_chars.append("o")
+                    mod_chars.append(_CONSONANTS[consonants[j]][1])
+                    vowel_seen = True
+                    j += 1
+                else:
+                    # There's another consonant after this one
+                    # Add implicit 'o' and treat this as final
+                    mod_chars.append("o")
+                    mod_chars.append(_CONSONANTS[consonants[j]][1])
+                    vowel_seen = True
+                    j += 1
+        else:  # After vowel - could be final consonant or start of new syllable
+            has_vowel_next = (i + 1 < len(word) and word[i+1] not in _CONSONANTS)
+            if has_vowel_next:
+                # Consonant followed by vowel - start of new syllable
+                mod_chars.append(_CONSONANTS[consonants[j]][0])
+                vowel_seen = False  # Reset for new syllable
+                j += 1
+            else:
+                # No vowel follows - this is a final consonant
+                mod_chars.append(_CONSONANTS[consonants[j]][1])
+                j += 1
     return "".join(mod_chars)
 
 
 # support function for romanize()
 def _romanize(word: str) -> str:
+    # Special case: single ห character should be empty (silent)
+    if word == 'ห':
+        return ''
+    
     word = _replace_vowels(_normalize(word))
     consonants = _RE_CONSONANT.findall(word)
 
@@ -204,6 +276,39 @@ def _romanize(word: str) -> str:
     return word
 
 
+def _should_add_syllable_separator(prev_word: str, curr_word: str, prev_romanized: str) -> bool:
+    """
+    Determine if 'a' should be added between two romanized syllables.
+    
+    This applies when:
+    - Previous word has explicit vowel and ends with consonant
+    - Current word is a 2-consonant cluster with no vowels (e.g., 'กร')
+    
+    :param prev_word: The previous Thai word/token
+    :param curr_word: The current Thai word/token
+    :param prev_romanized: The romanized form of the previous word
+    :return: True if 'a' should be added before the current word
+    """
+    if not prev_romanized or len(curr_word) < 2:
+        return False
+    
+    # Check if previous word has explicit vowel
+    prev_normalized = _normalize(prev_word)
+    prev_after_vowels = _replace_vowels(prev_normalized)
+    prev_consonants = _RE_CONSONANT.findall(prev_word)
+    has_explicit_vowel_prev = len(prev_after_vowels) > len(prev_consonants)
+    
+    # Check if current word is 2 Thai consonants with no vowel
+    consonants_in_word = _RE_CONSONANT.findall(curr_word)
+    vowels_in_word = len(curr_word) - len(consonants_in_word)
+    
+    # Add 'a' if conditions are met
+    return (has_explicit_vowel_prev and 
+            len(consonants_in_word) == 2 and 
+            vowels_in_word == 0 and
+            prev_romanized[-1] not in _ROMANIZED_VOWELS)
+
+
 def romanize(text: str) -> str:
     """Render Thai words in Latin alphabet, using RTGS
 
@@ -216,6 +321,18 @@ def romanize(text: str) -> str:
     :rtype: str
     """
     words = word_tokenize(text)
-    romanized_words = [_romanize(word) for word in words]
-
+    romanized_words = []
+    
+    for i, word in enumerate(words):
+        romanized = _romanize(word)
+        
+        # Check if we need to add syllable separator 'a'
+        if i > 0 and romanized:
+            prev_word = words[i-1]
+            prev_romanized = romanized_words[-1] if romanized_words else ''
+            if _should_add_syllable_separator(prev_word, word, prev_romanized):
+                romanized = 'a' + romanized
+        
+        romanized_words.append(romanized)
+    
     return "".join(romanized_words)
diff --git a/tests/core/test_transliterate.py b/tests/core/test_transliterate.py
@@ -23,31 +23,31 @@
     "กร": "kon",
     "กรร": "kan",
     "กรรม": "kam",
-    # "กรม": "krom",  # failed
+    "กรม": "krom",
     "ฝ้าย": "fai",
     "นพพร": "nopphon",
     "อัก": "ak",
-    # "ทีปกร": "thipakon",  # failed
-    # "ธรรพ์": "than",  # failed
-    # "ธรรม": "tham",  # failed
-    # "มหา": "maha",  # failed
-    # "หยาก": "yak",  # failed
-    # "อยาก": "yak",  # failed
-    # "ยมก": "yamok",  # failed
-    # "กลัว": "klua",  # failed
-    # "บ้านไร่": "banrai",  # failed
-    # "ชารินทร์": "charin",  # failed
+    "ทีปกร": "thipakon",
+    "ธรรพ์": "than",
+    "ธรรม": "tham",
+    "มหา": "maha",
+    "หยาก": "yak",
+    "อยาก": "yak",
+    "ยมก": "yamok",
+    "กลัว": "klua",
+    "บ้านไร่": "banrai",
+    "ชารินทร์": "charin",
 }
 
 # these are set of two-syllable words,
 # to test if the transliteration/romanization is consistent, say
 # romanize(1+2) = romanize(1) + romanize(2)
 CONSISTENCY_TESTS = [
-    # ("กระจก", "กระ", "จก"),  # failed
-    # ("ระเบิด", "ระ", "เบิด"),  # failed
-    # ("หยากไย่", "หยาก", "ไย่"),  # failed
+    # ("กระจก", "กระ", "จก"),  # failed - tokenization issue
+    ("ระเบิด", "ระ", "เบิด"),
+    ("หยากไย่", "หยาก", "ไย่"),
     ("ตากใบ", "ตาก", "ใบ"),
-    # ("จัดสรร", "จัด", "สรร"),  # failed
+    # ("จัดสรร", "จัด", "สรร"),  # failed - tokenization issue
 ]
 
 
@@ -83,8 +83,9 @@ def test_romanize_lookup(self):
             "caramel cappuccino",
         )
         ## found individually, but needs tokenization
+        # Updated expectation after royin improvements for syllable boundary detection
         self.assertEqual(
-            romanize("คาราเมลคาปูชิโน่", engine="lookup"), "khanamenkhaputino"
+            romanize("คาราเมลคาปูชิโน่", engine="lookup"), "kharamenkhapuchino"
         )
         # not found in v1.4
         self.assertEqual(romanize("ภาพยนตร์", engine="lookup"), "phapn")