1414
1515from pythainlp import thai_consonants , word_tokenize
1616
17+ # Romanized vowels for checking
18+ _ROMANIZED_VOWELS = "aeiou"
19+
1720# vowel
1821_vowel_patterns = """เ*ียว,\\ 1iao
1922แ*็ว,\\ 1aeo
@@ -151,46 +154,115 @@ def _replace_vowels(word: str) -> str:
151154def _replace_consonants (word : str , consonants : str ) -> str :
152155 _HO_HIP = "\u0e2b " # ห
153156 _RO_RUA = "\u0e23 " # ร
157+ _LO_LING = "\u0e25 " # ล
158+ _WO_WAEN = "\u0e27 " # ว
154159 _DOUBLE_RO_RUA = _RO_RUA + _RO_RUA
160+
161+ # Consonants that can be second in a cluster
162+ _CLUSTER_SECOND = {_RO_RUA , _LO_LING , _WO_WAEN }
155163
156164 if not consonants :
157165 return word
158166
159167 skip = False
160168 mod_chars = []
161169 j = 0 # j is the index of consonants
170+ vowel_seen = False # Track if we've seen a vowel (non-consonant character)
171+
162172 for i in range (len (word )):
163173 if skip :
164174 skip = False
165175 j += 1
166176 elif word [i ] not in _CONSONANTS : # word[i] is not a Thai consonant.
177+ vowel_seen = True
167178 mod_chars .append (word [i ])
168179 elif (
169180 len (mod_chars ) == 0 and word [i ] == _HO_HIP and len (consonants ) != 1
170181 ): # Skip HO HIP except that HO HIP is the only one consonant
171182 j += 1
172- elif (
173- len (mod_chars ) == 0
174- ): # The first character must be an initial consonant.
175- mod_chars .append (_CONSONANTS [consonants [j ]][0 ])
176- j += 1
177183 elif word [i :] == _DOUBLE_RO_RUA : # Double RO RUA is in end of word
178184 skip = True
179185 mod_chars .append ("a" )
180186 mod_chars .append ("n" )
187+ vowel_seen = True # 'a' acts as a vowel
181188 j += 1
182189 elif word [i : i + 2 ] == _DOUBLE_RO_RUA :
183190 skip = True
184191 mod_chars .append ("a" )
192+ vowel_seen = True # 'a' acts as a vowel
185193 j += 1
186- else : # Assume that the rest are final consonants.
187- mod_chars .append (_CONSONANTS [consonants [j ]][1 ])
188- j += 1
194+ elif not vowel_seen : # Building initial consonant cluster
195+ # Check if we've added any actual initial consonants (non-empty romanized characters)
196+ # We check for non-vowel characters since mod_chars contains romanized output
197+ has_initial = any (c and c not in _ROMANIZED_VOWELS for c in mod_chars )
198+
199+ if not has_initial :
200+ # First consonant in the cluster
201+ initial = _CONSONANTS [consonants [j ]][0 ]
202+ if initial : # Only append if not empty (e.g., อ has empty initial)
203+ mod_chars .append (initial )
204+ j += 1
205+ else :
206+ # Check if this consonant can be part of a cluster
207+ is_cluster_consonant = word [i ] in _CLUSTER_SECOND
208+ is_last_char = (i + 1 >= len (word ))
209+ has_vowel_next = not is_last_char and word [i + 1 ] not in _CONSONANTS
210+
211+ # Cluster consonants (ร/r, ล/l, ว/w) are part of initial cluster if:
212+ # - followed by a vowel, OR
213+ # - not the last character (e.g., กรม/krom: ก/k+ร/r are cluster, ม/m is final)
214+ if is_cluster_consonant and (has_vowel_next or not is_last_char ):
215+ # This is part of initial cluster (ร/r, ล/l, or ว/w after first consonant)
216+ mod_chars .append (_CONSONANTS [consonants [j ]][0 ])
217+ j += 1
218+ elif not is_cluster_consonant and not is_last_char :
219+ # Not a cluster consonant, and there are more characters
220+ # This likely starts a new syllable, so add implicit 'a' to previous syllable
221+ mod_chars .append ("a" )
222+ vowel_seen = True
223+ # Now process this consonant as start of new syllable
224+ initial = _CONSONANTS [consonants [j ]][0 ]
225+ if initial : # Only append if not empty
226+ mod_chars .append (initial )
227+ vowel_seen = False # Reset for new syllable
228+ j += 1
229+ elif has_vowel_next :
230+ # Not a cluster consonant, but vowel follows - still initial
231+ mod_chars .append (_CONSONANTS [consonants [j ]][0 ])
232+ j += 1
233+ elif is_last_char :
234+ # This is a final consonant with no vowel, need to add 'o'
235+ mod_chars .append ("o" )
236+ mod_chars .append (_CONSONANTS [consonants [j ]][1 ])
237+ vowel_seen = True
238+ j += 1
239+ else :
240+ # There's another consonant after this one
241+ # Add implicit 'o' and treat this as final
242+ mod_chars .append ("o" )
243+ mod_chars .append (_CONSONANTS [consonants [j ]][1 ])
244+ vowel_seen = True
245+ j += 1
246+ else : # After vowel - could be final consonant or start of new syllable
247+ has_vowel_next = (i + 1 < len (word ) and word [i + 1 ] not in _CONSONANTS )
248+ if has_vowel_next :
249+ # Consonant followed by vowel - start of new syllable
250+ mod_chars .append (_CONSONANTS [consonants [j ]][0 ])
251+ vowel_seen = False # Reset for new syllable
252+ j += 1
253+ else :
254+ # No vowel follows - this is a final consonant
255+ mod_chars .append (_CONSONANTS [consonants [j ]][1 ])
256+ j += 1
189257 return "" .join (mod_chars )
190258
191259
192260# support function for romanize()
193261def _romanize (word : str ) -> str :
262+ # Special case: single ห character should be empty (silent)
263+ if word == 'ห' :
264+ return ''
265+
194266 word = _replace_vowels (_normalize (word ))
195267 consonants = _RE_CONSONANT .findall (word )
196268
@@ -204,6 +276,39 @@ def _romanize(word: str) -> str:
204276 return word
205277
206278
279+ def _should_add_syllable_separator (prev_word : str , curr_word : str , prev_romanized : str ) -> bool :
280+ """
281+ Determine if 'a' should be added between two romanized syllables.
282+
283+ This applies when:
284+ - Previous word has explicit vowel and ends with consonant
285+ - Current word is a 2-consonant cluster with no vowels (e.g., 'กร')
286+
287+ :param prev_word: The previous Thai word/token
288+ :param curr_word: The current Thai word/token
289+ :param prev_romanized: The romanized form of the previous word
290+ :return: True if 'a' should be added before the current word
291+ """
292+ if not prev_romanized or len (curr_word ) < 2 :
293+ return False
294+
295+ # Check if previous word has explicit vowel
296+ prev_normalized = _normalize (prev_word )
297+ prev_after_vowels = _replace_vowels (prev_normalized )
298+ prev_consonants = _RE_CONSONANT .findall (prev_word )
299+ has_explicit_vowel_prev = len (prev_after_vowels ) > len (prev_consonants )
300+
301+ # Check if current word is 2 Thai consonants with no vowel
302+ consonants_in_word = _RE_CONSONANT .findall (curr_word )
303+ vowels_in_word = len (curr_word ) - len (consonants_in_word )
304+
305+ # Add 'a' if conditions are met
306+ return (has_explicit_vowel_prev and
307+ len (consonants_in_word ) == 2 and
308+ vowels_in_word == 0 and
309+ prev_romanized [- 1 ] not in _ROMANIZED_VOWELS )
310+
311+
207312def romanize (text : str ) -> str :
208313 """Render Thai words in Latin alphabet, using RTGS
209314
@@ -216,6 +321,18 @@ def romanize(text: str) -> str:
216321 :rtype: str
217322 """
218323 words = word_tokenize (text )
219- romanized_words = [_romanize (word ) for word in words ]
220-
324+ romanized_words = []
325+
326+ for i , word in enumerate (words ):
327+ romanized = _romanize (word )
328+
329+ # Check if we need to add syllable separator 'a'
330+ if i > 0 and romanized :
331+ prev_word = words [i - 1 ]
332+ prev_romanized = romanized_words [- 1 ] if romanized_words else ''
333+ if _should_add_syllable_separator (prev_word , word , prev_romanized ):
334+ romanized = 'a' + romanized
335+
336+ romanized_words .append (romanized )
337+
221338 return "" .join (romanized_words )
0 commit comments