@@ -25,7 +25,7 @@ public class ArpasingPlusPhonemizer : SyllableBasedPhonemizer {
2525 "aam" , "am" , "axm" , "aem" , "ahm" , "aom" , "om" , "awm" , "aum" , "aym" , "aim" , "ehm" , "em" , "eym" , "eim" , "ihm" , "iym" , "im" , "owm" , "oum" , "oym" , "oim" , "uhm" , "uwm" , "um" , "oh" ,
2626 "eu" , "oe" , "yw" , "yx" , "wx"
2727 } ;
28- private readonly string [ ] consonants = "b,ch,d,dh,dr, dx,f,g,hh,jh,k,l,m,n,ng,p,q,r,s,sh,t,th,tr ,v,w,y,z,zh" . Split ( ',' ) ;
28+ private readonly string [ ] consonants = "b,ch,d,dh,dx,f,g,hh,jh,k,l,m,n,ng,p,q,r,s,sh,t,th,v,w,y,z,zh" . Split ( ',' ) ;
2929 private readonly string [ ] affricates = "ch,jh,j" . Split ( ',' ) ;
3030 private readonly string [ ] tapConsonant = "dx" . Split ( "," ) ;
3131 private readonly string [ ] semilongConsonants = "ng,n,m,v,z,q,hh" . Split ( "," ) ;
@@ -204,24 +204,24 @@ protected override List<string> ProcessSyllable(Syllable syllable) {
204204 var firstC = 0 ;
205205
206206 foreach ( var entry in missingVphonemes ) {
207- if ( HasOto ( entry . Key , syllable . tone ) ) {
207+ if ( ! HasOto ( "ax" , syllable . tone ) || ! HasOto ( "b ax" , syllable . tone ) || ! HasOto ( "ax b" , syllable . tone ) ) {
208208 isMissingVPhonemes = true ;
209209 break ;
210210 }
211211 }
212212 foreach ( var entry in missingCphonemes ) {
213- if ( HasOto ( entry . Key , syllable . tone ) ) {
213+ if ( ! HasOto ( "wh" , syllable . tone ) || ! HasOto ( "zh er" , syllable . tone ) || ! HasOto ( "ah dx" , syllable . tone ) ) {
214214 isMissingCPhonemes = true ;
215215 break ;
216216 }
217217 }
218218 foreach ( var entry in timitphonemes ) {
219- if ( HasOto ( entry . Key , syllable . tone ) ) {
219+ if ( ! HasOto ( "gcl" , syllable . tone ) || ! HasOto ( "f axh" , syllable . tone ) || ! HasOto ( "ih tcl" , syllable . tone ) ) {
220220 isTimitPhonemes = true ;
221221 break ;
222222 }
223223 }
224-
224+
225225 // STARTING V
226226 if ( syllable . IsStartingV ) {
227227 // TRIES - V THEN V
@@ -335,40 +335,38 @@ protected override List<string> ProcessSyllable(Syllable syllable) {
335335 for ( var i = firstC ; i < cc . Length - 1 ; i ++ ) {
336336 var ccv = $ "{ string . Join ( "" , cc ) } { v } ";
337337 var ccv1 = string . Join ( "" , cc . Skip ( i ) ) + " " + v ;
338- if ( HasOto ( ccv , syllable . vowelTone ) || HasOto ( ValidateAlias ( ccv ) , syllable . vowelTone ) ) {
339- basePhoneme = ccv ;
340- lastC = i ;
341- break ;
342- } else {
343- if ( HasOto ( ccv1 , syllable . vowelTone ) || HasOto ( ValidateAlias ( ccv1 ) , syllable . vowelTone ) ) {
344- basePhoneme = ccv1 ;
338+ if ( syllable . CurrentWordCc . Length >= 2 ) {
339+ if ( HasOto ( ccv , syllable . vowelTone ) || HasOto ( ValidateAlias ( ccv ) , syllable . vowelTone ) ) {
340+ basePhoneme = ccv ;
341+ lastC = i ;
345342 break ;
346- } else if ( HasOto ( crv , syllable . vowelTone ) || HasOto ( ValidateAlias ( crv ) , syllable . vowelTone ) ) {
347- basePhoneme = crv ;
348- break ;
349- } else {
350- basePhoneme = $ "{ cc . Last ( ) } { v } ";
343+ } else if ( HasOto ( ccv1 , syllable . vowelTone ) || HasOto ( ValidateAlias ( ccv1 ) , syllable . vowelTone ) ) {
344+ basePhoneme = ccv1 ;
351345 }
346+ break ;
347+ } else if ( syllable . CurrentWordCc . Length >= 1 && syllable . PreviousWordCc . Length == 1 ) {
348+ basePhoneme = crv ;
352349 }
353350 }
354351 // try [V C], [V CC], [V -][- C]
355352 for ( var i = lastC + 1 ; i >= 0 ; i -- ) {
356353 var vr = $ "{ prevV } -";
357- var vcc = $ "{ prevV } { string . Join ( "" , cc . Take ( 2 ) ) } "; // bug on vcc, sequence of [{vowel} v][v f][f {vowel}] turns in to [{vowel} q/t][- {vowel}] which is odd
354+ var vcc = $ "{ prevV } { string . Join ( "" , cc . Take ( 2 ) ) } "; // bug on vcc, sequence of [{vowel} v][v f][f {vowel}] turns into [{vowel} q/t][- {vowel}] which is odd
358355 var vc = $ "{ prevV } { cc [ 0 ] } ";
359356 if ( i == 0 && ( HasOto ( vr , syllable . tone ) || HasOto ( ValidateAlias ( vr ) , syllable . tone ) ) && ! HasOto ( vc , syllable . tone ) ) {
360357 phonemes . Add ( vr ) ;
361358 phonemes . Add ( $ "- { cc [ 0 ] } ") ;
362359 break ;
363- }
364- if ( HasOto ( vcc , syllable . tone ) || HasOto ( ValidateAlias ( vcc ) , syllable . tone ) ) {
360+ } else if ( cc . Length > 2 && HasOto ( vcc , syllable . tone ) || HasOto ( ValidateAlias ( vcc ) , syllable . tone ) ) {
365361 phonemes . Add ( vcc ) ;
366362 firstC = 1 ;
367363 break ;
368- }
369- if ( HasOto ( vc , syllable . tone ) || HasOto ( ValidateAlias ( vc ) , syllable . tone ) ) {
364+ } else if ( HasOto ( vc , syllable . tone ) || HasOto ( ValidateAlias ( vc ) , syllable . tone ) ) {
370365 phonemes . Add ( vc ) ;
371366 break ;
367+ } else {
368+ // If none of the conditions are met, continue the loop
369+ continue ;
372370 }
373371 }
374372 }
@@ -379,10 +377,6 @@ protected override List<string> ProcessSyllable(Syllable syllable) {
379377 if ( ! HasOto ( cc1 , syllable . tone ) ) {
380378 cc1 = ValidateAlias ( cc1 ) ;
381379 }
382- // [C1 C2C3]
383- if ( HasOto ( $ "{ cc [ i ] } { string . Join ( "" , cc . Skip ( i + 1 ) ) } ", syllable . tone ) ) {
384- cc1 = ( $ "{ cc [ i ] } { string . Join ( "" , cc . Skip ( i + 1 ) ) } ") ;
385- }
386380 // [C1 C2]
387381 if ( ! HasOto ( cc1 , syllable . tone ) ) {
388382 cc1 = $ "{ cc [ i ] } { cc [ i + 1 ] } ";
@@ -399,22 +393,31 @@ protected override List<string> ProcessSyllable(Syllable syllable) {
399393 if ( ! HasOto ( cc1 , syllable . tone ) ) {
400394 cc1 = ValidateAlias ( cc1 ) ;
401395 }
402- // CC V on multiple consonants ex [s tr ao]
403- if ( HasOto ( ccv , syllable . vowelTone ) || HasOto ( ValidateAlias ( ccv ) , syllable . vowelTone ) ) {
404- basePhoneme = ccv ;
405- lastC = i ;
406- break ;
407- } else if ( ( HasOto ( lcv , syllable . vowelTone ) || HasOto ( ValidateAlias ( lcv ) , syllable . vowelTone ) ) && HasOto ( cc1 , syllable . vowelTone ) && ! cc1 . Contains ( $ "{ cc [ i ] } { cc [ i + 1 ] } ") ) {
396+ // CC V on multiple consonants ex [s tr ao] (only if the word starts with a CC)
397+ if ( syllable . CurrentWordCc . Length >= 2 && syllable . PreviousWordCc . Length >= 1 ) {
398+ if ( HasOto ( ccv , syllable . vowelTone ) || HasOto ( ValidateAlias ( ccv ) , syllable . vowelTone ) ) {
399+ basePhoneme = ccv ;
400+ lastC = i ;
401+ break ;
402+ } else if ( ( HasOto ( lcv , syllable . vowelTone ) || HasOto ( ValidateAlias ( lcv ) , syllable . vowelTone ) )
403+ && HasOto ( cc1 , syllable . vowelTone ) && ! HasOto ( ccv , syllable . vowelTone ) ) {
404+ basePhoneme = lcv ;
405+ }
406+ // [C1 C2C3]
407+ if ( HasOto ( $ "{ cc [ i ] } { string . Join ( "" , cc . Skip ( i + 1 ) ) } ", syllable . tone ) ) {
408+ cc1 = $ "{ cc [ i ] } { string . Join ( "" , cc . Skip ( i + 1 ) ) } ";
409+ }
410+ } else if ( syllable . CurrentWordCc . Length >= 1 && syllable . PreviousWordCc . Length == 1 ) {
408411 basePhoneme = lcv ;
412+ // [C1 C2]
413+ if ( ! HasOto ( cc1 , syllable . tone ) ) {
414+ cc1 = $ "{ cc [ i ] } { cc [ i + 1 ] } ";
415+ }
409416 }
410417 if ( i + 1 < lastC ) {
411418 if ( ! HasOto ( cc1 , syllable . tone ) ) {
412419 cc1 = ValidateAlias ( cc1 ) ;
413420 }
414- // [C1 C2C3]
415- if ( HasOto ( $ "{ cc [ i ] } { string . Join ( "" , cc . Skip ( i + 1 ) ) } ", syllable . tone ) ) {
416- cc1 = ( $ "{ cc [ i ] } { string . Join ( "" , cc . Skip ( i + 1 ) ) } ") ;
417- }
418421 // [C1 C2]
419422 if ( ! HasOto ( cc1 , syllable . tone ) ) {
420423 cc1 = $ "{ cc [ i ] } { cc [ i + 1 ] } ";
@@ -431,13 +434,26 @@ protected override List<string> ProcessSyllable(Syllable syllable) {
431434 if ( ! HasOto ( cc1 , syllable . tone ) ) {
432435 cc1 = ValidateAlias ( cc1 ) ;
433436 }
434- // CC V on multiple consonants ex [s tr ao]
435- if ( HasOto ( ccv , syllable . vowelTone ) || HasOto ( ValidateAlias ( ccv ) , syllable . vowelTone ) ) {
436- basePhoneme = ccv ;
437- lastC = i ;
438- break ;
439- } else if ( ( HasOto ( lcv , syllable . vowelTone ) || HasOto ( ValidateAlias ( lcv ) , syllable . vowelTone ) ) && HasOto ( cc1 , syllable . vowelTone ) && ! cc1 . Contains ( $ "{ cc [ i ] } { cc [ i + 1 ] } ") ) {
437+ // CC V on multiple consonants ex [s tr ao] (only if the word starts with a CC)
438+ if ( syllable . CurrentWordCc . Length >= 2 && syllable . PreviousWordCc . Length >= 1 ) {
439+ if ( HasOto ( ccv , syllable . vowelTone ) || HasOto ( ValidateAlias ( ccv ) , syllable . vowelTone ) ) {
440+ basePhoneme = ccv ;
441+ lastC = i ;
442+ break ;
443+ } else if ( ( HasOto ( lcv , syllable . vowelTone ) || HasOto ( ValidateAlias ( lcv ) , syllable . vowelTone ) )
444+ && HasOto ( cc1 , syllable . vowelTone ) && ! HasOto ( ccv , syllable . vowelTone ) ) {
445+ basePhoneme = lcv ;
446+ }
447+ // [C1 C2C3]
448+ if ( HasOto ( $ "{ cc [ i ] } { string . Join ( "" , cc . Skip ( i + 1 ) ) } ", syllable . tone ) ) {
449+ cc1 = $ "{ cc [ i ] } { string . Join ( "" , cc . Skip ( i + 1 ) ) } ";
450+ }
451+ } else if ( syllable . CurrentWordCc . Length >= 1 && syllable . PreviousWordCc . Length == 1 ) {
440452 basePhoneme = lcv ;
453+ // [C1 C2]
454+ if ( ! HasOto ( cc1 , syllable . tone ) ) {
455+ cc1 = $ "{ cc [ i ] } { cc [ i + 1 ] } ";
456+ }
441457 }
442458 if ( HasOto ( cc1 , syllable . tone ) && HasOto ( cc1 , syllable . tone ) && ! cc1 . Contains ( $ "{ string . Join ( "" , cc . Skip ( i ) ) } ") ) {
443459 // like [V C1] [C1 C2] [C2 C3] [C3 ..]
@@ -455,6 +471,7 @@ protected override List<string> ProcessSyllable(Syllable syllable) {
455471 TryAddPhoneme ( phonemes , syllable . tone , cc1 ) ;
456472 }
457473 }
474+
458475 phonemes . Add ( basePhoneme ) ;
459476 return phonemes ;
460477 }
@@ -938,10 +955,10 @@ protected override string ValidateAlias(string alias) {
938955 return alias . Replace ( "ao dx" , "ah d" ) ;
939956 }
940957 if ( alias == "ao q" ) {
941- return alias . Replace ( "ao q" , "ah t" ) ;
958+ return alias . Replace ( "ao q" , "ao t" ) ;
942959 }
943960 if ( alias == "ao tr" ) {
944- return alias . Replace ( "ao tr" , "ah t" ) ;
961+ return alias . Replace ( "ao tr" , "ao t" ) ;
945962 }
946963 if ( alias == "ao y" ) {
947964 return alias . Replace ( "ao y" , "ow y" ) ;
@@ -1649,80 +1666,98 @@ protected override string ValidateAlias(string alias) {
16491666 }
16501667
16511668 // glottal
1652- foreach ( var c1 in new [ ] { "q" } ) {
1653- foreach ( var v1 in vowels ) {
1654-
1655- alias = alias . Replace ( c1 + " " + v1 , "-" + " " + v1 ) ;
1669+ foreach ( var v1 in vowels ) {
1670+ if ( ! alias . Contains ( "cl " + v1 ) || ! alias . Contains ( "q " + v1 ) ) {
1671+ alias = alias . Replace ( "q " + v1 , "- " + v1 ) ;
16561672 }
16571673 }
1658- foreach ( var c1 in new [ ] { "q" } ) {
1659- foreach ( var c2 in consonants ) {
1660- alias = alias . Replace ( c2 + " " + c1 , $ "{ c2 } -") ;
1674+ foreach ( var c2 in consonants ) {
1675+ if ( ! alias . Contains ( c2 + " cl" ) || ! alias . Contains ( c2 + " q" ) ) {
1676+ alias = alias . Replace ( c2 + " q" , $ "{ c2 } -") ;
16611677 }
16621678 }
1663- foreach ( var c1 in new [ ] { "q" } ) {
1664- foreach ( var c2 in consonants ) {
1665- alias = alias . Replace ( c1 + " " + c2 , $ "- { c2 } " ) ;
1679+ foreach ( var c2 in consonants ) {
1680+ if ( ! alias . Contains ( "cl " + c2 ) || ! alias . Contains ( "q " + c2 ) ) {
1681+ alias = alias . Replace ( "q " + c2 , "- " + c2 ) ;
16661682 }
16671683 }
1668-
1684+
16691685 // C -'s
16701686 foreach ( var c1 in new [ ] { "d" , "dh" , "g" , "p" , "jh" , "b" , "s" , "ch" , "t" , "r" , "n" , "l" , "ng" , "sh" , "zh" , "th" , "z" , "f" , "k" , "s" , "hh" } ) {
16711687 foreach ( var s in new [ ] { "-" } ) {
1672- switch ( c1 + " " + s ) {
1673- case var str when alias . Contains ( str ) :
1674- if ( c1 == "d" || c1 == "dh" || c1 == "g" || c1 == "p" ) {
1688+ var str = c1 + " " + s ;
1689+ if ( alias . Contains ( str ) ) {
1690+ switch ( c1 ) {
1691+ case "d" when c1 == "d" || c1 == "dh" || c1 == "g" || c1 == "p" :
16751692 alias = alias . Replace ( str , "b" + " " + s ) ;
1676- } else if ( c1 == "jh" ) {
1693+ break ;
1694+ case "jh" when c1 == "jh" :
16771695 alias = alias . Replace ( str , "ch" + " " + s ) ;
1678- } else if ( c1 == "b" ) {
1696+ break ;
1697+ case "b" when c1 == "b" :
16791698 alias = alias . Replace ( str , "d" + " " + s ) ;
1680- } else if ( c1 == "s" ) {
1699+ break ;
1700+ case "s" when c1 == "s" :
16811701 alias = alias . Replace ( str , "f" + " " + s ) ;
1682- } else if ( c1 == "ch" ) {
1702+ break ;
1703+ case "ch" when c1 == "ch" :
16831704 alias = alias . Replace ( str , "jh" + " " + s ) ;
1684- } else if ( c1 == "t" ) {
1705+ break ;
1706+ case "t" when c1 == "t" :
16851707 alias = alias . Replace ( str , "k" + " " + s ) ;
1686- } else if ( c1 == "r" ) {
1708+ break ;
1709+ case "r" when c1 == "r" :
16871710 alias = alias . Replace ( str , "er" + " " + s ) ;
1688- } else if ( c1 == "n" ) {
1711+ break ;
1712+ case "n" when c1 == "n" :
16891713 alias = alias . Replace ( str , "m" + " " + s ) ;
1690- } else if ( c1 == "ng" || c1 == "m" ) {
1714+ break ;
1715+ case "ng" when c1 == "ng" || c1 == "m" :
16911716 alias = alias . Replace ( str , "n" + " " + s ) ;
1692- } else if ( c1 == "sh" || c1 == "zh" || c1 == "th" || c1 == "z" || c1 == "f" ) {
1717+ break ;
1718+ case "sh" when c1 == "sh" || c1 == "zh" || c1 == "th" || c1 == "z" || c1 == "f" :
16931719 alias = alias . Replace ( str , "s" + " " + s ) ;
1694- } else if ( c1 == "k" ) {
1720+ break ;
1721+ case "k" when c1 == "k" :
16951722 alias = alias . Replace ( str , "t" + " " + s ) ;
1696- } else if ( c1 == "s" ) {
1723+ break ;
1724+ case "s" when c1 == "s" :
16971725 alias = alias . Replace ( str , "z" + " " + s ) ;
1698- } else if ( c1 == "hh" ) {
1726+ break ;
1727+ case "hh" when c1 == "hh" :
16991728 alias = alias . Replace ( str , null ) ;
1700- }
1701- break ;
1729+ break ;
1730+ }
17021731 }
17031732 }
17041733 }
17051734 // CC's
17061735 foreach ( var c1 in new [ ] { "f" , "z" , "hh" , "k" , "p" , "d" , "dh" , "g" , "b" , "m" , "r" } ) {
17071736 foreach ( var c2 in consonants ) {
1708- switch ( c1 + " " + c2 ) {
1709- case var str when alias . Contains ( str ) :
1710- if ( ccSpecific ) {
1711- if ( c1 == "f" || c1 == "z" ) {
1737+ var str = c1 + " " + c2 ;
1738+ if ( alias . Contains ( str ) ) {
1739+ if ( ccSpecific ) {
1740+ switch ( c1 ) {
1741+ case "f" when c1 == "f" || c1 == "z" :
17121742 alias = alias . Replace ( str , "s" + " " + c2 ) ;
1713- } else if ( c1 == "k" || c1 == "p" || c1 == "d" ) {
1743+ break ;
1744+ case "k" when c1 == "k" || c1 == "p" || c1 == "d" :
17141745 alias = alias . Replace ( str , "t" + " " + c2 ) ;
1715- } else if ( c1 == "dh" || c1 == "g" || c1 == "b" ) {
1746+ break ;
1747+ case "dh" when c1 == "dh" || c1 == "g" || c1 == "b" :
17161748 alias = alias . Replace ( str , "d" + " " + c2 ) ;
1717- } else if ( c1 == "m" ) {
1749+ break ;
1750+ case "m" when c1 == "m" :
17181751 alias = alias . Replace ( str , "n" + " " + c2 ) ;
1719- } else if ( c1 == "hh" ) {
1752+ break ;
1753+ case "hh" when c1 == "hh" :
17201754 alias = alias . Replace ( str , "f" + " " + c2 ) ;
1721- } else if ( c1 == "r" ) {
1755+ break ;
1756+ case "r" when c1 == "r" :
17221757 alias = alias . Replace ( str , "er" + " " + c2 ) ;
1723- }
1758+ break ;
17241759 }
1725- break ;
1760+ }
17261761 }
17271762 }
17281763 }
0 commit comments