@@ -92,7 +92,9 @@ public class ArpasingPlusPhonemizer : SyllableBasedPhonemizer {
9292 { "awng" , "ng" } ,
9393 { "el" , "l" } ,
9494 } ;
95-
95+ private readonly string [ ] ccvException = { "ch" , "dh" , "dx" , "fh" , "gh" , "hh" , "jh" , "kh" , "ph" , "ng" , "sh" , "th" , "vh" , "wh" , "zh" } ;
96+ private readonly string [ ] vc_cException = { "r" , "l" } ;
97+ private readonly string [ ] RomajiException = { "a" , "e" , "i" , "o" , "u" } ;
9698
9799 protected override string [ ] GetSymbols ( Note note ) {
98100 string [ ] original = base . GetSymbols ( note ) ;
@@ -298,14 +300,14 @@ protected override List<string> ProcessSyllable(Syllable syllable) {
298300 var rccv1 = $ "- { string . Join ( "" , cc ) } { v } ";
299301 var crv = $ "{ cc . Last ( ) } { v } ";
300302 var ccv = $ "{ string . Join ( "" , cc ) } { v } ";
301- if ( HasOto ( rccv , syllable . vowelTone ) || HasOto ( ValidateAlias ( rccv ) , syllable . vowelTone ) ) {
303+ if ( HasOto ( rccv , syllable . vowelTone ) || HasOto ( ValidateAlias ( rccv ) , syllable . vowelTone ) && ! ccvException . Contains ( cc [ 0 ] ) ) {
302304 basePhoneme = rccv ;
303305 lastC = 0 ;
304- } else if ( HasOto ( rccv1 , syllable . vowelTone ) || HasOto ( ValidateAlias ( rccv1 ) , syllable . vowelTone ) ) {
306+ } else if ( HasOto ( rccv1 , syllable . vowelTone ) || HasOto ( ValidateAlias ( rccv1 ) , syllable . vowelTone ) && ! ccvException . Contains ( cc [ 0 ] ) ) {
305307 basePhoneme = rccv1 ;
306308 lastC = 0 ;
307309 } else {
308- if ( HasOto ( ccv , syllable . vowelTone ) || HasOto ( ValidateAlias ( ccv ) , syllable . vowelTone ) ) {
310+ if ( HasOto ( ccv , syllable . vowelTone ) || HasOto ( ValidateAlias ( ccv ) , syllable . vowelTone ) && ! ccvException . Contains ( cc [ 0 ] ) ) {
309311 basePhoneme = ccv ;
310312 } else if ( HasOto ( crv , syllable . vowelTone ) || HasOto ( ValidateAlias ( crv ) , syllable . vowelTone ) ) {
311313 basePhoneme = crv ;
@@ -314,9 +316,11 @@ protected override List<string> ProcessSyllable(Syllable syllable) {
314316 }
315317 // TRY RCC [- CC]
316318 for ( var i = cc . Length ; i > 1 ; i -- ) {
317- if ( TryAddPhoneme ( phonemes , syllable . tone , $ "- { string . Join ( "" , cc . Take ( i ) ) } ", ValidateAlias ( $ "- { string . Join ( "" , cc . Take ( i ) ) } ") ) ) {
318- firstC = i - 1 ;
319- break ;
319+ if ( ! ccvException . Contains ( cc [ 0 ] ) ) {
320+ if ( TryAddPhoneme ( phonemes , syllable . tone , $ "- { string . Join ( "" , cc . Take ( i ) ) } ", ValidateAlias ( $ "- { string . Join ( "" , cc . Take ( i ) ) } ") ) ) {
321+ firstC = i - 1 ;
322+ break ;
323+ }
320324 }
321325 }
322326 // [- C]
@@ -335,11 +339,11 @@ protected override List<string> ProcessSyllable(Syllable syllable) {
335339 for ( var i = firstC ; i < cc . Length - 1 ; i ++ ) {
336340 var ccv = $ "{ string . Join ( "" , cc ) } { v } ";
337341 var ccv1 = string . Join ( "" , cc . Skip ( i ) ) + " " + v ;
338- if ( syllable . CurrentWordCc . Length >= 2 ) {
342+ if ( syllable . CurrentWordCc . Length >= 2 && ! ccvException . Contains ( cc [ 0 ] ) ) {
339343 if ( HasOto ( ccv , syllable . vowelTone ) || HasOto ( ValidateAlias ( ccv ) , syllable . vowelTone ) ) {
340344 basePhoneme = ccv ;
341345 lastC = i ;
342- break ;
346+ break ;
343347 } else if ( HasOto ( ccv1 , syllable . vowelTone ) || HasOto ( ValidateAlias ( ccv1 ) , syllable . vowelTone ) ) {
344348 basePhoneme = ccv1 ;
345349 }
@@ -348,17 +352,31 @@ protected override List<string> ProcessSyllable(Syllable syllable) {
348352 basePhoneme = crv ;
349353 }
350354 }
351- // try [V C], [V CC], [V -][- C]
355+ // try [V C], [V CC], [VC C], [ V -][- C]
352356 for ( var i = lastC + 1 ; i >= 0 ; i -- ) {
353357 var vr = $ "{ prevV } -";
354- var vcc = $ "{ prevV } { string . Join ( "" , cc . Take ( 2 ) ) } "; // bug on vcc, sequence of [{vowel} v][v f][f {vowel}] turns into [{vowel} q/t][- {vowel}] which is odd
358+ var vc_c = $ "{ prevV } { string . Join ( " " , cc . Take ( 2 ) ) } ";
359+ var vcc = $ "{ prevV } { string . Join ( "" , cc . Take ( 2 ) ) } ";
355360 var vc = $ "{ prevV } { cc [ 0 ] } ";
361+ // CCV will trigger VCC
362+ bool CCV = false ;
363+ if ( syllable . CurrentWordCc . Length >= 2 && ! ccvException . Contains ( cc [ 0 ] ) ) {
364+ if ( HasOto ( $ "{ string . Join ( "" , cc ) } { v } ", syllable . vowelTone ) || HasOto ( ValidateAlias ( $ "{ string . Join ( "" , cc ) } { v } ") , syllable . vowelTone ) ) {
365+ CCV = true ;
366+ }
367+ }
356368 if ( i == 0 && ( HasOto ( vr , syllable . tone ) || HasOto ( ValidateAlias ( vr ) , syllable . tone ) ) && ! HasOto ( vc , syllable . tone ) ) {
357369 phonemes . Add ( vr ) ;
358370 phonemes . Add ( $ "- { cc [ 0 ] } ") ;
359371 break ;
360- } else if ( syllable . IsStartingCVWithMoreThanOneConsonant && syllable . CurrentWordCc . Length >= 2 && HasOto ( vcc , syllable . tone ) || HasOto ( ValidateAlias ( vcc ) , syllable . tone ) ) {
361- phonemes . Add ( vcc ) ;
372+ } else if ( CCV ) {
373+ if ( ! ( ccvException . Contains ( cc [ 0 ] ) && ( HasOto ( vcc , syllable . tone ) || HasOto ( ValidateAlias ( vcc ) , syllable . tone ) ) ) ) {
374+ phonemes . Add ( vcc ) ;
375+ firstC = 1 ;
376+ break ;
377+ }
378+ } else if ( vc_cException . Contains ( cc [ 0 ] ) && HasOto ( vc_c , syllable . tone ) || HasOto ( ValidateAlias ( vc_c ) , syllable . tone ) ) {
379+ phonemes . Add ( vc_c ) ;
362380 firstC = 1 ;
363381 break ;
364382 } else if ( HasOto ( vc , syllable . tone ) || HasOto ( ValidateAlias ( vc ) , syllable . tone ) ) {
@@ -394,13 +412,12 @@ protected override List<string> ProcessSyllable(Syllable syllable) {
394412 cc1 = ValidateAlias ( cc1 ) ;
395413 }
396414 // CC V on multiple consonants ex [s tr ao] (only if the word starts with a CC)
397- if ( syllable . CurrentWordCc . Length >= 2 ) {
415+ if ( syllable . CurrentWordCc . Length >= 2 && ! ccvException . Contains ( cc [ 0 ] ) ) {
398416 if ( HasOto ( ccv , syllable . vowelTone ) || HasOto ( ValidateAlias ( ccv ) , syllable . vowelTone ) ) {
399417 basePhoneme = ccv ;
400418 lastC = i ;
401419 break ;
402- } else if ( ( HasOto ( lcv , syllable . vowelTone ) || HasOto ( ValidateAlias ( lcv ) , syllable . vowelTone ) )
403- && HasOto ( cc1 , syllable . vowelTone ) && ! HasOto ( ccv , syllable . vowelTone ) ) {
420+ } else if ( ( HasOto ( lcv , syllable . vowelTone ) || HasOto ( ValidateAlias ( lcv ) , syllable . vowelTone ) ) && HasOto ( cc1 , syllable . vowelTone ) && ! HasOto ( ccv , syllable . vowelTone ) ) {
404421 basePhoneme = lcv ;
405422 }
406423 // [C1 C2C3]
@@ -435,13 +452,12 @@ protected override List<string> ProcessSyllable(Syllable syllable) {
435452 cc1 = ValidateAlias ( cc1 ) ;
436453 }
437454 // CC V on multiple consonants ex [s tr ao] (only if the word starts with a CC)
438- if ( syllable . CurrentWordCc . Length >= 2 ) {
455+ if ( syllable . CurrentWordCc . Length >= 2 && ! ccvException . Contains ( cc [ 0 ] ) ) {
439456 if ( HasOto ( ccv , syllable . vowelTone ) || HasOto ( ValidateAlias ( ccv ) , syllable . vowelTone ) ) {
440457 basePhoneme = ccv ;
441458 lastC = i ;
442459 break ;
443- } else if ( ( HasOto ( lcv , syllable . vowelTone ) || HasOto ( ValidateAlias ( lcv ) , syllable . vowelTone ) )
444- && HasOto ( cc1 , syllable . vowelTone ) && ! HasOto ( ccv , syllable . vowelTone ) ) {
460+ } else if ( ( HasOto ( lcv , syllable . vowelTone ) || HasOto ( ValidateAlias ( lcv ) , syllable . vowelTone ) ) && HasOto ( cc1 , syllable . vowelTone ) && ! HasOto ( ccv , syllable . vowelTone ) ) {
445461 basePhoneme = lcv ;
446462 }
447463 // [C1 C2C3]
@@ -496,21 +512,23 @@ protected override List<string> ProcessEnding(Ending ending) {
496512 } else {
497513 phonemes . Add ( vR ) ;
498514 }
499- } else if ( ending . IsEndingVCWithOneConsonant ) { // fix endings that ends with [v] turns into romaji vowel if the vb have them
515+ } else if ( ending . IsEndingVCWithOneConsonant ) { // fix endings that ends with [v] consonant turns into romaji vowel if the vb have them
500516 var vc = $ "{ v } { cc [ 0 ] } ";
501517 var vcr = $ "{ v } { cc [ 0 ] } -";
502518 var vcr2 = $ "{ v } { cc [ 0 ] } -";
503519 var vcr3 = $ "{ v } { cc [ 0 ] } -";
504- if ( HasOto ( vcr , ending . tone ) || HasOto ( ValidateAlias ( vcr ) , ending . tone ) ) {
505- phonemes . Add ( vcr ) ;
506- } else if ( ! HasOto ( vcr , ending . tone ) && ! HasOto ( ValidateAlias ( vcr ) , ending . tone ) && ( HasOto ( vcr2 , ending . tone ) || HasOto ( ValidateAlias ( vcr2 ) , ending . tone ) ) ) {
507- phonemes . Add ( vcr2 ) ;
508- } else if ( ! HasOto ( vcr2 , ending . tone ) && ! HasOto ( ValidateAlias ( vcr2 ) , ending . tone ) && ( HasOto ( vcr3 , ending . tone ) || HasOto ( ValidateAlias ( vcr3 ) , ending . tone ) ) ) {
509- phonemes . Add ( vcr3 ) ;
510- } else {
511- phonemes . Add ( vc ) ;
512- if ( vc . Contains ( cc [ 0 ] ) ) {
513- phonemes . Add ( $ "{ cc [ 0 ] } -") ;
520+ if ( ! RomajiException . Contains ( cc [ 0 ] ) ) {
521+ if ( HasOto ( vcr , ending . tone ) || HasOto ( ValidateAlias ( vcr ) , ending . tone ) ) {
522+ phonemes . Add ( vcr ) ;
523+ } else if ( ! HasOto ( vcr , ending . tone ) && ! HasOto ( ValidateAlias ( vcr ) , ending . tone ) && ( HasOto ( vcr2 , ending . tone ) || HasOto ( ValidateAlias ( vcr2 ) , ending . tone ) ) ) {
524+ phonemes . Add ( vcr2 ) ;
525+ } else if ( ! HasOto ( vcr2 , ending . tone ) && ! HasOto ( ValidateAlias ( vcr2 ) , ending . tone ) && ( HasOto ( vcr3 , ending . tone ) || HasOto ( ValidateAlias ( vcr3 ) , ending . tone ) ) ) {
526+ phonemes . Add ( vcr3 ) ;
527+ } else {
528+ phonemes . Add ( vc ) ;
529+ if ( vc . Contains ( cc [ 0 ] ) ) {
530+ phonemes . Add ( $ "{ cc [ 0 ] } -") ;
531+ }
514532 }
515533 }
516534 } else {
@@ -521,44 +539,47 @@ protected override List<string> ProcessEnding(Ending ending) {
521539 var vcc3 = $ "{ v } { string . Join ( " " , cc . Take ( 2 ) ) } ";
522540 var vcc4 = $ "{ v } { string . Join ( "" , cc . Take ( 2 ) ) } ";
523541 var vc = $ "{ v } { cc [ 0 ] } ";
524- if ( i == 0 ) {
525- if ( HasOto ( vr , ending . tone ) || HasOto ( ValidateAlias ( vr ) , ending . tone ) && ! HasOto ( vc , ending . tone ) ) {
526- phonemes . Add ( vr ) ;
527- }
528- break ;
529- } else if ( ( HasOto ( vcc , ending . tone ) || HasOto ( ValidateAlias ( vcc ) , ending . tone ) ) && lastC == 1 ) {
530- phonemes . Add ( vcc ) ;
531- firstC = 1 ;
532- break ;
533- } else if ( ( HasOto ( vcc2 , ending . tone ) || HasOto ( ValidateAlias ( vcc2 ) , ending . tone ) ) && lastC == 1 ) {
534- phonemes . Add ( vcc2 ) ;
535- firstC = 1 ;
536- break ;
537- } else if ( HasOto ( vcc3 , ending . tone ) || HasOto ( ValidateAlias ( vcc3 ) , ending . tone ) ) {
538- phonemes . Add ( vcc3 ) ;
539- if ( vcc3 . EndsWith ( cc . Last ( ) ) && lastC == 1 ) {
540- if ( affricates . Contains ( cc . Last ( ) ) ) {
541- TryAddPhoneme ( phonemes , ending . tone , $ "{ cc . Last ( ) } -", ValidateAlias ( $ "{ cc . Last ( ) } -") , cc . Last ( ) , ValidateAlias ( cc . Last ( ) ) ) ;
542- } else {
543- TryAddPhoneme ( phonemes , ending . tone , $ "{ cc . Last ( ) } -", ValidateAlias ( $ "{ cc . Last ( ) } -") ) ;
542+ if ( ! RomajiException . Contains ( cc [ 0 ] ) ) {
543+ if ( i == 0 ) {
544+ if ( HasOto ( vr , ending . tone ) || HasOto ( ValidateAlias ( vr ) , ending . tone ) && ! HasOto ( vc , ending . tone ) ) {
545+ phonemes . Add ( vr ) ;
544546 }
545- }
546- firstC = 1 ;
547- break ;
548- } else if ( HasOto ( vcc4 , ending . tone ) || HasOto ( ValidateAlias ( vcc4 ) , ending . tone ) ) {
549- phonemes . Add ( vcc4 ) ;
550- if ( vcc4 . EndsWith ( cc . Last ( ) ) && lastC == 1 ) {
551- if ( affricates . Contains ( cc . Last ( ) ) ) {
552- TryAddPhoneme ( phonemes , ending . tone , $ "{ cc . Last ( ) } -", ValidateAlias ( $ "{ cc . Last ( ) } -") , cc . Last ( ) , ValidateAlias ( cc . Last ( ) ) ) ;
553- } else {
554- TryAddPhoneme ( phonemes , ending . tone , $ "{ cc . Last ( ) } -", ValidateAlias ( $ "{ cc . Last ( ) } -") ) ;
547+ break ;
548+ } else if ( ( HasOto ( vcc , ending . tone ) || HasOto ( ValidateAlias ( vcc ) , ending . tone ) ) && lastC == 1 && ! ccvException . Contains ( cc [ 0 ] ) ) {
549+ phonemes . Add ( vcc ) ;
550+ firstC = 1 ;
551+ break ;
552+ } else if ( ( HasOto ( vcc2 , ending . tone ) || HasOto ( ValidateAlias ( vcc2 ) , ending . tone ) ) && lastC == 1 && ! ccvException . Contains ( cc [ 0 ] ) ) {
553+
554+ phonemes . Add ( vcc2 ) ;
555+ firstC = 1 ;
556+ break ;
557+ } else if ( HasOto ( vcc3 , ending . tone ) || HasOto ( ValidateAlias ( vcc3 ) , ending . tone ) && ! ccvException . Contains ( cc [ 0 ] ) ) {
558+ phonemes . Add ( vcc3 ) ;
559+ if ( vcc3 . EndsWith ( cc . Last ( ) ) && lastC == 1 ) {
560+ if ( consonants . Contains ( cc . Last ( ) ) ) {
561+ TryAddPhoneme ( phonemes , ending . tone , $ "{ cc . Last ( ) } -", ValidateAlias ( $ "{ cc . Last ( ) } -") , cc . Last ( ) , ValidateAlias ( cc . Last ( ) ) ) ;
562+ } else {
563+ TryAddPhoneme ( phonemes , ending . tone , $ "{ cc . Last ( ) } -", ValidateAlias ( $ "{ cc . Last ( ) } -") ) ;
564+ }
555565 }
566+ firstC = 1 ;
567+ break ;
568+ } else if ( HasOto ( vcc4 , ending . tone ) || HasOto ( ValidateAlias ( vcc4 ) , ending . tone ) && ! ccvException . Contains ( cc [ 0 ] ) ) {
569+ phonemes . Add ( vcc4 ) ;
570+ if ( vcc4 . EndsWith ( cc . Last ( ) ) && lastC == 1 ) {
571+ if ( consonants . Contains ( cc . Last ( ) ) ) {
572+ TryAddPhoneme ( phonemes , ending . tone , $ "{ cc . Last ( ) } -", ValidateAlias ( $ "{ cc . Last ( ) } -") , cc . Last ( ) , ValidateAlias ( cc . Last ( ) ) ) ;
573+ } else {
574+ TryAddPhoneme ( phonemes , ending . tone , $ "{ cc . Last ( ) } -", ValidateAlias ( $ "{ cc . Last ( ) } -") ) ;
575+ }
576+ }
577+ firstC = 1 ;
578+ break ;
579+ } else {
580+ phonemes . Add ( vc ) ;
581+ break ;
556582 }
557- firstC = 1 ;
558- break ;
559- } else {
560- phonemes . Add ( vc ) ;
561- break ;
562583 }
563584 }
564585 for ( var i = firstC ; i < lastC ; i ++ ) {
0 commit comments