11import os
2+ import re
23import numpy as np
34import librosa
45import soundfile as sf
1011
1112_pad = ['_' ]
1213_eos = ['~' ]
13- _pause = ['sil' , 'sp1' ]
14- _initials = ['b' , 'c' , 'ch' , 'd' , 'f' , 'g' , 'h' , 'j' , 'k' , 'l' , 'm' , 'n' , 'p' , 'q' , 'r' , 's' , 'sh' , 't' , 'x' , 'z' , 'zh' ]
14+ _pause = ['sil' , '#0' , '#1' , '#2' , '#3' ]
15+
16+ _initials = ['^' , 'b' , 'c' , 'ch' , 'd' , 'f' , 'g' , 'h' , 'j' , 'k' , 'l' , 'm' , 'n' , 'p' , 'q' , 'r' , 's' , 'sh' , 't' , 'x' , 'z' , 'zh' ]
17+
1518_tones = ['1' , '2' , '3' , '4' , '5' ]
16- _finals = ['a' , 'ai' , 'an' , 'ang' , 'ao' , 'e' , 'ei' , 'en' , 'eng' , 'er' , 'i' , 'ia' , 'ian' , 'iang' , 'iao' , 'ie' ,
17- 'ii' , 'iii' , 'in' , 'ing' , 'iong' , 'iou' , 'o' , 'ong' , 'ou' , 'u' , 'ua' , 'uai' , 'uan' , 'uang' ,
18- 'uei' , 'uen' , 'ueng' , 'uo' , 'v' , 'van' , 've' , 'vn' ]
19- _special = ['io5' ]
2019
21- symbols = _pad + _pause + _initials + [i + j for i in _finals for j in _tones ] + _special + _eos
20+ _finals = ['a' , 'ai' , 'an' , 'ang' , 'ao' , 'e' , 'ei' , 'en' , 'eng' , 'er' , 'i' , 'ia' , 'ian' , 'iang' , 'iao' , 'ie' , 'ii' ,
21+ 'iii' , 'in' , 'ing' , 'iong' , 'iou' , 'o' , 'ong' , 'ou' , 'u' , 'ua' , 'uai' , 'uan' , 'uang' , 'uei' , 'uen' , 'ueng' ,
22+ 'uo' , 'v' , 'van' , 've' , 'vn' ]
23+
24+ symbols = _pad + _pause + _initials + [i + j for i in _finals for j in _tones ] + _eos
2225
2326# Mappings from symbol to numeric ID and vice versa:
2427_symbol_to_id = {s : i for i , s in enumerate (symbols )}
2528_id_to_symbol = {i : s for i , s in enumerate (symbols )}
2629
2730
28-
2931pinyin_dict = {
30- 'a' : ('' , 'a' ),
31- 'ai' : ('' , 'ai' ),
32- 'an' : ('' , 'an' ),
33- 'ang' : ('' , 'ang' ),
34- 'ao' : ('' , 'ao' ),
32+ 'a' : ('^ ' , 'a' ),
33+ 'ai' : ('^ ' , 'ai' ),
34+ 'an' : ('^ ' , 'an' ),
35+ 'ang' : ('^ ' , 'ang' ),
36+ 'ao' : ('^ ' , 'ao' ),
3537 'ba' : ('b' , 'a' ),
3638 'bai' : ('b' , 'ai' ),
3739 'ban' : ('b' , 'an' ),
107109 'dui' : ('d' , 'uei' ),
108110 'dun' : ('d' , 'uen' ),
109111 'duo' : ('d' , 'uo' ),
110- 'e' : ('' , 'e' ),
111- 'ei' : ('' , 'ei' ),
112- 'en' : ('' , 'en' ),
113- 'ng' : ('' , 'en' ),
114- 'eng' : ('' , 'eng' ),
115- 'er' : ('' , 'er' ),
112+ 'e' : ('^ ' , 'e' ),
113+ 'ei' : ('^ ' , 'ei' ),
114+ 'en' : ('^ ' , 'en' ),
115+ 'ng' : ('^ ' , 'en' ),
116+ 'eng' : ('^ ' , 'eng' ),
117+ 'er' : ('^ ' , 'er' ),
116118 'fa' : ('f' , 'a' ),
117119 'fan' : ('f' , 'an' ),
118120 'fang' : ('f' , 'ang' ),
265267 'nve' : ('n' , 've' ),
266268 'nue' : ('n' , 've' ),
267269 'nuo' : ('n' , 'uo' ),
268- 'o' : ('' , 'o' ),
269- 'ou' : ('' , 'ou' ),
270+ 'o' : ('^ ' , 'o' ),
271+ 'ou' : ('^ ' , 'ou' ),
270272 'pa' : ('p' , 'a' ),
271273 'pai' : ('p' , 'ai' ),
272274 'pan' : ('p' , 'an' ),
369371 'tui' : ('t' , 'uei' ),
370372 'tun' : ('t' , 'uen' ),
371373 'tuo' : ('t' , 'uo' ),
372- 'wa' : ('' , 'ua' ),
373- 'wai' : ('' , 'uai' ),
374- 'wan' : ('' , 'uan' ),
375- 'wang' : ('' , 'uang' ),
376- 'wei' : ('' , 'uei' ),
377- 'wen' : ('' , 'uen' ),
378- 'weng' : ('' , 'ueng' ),
379- 'wo' : ('' , 'uo' ),
380- 'wu' : ('' , 'u' ),
374+ 'wa' : ('^ ' , 'ua' ),
375+ 'wai' : ('^ ' , 'uai' ),
376+ 'wan' : ('^ ' , 'uan' ),
377+ 'wang' : ('^ ' , 'uang' ),
378+ 'wei' : ('^ ' , 'uei' ),
379+ 'wen' : ('^ ' , 'uen' ),
380+ 'weng' : ('^ ' , 'ueng' ),
381+ 'wo' : ('^ ' , 'uo' ),
382+ 'wu' : ('^ ' , 'u' ),
381383 'xi' : ('x' , 'i' ),
382384 'xia' : ('x' , 'ia' ),
383385 'xian' : ('x' , 'ian' ),
392394 'xuan' : ('x' , 'van' ),
393395 'xue' : ('x' , 've' ),
394396 'xun' : ('x' , 'vn' ),
395- 'ya' : ('' , 'ia' ),
396- 'yan' : ('' , 'ian' ),
397- 'yang' : ('' , 'iang' ),
398- 'yao' : ('' , 'iao' ),
399- 'ye' : ('' , 'ie' ),
400- 'yi' : ('' , 'i' ),
401- 'yin' : ('' , 'in' ),
402- 'ying' : ('' , 'ing' ),
403- 'yo' : ('' , 'iou' ),
404- 'yong' : ('' , 'iong' ),
405- 'you' : ('' , 'iou' ),
406- 'yu' : ('' , 'v' ),
407- 'yuan' : ('' , 'van' ),
408- 'yue' : ('' , 've' ),
409- 'yun' : ('' , 'vn' ),
397+ 'ya' : ('^ ' , 'ia' ),
398+ 'yan' : ('^ ' , 'ian' ),
399+ 'yang' : ('^ ' , 'iang' ),
400+ 'yao' : ('^ ' , 'iao' ),
401+ 'ye' : ('^ ' , 'ie' ),
402+ 'yi' : ('^ ' , 'i' ),
403+ 'yin' : ('^ ' , 'in' ),
404+ 'ying' : ('^ ' , 'ing' ),
405+ 'yo' : ('^ ' , 'iou' ),
406+ 'yong' : ('^ ' , 'iong' ),
407+ 'you' : ('^ ' , 'iou' ),
408+ 'yu' : ('^ ' , 'v' ),
409+ 'yuan' : ('^ ' , 'van' ),
410+ 'yue' : ('^ ' , 've' ),
411+ 'yun' : ('^ ' , 'vn' ),
410412 'za' : ('z' , 'a' ),
411413 'zai' : ('z' , 'ai' ),
412414 'zan' : ('z' , 'an' ),
447449}
448450
449451
450- def process_phonelabel (label_file ):
451- with open (label_file , 'r' , encoding = 'utf-8' ) as f :
452- lines = f .readlines ()[12 :]
453- assert len (lines ) % 3 == 0
454-
455- text = []
456- for i in range (0 , len (lines ), 3 ):
457- begin = float (lines [i ].strip ())
458- if i == 0 :
459- assert begin == 0.
460- phone = lines [i + 2 ].strip ()
461- text .append (phone .replace ('"' , '' ))
462-
463- return text
452+ zh_pattern = re .compile (u'[\u4e00 -\u9fa5 ]' )
453+ def is_zh (word ):
454+ global zh_pattern
455+ match = zh_pattern .search (word )
456+ return match is not None
464457
465458
466459class MyConverter (NeutralToneWith5Mixin , DefaultConverter ):
@@ -479,41 +472,61 @@ def __init__(self, data_dir, target_rate=24000, cleaner_names=None):
479472 with open (os .path .join (data_dir , 'ProsodyLabeling/000001-010000.txt' ), encoding = 'utf-8' ) as ttf :
480473 lines = ttf .readlines ()
481474 for idx in range (0 , len (lines ), 2 ):
482- utt_id , _ = lines [idx ].strip ().split ()
483- phonemes = process_phonelabel (os .path .join (data_dir , f'PhoneLabeling/{ utt_id } .interval' ))
484- phonemes = self .deal_r (phonemes )
485- if 'pl' in phonemes or 'ng1' in phonemes :
486- print (f'Skip this: { utt_id } { phonemes } ' )
475+ utt_id , chn_char = lines [idx ].strip ().split ()
476+ pinyin = lines [idx + 1 ].strip ().split ()
477+ if 'IY1' in pinyin or 'B' in chn_char :
478+ print (f'Skip this: { utt_id } { chn_char } { pinyin } ' )
487479 continue
480+ phonemes = self .get_phoneme_from_char_and_pinyin (chn_char , pinyin )
488481 wav_path = os .path .join (data_dir , 'Wave' , '%s.wav' % utt_id )
489482 items .append ([' ' .join (phonemes ), wav_path , self .speaker_name , utt_id ])
490483 self .items = items
491-
492- self .pinyin = self .get_pinyin ()
484+ self .pinyin_parser = self .get_pinyin_parser ()
493485
494486 @staticmethod
495- def deal_r (phonemes ):
496- result = []
497- for p in phonemes :
498- if p [- 1 ].isdigit () and p [- 2 ] == 'r' and p [:2 ] != 'er' :
499- result .append (p [:- 2 ] + p [- 1 ])
500- result .append ('er5' )
501- else :
502- result .append (p )
503- return result
487+ def get_phoneme_from_char_and_pinyin (chn_char , pinyin ):
488+ # we do not need #4, use sil to replace it
489+ chn_char = chn_char .replace ('#4' , '' )
490+ char_len = len (chn_char )
491+ i , j = 0 , 0
492+ result = ['sil' ]
493+ while i < char_len :
494+ cur_char = chn_char [i ]
495+ if is_zh (cur_char ):
496+ if pinyin [j ][:- 1 ] not in pinyin_dict :
497+ assert chn_char [i + 1 ] == '儿'
498+ assert pinyin [j ][- 2 ] == 'r'
499+ tone = pinyin [j ][- 1 ]
500+ a = pinyin [j ][:- 2 ]
501+ a1 , a2 = pinyin_dict [a ]
502+ result += [a1 , a2 + tone , 'er5' ]
503+ if i + 2 < char_len and chn_char [i + 2 ] != '#' :
504+ result .append ('#0' )
504505
505- @staticmethod
506- def get_initials_and_finals (text ):
507- result = []
508- for x in text .split ():
509- assert x [- 1 ].isdigit ()
510- tone = x [- 1 ]
511- initial , final = pinyin_dict [x [:- 1 ]]
512- if initial != '' :
513- result .append (initial )
514- assert final is not ''
515- result .append (final + tone )
516- result = ' ' .join (result )
506+ i += 2
507+ j += 1
508+ else :
509+ tone = pinyin [j ][- 1 ]
510+ a = pinyin [j ][:- 1 ]
511+ a1 , a2 = pinyin_dict [a ]
512+ result += [a1 , a2 + tone ]
513+
514+ if i + 1 < char_len and chn_char [i + 1 ] != '#' :
515+ result .append ('#0' )
516+
517+ i += 1
518+ j += 1
519+ elif cur_char == '#' :
520+ result .append (chn_char [i :i + 2 ])
521+ i += 2
522+ else :
523+ # ignore the unknown char and punctuation
524+ # result.append(chn_char[i])
525+ i += 1
526+ if result [- 1 ] == '#0' :
527+ result = result [:- 1 ]
528+ result .append ('sil' )
529+ assert j == len (pinyin )
517530 return result
518531
519532 def get_one_sample (self , item ):
@@ -545,7 +558,7 @@ def get_one_sample(self, item):
545558
546559 return sample
547560
548- def get_pinyin (self ):
561+ def get_pinyin_parser (self ):
549562 my_pinyin = Pinyin (MyConverter ())
550563 pinyin = my_pinyin .pinyin
551564 return pinyin
@@ -554,12 +567,15 @@ def text_to_sequence(self, text, inference=False):
554567 global _symbol_to_id
555568
556569 if inference :
557- text = self .pinyin (text , style = Style .TONE3 )
558- new_text = []
559- for x in text :
560- new_text .append ('' .join (x ))
561- text = self .get_initials_and_finals (' ' .join (new_text ))
562- print (text )
570+ pinyin = self .pinyin_parser (text , style = Style .TONE3 )
571+ new_pinyin = []
572+ for x in pinyin :
573+ x = '' .join (x )
574+ if '#' not in x :
575+ new_pinyin .append (x )
576+ phonemes = self .get_phoneme_from_char_and_pinyin (text , new_pinyin )
577+ text = ' ' .join (phonemes )
578+ print (f'phoneme seq: { text } ' )
563579
564580 sequence = []
565581 for symbol in text .split ():
0 commit comments