@@ -30,6 +30,8 @@ def __init__(self, lexion_filename: str, tokens_filename: str):
3030 tones = [int (t ) for t in tones ]
3131
3232 lexicon [word_or_phrase ] = (phones , tones )
33+ lexicon ["呣" ] = lexicon ["母" ]
34+ lexicon ["嗯" ] = lexicon ["恩" ]
3335 self .lexicon = lexicon
3436
3537 punctuation = ["!" , "?" , "…" , "," , "." , "'" , "-" ]
@@ -98,20 +100,16 @@ def __init__(self, filename):
98100 self .lang_id = int (meta ["lang_id" ])
99101 self .sample_rate = int (meta ["sample_rate" ])
100102
101- def __call__ (self , x , tones , lang ):
103+ def __call__ (self , x , tones ):
102104 """
103105 Args:
104106 x: 1-D int64 torch tensor
105107 tones: 1-D int64 torch tensor
106- lang: 1-D int64 torch tensor
107108 """
108109 x = x .unsqueeze (0 )
109110 tones = tones .unsqueeze (0 )
110- lang = lang .unsqueeze (0 )
111111
112- print (x .shape , tones .shape , lang .shape )
113- bert = torch .zeros (1 , self .bert_dim , x .shape [- 1 ])
114- ja_bert = torch .zeros (1 , self .ja_bert_dim , x .shape [- 1 ])
112+ print (x .shape , tones .shape )
115113 sid = torch .tensor ([self .speaker_id ], dtype = torch .int64 )
116114 noise_scale = torch .tensor ([0.6 ], dtype = torch .float32 )
117115 length_scale = torch .tensor ([1.0 ], dtype = torch .float32 )
@@ -125,9 +123,6 @@ def __call__(self, x, tones, lang):
125123 "x" : x .numpy (),
126124 "x_lengths" : x_lengths .numpy (),
127125 "tones" : tones .numpy (),
128- "lang_id" : lang .numpy (),
129- "bert" : bert .numpy (),
130- "ja_bert" : ja_bert .numpy (),
131126 "sid" : sid .numpy (),
132127 "noise_scale" : noise_scale .numpy (),
133128 "noise_scale_w" : noise_scale_w .numpy (),
@@ -140,34 +135,46 @@ def __call__(self, x, tones, lang):
140135def main ():
141136 lexicon = Lexicon (lexion_filename = "./lexicon.txt" , tokens_filename = "./tokens.txt" )
142137
143- text = "永远相信,美好的事情即将发生。多音字测试, 银行,行不行?长沙长大 "
138+ text = "永远相信,美好的事情即将发生。"
144139 s = jieba .cut (text , HMM = True )
145140
146141 phones , tones = lexicon .convert (s )
147142
143+ en_text = "how are you ?" .split ()
144+
145+ phones_en , tones_en = lexicon .convert (en_text )
146+ phones += [0 ]
147+ tones += [0 ]
148+
149+ phones += phones_en
150+ tones += tones_en
151+
152+ text = "多音字测试, 银行,行不行?长沙长大"
153+ s = jieba .cut (text , HMM = True )
154+
155+ phones2 , tones2 = lexicon .convert (s )
156+
157+ phones += phones2
158+ tones += tones2
159+
148160 model = OnnxModel ("./model.onnx" )
149- langs = [model .lang_id ] * len (phones )
150161
151162 if model .add_blank :
152163 new_phones = [0 ] * (2 * len (phones ) + 1 )
153164 new_tones = [0 ] * (2 * len (tones ) + 1 )
154- new_langs = [0 ] * (2 * len (langs ) + 1 )
155165
156166 new_phones [1 ::2 ] = phones
157167 new_tones [1 ::2 ] = tones
158- new_langs [1 ::2 ] = langs
159168
160169 phones = new_phones
161170 tones = new_tones
162- langs = new_langs
163171
164172 phones = torch .tensor (phones , dtype = torch .int64 )
165173 tones = torch .tensor (tones , dtype = torch .int64 )
166- langs = torch .tensor (langs , dtype = torch .int64 )
167174
168- print (phones .shape , tones .shape , langs . shape )
175+ print (phones .shape , tones .shape )
169176
170- y = model (x = phones , tones = tones , lang = langs )
177+ y = model (x = phones , tones = tones )
171178 sf .write ("./test.wav" , y , model .sample_rate )
172179
173180
0 commit comments