Skip to content

Commit 80c219b

Browse files
authored
Merge pull request #2203 from yt605155624/mix_cli
[TTS]add mix tts cli
2 parents 7550ee2 + 7bd1b42 commit 80c219b

File tree

3 files changed

+42
-16
lines changed

3 files changed

+42
-16
lines changed

paddlespeech/cli/tts/infer.py

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,7 @@
2929
from ..executor import BaseExecutor
3030
from ..log import logger
3131
from ..utils import stats_wrapper
32-
from paddlespeech.t2s.frontend import English
33-
from paddlespeech.t2s.frontend.zh_frontend import Frontend
32+
from paddlespeech.t2s.exps.syn_utils import get_frontend
3433
from paddlespeech.t2s.modules.normalizer import ZScore
3534

3635
__all__ = ['TTSExecutor']
@@ -54,6 +53,7 @@ def __init__(self):
5453
'fastspeech2_ljspeech',
5554
'fastspeech2_aishell3',
5655
'fastspeech2_vctk',
56+
'fastspeech2_mix',
5757
'tacotron2_csmsc',
5858
'tacotron2_ljspeech',
5959
],
@@ -98,7 +98,7 @@ def __init__(self):
9898
self.parser.add_argument(
9999
'--voc',
100100
type=str,
101-
default='pwgan_csmsc',
101+
default='hifigan_csmsc',
102102
choices=[
103103
'pwgan_csmsc',
104104
'pwgan_ljspeech',
@@ -135,7 +135,7 @@ def __init__(self):
135135
'--lang',
136136
type=str,
137137
default='zh',
138-
help='Choose model language. zh or en')
138+
help='Choose model language. zh or en or mix')
139139
self.parser.add_argument(
140140
'--device',
141141
type=str,
@@ -231,8 +231,11 @@ def _init_from_path(
231231
use_pretrained_voc = True
232232
else:
233233
use_pretrained_voc = False
234-
235-
voc_tag = voc + '-' + lang
234+
voc_lang = lang
235+
# we must use ljspeech's voc for mix am now!
236+
if lang == 'mix':
237+
voc_lang = 'en'
238+
voc_tag = voc + '-' + voc_lang
236239
self.task_resource.set_task_model(
237240
model_tag=voc_tag,
238241
model_type=1, # vocoder
@@ -281,13 +284,8 @@ def _init_from_path(
281284
spk_num = len(spk_id)
282285

283286
# frontend
284-
if lang == 'zh':
285-
self.frontend = Frontend(
286-
phone_vocab_path=self.phones_dict,
287-
tone_vocab_path=self.tones_dict)
288-
289-
elif lang == 'en':
290-
self.frontend = English(phone_vocab_path=self.phones_dict)
287+
self.frontend = get_frontend(
288+
lang=lang, phones_dict=self.phones_dict, tones_dict=self.tones_dict)
291289

292290
# acoustic model
293291
odim = self.am_config.n_mels
@@ -381,8 +379,12 @@ def infer(self,
381379
input_ids = self.frontend.get_input_ids(
382380
text, merge_sentences=merge_sentences)
383381
phone_ids = input_ids["phone_ids"]
382+
elif lang == 'mix':
383+
input_ids = self.frontend.get_input_ids(
384+
text, merge_sentences=merge_sentences)
385+
phone_ids = input_ids["phone_ids"]
384386
else:
385-
logger.error("lang should in {'zh', 'en'}!")
387+
logger.error("lang should in {'zh', 'en', 'mix'}!")
386388
self.frontend_time = time.time() - frontend_st
387389

388390
self.am_time = 0
@@ -398,7 +400,7 @@ def infer(self,
398400
# fastspeech2
399401
else:
400402
# multi speaker
401-
if am_dataset in {"aishell3", "vctk"}:
403+
if am_dataset in {'aishell3', 'vctk', 'mix'}:
402404
mel = self.am_inference(
403405
part_phone_ids, spk_id=paddle.to_tensor(spk_id))
404406
else:

paddlespeech/resource/pretrained_models.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -655,6 +655,24 @@
655655
'phone_id_map.txt',
656656
},
657657
},
658+
"fastspeech2_mix-mix": {
659+
'1.0': {
660+
'url':
661+
'https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_csmscljspeech_add-zhen.zip',
662+
'md5':
663+
'77d9d4b5a79ed6203339ead7ef6c74f9',
664+
'config':
665+
'default.yaml',
666+
'ckpt':
667+
'snapshot_iter_94000.pdz',
668+
'speech_stats':
669+
'speech_stats.npy',
670+
'phones_dict':
671+
'phone_id_map.txt',
672+
'speaker_dict':
673+
'speaker_id_map.txt',
674+
},
675+
},
658676
# tacotron2
659677
"tacotron2_csmsc-zh": {
660678
'1.0': {

tests/unit/cli/test_cli.sh

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!
4343
paddlespeech tts --am speedyspeech_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!"
4444
paddlespeech tts --voc mb_melgan_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!"
4545
paddlespeech tts --voc style_melgan_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!"
46-
paddlespeech tts --voc hifigan_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!"
46+
paddlespeech tts --voc pwgan_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!"
4747
paddlespeech tts --am fastspeech2_aishell3 --voc pwgan_aishell3 --input "你好,欢迎使用百度飞桨深度学习框架!" --spk_id 0
4848
paddlespeech tts --am fastspeech2_aishell3 --voc hifigan_aishell3 --input "你好,欢迎使用百度飞桨深度学习框架!" --spk_id 0
4949
paddlespeech tts --am fastspeech2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get."
@@ -53,6 +53,12 @@ paddlespeech tts --am fastspeech2_vctk --voc hifigan_vctk --input "Life was like
5353
paddlespeech tts --am tacotron2_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!"
5454
paddlespeech tts --am tacotron2_csmsc --voc wavernn_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!"
5555
paddlespeech tts --am tacotron2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get."
56+
# mix tts
57+
# The `am` must be `fastspeech2_mix`!
58+
# The `lang` must be `mix`!
59+
# The voc must be `hifigan_ljspeech` or `pwgan_ljspeech` for f`astspeech2_mix` now!
60+
paddlespeech tts --am fastspeech2_mix --voc hifigan_ljspeech --lang mix --input "热烈欢迎您在 Discussions 中提交问题,并在 Issues 中指出发现的 bug。此外,我们非常希望您参与到 Paddle Speech 的开发中!" --spk_id 0 --output mix_spk0.wav
61+
paddlespeech tts --am fastspeech2_mix --voc pwgan_ljspeech --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --spk_id 1 --output mix_spk1.wav
5662

5763
# Speech Translation (only support linux)
5864
paddlespeech st --input ./en.wav

0 commit comments

Comments
 (0)