Merge pull request #2203 from yt605155624/mix_cli

yt605155624 · web-flow · commit 80c219b774e5 · 2022-07-29T16:20:34.000+08:00
[TTS]add mix tts cli
diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py
@@ -29,8 +29,7 @@
 from ..executor import BaseExecutor
 from ..log import logger
 from ..utils import stats_wrapper
-from paddlespeech.t2s.frontend import English
-from paddlespeech.t2s.frontend.zh_frontend import Frontend
+from paddlespeech.t2s.exps.syn_utils import get_frontend
 from paddlespeech.t2s.modules.normalizer import ZScore
 
 __all__ = ['TTSExecutor']
@@ -54,6 +53,7 @@ def __init__(self):
                 'fastspeech2_ljspeech',
                 'fastspeech2_aishell3',
                 'fastspeech2_vctk',
+                'fastspeech2_mix',
                 'tacotron2_csmsc',
                 'tacotron2_ljspeech',
             ],
@@ -98,7 +98,7 @@ def __init__(self):
         self.parser.add_argument(
             '--voc',
             type=str,
-            default='pwgan_csmsc',
+            default='hifigan_csmsc',
             choices=[
                 'pwgan_csmsc',
                 'pwgan_ljspeech',
@@ -135,7 +135,7 @@ def __init__(self):
             '--lang',
             type=str,
             default='zh',
-            help='Choose model language. zh or en')
+            help='Choose model language. zh or en or mix')
         self.parser.add_argument(
             '--device',
             type=str,
@@ -231,8 +231,11 @@ def _init_from_path(
             use_pretrained_voc = True
         else:
             use_pretrained_voc = False
-
-        voc_tag = voc + '-' + lang
+        voc_lang = lang
+        # we must use ljspeech's voc for mix am now!
+        if lang == 'mix':
+            voc_lang = 'en'
+        voc_tag = voc + '-' + voc_lang
         self.task_resource.set_task_model(
             model_tag=voc_tag,
             model_type=1,  # vocoder
@@ -281,13 +284,8 @@ def _init_from_path(
             spk_num = len(spk_id)
 
         # frontend
-        if lang == 'zh':
-            self.frontend = Frontend(
-                phone_vocab_path=self.phones_dict,
-                tone_vocab_path=self.tones_dict)
-
-        elif lang == 'en':
-            self.frontend = English(phone_vocab_path=self.phones_dict)
+        self.frontend = get_frontend(
+            lang=lang, phones_dict=self.phones_dict, tones_dict=self.tones_dict)
 
         # acoustic model
         odim = self.am_config.n_mels
@@ -381,8 +379,12 @@ def infer(self,
             input_ids = self.frontend.get_input_ids(
                 text, merge_sentences=merge_sentences)
             phone_ids = input_ids["phone_ids"]
+        elif lang == 'mix':
+            input_ids = self.frontend.get_input_ids(
+                text, merge_sentences=merge_sentences)
+            phone_ids = input_ids["phone_ids"]
         else:
-            logger.error("lang should in {'zh', 'en'}!")
+            logger.error("lang should in {'zh', 'en', 'mix'}!")
         self.frontend_time = time.time() - frontend_st
 
         self.am_time = 0
@@ -398,7 +400,7 @@ def infer(self,
             # fastspeech2
             else:
                 # multi speaker
-                if am_dataset in {"aishell3", "vctk"}:
+                if am_dataset in {'aishell3', 'vctk', 'mix'}:
                     mel = self.am_inference(
                         part_phone_ids, spk_id=paddle.to_tensor(spk_id))
                 else:
diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py
@@ -655,6 +655,24 @@
             'phone_id_map.txt',
         },
     },
+    "fastspeech2_mix-mix": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_csmscljspeech_add-zhen.zip',
+            'md5':
+            '77d9d4b5a79ed6203339ead7ef6c74f9',
+            'config':
+            'default.yaml',
+            'ckpt':
+            'snapshot_iter_94000.pdz',
+            'speech_stats':
+            'speech_stats.npy',
+            'phones_dict':
+            'phone_id_map.txt',
+            'speaker_dict':
+            'speaker_id_map.txt',
+        },
+    },
     # tacotron2
     "tacotron2_csmsc-zh": {
         '1.0': {
diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh
@@ -43,7 +43,7 @@ paddlespeech tts --input "你好，欢迎使用百度飞桨深度学习框架！
 paddlespeech tts --am speedyspeech_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
 paddlespeech tts --voc mb_melgan_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
 paddlespeech tts --voc style_melgan_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
-paddlespeech tts --voc hifigan_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
+paddlespeech tts --voc pwgan_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
 paddlespeech tts --am fastspeech2_aishell3 --voc pwgan_aishell3 --input "你好，欢迎使用百度飞桨深度学习框架！" --spk_id 0
 paddlespeech tts --am fastspeech2_aishell3 --voc hifigan_aishell3 --input "你好，欢迎使用百度飞桨深度学习框架！" --spk_id 0
 paddlespeech tts --am fastspeech2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get."
@@ -53,6 +53,12 @@ paddlespeech tts --am fastspeech2_vctk --voc hifigan_vctk --input "Life was like
 paddlespeech tts --am tacotron2_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
 paddlespeech tts --am tacotron2_csmsc --voc wavernn_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
 paddlespeech tts --am tacotron2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get."
+# mix tts
+# The `am` must be `fastspeech2_mix`!
+# The `lang` must be `mix`!
+# The voc must be `hifigan_ljspeech` or `pwgan_ljspeech` for f`astspeech2_mix` now!
+paddlespeech tts --am fastspeech2_mix --voc hifigan_ljspeech --lang mix  --input "热烈欢迎您在 Discussions 中提交问题，并在 Issues 中指出发现的 bug。此外，我们非常希望您参与到 Paddle Speech 的开发中！" --spk_id 0  --output mix_spk0.wav
+paddlespeech tts --am fastspeech2_mix --voc pwgan_ljspeech --lang mix  --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --spk_id 1  --output mix_spk1.wav
 
 # Speech Translation (only support linux)
 paddlespeech st --input ./en.wav