diff --git a/requirements.txt b/requirements.txt index f908d8c..68e01b8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -58,8 +58,10 @@ openai-whisper==20231117 onnxruntime==1.18.1 inflect==7.3.1 unidecode==1.3.8 -# NOTE: 这个似乎不需要 +# NOTE: 这个需要,否则无法启动CosyVoice服务 # matcha-tts +matcha-tts==0.0.7.2 + # whisper faster_whisper==1.0.3 diff --git a/scripts/dl_cosyvoice.py b/scripts/dl_cosyvoice.py new file mode 100644 index 0000000..c123438 --- /dev/null +++ b/scripts/dl_cosyvoice.py @@ -0,0 +1,41 @@ +import logging + +from scripts.dl_base import BaseModelDownloader + +logger = logging.getLogger(__name__) + + +class CosyVoiceDownloader(BaseModelDownloader): + def __init__(self): + required_files = [ + "campplus.onnx", + "configuration.json", + "cosyvoice.yaml", + "flow.pt", + "hift.pt", + "llm.pt", + "speech_tokenizer_v2.onnx", + "CosyVoice-BlankEN/model.safetensors", + "CosyVoice-BlankEN/config.json", + "CosyVoice-BlankEN/generation_config.json", + "CosyVoice-BlankEN/merges.txt", + "CosyVoice-BlankEN/tokenizer_config.json", + "CosyVoice-BlankEN/vocab.json", + ] + super().__init__( + model_name="CosyVoice2-0.5B", + # modelscope_repo="iic/CosyVoice2-0.5B", + # NOTE: 改用这个是以外上面iic这个repo里面文件名和hf的不一致... + modelscope_repo="aiwantaozi/CosyVoice2-0.5B", + huggingface_repo="FunAudioLLM/CosyVoice2-0.5B", + required_files=required_files, + just_download_required_files=True, + ) + self.logger = logger + + +if __name__ == "__main__": + from scripts.dl_args import parser_args + + args = parser_args() + CosyVoiceDownloader()(source=args.source) diff --git a/scripts/dl_cosyvoice_base.py b/scripts/dl_cosyvoice_base.py index 0f1ed2e..4561e1b 100644 --- a/scripts/dl_cosyvoice_base.py +++ b/scripts/dl_cosyvoice_base.py @@ -5,7 +5,7 @@ logger = logging.getLogger(__name__) -class CosyVoiceInstructDownloader(BaseModelDownloader): +class CosyVoiceBaseDownloader(BaseModelDownloader): def __init__(self): required_files = [ "campplus.onnx", @@ -14,8 +14,7 @@ def __init__(self): "flow.pt", "hift.pt", "llm.pt", - "speech_tokenizer_v1.onnx", - "spk2info.pt", + "speech_tokenizer_v1.onnx" ] super().__init__( model_name="CosyVoice_300M", @@ -30,4 +29,4 @@ def __init__(self): from scripts.dl_args import parser_args args = parser_args() - CosyVoiceInstructDownloader()(source=args.source) + CosyVoiceBaseDownloader()(source=args.source) diff --git a/scripts/dl_f5_tts.py b/scripts/dl_f5_tts.py new file mode 100644 index 0000000..b53d4e7 --- /dev/null +++ b/scripts/dl_f5_tts.py @@ -0,0 +1,27 @@ +import logging + +from scripts.dl_base import BaseModelDownloader + +logger = logging.getLogger(__name__) + + +class F5TTSDownloader(BaseModelDownloader): + def __init__(self): + required_files = [ + "F5TTS_Base/model_1200000.safetensors", + ] + super().__init__( + model_name="F5-TTS", + modelscope_repo="AI-ModelScope/F5-TTS", + huggingface_repo="SWivid/F5-TTS", + required_files=required_files, + ) + + self.logger = logger + + +if __name__ == "__main__": + from scripts.dl_args import parser_args + + args = parser_args() + F5TTSDownloader()(source=args.source) diff --git a/scripts/dl_faster_whisper.py b/scripts/dl_faster_whisper.py new file mode 100644 index 0000000..948832a --- /dev/null +++ b/scripts/dl_faster_whisper.py @@ -0,0 +1,31 @@ +import logging + +from scripts.dl_base import BaseModelDownloader + +logger = logging.getLogger(__name__) + + +class FasterWhisperDownloader(BaseModelDownloader): + def __init__(self): + required_files = [ + "model.bin", + "tokenizer.json", + "vocabulary.json", + "preprocessor_config.json", + "config.json", + ] + super().__init__( + model_name="faster-whisper-large-v3", + modelscope_repo="keepitsimple/faster-whisper-large-v3", + huggingface_repo="Systran/faster-whisper-large-v3", + required_files=required_files, + ) + + self.logger = logger + + +if __name__ == "__main__": + from scripts.dl_args import parser_args + + args = parser_args() + FasterWhisperDownloader()(source=args.source) diff --git a/scripts/dl_fire_red_tts.py b/scripts/dl_fire_red_tts.py new file mode 100644 index 0000000..835be2d --- /dev/null +++ b/scripts/dl_fire_red_tts.py @@ -0,0 +1,29 @@ +import logging + +from scripts.dl_base import BaseModelDownloader + +logger = logging.getLogger(__name__) + + +class FireRedTTSDownloader(BaseModelDownloader): + def __init__(self): + required_files = [ + "gpt.pt", + "speaker.bin", + "token2wav.pt", + ] + super().__init__( + model_name="FireRedTTS", + modelscope_repo="pengzhendong/FireRedTTS", + huggingface_repo="fireredteam/FireRedTTS", + required_files=required_files, + ) + + self.logger = logger + + +if __name__ == "__main__": + from scripts.dl_args import parser_args + + args = parser_args() + FireRedTTSDownloader()(source=args.source) diff --git a/scripts/dl_fish_speech_1_2sft.py b/scripts/dl_fish_speech_1_2sft.py new file mode 100644 index 0000000..120c29f --- /dev/null +++ b/scripts/dl_fish_speech_1_2sft.py @@ -0,0 +1,32 @@ +import logging + +from scripts.dl_base import BaseModelDownloader + +logger = logging.getLogger(__name__) + + +class FishSpeechDownloader(BaseModelDownloader): + def __init__(self): + required_files = [ + "config.json", + "firefly-gan-vq-fsq-4x1024-42hz-generator.pth", + "model.pth", + "special_tokens_map.json", + "tokenizer.json", + "tokenizer_config.json", + ] + super().__init__( + model_name="fish-speech-1.2-sft", + modelscope_repo="fishaudio/fish-speech-1.2-sft", + huggingface_repo="fishaudio/fish-speech-1.2-sft", + required_files=required_files, + ) + + self.logger = logger + + +if __name__ == "__main__": + from scripts.dl_args import parser_args + + args = parser_args() + FishSpeechDownloader()(source=args.source) diff --git a/scripts/dl_fish_speech_1_4.py b/scripts/dl_fish_speech_1_4.py new file mode 100644 index 0000000..2232051 --- /dev/null +++ b/scripts/dl_fish_speech_1_4.py @@ -0,0 +1,32 @@ +import logging + +from scripts.dl_base import BaseModelDownloader + +logger = logging.getLogger(__name__) + + +class FishSpeech14Downloader(BaseModelDownloader): + def __init__(self): + required_files = [ + "config.json", + "firefly-gan-vq-fsq-8x1024-21hz-generator.pth", + "model.pth", + "special_tokens_map.json", + "tokenizer.json", + "tokenizer_config.json", + ] + super().__init__( + model_name="fish-speech-1_4", + modelscope_repo="AI-ModelScope/fish-speech-1.4", + huggingface_repo="fishaudio/fish-speech-1.4", + required_files=required_files, + ) + + self.logger = logger + + +if __name__ == "__main__": + from scripts.dl_args import parser_args + + args = parser_args() + FishSpeech14Downloader()(source=args.source) diff --git a/scripts/dl_index_tts.py b/scripts/dl_index_tts.py new file mode 100644 index 0000000..671bbf9 --- /dev/null +++ b/scripts/dl_index_tts.py @@ -0,0 +1,34 @@ +import logging + +from scripts.dl_base import BaseModelDownloader + +logger = logging.getLogger(__name__) + + +class IndexTTSDownloader(BaseModelDownloader): + def __init__(self): + required_files = [ + "bigvgan_discriminator.pth", + "bigvgan_generator.pth", + "bpe.model", + "config.yaml", + "dvae.pth", + "gpt.pth", + "unigram_12000.vocab", + ] + super().__init__( + model_name="Index-TTS", + modelscope_repo="IndexTeam/Index-TTS", + huggingface_repo="IndexTeam/Index-TTS", + required_files=required_files, + just_download_required_files=True, + ) + + self.logger = logger + + +if __name__ == "__main__": + from scripts.dl_args import parser_args + + args = parser_args() + IndexTTSDownloader()(source=args.source) diff --git a/scripts/dl_open_voice.py b/scripts/dl_open_voice.py new file mode 100644 index 0000000..74ff73c --- /dev/null +++ b/scripts/dl_open_voice.py @@ -0,0 +1,28 @@ +import logging + +from scripts.dl_base import BaseModelDownloader + +logger = logging.getLogger(__name__) + + +class OpenVoiceDownloader(BaseModelDownloader): + def __init__(self): + required_files = [ + "converter/checkpoint.pth", + "converter/config.json", + ] + super().__init__( + model_name="OpenVoiceV2", + modelscope_repo="myshell-ai/OpenVoiceV2", + huggingface_repo="myshell-ai/OpenVoiceV2", + required_files=required_files, + ) + + self.logger = logger + + +if __name__ == "__main__": + from scripts.dl_args import parser_args + + args = parser_args() + OpenVoiceDownloader()(source=args.source) diff --git a/scripts/dl_vocos_mel_24khz.py b/scripts/dl_vocos_mel_24khz.py new file mode 100644 index 0000000..e055158 --- /dev/null +++ b/scripts/dl_vocos_mel_24khz.py @@ -0,0 +1,28 @@ +import logging + +from scripts.dl_base import BaseModelDownloader + +logger = logging.getLogger(__name__) + + +class VocosMel24khzDownloader(BaseModelDownloader): + def __init__(self): + required_files = [ + "config.yaml", + "pytorch_model.bin", + ] + super().__init__( + model_name="vocos-mel-24khz", + modelscope_repo="pengzhendong/vocos-mel-24khz", + huggingface_repo="charactr/vocos-mel-24khz", + required_files=required_files, + ) + + self.logger = logger + + +if __name__ == "__main__": + from scripts.dl_args import parser_args + + args = parser_args() + VocosMel24khzDownloader()(source=args.source) diff --git a/scripts/download_models.py b/scripts/download_models.py index 5e3fd8c..19aabe3 100644 --- a/scripts/download_models.py +++ b/scripts/download_models.py @@ -8,6 +8,18 @@ from scripts.dl_args import parser_args from scripts.dl_chattts import ChatTTSDownloader from scripts.dl_enhance import ResembleEnhanceDownloader +from scripts.dl_cosyvoice import CosyVoiceDownloader +from scripts.dl_cosyvoice_base import CosyVoiceBaseDownloader +from scripts.dl_cosyvoice_instruct import CosyVoiceInstructDownloader +from scripts.dl_f5_tts import F5TTSDownloader +from scripts.dl_faster_whisper import FasterWhisperDownloader +from scripts.dl_fire_red_tts import FireRedTTSDownloader +from scripts.dl_fish_speech_1_2sft import FishSpeechDownloader +from scripts.dl_fish_speech_1_4 import FishSpeech14Downloader +from scripts.dl_open_voice import OpenVoiceDownloader +from scripts.dl_vocos_mel_24khz import VocosMel24khzDownloader +from scripts.dl_index_tts import IndexTTSDownloader + from scripts.ModelDownloader import ModelDownloader @@ -17,6 +29,16 @@ def main(): downloaders: list[ModelDownloader] = [] downloaders.append(ChatTTSDownloader()) downloaders.append(ResembleEnhanceDownloader()) + downloaders.append(CosyVoiceBaseDownloader()) + downloaders.append(CosyVoiceInstructDownloader()) + downloaders.append(F5TTSDownloader()) + downloaders.append(FasterWhisperDownloader()) + downloaders.append(FireRedTTSDownloader()) + downloaders.append(FishSpeechDownloader()) + downloaders.append(FishSpeech14Downloader()) + downloaders.append(OpenVoiceDownloader()) + downloaders.append(VocosMel24khzDownloader()) + downloaders.append(IndexTTSDownloader()) for downloader in downloaders: downloader(source=args.source) diff --git a/scripts/downloader/fire_red_tts.py b/scripts/downloader/fire_red_tts.py index bbd550a..835be2d 100644 --- a/scripts/downloader/fire_red_tts.py +++ b/scripts/downloader/fire_red_tts.py @@ -8,9 +8,9 @@ class FireRedTTSDownloader(BaseModelDownloader): def __init__(self): required_files = [ - "fireredtts_gpt.pt", - "fireredtts_speaker.bin", - "fireredtts_token2wav.pt", + "gpt.pt", + "speaker.bin", + "token2wav.pt", ] super().__init__( model_name="FireRedTTS", diff --git a/scripts/downloader/index_tts.py b/scripts/downloader/index_tts.py index 1eedf99..fa95c7f 100644 --- a/scripts/downloader/index_tts.py +++ b/scripts/downloader/index_tts.py @@ -5,7 +5,7 @@ logger = logging.getLogger(__name__) -class CosyVoice2Downloader(BaseModelDownloader): +class IndexTTSDownloader(BaseModelDownloader): def __init__(self): required_files = [ "bigvgan_discriminator.pth", @@ -33,4 +33,4 @@ def __init__(self): from scripts.dl_args import parser_args args = parser_args() - CosyVoice2Downloader()(source=args.source) + IndexTTSDownloader()(source=args.source)