diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py index f98b0d61..288c38d3 100644 --- a/cosyvoice/cli/frontend.py +++ b/cosyvoice/cli/frontend.py @@ -24,6 +24,7 @@ import os import re import inflect +import copy try: import ttsfrd use_ttsfrd = True @@ -173,7 +174,11 @@ def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, resample_ 'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len, 'llm_embedding': embedding, 'flow_embedding': embedding} else: - model_input = self.spk2info[zero_shot_spk_id] + model_input = copy.deepcopy(self.spk2info[zero_shot_spk_id]) + if prompt_text: + prompt_text_token, prompt_text_token_len = self._extract_text_token(prompt_text) + model_input["prompt_text"] = prompt_text_token + model_input["prompt_text_len"] = prompt_text_token_len model_input['text'] = tts_text_token model_input['text_len'] = tts_text_token_len return model_input @@ -181,16 +186,16 @@ def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, resample_ def frontend_cross_lingual(self, tts_text, prompt_speech_16k, resample_rate, zero_shot_spk_id): model_input = self.frontend_zero_shot(tts_text, '', prompt_speech_16k, resample_rate, zero_shot_spk_id) # in cross lingual mode, we remove prompt in llm - del model_input['prompt_text'] - del model_input['prompt_text_len'] - del model_input['llm_prompt_speech_token'] - del model_input['llm_prompt_speech_token_len'] + model_input.pop('prompt_text', None) + model_input.pop('prompt_text_len', None) + model_input.pop('llm_prompt_speech_token', None) + model_input.pop('llm_prompt_speech_token_len', None) return model_input def frontend_instruct(self, tts_text, spk_id, instruct_text): model_input = self.frontend_sft(tts_text, spk_id) # in instruct mode, we remove spk_embedding in llm due to information leakage - del model_input['llm_embedding'] + model_input.pop('llm_embedding', None) instruct_text_token, instruct_text_token_len = self._extract_text_token(instruct_text + '') model_input['prompt_text'] = instruct_text_token model_input['prompt_text_len'] = instruct_text_token_len @@ -198,8 +203,8 @@ def frontend_instruct(self, tts_text, spk_id, instruct_text): def frontend_instruct2(self, tts_text, instruct_text, prompt_speech_16k, resample_rate, zero_shot_spk_id): model_input = self.frontend_zero_shot(tts_text, instruct_text + '<|endofprompt|>', prompt_speech_16k, resample_rate, zero_shot_spk_id) - del model_input['llm_prompt_speech_token'] - del model_input['llm_prompt_speech_token_len'] + model_input.pop('llm_prompt_speech_token', None) + model_input.pop('llm_prompt_speech_token_len', None) return model_input def frontend_vc(self, source_speech_16k, prompt_speech_16k, resample_rate):