FunAudioLLM · Clabiyau · Jul 23, 2025 · Jul 23, 2025 · Jul 24, 2025
diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py
@@ -24,6 +24,7 @@
 import os
 import re
 import inflect
+import copy
 try:
     import ttsfrd
     use_ttsfrd = True
@@ -173,33 +174,37 @@ def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, resample_
                            'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
                            'llm_embedding': embedding, 'flow_embedding': embedding}
         else:
-            model_input = self.spk2info[zero_shot_spk_id]
+            model_input = copy.deepcopy(self.spk2info[zero_shot_spk_id])
+            if prompt_text:
+                prompt_text_token, prompt_text_token_len = self._extract_text_token(prompt_text)
+                model_input["prompt_text"] = prompt_text_token
+                model_input["prompt_text_len"] = prompt_text_token_len
         model_input['text'] = tts_text_token
         model_input['text_len'] = tts_text_token_len
         return model_input
 
     def frontend_cross_lingual(self, tts_text, prompt_speech_16k, resample_rate, zero_shot_spk_id):
         model_input = self.frontend_zero_shot(tts_text, '', prompt_speech_16k, resample_rate, zero_shot_spk_id)
         # in cross lingual mode, we remove prompt in llm
-        del model_input['prompt_text']
-        del model_input['prompt_text_len']
-        del model_input['llm_prompt_speech_token']
-        del model_input['llm_prompt_speech_token_len']
+        model_input.pop('prompt_text', None)
+        model_input.pop('prompt_text_len', None)
+        model_input.pop('llm_prompt_speech_token', None)
+        model_input.pop('llm_prompt_speech_token_len', None)
         return model_input
 
     def frontend_instruct(self, tts_text, spk_id, instruct_text):
         model_input = self.frontend_sft(tts_text, spk_id)
         # in instruct mode, we remove spk_embedding in llm due to information leakage
-        del model_input['llm_embedding']
+        model_input.pop('llm_embedding', None)
         instruct_text_token, instruct_text_token_len = self._extract_text_token(instruct_text + '<endofprompt>')
         model_input['prompt_text'] = instruct_text_token
         model_input['prompt_text_len'] = instruct_text_token_len
         return model_input
 
     def frontend_instruct2(self, tts_text, instruct_text, prompt_speech_16k, resample_rate, zero_shot_spk_id):
         model_input = self.frontend_zero_shot(tts_text, instruct_text + '<|endofprompt|>', prompt_speech_16k, resample_rate, zero_shot_spk_id)
-        del model_input['llm_prompt_speech_token']
-        del model_input['llm_prompt_speech_token_len']
+        model_input.pop('llm_prompt_speech_token', None)
+        model_input.pop('llm_prompt_speech_token_len', None)
         return model_input
 
     def frontend_vc(self, source_speech_16k, prompt_speech_16k, resample_rate):