diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py
index f98b0d61..288c38d3 100644
--- a/cosyvoice/cli/frontend.py
+++ b/cosyvoice/cli/frontend.py
@@ -24,6 +24,7 @@
 import os
 import re
 import inflect
+import copy
 try:
     import ttsfrd
     use_ttsfrd = True
@@ -173,7 +174,11 @@ def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, resample_
                            'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
                            'llm_embedding': embedding, 'flow_embedding': embedding}
         else:
-            model_input = self.spk2info[zero_shot_spk_id]
+            model_input = copy.deepcopy(self.spk2info[zero_shot_spk_id])
+            if prompt_text:
+                prompt_text_token, prompt_text_token_len = self._extract_text_token(prompt_text)
+                model_input["prompt_text"] = prompt_text_token
+                model_input["prompt_text_len"] = prompt_text_token_len
         model_input['text'] = tts_text_token
         model_input['text_len'] = tts_text_token_len
         return model_input
@@ -181,16 +186,16 @@ def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, resample_
     def frontend_cross_lingual(self, tts_text, prompt_speech_16k, resample_rate, zero_shot_spk_id):
         model_input = self.frontend_zero_shot(tts_text, '', prompt_speech_16k, resample_rate, zero_shot_spk_id)
         # in cross lingual mode, we remove prompt in llm
-        del model_input['prompt_text']
-        del model_input['prompt_text_len']
-        del model_input['llm_prompt_speech_token']
-        del model_input['llm_prompt_speech_token_len']
+        model_input.pop('prompt_text', None)
+        model_input.pop('prompt_text_len', None)
+        model_input.pop('llm_prompt_speech_token', None)
+        model_input.pop('llm_prompt_speech_token_len', None)
         return model_input
 
     def frontend_instruct(self, tts_text, spk_id, instruct_text):
         model_input = self.frontend_sft(tts_text, spk_id)
         # in instruct mode, we remove spk_embedding in llm due to information leakage
-        del model_input['llm_embedding']
+        model_input.pop('llm_embedding', None)
         instruct_text_token, instruct_text_token_len = self._extract_text_token(instruct_text + '<endofprompt>')
         model_input['prompt_text'] = instruct_text_token
         model_input['prompt_text_len'] = instruct_text_token_len
@@ -198,8 +203,8 @@ def frontend_instruct(self, tts_text, spk_id, instruct_text):
 
     def frontend_instruct2(self, tts_text, instruct_text, prompt_speech_16k, resample_rate, zero_shot_spk_id):
         model_input = self.frontend_zero_shot(tts_text, instruct_text + '<|endofprompt|>', prompt_speech_16k, resample_rate, zero_shot_spk_id)
-        del model_input['llm_prompt_speech_token']
-        del model_input['llm_prompt_speech_token_len']
+        model_input.pop('llm_prompt_speech_token', None)
+        model_input.pop('llm_prompt_speech_token_len', None)
         return model_input
 
     def frontend_vc(self, source_speech_16k, prompt_speech_16k, resample_rate):