Merge pull request #1901 from xinnan-tech/py_test_tts

openrz · web-flow · commit e53b24ef4745 · 2025-07-25T22:17:59.000+08:00
Py test tts
diff --git a/main/xiaozhi-server/config.yaml b/main/xiaozhi-server/config.yaml
@@ -710,6 +710,26 @@ TTS:
     #     voice_id: female-shaonv
     #     weight: 1
     # language_boost: auto
+
+# MinimaxTTSHTTPStream和MinimaxTTSWebSocketStream还在测试，测试完再开放
+#
+#  MinimaxTTSHTTPStream:
+#  # Minimax流式语音合成服务
+#    type: minimax_httpstream
+#    output_dir: tmp/
+#    group_id: 你的minimax平台groupID
+#    api_key: 你的minimax平台接口密钥
+#    model: "speech-01-turbo"
+#    voice_id: "female-shaonv"
+#
+#  MinimaxTTSWebSocketStream:
+#    type: minimax_webSocket
+#    output_dir: tmp/
+#    group_id: 你的minimax平台groupID
+#    api_key: 你的minimax平台接口密钥
+#    model: "speech-01-turbo"
+#    voice_id: "female-shaonv"
+
   AliyunTTS:
     # 阿里云智能语音交互服务，需要先在阿里云平台开通服务，然后获取验证信息
     # 平台地址：https://nls-portal.console.aliyun.com/
diff --git a/main/xiaozhi-server/core/connection.py b/main/xiaozhi-server/core/connection.py
@@ -132,6 +132,8 @@ def __init__(
 
         # tts相关变量
         self.sentence_id = None
+        # 处理TTS响应没有文本返回
+        self.tts_MessageText = ""
 
         # iot相关变量
         self.iot_descriptors = {}
@@ -182,8 +184,13 @@ async def handle_connection(self, ws):
                     await ws.send("端口正常，如需测试连接，请使用test_page.html")
                     await self.close(ws)
                     return
-            # 获取客户端ip地址
-            self.client_ip = ws.remote_address[0]
+            real_ip = self.headers.get("x-real-ip") or self.headers.get(
+                "x-forwarded-for"
+            )
+            if real_ip:
+                self.client_ip = real_ip.split(",")[0].strip()
+            else:
+                self.client_ip = ws.remote_address[0]
             self.logger.bind(tag=TAG).info(
                 f"{self.client_ip} conn - Headers: {self.headers}"
             )
@@ -335,7 +342,7 @@ def _initialize_components(self):
                 self.config.get("selected_module", {})
             )
             self.logger = create_connection_logger(self.selected_module_str)
-            
+
             """初始化组件"""
             if self.config.get("prompt") is not None:
                 user_prompt = self.config["prompt"]
@@ -351,10 +358,10 @@ def _initialize_components(self):
                 self.vad = self._vad
             if self.asr is None:
                 self.asr = self._initialize_asr()
-            
+
             # 初始化声纹识别
             self._initialize_voiceprint()
-            
+
             # 打开语音识别通道
             asyncio.run_coroutine_threadsafe(
                 self.asr.open_audio_channels(self), self.loop
@@ -790,9 +797,9 @@ def chat(self, query, tool_call=False, depth=0):
             if not bHasError:
                 # 如需要大模型先处理一轮，添加相关处理后的日志情况
                 if len(response_message) > 0:
-                    self.dialogue.put(
-                        Message(role="assistant", content="".join(response_message))
-                    )
+                    text_buff = "".join(response_message)
+                    self.tts_MessageText = text_buff
+                    self.dialogue.put(Message(role="assistant", content=text_buff))
                 response_message.clear()
                 self.logger.bind(tag=TAG).debug(
                     f"function_name={function_name}, function_id={function_id}, function_arguments={function_arguments}"
@@ -814,9 +821,9 @@ def chat(self, query, tool_call=False, depth=0):
 
         # 存储对话内容
         if len(response_message) > 0:
-            self.dialogue.put(
-                Message(role="assistant", content="".join(response_message))
-            )
+            text_buff = "".join(response_message)
+            self.tts_MessageText = text_buff
+            self.dialogue.put(Message(role="assistant", content=text_buff))
         if depth == 0:
             self.tts.tts_text_queue.put(
                 TTSMessageDTO(
@@ -893,9 +900,7 @@ def _report_worker(self):
                     if self.executor is None:
                         continue
                     # 提交任务到线程池
-                    self.executor.submit(
-                        self._process_report, *item
-                    )
+                    self.executor.submit(self._process_report, *item)
                 except Exception as e:
                     self.logger.bind(tag=TAG).error(f"聊天记录上报线程异常: {e}")
             except queue.Empty:
diff --git a/main/xiaozhi-server/core/handle/sendAudioHandle.py b/main/xiaozhi-server/core/handle/sendAudioHandle.py
@@ -12,7 +12,7 @@ async def sendAudioMessage(conn, sentenceType, audios, text):
     conn.logger.bind(tag=TAG).info(f"发送音频消息: {sentenceType}, {text}")
 
     pre_buffer = False
-    if conn.tts.tts_audio_first_sentence and text is not None:
+    if conn.tts.tts_audio_first_sentence:
         conn.logger.bind(tag=TAG).info(f"发送第一段语音: {text}")
         conn.tts.tts_audio_first_sentence = False
         pre_buffer = True
@@ -73,7 +73,7 @@ async def send_tts_message(conn, state, text=None):
     """发送 TTS 状态消息"""
     message = {"type": "tts", "state": state, "session_id": conn.session_id}
     if text is not None:
-        message["text"] = text
+        message["text"] = textUtils.check_emoji(text)
 
     # TTS播放结束
     if state == "stop":
diff --git a/main/xiaozhi-server/core/providers/tts/aliyun_stream.py b/main/xiaozhi-server/core/providers/tts/aliyun_stream.py
@@ -127,8 +127,6 @@ def __init__(self, config, delete_audio_file):
 
         # 专属tts设置
         self.message_id = ""
-        self.tts_text = ""
-        self.text_buffer = []
 
         # 创建Opus编码器
         self.opus_encoder = opus_encoder_utils.OpusEncoderUtils(
@@ -229,7 +227,6 @@ def tts_text_priority_thread(self):
 
                         # aliyunStream独有的参数生成
                         self.message_id = str(uuid.uuid4().hex)
-                        self.text_buffer = []
 
                         logger.bind(tag=TAG).info("开始启动TTS会话...")
                         future = asyncio.run_coroutine_threadsafe(
@@ -250,7 +247,6 @@ def tts_text_priority_thread(self):
                             logger.bind(tag=TAG).debug(
                                 f"开始发送TTS文本: {message.content_detail}"
                             )
-                            self.text_buffer.append(message.content_detail)
                             future = asyncio.run_coroutine_threadsafe(
                                 self.text_to_speak(message.content_detail, None),
                                 loop=self.conn.loop,
@@ -275,9 +271,6 @@ def tts_text_priority_thread(self):
                 if message.sentence_type == SentenceType.LAST:
                     try:
                         logger.bind(tag=TAG).info("开始结束TTS会话...")
-                        self.tts_text = textUtils.get_string_no_punctuation_or_emoji(
-                            "".join(self.text_buffer).replace("\n", "")
-                        )
                         future = asyncio.run_coroutine_threadsafe(
                             self.finish_session(self.conn.sentence_id),
                             loop=self.conn.loop,
@@ -444,34 +437,35 @@ async def _start_monitor_tts_response(self):
                             event_name = header.get("name")
                             if event_name == "SynthesisStarted":
                                 logger.bind(tag=TAG).debug("TTS合成已启动")
-                            elif event_name == "SentenceBegin":
-                                logger.bind(tag=TAG).debug(
-                                    f"句子语音生成开始: {self.tts_text}"
-                                )
-                                opus_datas_cache = []
                                 self.tts_audio_queue.put(
-                                    (SentenceType.FIRST, [], self.tts_text)
+                                    (SentenceType.FIRST, [], None)
                                 )
+                            elif event_name == "SentenceBegin":
+                                opus_datas_cache = []
                             elif event_name == "SentenceEnd":
-                                logger.bind(tag=TAG).info(
-                                    f"句子语音生成成功： {self.tts_text}"
-                                )
                                 if (
                                     not is_first_sentence
                                     or first_sentence_segment_count > 10
                                 ):
                                     # 发送缓存的数据
-                                    self.tts_audio_queue.put(
-                                        (SentenceType.MIDDLE, opus_datas_cache, None)
-                                    )
+                                    if self.conn.tts_MessageText:
+                                        logger.bind(tag=TAG).info(
+                                            f"句子语音生成成功： {self.conn.tts_MessageText}"
+                                        )
+                                        self.tts_audio_queue.put(
+                                            (SentenceType.MIDDLE, opus_datas_cache, self.conn.tts_MessageText)
+                                        )
+                                        self.conn.tts_MessageText = None
+                                    else:
+                                        self.tts_audio_queue.put(
+                                            (SentenceType.MIDDLE, opus_datas_cache, None)
+                                        )
                                 # 第一句话结束后，将标志设置为False
                                 is_first_sentence = False
                             elif event_name == "SynthesisCompleted":
                                 logger.bind(tag=TAG).debug(f"会话结束～～")
                                 self._process_before_stop_play_files()
                                 session_finished = True
-                                self.reuse_judgment = time.time()
-                                self.tts_text = ""
                                 break
                         except json.JSONDecodeError:
                             logger.bind(tag=TAG).warning("收到无效的JSON消息")
diff --git a/main/xiaozhi-server/core/providers/tts/huoshan_double_stream.py b/main/xiaozhi-server/core/providers/tts/huoshan_double_stream.py
@@ -232,7 +232,6 @@ def tts_text_priority_thread(self):
                             loop=self.conn.loop,
                         )
                         future.result()
-                        self.tts_audio_first_sentence = True
                         self.before_stop_play_files.clear()
                         logger.bind(tag=TAG).info("TTS会话启动成功")
                     except Exception as e:
diff --git a/main/xiaozhi-server/core/providers/tts/linkerai.py b/main/xiaozhi-server/core/providers/tts/linkerai.py
@@ -52,7 +52,6 @@ def tts_text_priority_thread(self):
                     self.processed_chars = 0
                     self.tts_text_buff = []
                     self.segment_count = 0
-                    self.tts_audio_first_sentence = True
                     self.before_stop_play_files.clear()
                 elif ContentType.TEXT == message.content_type:
                     self.tts_text_buff.append(message.content_detail)
diff --git a/main/xiaozhi-server/core/providers/tts/minimax_httpstream.py b/main/xiaozhi-server/core/providers/tts/minimax_httpstream.py
diff --git a/main/xiaozhi-server/core/providers/tts/minimax_webSocket.py b/main/xiaozhi-server/core/providers/tts/minimax_webSocket.py
diff --git a/main/xiaozhi-server/core/utils/textUtils.py b/main/xiaozhi-server/core/utils/textUtils.py

Original file line number	Diff line number	Diff line change
`@@ -232,7 +232,6 @@ def tts_text_priority_thread(self):`
`232`	`232`	`loop=self.conn.loop,`
`233`	`233`	`)`
`234`	`234`	`future.result()`
`235`		`- self.tts_audio_first_sentence = True`
`236`	`235`	`self.before_stop_play_files.clear()`
`237`	`236`	`logger.bind(tag=TAG).info("TTS会话启动成功")`
`238`	`237`	`except Exception as e:`