[voice agent] Fix RTVI missing bot message (#15068)

stevehuang52 · web-flow · commit 26874a5baaaa · 2025-11-13T18:44:15.000-05:00
* fix RTVI missing bot message, fix diar not passing VAD frames

Signed-off-by: stevehuang52 &lt;heh@nvidia.com&gt;

* revert change to diar

Signed-off-by: stevehuang52 &lt;heh@nvidia.com&gt;

---------

Signed-off-by: stevehuang52 &lt;heh@nvidia.com&gt;
diff --git a/examples/voice_agent/server/bot_websocket_server.py b/examples/voice_agent/server/bot_websocket_server.py
@@ -20,9 +20,8 @@
 import sys
 
 from loguru import logger
-from omegaconf import OmegaConf
 
-from pipecat.audio.vad.silero import SileroVADAnalyzer, VADParams
+from pipecat.audio.vad.silero import SileroVADAnalyzer
 from pipecat.frames.frames import EndTaskFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
@@ -112,7 +111,10 @@ def signal_handler(signum, frame):
     shutdown_event.set()
 
 
-async def run_bot_websocket_server():
+async def run_bot_websocket_server(host: str = "0.0.0.0", port: int = 8765):
+    logger.info(f"Starting websocket server on {host}:{port}")
+    logger.info(f"Server configured to run indefinitely with no timeouts, use Ctrl+C to quit.")
+
     # Set up signal handlers for graceful shutdown
     signal.signal(signal.SIGINT, signal_handler)
     signal.signal(signal.SIGTERM, signal_handler)
@@ -147,8 +149,8 @@ async def run_bot_websocket_server():
             is None,  # if backchannel phrases are disabled, we can use VAD to interrupt the bot immediately
             audio_out_10ms_chunks=TRANSPORT_AUDIO_OUT_10MS_CHUNKS,
         ),
-        host="0.0.0.0",  # Bind to all interfaces
-        port=8765,
+        host=host,
+        port=port,
     )
 
     logger.info("Initializing STT service...")
@@ -279,7 +281,7 @@ async def reset_context_handler(rtvi_processor: RTVIProcessor, service: str, arg
 
     pipeline = Pipeline(pipeline)
 
-    rtvi_text_aggregator = SimpleSegmentedTextAggregator("\n?!.", min_sentence_length=5)
+    rtvi_text_aggregator = SimpleSegmentedTextAggregator(punctuation_marks=".!?\n")
     task = PipelineTask(
         pipeline,
         params=PipelineParams(
diff --git a/examples/voice_agent/server/server_configs/default.yaml b/examples/voice_agent/server/server_configs/default.yaml
@@ -15,8 +15,8 @@ vad:
 
 stt:
   type: nemo # choices in ['nemo'] currently only NeMo is supported
-  model: "stt_en_fastconformer_hybrid_large_streaming_80ms"
-  # model: "nvidia/parakeet_realtime_eou_120m-v1"
+  # model: "stt_en_fastconformer_hybrid_large_streaming_80ms"
+  model: "nvidia/parakeet_realtime_eou_120m-v1"
   model_config: "./server_configs/stt_configs/nemo_cache_aware_streaming.yaml"
   device: "cuda"
 
diff --git a/nemo/agents/voice_agent/pipecat/utils/text/simple_text_aggregator.py b/nemo/agents/voice_agent/pipecat/utils/text/simple_text_aggregator.py
@@ -101,7 +101,7 @@ def find_last_period_index(text: str) -> int:
 class SimpleSegmentedTextAggregator(SimpleTextAggregator):
     def __init__(
         self,
-        punctuation_marks: str | list[str] = ".,!?;:",
+        punctuation_marks: str | list[str] = ".,!?;:\n",
         ignore_marks: str | list[str] = "*",
         min_sentence_length: int = 0,
         use_legacy_eos_detection: bool = False,
@@ -130,9 +130,8 @@ def __init__(
             )
             if "." in punctuation_marks:
                 punctuation_marks.remove(".")
-            punctuation_marks += [
-                "."
-            ]  # put period at the end of the list to ensure it's the last punctuation mark to be matched
+            # put period at the end of the list to ensure it's the last punctuation mark to be matched
+            punctuation_marks += ["."]
             self._punctuation_marks = punctuation_marks
 
     def _find_segment_end(self, text: str) -> Optional[int]:
@@ -144,7 +143,12 @@ def _find_segment_end(self, text: str) -> Optional[int]:
         Returns:
             The index of the end of the segment, or None if the text is too short.
         """
-        if len(text.strip()) < self._min_sentence_length:
+        # drop leading whitespace but keep trailing whitespace to
+        # allow "\n" to trigger the end of the sentence
+        text_len = len(text)
+        text = text.lstrip()
+        offset = text_len - len(text)
+        if len(text) < self._min_sentence_length:
             return None
 
         for punc in self._punctuation_marks:
@@ -153,12 +157,12 @@ def _find_segment_end(self, text: str) -> Optional[int]:
             else:
                 idx = text.find(punc)
             if idx != -1:
-                return idx + 1
+                # add the offset to the index to account for the leading whitespace
+                return idx + 1 + offset
         return None
 
     async def aggregate(self, text: str) -> Optional[str]:
         result: Optional[str] = None
-
         self._text += str(text)
 
         for ignore_mark in self._ignore_marks:
@@ -174,10 +178,12 @@ async def aggregate(self, text: str) -> Optional[str]:
         if eos_end_index:
             result = self._text[:eos_end_index]
             if len(result.strip()) < self._min_sentence_length:
+                logger.debug(
+                    f"Text is too short, skipping: `{result}`, full text: `{self._text}`, input text: `{text}`"
+                )
                 result = None
-                logger.debug(f"Text is too short, skipping: `{result}`, full text: `{self._text}`")
             else:
-                logger.debug(f"Text Aggregator Result: `{result}`, full text: `{self._text}`")
+                logger.debug(f"Text Aggregator Result: `{result}`, full text: `{self._text}`, input text: `{text}`")
                 self._text = self._text[eos_end_index:]
 
         return result