rfejgin
diff --git a/‎examples/voice_agent/README.md‎
Lines changed: 7 additions & 4 deletions b/‎examples/voice_agent/README.md‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎examples/voice_agent/client/src/app.ts‎
Lines changed: 51 additions & 1 deletion b/‎examples/voice_agent/client/src/app.ts‎
Lines changed: 51 additions & 1 deletion
diff --git a/‎examples/voice_agent/server/bot_websocket_server.py‎
Lines changed: 14 additions & 11 deletions b/‎examples/voice_agent/server/bot_websocket_server.py‎
Lines changed: 14 additions & 11 deletions
diff --git a/‎examples/voice_agent/server/server_configs/default.yaml‎
Lines changed: 1 addition & 1 deletion b/‎examples/voice_agent/server/server_configs/default.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/voice_agent/server/server_configs/llm_configs/nemotron_nano_v2.yaml‎
Lines changed: 1 addition & 1 deletion b/‎examples/voice_agent/server/server_configs/llm_configs/nemotron_nano_v2.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/voice_agent/server/server_configs/tts_configs/kokoro_82M.yaml‎
Lines changed: 0 additions & 1 deletion b/‎examples/voice_agent/server/server_configs/tts_configs/kokoro_82M.yaml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎examples/voice_agent/server/server_configs/tts_configs/nemo_fastpitch-hifigan.yaml‎
Lines changed: 0 additions & 1 deletion b/‎examples/voice_agent/server/server_configs/tts_configs/nemo_fastpitch-hifigan.yaml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎examples/voice_agent/tests/test_config_manager.py‎
Lines changed: 7 additions & 4 deletions b/‎examples/voice_agent/tests/test_config_manager.py‎
Lines changed: 7 additions & 4 deletions
@@ -11,7 +11,6 @@ As of now, we only support English input and output, but more languages will be
 - [🚀 Quick Start](#-quick-start)
 - [📑 Supported Models and Features](#-supported-models-and-features)
   - [🤖 LLM](#-llm)
-    - [Thinking/reasoning Mode for LLMs](#thinkingreasoning-mode-for-llms)
   - [🎤 ASR](#-asr)
   - [💬 Speaker Diarization](#-speaker-diarization)
   - [🔉 TTS](#-tts)
@@ -171,6 +170,9 @@ For vLLM server, if you specify `--reasoning_parser` in `vllm_server_params`, th
 
 We use [cache-aware streaming FastConformer](https://arxiv.org/abs/2312.17279) to transcribe the user's speech into text. While new models will be released soon, we use the existing English models for now:
 - [nvidia/parakeet_realtime_eou_120m-v1](https://huggingface.co/nvidia/parakeet_realtime_eou_120m-v1) (default)
+  - This model supports EOU prediction and optimized for lowest latency, but does not support punctuation and capitalization.
+- [nvidia/nemotron-speech-streaming-en-0.6b](https://huggingface.co/nvidia/nemotron-speech-streaming-en-0.6b)
+  - This model has better ASR accuracy and supports punctuation and capitalization, but does not predict EOU.
 - [stt_en_fastconformer_hybrid_large_streaming_80ms](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_fastconformer_hybrid_large_streaming_80ms)
 - [nvidia/stt_en_fastconformer_hybrid_large_streaming_multi](https://huggingface.co/nvidia/stt_en_fastconformer_hybrid_large_streaming_multi)
 
@@ -244,10 +246,11 @@ The tools are then registered to the LLM via the `register_direct_tools_to_llm`
 
 More details on tool calling with Pipecat can be found in the [Pipecat documentation](https://docs.pipecat.ai/guides/learn/function-calling).
 
-#### Notes on system prompt with tools
+#### Notes on tool calling issues
 
-We notice that sometimes the LLM cannot do anything that's not related to the provided tools, or it might not actually use the tools even though it says it's using them. To alleviate this issue, we insert additional instructions to the system prompt (e.g., in `server/server_configs/llm_configs/nemotron_nano_v2.yaml`):
-- "Before responding to the user, check if the user request requires using external tools, and use the tools if they match with the user's intention. Otherwise, use your internal knowledge to answer the user's question. Do not use tools for casual conversation or when the tools don't fit the use cases. You should still try to address the user's request when it's not related to the provided tools."
+We notice that sometimes the LLM cannot do anything that's not related to the provided tools, or it might not actually use the tools even though it says it's using them. To alleviate this issue, we insert additional instructions to the system prompt to regulate its behavior (e.g., in `server/server_configs/llm_configs/nemotron_nano_v2.yaml`).
+
+Sometimes, after answering a question related to the tools, the LLM might refuce to answer questions that are not related to the tools, or vice versa. This phenomenon can be called "commitment bias" or "tunnel vision". To alleviate this issue, we can insert additional instructions to the system prompt and explicitly asking the LLM to use or not use the tools in the user's query.
 
 
 ## 📝 Notes & FAQ
 
@@ -41,6 +41,8 @@ class WebsocketClientApp {
   private analyser: AnalyserNode | null = null;
   private microphone: MediaStreamAudioSourceNode | null = null;
   private volumeUpdateInterval: number | null = null;
+  private currentBotMessageElement: HTMLDivElement | null = null;
+  private currentBotMessage: string = '';
 
   // Server configurations
   private readonly serverConfigs = {
@@ -110,6 +112,19 @@ class WebsocketClientApp {
     console.log(message);
   }
 
+  /**
+   * Create a bot message element and add it to the debug log
+   */
+  private createBotMessageElement(initialText: string): HTMLDivElement | null {
+    if (!this.debugLog) return null;
+    const entry = document.createElement('div');
+    entry.style.color = '#4CAF50';
+    entry.textContent = `${new Date().toISOString()} - ${initialText}`;
+    this.debugLog.appendChild(entry);
+    this.debugLog.scrollTop = this.debugLog.scrollHeight;
+    return entry;
+  }
+
   /**
    * Update the connection status display
    */
@@ -240,7 +255,34 @@ class WebsocketClientApp {
               this.log(`User: ${data.text}`);
             }
           },
-          onBotTranscript: (data) => this.log(`Bot: ${data.text}`),
+          onBotTranscript: (data) => {
+            // If no current element exists, create one (fallback in case BOT_LLM_STARTED didn't fire)
+            if (!this.currentBotMessageElement) {
+              this.currentBotMessage = '';
+              this.currentBotMessageElement = this.createBotMessageElement('Bot: ');
+            }
+            
+            // Accumulate the text
+            this.currentBotMessage += data.text;
+            
+            // Update the current element
+            if (this.currentBotMessageElement) {
+              const timestamp = new Date().toISOString();
+              this.currentBotMessageElement.textContent = `${timestamp} - Bot: ${this.currentBotMessage}`;
+              this.debugLog?.scrollTo({ top: this.debugLog.scrollHeight, behavior: 'smooth' });
+            }
+          },
+          onBotLlmStarted: () => {
+            // Only create a new bot message element if the current one has content
+            if (this.currentBotMessage !== '') {
+              this.currentBotMessage = '';
+              this.currentBotMessageElement = this.createBotMessageElement('Bot: ');
+            } else if (!this.currentBotMessageElement) {
+              // Create element if it doesn't exist at all
+              this.currentBotMessage = '';
+              this.currentBotMessageElement = this.createBotMessageElement('Bot: ');
+            }
+          },
           onMessageError: (error) => console.error('Message error:', error),
           onError: (error) => console.error('Error:', error),
         },
@@ -313,6 +355,10 @@ class WebsocketClientApp {
     // Stop volume monitoring
     this.stopVolumeMonitoring();
 
+    // Clean up bot message state
+    this.currentBotMessage = '';
+    this.currentBotMessageElement = null;
+    
     // Reset mute state
     this.isMuted = false;
 
@@ -357,6 +403,10 @@ class WebsocketClientApp {
     // Stop volume monitoring
     this.stopVolumeMonitoring();
 
+    // Clean up bot message state
+    this.currentBotMessage = '';
+    this.currentBotMessageElement = null;
+    
     // Reset mute state
     this.isMuted = false;
 
 
@@ -26,21 +26,21 @@
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
 from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
-from pipecat.processors.frameworks.rtvi import RTVIAction, RTVIConfig, RTVIProcessor
+from pipecat.processors.frameworks.rtvi import RTVIAction, RTVIConfig, RTVIObserverParams, RTVIProcessor
 from pipecat.serializers.protobuf import ProtobufFrameSerializer
 
 from nemo.agents.voice_agent.pipecat.processors.frameworks.rtvi import RTVIObserver
 from nemo.agents.voice_agent.pipecat.services.nemo.diar import NemoDiarService
 from nemo.agents.voice_agent.pipecat.services.nemo.llm import get_llm_service_from_config
-from nemo.agents.voice_agent.pipecat.services.nemo.stt import NemoSTTService
+from nemo.agents.voice_agent.pipecat.services.nemo.stt import ASR_EOU_MODELS, NemoSTTService
 from nemo.agents.voice_agent.pipecat.services.nemo.tts import KokoroTTSService, NeMoFastPitchHiFiGANTTSService
 from nemo.agents.voice_agent.pipecat.services.nemo.turn_taking import NeMoTurnTakingService
 from nemo.agents.voice_agent.pipecat.transports.network.websocket_server import (
     WebsocketServerParams,
     WebsocketServerTransport,
 )
 from nemo.agents.voice_agent.pipecat.utils.text.simple_text_aggregator import SimpleSegmentedTextAggregator
-from nemo.agents.voice_agent.pipecat.utils.tool_calling.basic_tools import get_city_weather
+from nemo.agents.voice_agent.pipecat.utils.tool_calling.basic_tools import tool_get_city_weather
 from nemo.agents.voice_agent.pipecat.utils.tool_calling.mixins import register_direct_tools_to_llm
 from nemo.agents.voice_agent.utils.config_manager import ConfigManager
 
@@ -83,7 +83,7 @@ def setup_logging():
 vad_params = config_manager.get_vad_params()
 
 # STT configuration
-STT_MODEL_PATH = config_manager.STT_MODEL_PATH
+STT_MODEL = config_manager.STT_MODEL
 STT_DEVICE = config_manager.STT_DEVICE
 stt_params = config_manager.get_stt_params()
 
@@ -137,6 +137,9 @@ async def run_bot_websocket_server(host: str = "0.0.0.0", port: int = 8765):
     )
     logger.info("VAD analyzer initialized")
 
+    has_turn_taking = True if STT_MODEL in ASR_EOU_MODELS else False
+    logger.info(f"Setting STT service has_turn_taking to `{has_turn_taking}` based on model name: `{STT_MODEL}`")
+
     ws_transport = WebsocketServerTransport(
         params=WebsocketServerParams(
             serializer=ProtobufFrameSerializer(),
@@ -146,8 +149,8 @@ async def run_bot_websocket_server(host: str = "0.0.0.0", port: int = 8765):
             vad_analyzer=vad_analyzer,
             session_timeout=None,  # Disable session timeout
             audio_in_sample_rate=SAMPLE_RATE,
-            can_create_user_frames=TURN_TAKING_BACKCHANNEL_PHRASES_PATH
-            is None,  # if backchannel phrases are disabled, we can use VAD to interrupt the bot immediately
+            can_create_user_frames=TURN_TAKING_BACKCHANNEL_PHRASES_PATH is None
+            or not has_turn_taking,  # if backchannel phrases are disabled, we can use VAD to interrupt the bot immediately
             audio_out_10ms_chunks=TRANSPORT_AUDIO_OUT_10MS_CHUNKS,
         ),
         host=host,
@@ -157,12 +160,12 @@ async def run_bot_websocket_server(host: str = "0.0.0.0", port: int = 8765):
     logger.info("Initializing STT service...")
 
     stt = NemoSTTService(
-        model=STT_MODEL_PATH,
+        model=STT_MODEL,
         device=STT_DEVICE,
         params=stt_params,
         sample_rate=SAMPLE_RATE,
         audio_passthrough=True,
-        has_turn_taking=True,
+        has_turn_taking=has_turn_taking,
         backend="legacy",
         decoder_type="rnnt",
     )
@@ -229,7 +232,7 @@ async def run_bot_websocket_server(host: str = "0.0.0.0", port: int = 8765):
 
     if server_config.llm.get("enable_tool_calling", False):
         logger.info("Tools calling for LLM is enabled by config, registering tools...")
-        register_direct_tools_to_llm(llm=llm, context=context, tool_mixins=[tts], tools=[get_city_weather])
+        register_direct_tools_to_llm(llm=llm, context=context, tool_mixins=[tts], tools=[tool_get_city_weather])
     else:
         logger.info("Tools calling for LLM is disabled by config, skipping tool registration.")
 
@@ -288,7 +291,7 @@ async def reset_context_handler(rtvi_processor: RTVIProcessor, service: str, arg
 
     pipeline = Pipeline(pipeline)
 
-    rtvi_text_aggregator = SimpleSegmentedTextAggregator(punctuation_marks=".!?\n")
+    rtvi_params = RTVIObserverParams(bot_llm_enabled=False)
     task = PipelineTask(
         pipeline,
         params=PipelineParams(
@@ -299,7 +302,7 @@ async def reset_context_handler(rtvi_processor: RTVIProcessor, service: str, arg
             report_only_initial_ttfb=True,
             idle_timeout=None,  # Disable idle timeout
         ),
-        observers=[RTVIObserver(rtvi, text_aggregator=rtvi_text_aggregator)],
+        observers=[RTVIObserver(rtvi, params=rtvi_params)],
         idle_timeout_secs=None,
         cancel_on_idle_timeout=False,
     )
 
@@ -46,7 +46,7 @@ llm:
   enable_reasoning: false  # it's best to turn-off reasoning for lowest latency, setting it to True will use the same config ending with `_think.yaml` instead
   # `system_prompt` is used as the sytem prompt to the LLM, please refer to differnt LLM webpage for spcial functions like enabling/disabling thinking
   # system_prompt: /path/to/prompt.txt  # or use path to a txt file that contains a long prompt, for example in `../example_prompts/fast_bite.txt`
-  system_prompt: "You are a helpful AI agent named Lisa. Start by greeting the user warmly and introducing yourself within one sentence. Your answer should be concise and to the point. You might also see speaker tags (<speaker_0>, <speaker_1>, etc.) in the user context. You should respond to the user based on the speaker tag and the context of that speaker. Do not include the speaker tags in your response, use them only to identify the speaker. Do not include any emoji in response."
+  system_prompt: "You are a helpful AI agent named Lisa. Start by greeting the user warmly and introducing yourself within one sentence. Your answer should be concise and to the point. You might also see speaker tags (<speaker_0>, <speaker_1>, etc.) in the user context. You should respond to the user based on the speaker tag and the context of that speaker. Do not include the speaker tags in your response, use them only to identify the speaker. Avoid using emoji in your response."
 
 tts:
   type: kokoro # choices in ['nemo', 'kokoro']
 
@@ -6,7 +6,7 @@ type: vllm  # Overwrite to vllm to enable tool calling, the HF backend currently
 dtype: bfloat16  # torch.dtype for LLM
 device: "cuda"
 system_role: "system"  # role for system prompt, set it to `user` for models that do not support system prompt
-system_prompt_suffix: "Before responding to the user, check if the user request requires using external tools, and use the tools if they match with the user's intention. Otherwise, use your internal knowledge to answer the user's question. Do not use tools for casual conversation or when the tools don't fit the use cases. You should still try to address the user's request when it's not related to the provided tools. /no_think"  # a string that would be appended to the system prompt, `/think` and `/no_think` are used to enable/disable thinking
+system_prompt_suffix: "Before responding to the user, check if the user request requires using external tools, and use the tools if they match with the user's intention. Otherwise, use your internal knowledge to answer the user's question. Do not use tools for casual conversation or when the tools don't fit the use cases. You should still try to address the user's request when it's not related to the provided tools. If you are provided with a set of tools, use them only when needed, do not limit your capabilities to the scope of the tools. If the purpose of a tool matches well with a user's request, always try to call the tool first. Conversation history should not limit your behavior on whether you can use tools. You must answer questions not related to the tools. /no_think"  # a string that would be appended to the system prompt, `/think` and `/no_think` are used to enable/disable thinking
 enable_tool_calling: True  # set to True since the vllm config below supports tool calling
 
 ##############################
 
@@ -12,5 +12,4 @@ extra_separator:  # a list of additional punctuations to chunk LLM response into
   - "?"
   - "!"
   - ";"
-  - ":"
 think_tokens: ["<think>", "</think>"]  # specify them to avoid TTS for thinking process, set to `null` to allow thinking out loud
@@ -11,5 +11,4 @@ extra_separator:  # a list of additional punctuations to chunk LLM response into
   - "?"
   - "!"
   - ";"
-  - ":"
 think_tokens: ["<think>", "</think>"]  # specify them to avoid TTS for thinking process, set to `null` to allow thinking out loud
@@ -84,14 +84,17 @@ def test_configure_stt_nemo_model(self, voice_agent_server_base_path):
         # Create necessary files
         config_manager = ConfigManager(voice_agent_server_base_path)
 
-        assert "stt_en_fastconformer" in config_manager.STT_MODEL_PATH
+        # STT_MODEL can be either a fastconformer model or an EOU model (e.g., parakeet_realtime_eou)
+        assert (
+            "stt_en_fastconformer" in config_manager.STT_MODEL or "parakeet_realtime_eou" in config_manager.STT_MODEL
+        )
         assert isinstance(config_manager.stt_params, NeMoSTTInputParams)
 
     @pytest.mark.unit
     def test_configure_stt_with_model_config(self, voice_agent_server_base_path):
         """Test STT configuration with custom model config."""
         config_manager = ConfigManager(voice_agent_server_base_path)
-        assert hasattr(config_manager, "STT_MODEL_PATH")
+        assert hasattr(config_manager, "STT_MODEL")
 
     @pytest.mark.unit
     def test_configure_diarization(self, voice_agent_server_base_path):
@@ -203,8 +206,8 @@ def test_get_vad_params(self, voice_agent_server_base_path):
 
         assert isinstance(vad_params, VADParams)
         assert isinstance(vad_params.confidence, float) and 0.0 <= vad_params.confidence <= 1.0
-        assert isinstance(vad_params.start_secs, float) and 0.0 <= vad_params.start_secs <= 1.0
-        assert isinstance(vad_params.stop_secs, float) and 0.0 <= vad_params.stop_secs <= 1.0
+        assert isinstance(vad_params.start_secs, float) and vad_params.start_secs >= 0.0
+        assert isinstance(vad_params.stop_secs, float) and vad_params.stop_secs >= 0.0
         assert isinstance(vad_params.min_volume, float) and 0.0 <= vad_params.min_volume <= 1.0
 
     @pytest.mark.unit
-Original file line number
+Diff line change
   - "?"
   - "!"
   - ";"
 -  - ":"
 think_tokens: ["<think>", "</think>"]  # specify them to avoid TTS for thinking process, set to `null` to allow thinking out loud