[voice agent] Fixing the missing arguments calling in NemoSTTService (#15233)

SangwonSUH · stevehuang52 · web-flow · commit 412ab81dd5b1 · 2025-12-30T20:30:51.000-05:00
* Fixing the missing arguments calling in `NemoSTTService`

Signed-off-by: SangwonSUH &lt;suhsw1210@gmail.com&gt;

* Remove redundant attribute assignments in `NemoSTTService` and use `self._params` directly instead.

Signed-off-by: SangwonSUH &lt;suhsw1210@gmail.com&gt;

* Add docstrings to `NemoSTTService` and related classes

Signed-off-by: SangwonSUH &lt;suhsw1210@gmail.com&gt;

* Fix incorrect model type check in process_frame method

Signed-off-by: He Huang (Steve) &lt;105218074+stevehuang52@users.noreply.github.com&gt;

---------

Signed-off-by: SangwonSUH &lt;suhsw1210@gmail.com&gt;
Signed-off-by: He Huang (Steve) &lt;105218074+stevehuang52@users.noreply.github.com&gt;
Co-authored-by: He Huang (Steve) &lt;105218074+stevehuang52@users.noreply.github.com&gt;
diff --git a/nemo/agents/voice_agent/pipecat/services/nemo/stt.py b/nemo/agents/voice_agent/pipecat/services/nemo/stt.py
@@ -34,7 +34,6 @@
 from pydantic import BaseModel
 
 from nemo.agents.voice_agent.pipecat.services.nemo.streaming_asr import NemoStreamingASRService
-from nemo.agents.voice_agent.pipecat.services.nemo.streaming_diar import NeMoStreamingDiarService
 
 try:
     # disable nemo logging
@@ -51,6 +50,8 @@
 
 
 class NeMoSTTInputParams(BaseModel):
+    """Input parameters for NeMo STT service."""
+
     language: Optional[Language] = Language.EN_US
     att_context_size: Optional[List] = [70, 1]
     frame_len_in_secs: Optional[float] = 0.08  # 80ms for FastConformer model
@@ -60,6 +61,8 @@ class NeMoSTTInputParams(BaseModel):
 
 
 class NemoSTTService(STTService):
+    """NeMo Speech-to-Text service for Pipecat integration."""
+
     def __init__(
         self,
         *,
@@ -94,7 +97,11 @@ def __init__(
     def _load_model(self):
         if self._backend == "legacy":
             self._model = NemoStreamingASRService(
-                self._model_name, device=self._device, decoder_type=self._decoder_type
+                self._model_name,
+                self._params.att_context_size,
+                device=self._device,
+                decoder_type=self._decoder_type,
+                frame_len_in_secs=self._params.frame_len_in_secs,
             )
         else:
             raise ValueError(f"Invalid ASR backend: {self._backend}")
@@ -258,7 +265,8 @@ async def set_model(self, model: str):
         self._load_model()
 
     async def process_frame(self, frame: Frame, direction: FrameDirection):
-        if isinstance(frame, VADUserStoppedSpeakingFrame) and isinstance(self._model, NeMoStreamingDiarService):
+        """Process incoming frames and handle VAD events."""
+        if isinstance(frame, VADUserStoppedSpeakingFrame) and isinstance(self._model, NemoStreamingASRService):
             # manualy reset the state of the model when end of utterance is detected by VAD
             logger.debug("Resetting state of the model due to VADUserStoppedSpeakingFrame")
             self._model.reset_state()