GetStream
diff --git a/‎agents-core/vision_agents/core/stt/stt.py‎
Lines changed: 13 additions & 184 deletions b/‎agents-core/vision_agents/core/stt/stt.py‎
Lines changed: 13 additions & 184 deletions
diff --git a/‎plugins/deepgram/tests/test_stt.py‎ renamed to ‎plugins/deepgram/tests/test_deepgram_stt.py‎ b/‎plugins/deepgram/tests/test_stt.py‎ renamed to ‎plugins/deepgram/tests/test_deepgram_stt.py‎
diff --git a/‎plugins/deepgram/vision_agents/plugins/deepgram/stt.py‎
Lines changed: 13 additions & 35 deletions b/‎plugins/deepgram/vision_agents/plugins/deepgram/stt.py‎
Lines changed: 13 additions & 35 deletions
@@ -1,15 +1,10 @@
 import abc
 import logging
-import time
 import uuid
-from typing import Optional, Dict, Any, Tuple, List, Union
+from typing import Optional, Dict, Any, Union
 from getstream.video.rtc.track_util import PcmData
 
 from ..edge.types import Participant
-from vision_agents.core.events import (
-    PluginInitializedEvent,
-    PluginClosedEvent,
-)
 from vision_agents.core.events.manager import EventManager
 from . import events
 
@@ -20,84 +15,25 @@ class STT(abc.ABC):
     """
     Abstract base class for Speech-to-Text implementations.
 
-    This class provides a standardized interface for STT implementations with consistent
-    event emission patterns and error handling.
+    Subclasses implement this and have to call
+    - _emit_partial_transcript_event
+    - _emit_transcript_event
+    - _emit_error_event for temporary errors
 
-    Events:
-        - transcript: Emitted when a complete transcript is available.
-            Args: text (str), user_metadata (dict), metadata (dict)
-        - partial_transcript: Emitted when a partial transcript is available.
-            Args: text (str), user_metadata (dict), metadata (dict)
-        - error: Emitted when an error occurs during transcription.
-            Args: error (Exception)
-
-    Standard Error Handling:
-        - All implementations should catch exceptions in _process_audio_impl and emit error events
-        - Use _emit_error_event() helper for consistent error emission
-        - Log errors with appropriate context using the logger
-
-    Standard Event Emission:
-        - Use _emit_transcript_event() and _emit_partial_transcript_event() helpers
-        - Include processing time and audio duration in metadata when available
-        - Maintain consistent metadata structure across implementations
+    process_audio is currently called every 20ms. The integration with turn keeping could be improved
     """
+    closed: bool = False
 
     def __init__(
         self,
-        sample_rate: int = 16000,
-        *,
         provider_name: Optional[str] = None,
     ):
-        """
-        Initialize the STT service.
-
-        Args:
-            sample_rate: The sample rate of the audio to process, in Hz.
-            provider_name: Name of the STT provider (e.g., "deepgram", "moonshine")
-       """
-
-        self._track = None
-        self.sample_rate = sample_rate
-        self._is_closed = False
         self.session_id = str(uuid.uuid4())
         self.provider_name = provider_name or self.__class__.__name__
+
         self.events = EventManager()
         self.events.register_events_from_module(events, ignore_not_compatible=True)
 
-        self.events.send(PluginInitializedEvent(
-            session_id=self.session_id,
-            plugin_name=self.provider_name,
-            plugin_type="STT",
-            provider=self.provider_name,
-            configuration={"sample_rate": sample_rate},
-        ))
-
-    def _validate_pcm_data(self, pcm_data: PcmData) -> bool:
-        """
-        Validate PCM data input for processing.
-
-        Args:
-            pcm_data: The PCM audio data to validate.
-
-        Returns:
-            True if the data is valid, False otherwise.
-        """
-
-        if not hasattr(pcm_data, "samples") or pcm_data.samples is None:
-            logger.warning("PCM data has no samples")
-            return False
-
-        if not hasattr(pcm_data, "sample_rate") or pcm_data.sample_rate <= 0:
-            logger.warning("PCM data has invalid sample rate")
-            return False
-
-        # Check if samples are empty
-        if hasattr(pcm_data.samples, "__len__") and len(pcm_data.samples) == 0:
-            logger.debug("Received empty audio samples")
-            return False
-
-        return True
-
     def _emit_transcript_event(
         self,
         text: str,
@@ -159,12 +95,8 @@ def _emit_error_event(
         user_metadata: Optional[Union[Dict[str, Any], Participant]] = None,
     ):
         """
-        Emit an error event with structured data.
-
-        Args:
-            error: The exception that occurred.
-            context: Additional context about where the error occurred.
-            user_metadata: User-specific metadata.
+        Emit an error event. Note this should only be emitted for temporary errors.
+        Permanent errors due to config etc should be directly raised
         """
         self.events.send(events.STTErrorEvent(
             session_id=self.session_id,
@@ -176,114 +108,11 @@ def _emit_error_event(
             is_recoverable=not isinstance(error, (SystemExit, KeyboardInterrupt)),
         ))
 
+    @abc.abstractmethod
     async def process_audio(
-        self, pcm_data: PcmData, participant: Optional[Participant] = None
+        self, pcm_data: PcmData, participant: Optional[Participant] = None,
     ):
-        """
-        Process audio data for transcription and emit appropriate events.
-
-        Args:
-            pcm_data: The PCM audio data to process.
-            user_metadata: Additional metadata about the user or session.
-        """
-        if self._is_closed:
-            logger.debug("Ignoring audio processing request - STT is closed")
-            return
-
-        # Validate input data
-        if not self._validate_pcm_data(pcm_data):
-            logger.warning("Invalid PCM data received, skipping processing")
-            return
-
-        try:
-            # Process the audio data using the implementation-specific method
-            audio_duration_ms = (
-                pcm_data.duration * 1000 if hasattr(pcm_data, "duration") else None
-            )
-            logger.debug(
-                "Processing audio chunk",
-                extra={
-                    "duration_ms": audio_duration_ms,
-                    "has_user_metadata": participant is not None,
-                },
-            )
-
-            start_time = time.time()
-            results = await self._process_audio_impl(pcm_data, participant)
-            processing_time = time.time() - start_time
-
-            # If no results were returned, just return
-            if not results:
-                logger.debug(
-                    "No speech detected in audio",
-                    extra={
-                        "processing_time_ms": processing_time * 1000,
-                        "audio_duration_ms": audio_duration_ms,
-                    },
-                )
-                return
-
-            # Process each result and emit the appropriate event
-            for is_final, text, metadata in results:
-                # Ensure metadata includes processing time if not already present
-                if "processing_time_ms" not in metadata:
-                    metadata["processing_time_ms"] = processing_time * 1000
-
-                if is_final:
-                    self._emit_transcript_event(text, participant, metadata)
-                else:
-                    self._emit_partial_transcript_event(text, participant, metadata)
-
-        except Exception as e:
-            # Emit any errors that occur during processing
-            self._emit_error_event(e, "audio processing", participant)
-
-    @abc.abstractmethod
-    async def _process_audio_impl(
-        self, pcm_data: PcmData, user_metadata: Optional[Union[Dict[str, Any], Participant]] = None
-    ) -> Optional[List[Tuple[bool, str, Dict[str, Any]]]]:
-        """
-        Implementation-specific method to process audio data.
-
-        This method must be implemented by all STT providers and should handle the core
-        transcription logic. The base class handles event emission and error handling.
-
-        Args:
-            pcm_data: The PCM audio data to process. Guaranteed to be valid by base class.
-            user_metadata: Additional metadata about the user or session.
-
-        Returns:
-            optional list[tuple[bool, str, dict]] | None
-                • synchronous providers: a list of results.
-                • asynchronous providers: None (they emit events themselves).
-
-        Notes:
-            Implementations must not both emit events and return non-empty results,
-            or duplicate events will be produced.
-            Exceptions should bubble up; process_audio() will catch them
-            and emit a single "error" event.
-        """
         pass
 
-    @abc.abstractmethod
     async def close(self):
-        """
-        Close the STT service and release any resources.
-
-        Implementations should:
-        - Set self._is_closed = True
-        - Clean up any background tasks or connections
-        - Release any allocated resources
-        - Log the closure appropriately
-        """
-        if not self._is_closed:
-            self._is_closed = True
-
-            # Emit closure event
-            self.events.send(PluginClosedEvent(
-                session_id=self.session_id,
-                plugin_name=self.provider_name,
-                plugin_type="STT",
-                provider=self.provider_name,
-                cleanup_successful=True,
-            ))
+        self.closed = True
@@ -3,7 +3,7 @@
 import logging
 import os
 import time
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Dict, Optional
 
 import numpy as np
 import websockets
@@ -23,8 +23,7 @@
 
 from .utils import generate_silence
 
-if TYPE_CHECKING:
-    from vision_agents.core.edge.types import Participant
+from vision_agents.core.edge.types import Participant
 
 logger = logging.getLogger(__name__)
 
@@ -50,7 +49,6 @@ def __init__(
         self,
         api_key: Optional[str] = None,
         options: Optional[dict] = None,
-        sample_rate: int = 48000,
         language: str = "en-US",
         interim_results: bool = True,
         client: Optional[AsyncDeepgramClient] = None,
@@ -70,7 +68,7 @@ def __init__(
             connection_timeout: Time to wait for the Deepgram connection to be established.
 
         """
-        super().__init__(sample_rate=sample_rate)
+        super().__init__(provider_name="deepgram")
 
         # If no API key was provided, check for DEEPGRAM_API_KEY in environment
         if api_key is None:
@@ -86,12 +84,13 @@ def __init__(
             client if client is not None else AsyncDeepgramClient(api_key=api_key)
         )
         self.dg_connection: Optional[AsyncV1SocketClient] = None
+        self.sample_rate = 48000
 
         self.options = options or {
             "model": "nova-2",
             "language": language,
             "encoding": "linear16",
-            "sample_rate": sample_rate,
+            "sample_rate": self.sample_rate,
             "channels": 1,
             "interim_results": interim_results,
         }
@@ -101,7 +100,7 @@ def __init__(
 
         # Generate a silence audio to use as keep-alive message
         self._keep_alive_data = generate_silence(
-            sample_rate=sample_rate, duration_ms=10
+            sample_rate=self.sample_rate, duration_ms=10
         )
         self._keep_alive_interval = keep_alive_interval
 
@@ -121,7 +120,7 @@ async def start(self):
         """
         Start the main task establishing the Deepgram connection and processing the events.
         """
-        if self._is_closed:
+        if self.closed:
             logger.warning("Cannot setup connection - Deepgram instance is closed")
             return None
 
@@ -178,15 +177,8 @@ async def started(self):
         )
 
     async def close(self):
+        await super().close()
         """Close the Deepgram connection and clean up resources."""
-        if self._is_closed:
-            logger.debug("Deepgram STT service already closed")
-            return
-
-        logger.info("Closing Deepgram STT service")
-        self._is_closed = True
-
-        # Close the Deepgram connection if it exists
         if self.dg_connection:
             logger.debug("Closing Deepgram connection")
             try:
@@ -261,29 +253,15 @@ async def _on_connection_close(self, message: Any):
         logger.warning(f"Deepgram connection closed. message={message}")
         await self.close()
 
-    async def _process_audio_impl(
+    async def process_audio(
         self,
         pcm_data: PcmData,
-        user_metadata: Optional[Union[Dict[str, Any], "Participant"]] = None,
-    ) -> Optional[List[Tuple[bool, str, Dict[str, Any]]]]:
-        """
-        Process audio data through Deepgram for transcription.
-
-        Args:
-            pcm_data: The PCM audio data to process.
-            user_metadata: Additional metadata about the user or session.
-
-        Returns:
-            None - Deepgram operates in asynchronous mode and emits events directly
-            when transcripts arrive from the streaming service.
-        """
-        if self._is_closed:
+        participant: Optional[Participant] = None,
+    ):
+        if self.closed:
             logger.warning("Deepgram connection is closed, ignoring audio")
             return None
 
-        # Store the current user context for transcript events
-        self._current_user = user_metadata  # type: ignore[assignment]
-
         # Check if the input sample rate matches the expected sample rate
         if pcm_data.sample_rate != self.sample_rate:
             logger.warning(
@@ -334,7 +312,7 @@ async def _keepalive_loop(self):
         Send the silence audio every `interval` seconds
         to prevent Deepgram from closing the connection.
         """
-        while not self._is_closed and self.dg_connection is not None:
+        while not self.closed and self.dg_connection is not None:
             if self._last_sent_at + self._keep_alive_interval <= time.time():
                 logger.debug("Sending keepalive packet to Deepgram...")
                 # Send audio silence to keep the connection open