11import abc
22import logging
3- import time
43import uuid
5- from typing import Optional , Dict , Any , Tuple , List , Union
4+ from typing import Optional , Dict , Any , Union
65from getstream .video .rtc .track_util import PcmData
76
87from ..edge .types import Participant
9- from vision_agents .core .events import (
10- PluginInitializedEvent ,
11- PluginClosedEvent ,
12- )
138from vision_agents .core .events .manager import EventManager
149from . import events
1510
@@ -20,84 +15,25 @@ class STT(abc.ABC):
2015 """
2116 Abstract base class for Speech-to-Text implementations.
2217
23- This class provides a standardized interface for STT implementations with consistent
24- event emission patterns and error handling.
18+ Subclasses implement this and have to call
19+ - _emit_partial_transcript_event
20+ - _emit_transcript_event
21+ - _emit_error_event for temporary errors
2522
26- Events:
27- - transcript: Emitted when a complete transcript is available.
28- Args: text (str), user_metadata (dict), metadata (dict)
29- - partial_transcript: Emitted when a partial transcript is available.
30- Args: text (str), user_metadata (dict), metadata (dict)
31- - error: Emitted when an error occurs during transcription.
32- Args: error (Exception)
33-
34- Standard Error Handling:
35- - All implementations should catch exceptions in _process_audio_impl and emit error events
36- - Use _emit_error_event() helper for consistent error emission
37- - Log errors with appropriate context using the logger
38-
39- Standard Event Emission:
40- - Use _emit_transcript_event() and _emit_partial_transcript_event() helpers
41- - Include processing time and audio duration in metadata when available
42- - Maintain consistent metadata structure across implementations
23+ process_audio is currently called every 20ms. The integration with turn keeping could be improved
4324 """
25+ closed : bool = False
4426
4527 def __init__ (
4628 self ,
47- sample_rate : int = 16000 ,
48- * ,
4929 provider_name : Optional [str ] = None ,
5030 ):
51- """
52- Initialize the STT service.
53-
54- Args:
55- sample_rate: The sample rate of the audio to process, in Hz.
56- provider_name: Name of the STT provider (e.g., "deepgram", "moonshine")
57- """
58-
59- self ._track = None
60- self .sample_rate = sample_rate
61- self ._is_closed = False
6231 self .session_id = str (uuid .uuid4 ())
6332 self .provider_name = provider_name or self .__class__ .__name__
33+
6434 self .events = EventManager ()
6535 self .events .register_events_from_module (events , ignore_not_compatible = True )
6636
67- self .events .send (PluginInitializedEvent (
68- session_id = self .session_id ,
69- plugin_name = self .provider_name ,
70- plugin_type = "STT" ,
71- provider = self .provider_name ,
72- configuration = {"sample_rate" : sample_rate },
73- ))
74-
75- def _validate_pcm_data (self , pcm_data : PcmData ) -> bool :
76- """
77- Validate PCM data input for processing.
78-
79- Args:
80- pcm_data: The PCM audio data to validate.
81-
82- Returns:
83- True if the data is valid, False otherwise.
84- """
85-
86- if not hasattr (pcm_data , "samples" ) or pcm_data .samples is None :
87- logger .warning ("PCM data has no samples" )
88- return False
89-
90- if not hasattr (pcm_data , "sample_rate" ) or pcm_data .sample_rate <= 0 :
91- logger .warning ("PCM data has invalid sample rate" )
92- return False
93-
94- # Check if samples are empty
95- if hasattr (pcm_data .samples , "__len__" ) and len (pcm_data .samples ) == 0 :
96- logger .debug ("Received empty audio samples" )
97- return False
98-
99- return True
100-
10137 def _emit_transcript_event (
10238 self ,
10339 text : str ,
@@ -159,12 +95,8 @@ def _emit_error_event(
15995 user_metadata : Optional [Union [Dict [str , Any ], Participant ]] = None ,
16096 ):
16197 """
162- Emit an error event with structured data.
163-
164- Args:
165- error: The exception that occurred.
166- context: Additional context about where the error occurred.
167- user_metadata: User-specific metadata.
98+ Emit an error event. Note this should only be emitted for temporary errors.
99+ Permanent errors due to config etc should be directly raised
168100 """
169101 self .events .send (events .STTErrorEvent (
170102 session_id = self .session_id ,
@@ -176,114 +108,11 @@ def _emit_error_event(
176108 is_recoverable = not isinstance (error , (SystemExit , KeyboardInterrupt )),
177109 ))
178110
111+ @abc .abstractmethod
179112 async def process_audio (
180- self , pcm_data : PcmData , participant : Optional [Participant ] = None
113+ self , pcm_data : PcmData , participant : Optional [Participant ] = None ,
181114 ):
182- """
183- Process audio data for transcription and emit appropriate events.
184-
185- Args:
186- pcm_data: The PCM audio data to process.
187- user_metadata: Additional metadata about the user or session.
188- """
189- if self ._is_closed :
190- logger .debug ("Ignoring audio processing request - STT is closed" )
191- return
192-
193- # Validate input data
194- if not self ._validate_pcm_data (pcm_data ):
195- logger .warning ("Invalid PCM data received, skipping processing" )
196- return
197-
198- try :
199- # Process the audio data using the implementation-specific method
200- audio_duration_ms = (
201- pcm_data .duration * 1000 if hasattr (pcm_data , "duration" ) else None
202- )
203- logger .debug (
204- "Processing audio chunk" ,
205- extra = {
206- "duration_ms" : audio_duration_ms ,
207- "has_user_metadata" : participant is not None ,
208- },
209- )
210-
211- start_time = time .time ()
212- results = await self ._process_audio_impl (pcm_data , participant )
213- processing_time = time .time () - start_time
214-
215- # If no results were returned, just return
216- if not results :
217- logger .debug (
218- "No speech detected in audio" ,
219- extra = {
220- "processing_time_ms" : processing_time * 1000 ,
221- "audio_duration_ms" : audio_duration_ms ,
222- },
223- )
224- return
225-
226- # Process each result and emit the appropriate event
227- for is_final , text , metadata in results :
228- # Ensure metadata includes processing time if not already present
229- if "processing_time_ms" not in metadata :
230- metadata ["processing_time_ms" ] = processing_time * 1000
231-
232- if is_final :
233- self ._emit_transcript_event (text , participant , metadata )
234- else :
235- self ._emit_partial_transcript_event (text , participant , metadata )
236-
237- except Exception as e :
238- # Emit any errors that occur during processing
239- self ._emit_error_event (e , "audio processing" , participant )
240-
241- @abc .abstractmethod
242- async def _process_audio_impl (
243- self , pcm_data : PcmData , user_metadata : Optional [Union [Dict [str , Any ], Participant ]] = None
244- ) -> Optional [List [Tuple [bool , str , Dict [str , Any ]]]]:
245- """
246- Implementation-specific method to process audio data.
247-
248- This method must be implemented by all STT providers and should handle the core
249- transcription logic. The base class handles event emission and error handling.
250-
251- Args:
252- pcm_data: The PCM audio data to process. Guaranteed to be valid by base class.
253- user_metadata: Additional metadata about the user or session.
254-
255- Returns:
256- optional list[tuple[bool, str, dict]] | None
257- • synchronous providers: a list of results.
258- • asynchronous providers: None (they emit events themselves).
259-
260- Notes:
261- Implementations must not both emit events and return non-empty results,
262- or duplicate events will be produced.
263- Exceptions should bubble up; process_audio() will catch them
264- and emit a single "error" event.
265- """
266115 pass
267116
268- @abc .abstractmethod
269117 async def close (self ):
270- """
271- Close the STT service and release any resources.
272-
273- Implementations should:
274- - Set self._is_closed = True
275- - Clean up any background tasks or connections
276- - Release any allocated resources
277- - Log the closure appropriately
278- """
279- if not self ._is_closed :
280- self ._is_closed = True
281-
282- # Emit closure event
283- self .events .send (PluginClosedEvent (
284- session_id = self .session_id ,
285- plugin_name = self .provider_name ,
286- plugin_type = "STT" ,
287- provider = self .provider_name ,
288- cleanup_successful = True ,
289- ))
118+ self .closed = True
0 commit comments