more cleanup

tschellenbach · tschellenbach · commit f18512036a5c · 2025-10-23T15:08:42.000-06:00
diff --git a/agents-core/vision_agents/core/agents/agents.py b/agents-core/vision_agents/core/agents/agents.py
@@ -309,7 +309,6 @@ async def on_realtime_agent_speech_transcription(
 
         @self.events.subscribe
         async def on_stt_transcript_event_create_response(event: STTTranscriptEvent):
-            import pdb; pdb.set_trace()
             if self.realtime_mode or not self.llm:
                 # when running in realtime mode, there is no need to send the response to the LLM
                 return
diff --git a/agents-core/vision_agents/core/stt/events.py b/agents-core/vision_agents/core/stt/events.py
@@ -9,6 +9,7 @@ class TranscriptResponse:
     language: Optional[str] = None
     processing_time_ms: Optional[float] = None
     audio_duration_ms: Optional[float] = None
+    model_name: Optional[str] = None
     other: Optional[dict] = None
 
 @dataclass
@@ -44,10 +45,6 @@ def audio_duration_ms(self) -> Optional[float]:
     @property
     def model_name(self) -> Optional[str]:
         return self.response.model_name
-    
-    @property
-    def words(self) -> Optional[list[dict[str, Any]]]:
-        return self.response.words
 
 
 @dataclass
@@ -79,10 +76,6 @@ def audio_duration_ms(self) -> Optional[float]:
     @property
     def model_name(self) -> Optional[str]:
         return self.response.model_name
-    
-    @property
-    def words(self) -> Optional[list[dict[str, Any]]]:
-        return self.response.words
 
 
 @dataclass
diff --git a/plugins/deepgram/vision_agents/plugins/deepgram/stt.py b/plugins/deepgram/vision_agents/plugins/deepgram/stt.py
@@ -20,6 +20,7 @@
 from getstream.video.rtc.track_util import PcmData
 
 from vision_agents.core import stt
+from vision_agents.core.stt import TranscriptResponse
 
 from .utils import generate_silence
 
@@ -217,28 +218,25 @@ async def _on_message(
         # Check if this is a final result
         is_final = transcript.get("is_final", False)
 
-        # Create metadata with useful information
-        metadata = {
-            "confidence": alternatives[0].get("confidence", 0),
-            "words": alternatives[0].get("words", []),
-            "is_final": is_final,
-            "channel_index": transcript.get("channel_index", 0),
-        }
+        # Create response metadata
+        response_metadata = TranscriptResponse(
+            confidence=alternatives[0].get("confidence", 0),
+        )
 
         # Emit immediately for real-time responsiveness
         if is_final:
-            self._emit_transcript_event(transcript_text, self._current_user, metadata)
+            self._emit_transcript_event(transcript_text, self._current_user, response_metadata)
         else:
             self._emit_partial_transcript_event(
-                transcript_text, self._current_user, metadata
+                transcript_text, self._current_user, response_metadata
             )
 
         logger.debug(
             "Received transcript",
             extra={
                 "is_final": is_final,
                 "text_length": len(transcript_text),
-                "confidence": metadata["confidence"],
+                "confidence": response_metadata.confidence,
             },
         )
 
diff --git a/plugins/fish/vision_agents/plugins/fish/stt.py b/plugins/fish/vision_agents/plugins/fish/stt.py
@@ -9,9 +9,9 @@
 from getstream.video.rtc.track_util import PcmData
 
 from vision_agents.core import stt
+from vision_agents.core.stt import TranscriptResponse
 
-if TYPE_CHECKING:
-    from vision_agents.core.edge.types import Participant
+from vision_agents.core.edge.types import Participant
 
 logger = logging.getLogger(__name__)
 
@@ -126,23 +126,12 @@ async def process_audio(
                 logger.error("No transcript returned from Fish Audio %s", pcm_data.duration)
                 return None
 
-            # Build metadata from response
-            metadata: Dict[str, Any] = {
-                "audio_duration_ms": response.duration,
-                "language": self.language or "auto",
-                "model_name": "fish-audio-asr",
-            }
-
-            # Include segments if timestamps were requested
-            if not self.ignore_timestamps and response.segments:
-                metadata["segments"] = [
-                    {
-                        "text": segment.text,
-                        "start": segment.start,
-                        "end": segment.end,
-                    }
-                    for segment in response.segments
-                ]
+            # Build response metadata
+            response_metadata = TranscriptResponse(
+                audio_duration_ms=response.duration,
+                language=self.language or "auto",
+                model_name="fish-audio-asr",
+            )
 
             logger.debug(
                 "Received transcript from Fish Audio",
@@ -152,7 +141,7 @@ async def process_audio(
                 },
             )
 
-            self._emit_transcript_event(transcript_text, participant, metadata)
+            self._emit_transcript_event(transcript_text, participant, response_metadata)
 
         except Exception as e:
             logger.error(
@@ -162,12 +151,3 @@ async def process_audio(
             # Let the base class handle error emission
             raise
 
-    async def close(self):
-        """Close the Fish Audio STT service and clean up resources."""
-        if self._is_closed:
-            logger.debug("Fish Audio STT service already closed")
-            return
-
-        logger.info("Closing Fish Audio STT service")
-        await super().close()
-
diff --git a/plugins/ultralytics/vision_agents/plugins/ultralytics/yolo_pose_processor.py b/plugins/ultralytics/vision_agents/plugins/ultralytics/yolo_pose_processor.py
@@ -96,15 +96,10 @@ async def recv(self) -> av.frame.Frame:
         pts, time_base = await self.next_timestamp()
 
         # Create av.VideoFrame from PIL Image
-        try:
-            av_frame = self.last_frame
-
-            av_frame.pts = pts
-            av_frame.time_base = time_base
-        except Exception:
-            import pdb
+        av_frame = self.last_frame
 
-            pdb.set_trace()
+        av_frame.pts = pts
+        av_frame.time_base = time_base
 
         # if frame_received:
         #    logger.info(f"Returning NEW video frame: {av_frame.width}x{av_frame.height}")
diff --git a/plugins/wizper/vision_agents/plugins/wizper/stt.py b/plugins/wizper/vision_agents/plugins/wizper/stt.py
@@ -38,6 +38,7 @@ async def on_error(error: str):
 import fal_client
 from getstream.video.rtc.track_util import PcmData
 from vision_agents.core import stt
+from vision_agents.core.stt import TranscriptResponse
 
 logger = logging.getLogger(__name__)
 
@@ -154,8 +155,9 @@ async def _process_audio_impl(
                 if "text" in result:
                     text = result["text"].strip()
                     if text:
+                        response_metadata = TranscriptResponse()
                         self._emit_transcript_event(
-                            text, user_metadata, {"chunks": result.get("chunks", [])}
+                            text, user_metadata, response_metadata
                         )
             finally:
                 # Clean up temporary file