Skip to content

Commit f185120

Browse files
committed
more cleanup
1 parent 05ccbfd commit f185120

File tree

6 files changed

+24
-57
lines changed

6 files changed

+24
-57
lines changed

agents-core/vision_agents/core/agents/agents.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -309,7 +309,6 @@ async def on_realtime_agent_speech_transcription(
309309

310310
@self.events.subscribe
311311
async def on_stt_transcript_event_create_response(event: STTTranscriptEvent):
312-
import pdb; pdb.set_trace()
313312
if self.realtime_mode or not self.llm:
314313
# when running in realtime mode, there is no need to send the response to the LLM
315314
return

agents-core/vision_agents/core/stt/events.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ class TranscriptResponse:
99
language: Optional[str] = None
1010
processing_time_ms: Optional[float] = None
1111
audio_duration_ms: Optional[float] = None
12+
model_name: Optional[str] = None
1213
other: Optional[dict] = None
1314

1415
@dataclass
@@ -44,10 +45,6 @@ def audio_duration_ms(self) -> Optional[float]:
4445
@property
4546
def model_name(self) -> Optional[str]:
4647
return self.response.model_name
47-
48-
@property
49-
def words(self) -> Optional[list[dict[str, Any]]]:
50-
return self.response.words
5148

5249

5350
@dataclass
@@ -79,10 +76,6 @@ def audio_duration_ms(self) -> Optional[float]:
7976
@property
8077
def model_name(self) -> Optional[str]:
8178
return self.response.model_name
82-
83-
@property
84-
def words(self) -> Optional[list[dict[str, Any]]]:
85-
return self.response.words
8679

8780

8881
@dataclass

plugins/deepgram/vision_agents/plugins/deepgram/stt.py

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from getstream.video.rtc.track_util import PcmData
2121

2222
from vision_agents.core import stt
23+
from vision_agents.core.stt import TranscriptResponse
2324

2425
from .utils import generate_silence
2526

@@ -217,28 +218,25 @@ async def _on_message(
217218
# Check if this is a final result
218219
is_final = transcript.get("is_final", False)
219220

220-
# Create metadata with useful information
221-
metadata = {
222-
"confidence": alternatives[0].get("confidence", 0),
223-
"words": alternatives[0].get("words", []),
224-
"is_final": is_final,
225-
"channel_index": transcript.get("channel_index", 0),
226-
}
221+
# Create response metadata
222+
response_metadata = TranscriptResponse(
223+
confidence=alternatives[0].get("confidence", 0),
224+
)
227225

228226
# Emit immediately for real-time responsiveness
229227
if is_final:
230-
self._emit_transcript_event(transcript_text, self._current_user, metadata)
228+
self._emit_transcript_event(transcript_text, self._current_user, response_metadata)
231229
else:
232230
self._emit_partial_transcript_event(
233-
transcript_text, self._current_user, metadata
231+
transcript_text, self._current_user, response_metadata
234232
)
235233

236234
logger.debug(
237235
"Received transcript",
238236
extra={
239237
"is_final": is_final,
240238
"text_length": len(transcript_text),
241-
"confidence": metadata["confidence"],
239+
"confidence": response_metadata.confidence,
242240
},
243241
)
244242

plugins/fish/vision_agents/plugins/fish/stt.py

Lines changed: 9 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@
99
from getstream.video.rtc.track_util import PcmData
1010

1111
from vision_agents.core import stt
12+
from vision_agents.core.stt import TranscriptResponse
1213

13-
if TYPE_CHECKING:
14-
from vision_agents.core.edge.types import Participant
14+
from vision_agents.core.edge.types import Participant
1515

1616
logger = logging.getLogger(__name__)
1717

@@ -126,23 +126,12 @@ async def process_audio(
126126
logger.error("No transcript returned from Fish Audio %s", pcm_data.duration)
127127
return None
128128

129-
# Build metadata from response
130-
metadata: Dict[str, Any] = {
131-
"audio_duration_ms": response.duration,
132-
"language": self.language or "auto",
133-
"model_name": "fish-audio-asr",
134-
}
135-
136-
# Include segments if timestamps were requested
137-
if not self.ignore_timestamps and response.segments:
138-
metadata["segments"] = [
139-
{
140-
"text": segment.text,
141-
"start": segment.start,
142-
"end": segment.end,
143-
}
144-
for segment in response.segments
145-
]
129+
# Build response metadata
130+
response_metadata = TranscriptResponse(
131+
audio_duration_ms=response.duration,
132+
language=self.language or "auto",
133+
model_name="fish-audio-asr",
134+
)
146135

147136
logger.debug(
148137
"Received transcript from Fish Audio",
@@ -152,7 +141,7 @@ async def process_audio(
152141
},
153142
)
154143

155-
self._emit_transcript_event(transcript_text, participant, metadata)
144+
self._emit_transcript_event(transcript_text, participant, response_metadata)
156145

157146
except Exception as e:
158147
logger.error(
@@ -162,12 +151,3 @@ async def process_audio(
162151
# Let the base class handle error emission
163152
raise
164153

165-
async def close(self):
166-
"""Close the Fish Audio STT service and clean up resources."""
167-
if self._is_closed:
168-
logger.debug("Fish Audio STT service already closed")
169-
return
170-
171-
logger.info("Closing Fish Audio STT service")
172-
await super().close()
173-

plugins/ultralytics/vision_agents/plugins/ultralytics/yolo_pose_processor.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -96,15 +96,10 @@ async def recv(self) -> av.frame.Frame:
9696
pts, time_base = await self.next_timestamp()
9797

9898
# Create av.VideoFrame from PIL Image
99-
try:
100-
av_frame = self.last_frame
101-
102-
av_frame.pts = pts
103-
av_frame.time_base = time_base
104-
except Exception:
105-
import pdb
99+
av_frame = self.last_frame
106100

107-
pdb.set_trace()
101+
av_frame.pts = pts
102+
av_frame.time_base = time_base
108103

109104
# if frame_received:
110105
# logger.info(f"Returning NEW video frame: {av_frame.width}x{av_frame.height}")

plugins/wizper/vision_agents/plugins/wizper/stt.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ async def on_error(error: str):
3838
import fal_client
3939
from getstream.video.rtc.track_util import PcmData
4040
from vision_agents.core import stt
41+
from vision_agents.core.stt import TranscriptResponse
4142

4243
logger = logging.getLogger(__name__)
4344

@@ -154,8 +155,9 @@ async def _process_audio_impl(
154155
if "text" in result:
155156
text = result["text"].strip()
156157
if text:
158+
response_metadata = TranscriptResponse()
157159
self._emit_transcript_event(
158-
text, user_metadata, {"chunks": result.get("chunks", [])}
160+
text, user_metadata, response_metadata
159161
)
160162
finally:
161163
# Clean up temporary file

0 commit comments

Comments
 (0)