Skip to content

Commit 05ccbfd

Browse files
committed
cleanup
1 parent bb834ca commit 05ccbfd

File tree

4 files changed

+87
-39
lines changed

4 files changed

+87
-39
lines changed
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
from .stt import STT
2+
from .events import TranscriptResponse
23

3-
__all__ = ["STT"]
4+
__all__ = ["STT", "TranscriptResponse"]

agents-core/vision_agents/core/stt/events.py

Lines changed: 60 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,23 +3,51 @@
33
from typing import Optional, Any
44

55

6+
@dataclass
7+
class TranscriptResponse:
8+
confidence: Optional[float] = None
9+
language: Optional[str] = None
10+
processing_time_ms: Optional[float] = None
11+
audio_duration_ms: Optional[float] = None
12+
other: Optional[dict] = None
13+
614
@dataclass
715
class STTTranscriptEvent(PluginBaseEvent):
816
"""Event emitted when a complete transcript is available."""
917

1018
type: str = field(default='plugin.stt_transcript', init=False)
1119
text: str = ""
12-
confidence: Optional[float] = None
13-
language: Optional[str] = None
14-
processing_time_ms: Optional[float] = None
15-
audio_duration_ms: Optional[float] = None
16-
model_name: Optional[str] = None
17-
words: Optional[list[dict[str, Any]]] = None
20+
response: TranscriptResponse = field(default_factory=TranscriptResponse)
1821
is_final: bool = True
1922

2023
def __post_init__(self):
2124
if not self.text:
2225
raise ValueError("Transcript text cannot be empty")
26+
27+
# Convenience properties for backward compatibility
28+
@property
29+
def confidence(self) -> Optional[float]:
30+
return self.response.confidence
31+
32+
@property
33+
def language(self) -> Optional[str]:
34+
return self.response.language
35+
36+
@property
37+
def processing_time_ms(self) -> Optional[float]:
38+
return self.response.processing_time_ms
39+
40+
@property
41+
def audio_duration_ms(self) -> Optional[float]:
42+
return self.response.audio_duration_ms
43+
44+
@property
45+
def model_name(self) -> Optional[str]:
46+
return self.response.model_name
47+
48+
@property
49+
def words(self) -> Optional[list[dict[str, Any]]]:
50+
return self.response.words
2351

2452

2553
@dataclass
@@ -28,13 +56,33 @@ class STTPartialTranscriptEvent(PluginBaseEvent):
2856

2957
type: str = field(default='plugin.stt_partial_transcript', init=False)
3058
text: str = ""
31-
confidence: Optional[float] = None
32-
language: Optional[str] = None
33-
processing_time_ms: Optional[float] = None
34-
audio_duration_ms: Optional[float] = None
35-
model_name: Optional[str] = None
36-
words: Optional[list[dict[str, Any]]] = None
59+
response: TranscriptResponse = field(default_factory=TranscriptResponse)
3760
is_final: bool = False
61+
62+
# Convenience properties for backward compatibility
63+
@property
64+
def confidence(self) -> Optional[float]:
65+
return self.response.confidence
66+
67+
@property
68+
def language(self) -> Optional[str]:
69+
return self.response.language
70+
71+
@property
72+
def processing_time_ms(self) -> Optional[float]:
73+
return self.response.processing_time_ms
74+
75+
@property
76+
def audio_duration_ms(self) -> Optional[float]:
77+
return self.response.audio_duration_ms
78+
79+
@property
80+
def model_name(self) -> Optional[str]:
81+
return self.response.model_name
82+
83+
@property
84+
def words(self) -> Optional[list[dict[str, Any]]]:
85+
return self.response.words
3886

3987

4088
@dataclass

agents-core/vision_agents/core/stt/stt.py

Lines changed: 15 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from ..edge.types import Participant
88
from vision_agents.core.events.manager import EventManager
99
from . import events
10+
from .events import TranscriptResponse
1011

1112
logger = logging.getLogger(__name__)
1213

@@ -37,62 +38,52 @@ def __init__(
3738
def _emit_transcript_event(
3839
self,
3940
text: str,
40-
user_metadata: Optional[Union[Dict[str, Any], Participant]],
41-
metadata: Dict[str, Any],
41+
participant: Optional[Union[Dict[str, Any], Participant]],
42+
response: TranscriptResponse,
4243
):
4344
"""
4445
Emit a final transcript event with structured data.
4546
4647
Args:
4748
text: The transcribed text.
48-
user_metadata: User-specific metadata.
49-
metadata: Transcription metadata (processing time, confidence, etc.).
49+
participant: Participant metadata.
50+
response: Transcription response metadata.
5051
"""
5152
self.events.send(events.STTTranscriptEvent(
5253
session_id=self.session_id,
5354
plugin_name=self.provider_name,
5455
text=text,
55-
user_metadata=user_metadata,
56-
confidence=metadata.get("confidence"),
57-
language=metadata.get("language"),
58-
processing_time_ms=metadata.get("processing_time_ms"),
59-
audio_duration_ms=metadata.get("audio_duration_ms"),
60-
model_name=metadata.get("model_name"),
61-
words=metadata.get("words"),
56+
user_metadata=participant,
57+
response=response,
6258
))
6359

6460
def _emit_partial_transcript_event(
6561
self,
6662
text: str,
67-
user_metadata: Optional[Union[Dict[str, Any], Participant]],
68-
metadata: Dict[str, Any],
63+
participant: Optional[Union[Dict[str, Any], Participant]],
64+
response: TranscriptResponse,
6965
):
7066
"""
7167
Emit a partial transcript event with structured data.
7268
7369
Args:
7470
text: The partial transcribed text.
75-
user_metadata: User-specific metadata.
76-
metadata: Transcription metadata (processing time, confidence, etc.).
71+
participant: Participant metadata.
72+
response: Transcription response metadata.
7773
"""
7874
self.events.send(events.STTPartialTranscriptEvent(
7975
session_id=self.session_id,
8076
plugin_name=self.provider_name,
8177
text=text,
82-
user_metadata=user_metadata,
83-
confidence=metadata.get("confidence"),
84-
language=metadata.get("language"),
85-
processing_time_ms=metadata.get("processing_time_ms"),
86-
audio_duration_ms=metadata.get("audio_duration_ms"),
87-
model_name=metadata.get("model_name"),
88-
words=metadata.get("words"),
78+
user_metadata=participant,
79+
response=response,
8980
))
9081

9182
def _emit_error_event(
9283
self,
9384
error: Exception,
9485
context: str = "",
95-
user_metadata: Optional[Union[Dict[str, Any], Participant]] = None,
86+
participant: Optional[Union[Dict[str, Any], Participant]] = None,
9687
):
9788
"""
9889
Emit an error event. Note this should only be emitted for temporary errors.
@@ -103,7 +94,7 @@ def _emit_error_event(
10394
plugin_name=self.provider_name,
10495
error=error,
10596
context=context,
106-
user_metadata=user_metadata,
97+
user_metadata=participant,
10798
error_code=getattr(error, "error_code", None),
10899
is_recoverable=not isinstance(error, (SystemExit, KeyboardInterrupt)),
109100
))

docs/ai/instructions/ai-stt.md

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
```python
44
from vision_agents.core import stt
5+
from vision_agents.core.stt.events import TranscriptResponse
56

67
class MySTT(stt.STT):
78

@@ -24,12 +25,19 @@ class MySTT(stt.STT):
2425
parts = self.client.stt(pcm_data, stream=True)
2526
full_text = ""
2627
for part in parts:
28+
response = TranscriptResponse(
29+
confidence=0.9,
30+
language='en',
31+
processing_time_ms=300,
32+
audio_duration_ms=2000,
33+
other={}
34+
)
2735
# parts that aren't finished
28-
self._emit_partial_transcript_event(part, participant, metadata)
36+
self._emit_partial_transcript_event(part, participant, response)
2937
full_text += part
3038

3139
# the full text
32-
self._emit_transcript_event(full_text, participant, metadata)
40+
self._emit_transcript_event(full_text, participant, response)
3341

3442
```
3543

0 commit comments

Comments
 (0)