Skip to content

Commit 9ef8d75

Browse files
authored
implement end of utterance detection for AssemblyAI (vocodedev#512)
* implement end of utterance detection * replace threshold_msg with end_utterance_silence_threshold_msg * implement end of utterance detection with endpointing config * standardise end_utterance_silence_threshold to be in milliseconds * fix error * ensure type safety for endpointing_config attribute access * add imports to fix errors
1 parent c4eca7f commit 9ef8d75

File tree

2 files changed

+15
-5
lines changed

2 files changed

+15
-5
lines changed

vocode/streaming/models/transcriber.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ class AssemblyAITranscriberConfig(
130130
):
131131
buffer_size_seconds: float = 0.1
132132
word_boost: Optional[List[str]] = None
133+
end_utterance_silence_threshold_milliseconds: Optional[int] = None
133134

134135

135136
class WhisperCPPTranscriberConfig(

vocode/streaming/transcriber/assembly_ai_transcriber.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from urllib.parse import urlencode
99
from vocode import getenv
1010

11-
from vocode.streaming.models.transcriber import AssemblyAITranscriberConfig
11+
from vocode.streaming.models.transcriber import AssemblyAITranscriberConfig, TimeEndpointingConfig, PunctuationEndpointingConfig
1212
from vocode.streaming.models.websocket import AudioMessage
1313
from vocode.streaming.transcriber.base_transcriber import (
1414
BaseAsyncTranscriber,
@@ -54,12 +54,18 @@ def __init__(
5454
)
5555
self._ended = False
5656
self.logger = logger or logging.getLogger(__name__)
57-
if self.transcriber_config.endpointing_config:
58-
raise Exception("Assembly AI endpointing config not supported yet")
59-
6057
self.buffer = bytearray()
6158
self.audio_cursor = 0
62-
self.terminate_msg = str.encode(json.dumps({"terminate_session": True}))
59+
60+
if isinstance(self.transcriber_config.endpointing_config, (TimeEndpointingConfig, PunctuationEndpointingConfig)):
61+
self.transcriber_config.end_utterance_silence_threshold_milliseconds = int(self.transcriber_config.endpointing_config.time_cutoff_seconds * 1000)
62+
self.terminate_msg = json.dumps({"terminate_session": True})
63+
self.end_utterance_silence_threshold_msg = (
64+
None if self.transcriber_config.end_utterance_silence_threshold_milliseconds is None
65+
else json.dumps(
66+
{"end_utterance_silence_threshold": self.transcriber_config.end_utterance_silence_threshold_milliseconds}
67+
)
68+
)
6369

6470
async def ready(self):
6571
return True
@@ -107,6 +113,9 @@ async def process(self):
107113
) as ws:
108114
await asyncio.sleep(0.1)
109115

116+
if self.end_utterance_silence_threshold_msg:
117+
await ws.send(self.end_utterance_silence_threshold_msg)
118+
110119
async def sender(ws): # sends audio to websocket
111120
while not self._ended:
112121
try:

0 commit comments

Comments
 (0)