Skip to content

Commit 3316908

Browse files
authored
Simplify TTS plugin and audio utils (#123)
- Simplified TTS plugin - AWS Polly TTS plugin - OpenAI TTS plugin - Improved audio utils
1 parent 6a725b0 commit 3316908

File tree

34 files changed

+2190
-1323
lines changed

34 files changed

+2190
-1323
lines changed

DEVELOPMENT.md

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,73 @@ To see how the agent work open up agents.py
109109
* The LLM uses the VideoForwarder to write the video to a websocket or webrtc connection
110110
* The STS writes the reply on agent.llm.audio_track and the RealtimeTranscriptEvent / RealtimePartialTranscriptEvent
111111

112+
## Audio management
113+
114+
Some important things about audio inside the library:
115+
116+
1. WebRTC uses Opus 48khz stereo but inside the library audio is always in PCM format
117+
2. Plugins / AI models work with different PCM formats, usually 16khz mono
118+
3. PCM data is always passed around using the `PcmData` object which contains information about sample rate, channels and format
119+
4. Text-to-speech plugins automatically return PCM in the format needed by WebRTC. This is exposed via the `set_output_format` method
120+
5. Audio resampling can be done using `PcmData.resample` method
121+
6. When resampling audio in chunks, it is important to re-use the same `av.AudioResampler` resampler (see `PcmData.resample` and `core.tts.TTS`)
122+
7. Adjusting from stereo to mono and vice-versa can be done using the `PcmData.resample` method
123+
124+
Some ground rules:
125+
126+
1. Do not build code to resample / adjust audio unless it is not covered already by `PcmData`
127+
2. Do not pass PCM as plain bytes around and write code that assumes specific sample rate or format. Use `PcmData` instead
128+
129+
## Example
130+
131+
```python
132+
import asyncio
133+
from vision_agents.core.edge.types import PcmData
134+
from openai import AsyncOpenAI
135+
136+
async def example():
137+
client = AsyncOpenAI(api_key="sk-42")
138+
139+
resp = await client.audio.speech.create(
140+
model="gpt-4o-mini-tts",
141+
voice="alloy",
142+
input="pcm is cool, give me some of that please",
143+
response_format="pcm",
144+
)
145+
146+
# load response into PcmData, note that you need to specify sample_rate, channels and format
147+
pcm_data = PcmData.from_bytes(
148+
resp.content, sample_rate=24_000, channels=1, format="s16"
149+
)
150+
151+
# check if pcm_data is stereo (it's not in this case ofc)
152+
print(pcm_data.stereo)
153+
154+
# write the pcm to file
155+
with open("test.wav", "wb") as f:
156+
f.write(pcm_data.to_wav_bytes())
157+
158+
# resample pcm to be 48khz stereo
159+
resampled_pcm = pcm_data.resample(48_000, 2)
160+
161+
# play-out pcm using ffplay
162+
from vision_agents.core.edge.types import play_pcm_with_ffplay
163+
164+
await play_pcm_with_ffplay(resampled_pcm)
165+
166+
if __name__ == "__main__":
167+
asyncio.run(example())
168+
```
169+
170+
171+
### Testing audio manually
172+
173+
Sometimes you need to test audio manually, here's some tips:
174+
175+
1. Do not use earplugs when testing PCM playback ;)
176+
2. You can use the `PcmData.to_wav_bytes` method to convert PCM into wav bytes (see `manual_tts_to_wav` for an example)
177+
3. If you have `ffplay` installed, you can playback pcm directly to check if audio is correct
178+
112179
## Dev / Contributor Guidelines
113180

114181
### Light wrapping

agents-core/vision_agents/core/agents/agents.py

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
from typing import TYPE_CHECKING, Any, Dict, List, Optional
66
from uuid import uuid4
77

8-
import aiortc
98
import getstream.models
109
from aiortc import VideoStreamTrack
1110
from getstream.video.rtc import Call
@@ -15,7 +14,7 @@
1514
from getstream.video.rtc.pb.stream.video.sfu.models.models_pb2 import TrackType
1615
from ..edge import sfu_events
1716
from ..edge.events import AudioReceivedEvent, TrackAddedEvent, CallEndedEvent
18-
from ..edge.types import Connection, Participant, PcmData, User
17+
from ..edge.types import Connection, Participant, PcmData, User, OutputAudioTrack
1918
from ..events.manager import EventManager
2019
from ..llm import events as llm_events
2120
from ..llm.events import (
@@ -32,6 +31,7 @@
3231
from ..stt.events import STTTranscriptEvent, STTErrorEvent
3332
from ..stt.stt import STT
3433
from ..tts.tts import TTS
34+
from ..tts.events import TTSAudioEvent
3535
from ..turn_detection import TurnDetector, TurnStartedEvent, TurnEndedEvent
3636
from ..vad import VAD
3737
from ..vad.events import VADAudioEvent
@@ -160,7 +160,7 @@ def __init__(
160160
self._callback_executed = False
161161
self._track_tasks: Dict[str, asyncio.Task] = {}
162162
self._connection: Optional[Connection] = None
163-
self._audio_track: Optional[aiortc.AudioStreamTrack] = None
163+
self._audio_track: Optional[OutputAudioTrack] = None
164164
self._video_track: Optional[VideoStreamTrack] = None
165165
self._realtime_connection = None
166166
self._pc_track_handler_attached: bool = False
@@ -307,6 +307,11 @@ async def on_realtime_agent_speech_transcription(
307307
original=event,
308308
)
309309

310+
@self.events.subscribe
311+
async def _on_tts_audio_write_to_output(event: TTSAudioEvent):
312+
if self._audio_track and event and event.audio_data is not None:
313+
await self._audio_track.write(event.audio_data)
314+
310315
@self.events.subscribe
311316
async def on_stt_transcript_event_create_response(event: STTTranscriptEvent):
312317
if self.realtime_mode or not self.llm:
@@ -1021,19 +1026,19 @@ def _prepare_rtc(self):
10211026
self._audio_track = self.llm.output_track
10221027
self.logger.info("🎵 Using Realtime provider output track for audio")
10231028
else:
1024-
# TODO: what if we want to transform audio...
1025-
# Get the required framerate and stereo setting from TTS plugin, default to 48000 for WebRTC
1026-
if self.tts:
1027-
framerate = self.tts.get_required_framerate()
1028-
stereo = self.tts.get_required_stereo()
1029-
else:
1030-
framerate = 48000
1031-
stereo = True # Default to stereo for WebRTC
1029+
# Default to WebRTC-friendly format unless configured differently
1030+
framerate = 48000
1031+
stereo = True
10321032
self._audio_track = self.edge.create_audio_track(
10331033
framerate=framerate, stereo=stereo
10341034
)
1035+
# Inform TTS of desired output format so it can resample accordingly
10351036
if self.tts:
1036-
self.tts.set_output_track(self._audio_track)
1037+
channels = 2 if stereo else 1
1038+
self.tts.set_output_format(
1039+
sample_rate=framerate,
1040+
channels=channels,
1041+
)
10371042

10381043
# Set up video track if video publishers are available
10391044
if self.publish_video:

agents-core/vision_agents/core/edge/edge_transport.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,17 @@
11
"""
22
Abstraction for stream vs other services here
33
"""
4+
45
import abc
56

67
from typing import TYPE_CHECKING, Any, Optional
78

89
import aiortc
910
from pyee.asyncio import AsyncIOEventEmitter
1011

11-
from vision_agents.core.edge.types import User
12+
from vision_agents.core.edge.types import User, OutputAudioTrack
1213

1314
if TYPE_CHECKING:
14-
1515
pass
1616

1717

@@ -31,7 +31,7 @@ async def create_user(self, user: User):
3131
pass
3232

3333
@abc.abstractmethod
34-
def create_audio_track(self):
34+
def create_audio_track(self) -> OutputAudioTrack:
3535
pass
3636

3737
@abc.abstractmethod
@@ -55,6 +55,7 @@ async def create_conversation(self, call: Any, user: User, instructions):
5555
pass
5656

5757
@abc.abstractmethod
58-
def add_track_subscriber(self, track_id: str) -> Optional[aiortc.mediastreams.MediaStreamTrack]:
58+
def add_track_subscriber(
59+
self, track_id: str
60+
) -> Optional[aiortc.mediastreams.MediaStreamTrack]:
5961
pass
60-

0 commit comments

Comments
 (0)