livekit
diff --git a/‎.changeset/big-cars-join.md‎
Lines changed: 13 additions & 0 deletions b/‎.changeset/big-cars-join.md‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎livekit-agents/livekit/agents/tts/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎livekit-agents/livekit/agents/tts/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎livekit-agents/livekit/agents/tts/tts.py‎
Lines changed: 45 additions & 0 deletions b/‎livekit-agents/livekit/agents/tts/tts.py‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎livekit-agents/livekit/agents/utils/codecs/__init__.py‎
Lines changed: 1 addition & 2 deletions b/‎livekit-agents/livekit/agents/utils/codecs/__init__.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎livekit-agents/livekit/agents/utils/codecs/mp3.py‎
Lines changed: 0 additions & 85 deletions b/‎livekit-agents/livekit/agents/utils/codecs/mp3.py‎
Lines changed: 0 additions & 85 deletions
diff --git a/‎livekit-agents/livekit/agents/utils/connection_pool.py‎
Lines changed: 5 additions & 0 deletions b/‎livekit-agents/livekit/agents/utils/connection_pool.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎livekit-plugins/livekit-plugins-aws/livekit/plugins/aws/log.py‎
Lines changed: 2 additions & 0 deletions b/‎livekit-plugins/livekit-plugins-aws/livekit/plugins/aws/log.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎livekit-plugins/livekit-plugins-aws/livekit/plugins/aws/models.py‎
Lines changed: 1 addition & 1 deletion b/‎livekit-plugins/livekit-plugins-aws/livekit/plugins/aws/models.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎livekit-plugins/livekit-plugins-aws/livekit/plugins/aws/tts.py‎
Lines changed: 32 additions & 34 deletions b/‎livekit-plugins/livekit-plugins-aws/livekit/plugins/aws/tts.py‎
Lines changed: 32 additions & 34 deletions
@@ -0,0 +1,13 @@
+---
+"livekit-plugins-elevenlabs": minor
+"livekit-plugins-deepgram": minor
+"livekit-plugins-cartesia": patch
+"livekit-plugins-google": patch
+"livekit-plugins-openai": patch
+"livekit-plugins-playai": patch
+"livekit-plugins-rime": patch
+"livekit-plugins-aws": patch
+"livekit-agents": patch
+---
+
+use streaming AudioDecoder to handle compressed encoding
@@ -9,6 +9,7 @@
     TTS,
     ChunkedStream,
     SynthesizedAudio,
+    SynthesizedAudioEmitter,
     SynthesizeStream,
     TTSCapabilities,
 )
@@ -25,4 +26,5 @@
     "FallbackAdapter",
     "FallbackChunkedStream",
     "FallbackSynthesizeStream",
+    "SynthesizedAudioEmitter",
 ]
@@ -430,3 +430,48 @@ async def __aexit__(
         exc_tb: TracebackType | None,
     ) -> None:
         await self.aclose()
+
+
+class SynthesizedAudioEmitter:
+    """Utility for buffering and emitting audio frames with metadata to a channel.
+
+    This class helps TTS implementers to correctly handle is_final logic when streaming responses.
+    """
+
+    def __init__(
+        self,
+        *,
+        event_ch: aio.Chan[SynthesizedAudio],
+        request_id: str,
+        segment_id: str = "",
+    ) -> None:
+        self._event_ch = event_ch
+        self._frame: rtc.AudioFrame | None = None
+        self._request_id = request_id
+        self._segment_id = segment_id
+
+    def push(self, frame: Optional[rtc.AudioFrame]):
+        """Emits any buffered frame and stores the new frame for later emission.
+
+        The buffered frame is emitted as not final.
+        """
+        self._emit_frame(is_final=False)
+        self._frame = frame
+
+    def _emit_frame(self, is_final: bool = False):
+        """Sends the buffered frame to the event channel if one exists."""
+        if self._frame is None:
+            return
+        self._event_ch.send_nowait(
+            SynthesizedAudio(
+                frame=self._frame,
+                request_id=self._request_id,
+                segment_id=self._segment_id,
+                is_final=is_final,
+            )
+        )
+        self._frame = None
+
+    def flush(self):
+        """Emits any buffered frame as final."""
+        self._emit_frame(is_final=True)
@@ -13,6 +13,5 @@
 # limitations under the License.
 
 from .decoder import AudioStreamDecoder, StreamBuffer
-from .mp3 import Mp3StreamDecoder
 
-__all__ = ["Mp3StreamDecoder", "AudioStreamDecoder", "StreamBuffer"]
+__all__ = ["AudioStreamDecoder", "StreamBuffer"]
@@ -24,17 +24,20 @@ def __init__(
         self,
         *,
         max_session_duration: Optional[float] = None,
+        mark_refreshed_on_get: bool = False,
         connect_cb: Optional[Callable[[], Awaitable[T]]] = None,
         close_cb: Optional[Callable[[T], Awaitable[None]]] = None,
     ) -> None:
         """Initialize the connection wrapper.
 
         Args:
             max_session_duration: Maximum duration in seconds before forcing reconnection
+            mark_refreshed_on_get: If True, the session will be marked as fresh when get() is called. only used when max_session_duration is set.
             connect_cb: Optional async callback to create new connections
             close_cb: Optional async callback to close connections
         """
         self._max_session_duration = max_session_duration
+        self._mark_refreshed_on_get = mark_refreshed_on_get
         self._connect_cb = connect_cb
         self._close_cb = close_cb
         self._connections: dict[T, float] = {}  # conn -> connected_at timestamp
@@ -95,6 +98,8 @@ async def get(self) -> T:
                 self._max_session_duration is None
                 or now - self._connections[conn] <= self._max_session_duration
             ):
+                if self._mark_refreshed_on_get:
+                    self._connections[conn] = now
                 return conn
             # connection expired; mark it for resetting.
             self.remove(conn)
 
@@ -1,3 +1,5 @@
 import logging
 
 logger = logging.getLogger("livekit.plugins.aws")
+for logger_name in ["botocore", "aiobotocore"]:
+    logging.getLogger(logger_name).setLevel(logging.INFO)
@@ -45,4 +45,4 @@
     "de-CH",
 ]
 
-TTS_OUTPUT_FORMAT = Literal["pcm", "mp3"]
+TTS_OUTPUT_FORMAT = Literal["mp3"]
@@ -18,7 +18,6 @@
 
 import aiohttp
 from aiobotocore.session import AioSession, get_session
-from livekit import rtc
 from livekit.agents import (
     APIConnectionError,
     APIConnectOptions,
@@ -29,10 +28,9 @@
 )
 
 from ._utils import _get_aws_credentials
-from .models import TTS_LANGUAGE, TTS_OUTPUT_FORMAT, TTS_SPEECH_ENGINE
+from .models import TTS_LANGUAGE, TTS_SPEECH_ENGINE
 
 TTS_NUM_CHANNELS: int = 1
-DEFAULT_OUTPUT_FORMAT: TTS_OUTPUT_FORMAT = "pcm"
 DEFAULT_SPEECH_ENGINE: TTS_SPEECH_ENGINE = "generative"
 DEFAULT_SPEECH_REGION = "us-east-1"
 DEFAULT_VOICE = "Ruth"
@@ -43,7 +41,6 @@
 class _TTSOptions:
     # https://docs.aws.amazon.com/polly/latest/dg/API_SynthesizeSpeech.html
     voice: str | None
-    output_format: TTS_OUTPUT_FORMAT
     speech_engine: TTS_SPEECH_ENGINE
     speech_region: str
     sample_rate: int
@@ -56,7 +53,6 @@ def __init__(
         *,
         voice: str | None = DEFAULT_VOICE,
         language: TTS_LANGUAGE | str | None = None,
-        output_format: TTS_OUTPUT_FORMAT = DEFAULT_OUTPUT_FORMAT,
         speech_engine: TTS_SPEECH_ENGINE = DEFAULT_SPEECH_ENGINE,
         sample_rate: int = DEFAULT_SAMPLE_RATE,
         speech_region: str = DEFAULT_SPEECH_REGION,
@@ -75,7 +71,6 @@ def __init__(
         Args:
             Voice (TTSModels, optional): Voice ID to use for the synthesis. Defaults to "Ruth".
             language (TTS_LANGUAGE, optional): language code for the Synthesize Speech request. This is only necessary if using a bilingual voice, such as Aditi, which can be used for either Indian English (en-IN) or Hindi (hi-IN).
-            output_format(TTS_OUTPUT_FORMAT, optional): The format in which the returned output will be encoded. Defaults to "pcm".
             sample_rate(int, optional): The audio frequency specified in Hz. Defaults to 16000.
             speech_engine(TTS_SPEECH_ENGINE, optional): The engine to use for the synthesis. Defaults to "generative".
             speech_region(str, optional): The region to use for the synthesis. Defaults to "us-east-1".
@@ -96,7 +91,6 @@ def __init__(
 
         self._opts = _TTSOptions(
             voice=voice,
-            output_format=output_format,
             speech_engine=speech_engine,
             speech_region=speech_region,
             language=language,
@@ -149,7 +143,7 @@ async def _run(self):
             async with self._get_client() as client:
                 params = {
                     "Text": self._input_text,
-                    "OutputFormat": self._opts.output_format,
+                    "OutputFormat": "mp3",
                     "Engine": self._opts.speech_engine,
                     "VoiceId": self._opts.voice,
                     "TextType": "text",
@@ -158,32 +152,36 @@ async def _run(self):
                 }
                 response = await client.synthesize_speech(**_strip_nones(params))
                 if "AudioStream" in response:
-                    decoder = utils.codecs.Mp3StreamDecoder()
-                    async with response["AudioStream"] as resp:
-                        async for data, _ in resp.content.iter_chunks():
-                            if self._opts.output_format == "mp3":
-                                frames = decoder.decode_chunk(data)
-                                for frame in frames:
-                                    self._event_ch.send_nowait(
-                                        tts.SynthesizedAudio(
-                                            request_id=request_id,
-                                            segment_id=self._segment_id,
-                                            frame=frame,
-                                        )
-                                    )
-                            else:
-                                self._event_ch.send_nowait(
-                                    tts.SynthesizedAudio(
-                                        request_id=request_id,
-                                        segment_id=self._segment_id,
-                                        frame=rtc.AudioFrame(
-                                            data=data,
-                                            sample_rate=self._opts.sample_rate,
-                                            num_channels=1,
-                                            samples_per_channel=len(data) // 2,
-                                        ),
-                                    )
-                                )
+                    decoder = utils.codecs.AudioStreamDecoder(
+                        sample_rate=self._opts.sample_rate,
+                        num_channels=1,
+                    )
+
+                    # Create a task to push data to the decoder
+                    async def push_data():
+                        try:
+                            async with response["AudioStream"] as resp:
+                                async for data, _ in resp.content.iter_chunks():
+                                    decoder.push(data)
+                        finally:
+                            decoder.end_input()
+
+                    # Start pushing data to the decoder
+                    push_task = asyncio.create_task(push_data())
+
+                    try:
+                        # Create emitter and process decoded frames
+                        emitter = tts.SynthesizedAudioEmitter(
+                            event_ch=self._event_ch,
+                            request_id=request_id,
+                            segment_id=self._segment_id,
+                        )
+                        async for frame in decoder:
+                            emitter.push(frame)
+                        emitter.flush()
+                        await push_task
+                    finally:
+                        await utils.aio.gracefully_cancel(push_task)
 
         except asyncio.TimeoutError as e:
             raise APITimeoutError() from e
Original file line number	Diff line number	Diff line change
`@@ -9,6 +9,7 @@`
`9`	`9`	`TTS,`
`10`	`10`	`ChunkedStream,`
`11`	`11`	`SynthesizedAudio,`
	`12`	`+ SynthesizedAudioEmitter,`
`12`	`13`	`SynthesizeStream,`
`13`	`14`	`TTSCapabilities,`
`14`	`15`	`)`
`@@ -25,4 +26,5 @@`
`25`	`26`	`"FallbackAdapter",`
`26`	`27`	`"FallbackChunkedStream",`
`27`	`28`	`"FallbackSynthesizeStream",`
	`29`	`+ "SynthesizedAudioEmitter",`
`28`	`30`	`]`
Original file line number	Diff line number	Diff line change
`@@ -45,4 +45,4 @@`
`45`	`45`	`"de-CH",`
`46`	`46`	`]`
`47`	`47`
`48`		`-TTS_OUTPUT_FORMAT = Literal["pcm", "mp3"]`
	`48`	`+TTS_OUTPUT_FORMAT = Literal["mp3"]`