livekit
diff --git a/‎.changeset/blue-ants-heal.md‎
Lines changed: 8 additions & 0 deletions b/‎.changeset/blue-ants-heal.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎.changeset/eleven-brooms-watch.md‎
Lines changed: 5 additions & 0 deletions b/‎.changeset/eleven-brooms-watch.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎.changeset/fluffy-apricots-attack.md‎
Lines changed: 5 additions & 0 deletions b/‎.changeset/fluffy-apricots-attack.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎examples/voice-pipeline-agent/function_calling_weather.py‎
Lines changed: 39 additions & 27 deletions b/‎examples/voice-pipeline-agent/function_calling_weather.py‎
Lines changed: 39 additions & 27 deletions
diff --git a/‎livekit-agents/livekit/agents/cli/log.py‎
Lines changed: 2 additions & 0 deletions b/‎livekit-agents/livekit/agents/cli/log.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎livekit-agents/livekit/agents/pipeline/agent_output.py‎
Lines changed: 8 additions & 37 deletions b/‎livekit-agents/livekit/agents/pipeline/agent_output.py‎
Lines changed: 8 additions & 37 deletions
diff --git a/‎livekit-agents/livekit/agents/tts/fallback_adapter.py‎
Lines changed: 4 additions & 0 deletions b/‎livekit-agents/livekit/agents/tts/fallback_adapter.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎livekit-agents/livekit/agents/tts/stream_adapter.py‎
Lines changed: 3 additions & 0 deletions b/‎livekit-agents/livekit/agents/tts/stream_adapter.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎livekit-agents/livekit/agents/tts/tts.py‎
Lines changed: 4 additions & 0 deletions b/‎livekit-agents/livekit/agents/tts/tts.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎livekit-agents/livekit/agents/utils/codecs/decoder.py‎
Lines changed: 23 additions & 19 deletions b/‎livekit-agents/livekit/agents/utils/codecs/decoder.py‎
Lines changed: 23 additions & 19 deletions
@@ -0,0 +1,8 @@
+---
+"livekit-plugins-elevenlabs": patch
+"livekit-plugins-cartesia": patch
+"livekit-plugins-deepgram": patch
+"livekit-agents": patch
+---
+
+added a tts.prewarm method to start the connection pool early.
@@ -0,0 +1,5 @@
+---
+"livekit-agents": patch
+---
+
+fixed a bug in AudioStreamDecoder where it could fail on close
@@ -0,0 +1,5 @@
+---
+"livekit-plugins-elevenlabs": patch
+---
+
+deprecated elevenlabs' optimize_stream_latency option
@@ -1,5 +1,5 @@
+import asyncio
 import logging
-import random
 from typing import Annotated
 
 import aiohttp
@@ -11,8 +11,9 @@
     WorkerOptions,
     cli,
     llm,
+    metrics,
 )
-from livekit.agents.pipeline import AgentCallContext, VoicePipelineAgent
+from livekit.agents.pipeline import VoicePipelineAgent
 from livekit.plugins import deepgram, openai, silero
 
 load_dotenv()
@@ -50,25 +51,21 @@ async def get_weather(
         # that it might take awhile:
         # Option 1: you can use .say filler message immediately after the call is triggered
         # Option 2: you can prompt the agent to return a text response when it's making a function call
-        agent = AgentCallContext.get_current().agent
-
-        if (
-            not agent.chat_ctx.messages
-            or agent.chat_ctx.messages[-1].role != "assistant"
-        ):
-            # skip if assistant already said something
-            filler_messages = [
-                "Let me check the weather in {location} for you.",
-                "Let me see what the weather is like in {location} right now.",
-                # LLM will complete this sentence if it is added to the end of the chat context
-                "The current weather in {location} is ",
-            ]
-            message = random.choice(filler_messages).format(location=location)
-            logger.info(f"saying filler message: {message}")
-
-            # NOTE: set add_to_chat_ctx=True will add the message to the end
-            #   of the chat context of the function call for answer synthesis
-            speech_handle = await agent.say(message, add_to_chat_ctx=True)  # noqa: F841
+
+        # uncomment for option 1
+        # agent = AgentCallContext.get_current().agent
+        # filler_messages = [
+        #     "Let me check the weather in {location} for you.",
+        #     "Let me see what the weather is like in {location} right now.",
+        #     # LLM will complete this sentence if it is added to the end of the chat context
+        #     "The current weather in {location} is ",
+        # ]
+        # message = random.choice(filler_messages).format(location=location)
+        # logger.info(f"saying filler message: {message}")
+
+        # NOTE: set add_to_chat_ctx=True will add the message to the end
+        #   of the chat context of the function call for answer synthesis
+        # speech_handle = await agent.say(message, add_to_chat_ctx=True)  # noqa: F841
 
         logger.info(f"getting weather for {latitude}, {longitude}")
         url = f"https://api.open-meteo.com/v1/forecast?latitude={latitude}&longitude={longitude}&current=temperature_2m"
@@ -82,13 +79,17 @@ async def get_weather(
                         "temperature": data["current"]["temperature_2m"],
                         "temperature_unit": "Celsius",
                     }
-                    logger.info(f"weather data: {weather_data}")
                 else:
                     raise Exception(
                         f"Failed to get weather data, status code: {response.status}"
                     )
 
+        # artificially delay the function call for testing
+        await asyncio.sleep(2)
+        logger.info(f"weather data: {weather_data}")
+
         # (optional) To wait for the speech to finish before giving results of the function call
+        # without waiting, the new speech result will be queued and played after current speech is finished
         # await speech_handle.join()
         return weather_data
 
@@ -106,26 +107,37 @@ async def entrypoint(ctx: JobContext):
             "You are a weather assistant created by LiveKit. Your interface with users will be voice. "
             "You will provide weather information for a given location. "
             # when using option 1, you can suppress from the agent with prompt
-            "do not return any text while calling the function."
-            # uncomment this to use option 2
-            # "when performing function calls, let user know that you are checking the weather."
+            # "do not return any text while calling the function."
+            # option 2 - using LLM to generate text for the function call
+            "when performing function calls, let user know that you are checking the weather."
         ),
         role="system",
     )
     participant = await ctx.wait_for_participant()
     agent = VoicePipelineAgent(
         vad=ctx.proc.userdata["vad"],
         stt=deepgram.STT(),
-        llm=openai.LLM(model="gpt-4o-mini"),
+        llm=openai.LLM(model="gpt-4o"),
         tts=openai.TTS(),
         fnc_ctx=fnc_ctx,
         chat_ctx=initial_chat_ctx,
     )
 
+    usage_collector = metrics.UsageCollector()
+
+    @agent.on("metrics_collected")
+    def _on_metrics_collected(mtrcs: metrics.AgentMetrics):
+        metrics.log_metrics(mtrcs)
+        usage_collector.collect(mtrcs)
+
+    async def log_usage():
+        summary = usage_collector.get_summary()
+        logger.info(f"Usage: ${summary}")
+
     # Start the assistant. This will automatically publish a microphone track and listen to the participant.
     agent.start(ctx.room, participant)
     await agent.say(
-        "Hello from the weather station. Would you like to know the weather? If so, tell me your location."
+        "Hello from the weather station. Tell me your location to check the weather."
     )
 
 
 
@@ -19,6 +19,8 @@
     "watchfiles",
     "anthropic",
     "websockets.client",
+    "botocore",
+    "aiobotocore",
 ]
 
 
 
@@ -173,10 +173,16 @@ async def _synthesize_task(self, handle: SynthesisHandle) -> None:
         if isinstance(transcript_source, Awaitable):
             transcript_source = await transcript_source
 
+        tts_stream: AsyncIterable[str] | None = None
         if isinstance(tts_source, str):
-            co = self._str_synthesis_task(tts_source, transcript_source, handle)
+            # wrap in async iterator
+            async def string_to_stream(text: str):
+                yield text
+
+            tts_stream = string_to_stream(tts_source)
         else:
-            co = self._stream_synthesis_task(tts_source, transcript_source, handle)
+            tts_stream = tts_source
+        co = self._stream_synthesis_task(tts_stream, transcript_source, handle)
 
         synth = asyncio.create_task(co)
         synth.add_done_callback(lambda _: handle._buf_ch.close())
@@ -205,41 +211,6 @@ async def _read_transcript_task(
             if inspect.isasyncgen(transcript_source):
                 await transcript_source.aclose()
 
-    @utils.log_exceptions(logger=logger)
-    async def _str_synthesis_task(
-        self,
-        tts_text: str,
-        transcript_source: AsyncIterable[str] | str,
-        handle: SynthesisHandle,
-    ) -> None:
-        """synthesize speech from a string"""
-        read_transcript_atask: asyncio.Task | None = None
-
-        first_frame = True
-        tts_stream = handle._tts.synthesize(tts_text)
-        try:
-            async for audio in tts_stream:
-                if first_frame:
-                    first_frame = False
-                    read_transcript_atask = asyncio.create_task(
-                        self._read_transcript_task(transcript_source, handle)
-                    )
-
-                handle._buf_ch.send_nowait(audio.frame)
-                if not handle.tts_forwarder.closed:
-                    handle.tts_forwarder.push_audio(audio.frame)
-
-            if not handle.tts_forwarder.closed:
-                handle.tts_forwarder.mark_audio_segment_end()
-
-            if read_transcript_atask is not None:
-                await read_transcript_atask
-        finally:
-            await tts_stream.aclose()
-
-            if read_transcript_atask is not None:
-                await utils.aio.gracefully_cancel(read_transcript_atask)
-
     @utils.log_exceptions(logger=logger)
     async def _stream_synthesis_task(
         self,
 
@@ -138,6 +138,10 @@ def stream(
             conn_options=conn_options or DEFAULT_FALLBACK_API_CONNECT_OPTIONS,
         )
 
+    def prewarm(self) -> None:
+        if self._tts_instances:
+            self._tts_instances[0].prewarm()
+
     async def aclose(self) -> None:
         for tts_status in self._status:
             if tts_status.recovering_task is not None:
 
@@ -55,6 +55,9 @@ def stream(
             sentence_tokenizer=self._sentence_tokenizer,
         )
 
+    def prewarm(self) -> None:
+        self._tts.prewarm()
+
 
 class StreamAdapterWrapper(SynthesizeStream):
     def __init__(
 
@@ -98,6 +98,10 @@ def stream(
             "streaming is not supported by this TTS, please use a different TTS or use a StreamAdapter"
         )
 
+    def prewarm(self) -> None:
+        """Pre-warm connection to the TTS service"""
+        pass
+
     async def aclose(self) -> None: ...
 
     async def __aenter__(self) -> TTS:
 
@@ -17,6 +17,7 @@
 from concurrent.futures import ThreadPoolExecutor
 from typing import AsyncIterator, Optional
 
+from livekit.agents.log import logger
 from livekit.agents.utils import aio
 
 try:
@@ -39,14 +40,14 @@ def __init__(self):
         self._buffer = io.BytesIO()
         self._lock = threading.Lock()
         self._data_available = threading.Condition(self._lock)
-        self._eof = False  # EOF flag to signal no more writes
+        self._eof = False
 
     def write(self, data: bytes):
         """Write data to the buffer from a writer thread."""
-        with self._data_available:  # Lock and notify readers
-            self._buffer.seek(0, io.SEEK_END)  # Move to the end
+        with self._data_available:
+            self._buffer.seek(0, io.SEEK_END)
             self._buffer.write(data)
-            self._data_available.notify_all()  # Notify waiting readers
+            self._data_available.notify_all()
 
     def read(self, size: int = -1) -> bytes:
         """Read data from the buffer in a reader thread."""
@@ -56,21 +57,21 @@ def read(self, size: int = -1) -> bytes:
 
         with self._data_available:
             while True:
-                self._buffer.seek(0)  # Rewind for reading
+                if self._buffer.closed:
+                    return b""
+                # always read from beginning
+                self._buffer.seek(0)
                 data = self._buffer.read(size)
 
-                # If data is available, return it
                 if data:
-                    # Shrink the buffer to remove already-read data
+                    # shrink the buffer to remove already-read data
                     remaining = self._buffer.read()
                     self._buffer = io.BytesIO(remaining)
                     return data
 
-                # If EOF is signaled and no data remains, return EOF
                 if self._eof:
                     return b""
 
-                # Wait for more data
                 self._data_available.wait()
 
     def end_input(self):
@@ -129,15 +130,15 @@ def end_input(self):
         self._input_buf.end_input()
 
     def _decode_loop(self):
-        container = av.open(self._input_buf)
-        audio_stream = next(s for s in container.streams if s.type == "audio")
-        resampler = av.AudioResampler(
-            # convert to signed 16-bit little endian
-            format="s16",
-            layout=self._layout,
-            rate=self._sample_rate,
-        )
         try:
+            container = av.open(self._input_buf)
+            audio_stream = next(s for s in container.streams if s.type == "audio")
+            resampler = av.AudioResampler(
+                # convert to signed 16-bit little endian
+                format="s16",
+                layout=self._layout,
+                rate=self._sample_rate,
+            )
             # TODO: handle error where audio stream isn't found
             if not audio_stream:
                 return
@@ -157,6 +158,8 @@ def _decode_loop(self):
                             ),
                         )
                     )
+        except Exception:
+            logger.exception("error decoding audio")
         finally:
             self._output_ch.close()
 
@@ -175,8 +178,9 @@ async def aclose(self):
         self._closed = True
         self.end_input()
         self._input_buf.close()
-        # wait for decode loop to finish
+        # wait for decode loop to finish, only if anything's been pushed
         try:
-            await self._output_ch.recv()
+            if self._started:
+                await self._output_ch.recv()
         except aio.ChanClosed:
             pass
-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
 +"livekit-agents": patch
 +---
++
 +fixed a bug in AudioStreamDecoder where it could fail on close
Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,8 @@`
`19`	`19`	`"watchfiles",`
`20`	`20`	`"anthropic",`
`21`	`21`	`"websockets.client",`
	`22`	`+ "botocore",`
	`23`	`+ "aiobotocore",`
`22`	`24`	`]`
`23`	`25`
`24`	`26`
Original file line number	Diff line number	Diff line change
`@@ -55,6 +55,9 @@ def stream(`
`55`	`55`	`sentence_tokenizer=self._sentence_tokenizer,`
`56`	`56`	`)`
`57`	`57`
	`58`	`+ def prewarm(self) -> None:`
	`59`	`+ self._tts.prewarm()`
	`60`	`+`
`58`	`61`
`59`	`62`	`class StreamAdapterWrapper(SynthesizeStream):`
`60`	`63`	`def __init__(`
Original file line number	Diff line number	Diff line change
`@@ -98,6 +98,10 @@ def stream(`
`98`	`98`	`"streaming is not supported by this TTS, please use a different TTS or use a StreamAdapter"`
`99`	`99`	`)`
`100`	`100`
	`101`	`+ def prewarm(self) -> None:`
	`102`	`+ """Pre-warm connection to the TTS service"""`
	`103`	`+ pass`
	`104`	`+`
`101`	`105`	`async def aclose(self) -> None: ...`
`102`	`106`
`103`	`107`	`async def __aenter__(self) -> TTS:`