add back on_playback_started

chenghao-mou · chenghao-mou · commit 1e825a1e7e62 · 2026-01-08T15:00:25.000Z
diff --git a/livekit-agents/livekit/agents/voice/generation.py b/livekit-agents/livekit/agents/voice/generation.py
@@ -360,6 +360,12 @@ async def _audio_forwarding_task(
 
     try:
         audio_output.resume()
+
+        @audio_output.on("playback_started")
+        def _on_playback_started(ev: io.PlaybackStartedEvent) -> None:
+            if not out.first_frame_fut.done():
+                out.first_frame_fut.set_result(ev.created_at)
+
         async for frame in tts_output:
             out.audio.append(frame)
 
@@ -381,9 +387,6 @@ async def _audio_forwarding_task(
             else:
                 await audio_output.capture_frame(frame)
 
-            if not out.first_frame_fut.done():
-                out.first_frame_fut.set_result(time.time())
-
         if resampler:
             for frame in resampler.flush():
                 await audio_output.capture_frame(frame)
diff --git a/livekit-agents/livekit/agents/voice/io.py b/livekit-agents/livekit/agents/voice/io.py
@@ -126,12 +126,18 @@ class PlaybackFinishedEvent:
     When None, the transcript is not synchronized with the playback"""
 
 
+@dataclass
+class PlaybackStartedEvent:
+    created_at: float
+    """The timestamp (time.time())when the playback started"""
+
+
 @dataclass
 class AudioOutputCapabilities:
     pause: bool
 
 
-class AudioOutput(ABC, rtc.EventEmitter[Literal["playback_finished"]]):
+class AudioOutput(ABC, rtc.EventEmitter[Literal["playback_finished", "playback_started"]]):
     def __init__(
         self,
         *,
@@ -167,6 +173,9 @@ def __init__(
                     synchronized_transcript=ev.synchronized_transcript,
                 ),
             )
+            self.next_in_chain.on(
+                "playback_started", lambda ev: self.on_playback_started(created_at=ev.created_at)
+            )
 
     @property
     def label(self) -> str:
@@ -176,6 +185,9 @@ def label(self) -> str:
     def next_in_chain(self) -> AudioOutput | None:
         return self.__next_in_chain
 
+    def on_playback_started(self, *, created_at: float) -> None:
+        self.emit("playback_started", PlaybackStartedEvent(created_at=created_at))
+
     def on_playback_finished(
         self,
         *,
diff --git a/livekit-agents/livekit/agents/voice/room_io/_output.py b/livekit-agents/livekit/agents/voice/room_io/_output.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import asyncio
+import time
 
 from livekit import rtc
 
@@ -17,10 +18,6 @@
 from ..transcription import find_micro_track_id
 
 
-class _InterruptedError(Exception):
-    pass
-
-
 class _ParticipantAudioOutput(io.AudioOutput):
     def __init__(
         self,
@@ -60,8 +57,7 @@ def __init__(
 
         self._playback_enabled = asyncio.Event()
         self._playback_enabled.set()
-
-        self._first_frame_fut: asyncio.Future[None] | None = None
+        self._first_frame_event = asyncio.Event()
 
     async def _publish_track(self) -> None:
         async with self._lock:
@@ -103,21 +99,9 @@ async def capture_frame(self, frame: rtc.AudioFrame) -> None:
             await self._flush_task
 
         for f in self._audio_bstream.push(frame.data):
-            if self._pushed_duration == 0:
-                self._first_frame_fut = asyncio.Future[None]()
-
             await self._audio_buf.send(f)
             self._pushed_duration += f.duration
 
-            # wait for the first frame to be captured
-            if self._first_frame_fut and not self._first_frame_fut.done():
-                try:
-                    await self._first_frame_fut
-                except _InterruptedError:
-                    continue
-                finally:
-                    self._first_frame_fut = None
-
     def flush(self) -> None:
         super().flush()
 
@@ -150,6 +134,7 @@ def pause(self) -> None:
     def resume(self) -> None:
         super().resume()
         self._playback_enabled.set()
+        self._first_frame_event.clear()
 
     async def _wait_for_playout(self) -> None:
         wait_for_interruption = asyncio.create_task(self._interrupted_event.wait())
@@ -185,9 +170,7 @@ async def _wait_buffered_audio() -> None:
 
         self._pushed_duration = 0
         self._interrupted_event.clear()
-        if self._first_frame_fut and not self._first_frame_fut.done():
-            self._first_frame_fut.set_exception(_InterruptedError())
-        self._first_frame_fut = None
+        self._first_frame_event.clear()
         self.on_playback_finished(playback_position=pushed_duration, interrupted=interrupted)
 
     async def _forward_audio(self) -> None:
@@ -198,18 +181,16 @@ async def _forward_audio(self) -> None:
                 # TODO(long): save the frames in the queue and play them later
                 # TODO(long): ignore frames from previous syllable
 
-            if self._interrupted_event.is_set() or (
-                self._pushed_duration == 0 and not self._first_frame_fut
-            ):
+            if self._interrupted_event.is_set() or self._pushed_duration == 0:
                 if self._interrupted_event.is_set() and self._flush_task:
                     await self._flush_task
 
                 # ignore frames if interrupted
                 continue
 
-            if self._first_frame_fut and not self._first_frame_fut.done():
-                self._first_frame_fut.set_result(None)
-                self._first_frame_fut = None
+            if not self._first_frame_event.is_set():
+                self._first_frame_event.set()
+                self.on_playback_started(created_at=time.time())
             await self._audio_source.capture_frame(frame)
 
     def _on_reconnected(self) -> None: