livekit · chenghao-mou · Dec 1, 2025 · Dec 1, 2025 · Dec 1, 2025 · Dec 1, 2025
diff --git a/livekit-agents/livekit/agents/voice/agent_activity.py b/livekit-agents/livekit/agents/voice/agent_activity.py
@@ -1207,7 +1207,10 @@ def _interrupt_by_audio_activity(self) -> None:
     # region recognition hooks
 
     def on_start_of_speech(self, ev: vad.VADEvent | None) -> None:
-        self._session._update_user_state("speaking")
+        speech_start_time = time.time()
+        if ev:
+            speech_start_time = speech_start_time - ev.speech_duration
+        self._session._update_user_state("speaking", last_speaking_time=speech_start_time)
         self._user_silence_event.clear()
 
         if self._false_interruption_timer:
@@ -1649,10 +1652,19 @@ async def _read_text() -> AsyncIterable[str]:
         started_speaking_at: float | None = None
         stopped_speaking_at: float | None = None
 
-        def _on_first_frame(_: asyncio.Future[None]) -> None:
+        def _on_first_frame(fut: asyncio.Future[float] | asyncio.Future[None]) -> None:
+            """
+            Callback to update the agent state when the first frame is captured:
+            1. _AudioOutput.first_frame_fut (float)
+            2. _TextOutput.first_text_fut (None)
+            """
             nonlocal started_speaking_at
-            started_speaking_at = time.time()
-            self._session._update_agent_state("speaking")
+            try:
+                started_speaking_at = fut.result() or time.time()
+            except BaseException:
+                started_speaking_at = time.time()
+
+            self._session._update_agent_state("speaking", start_time=started_speaking_at)
 
         audio_out: _AudioOutput | None = None
         tts_gen_data: _TTSGenerationData | None = None
@@ -1920,10 +1932,18 @@ async def _read_text(
         started_speaking_at: float | None = None
         stopped_speaking_at: float | None = None
 
-        def _on_first_frame(_: asyncio.Future[None]) -> None:
+        def _on_first_frame(fut: asyncio.Future[float] | asyncio.Future[None]) -> None:
+            """
+            Callback to update the agent state when the first frame is captured:
+            1. _AudioOutput.first_frame_fut (float)
+            2. _TextOutput.first_text_fut (None)
+            """
             nonlocal started_speaking_at
-            started_speaking_at = time.time()
-            self._session._update_agent_state("speaking")
+            try:
+                started_speaking_at = fut.result() or time.time()
+            except BaseException:
+                started_speaking_at = time.time()
+            self._session._update_agent_state("speaking", start_time=started_speaking_at)
 
         audio_out: _AudioOutput | None = None
         if audio_output is not None:
@@ -2272,10 +2292,18 @@ async def _realtime_generation_task_impl(
         started_speaking_at: float | None = None
         stopped_speaking_at: float | None = None
 
-        def _on_first_frame(_: asyncio.Future[None]) -> None:
+        def _on_first_frame(fut: asyncio.Future[float] | asyncio.Future[None]) -> None:
+            """
+            Callback to update the agent state when the first frame is captured:
+            1. _AudioOutput.first_frame_fut (float)
+            2. _TextOutput.first_text_fut (None)
+            """
             nonlocal started_speaking_at
-            started_speaking_at = time.time()
-            self._session._update_agent_state("speaking")
+            try:
+                started_speaking_at = fut.result() or time.time()
+            except BaseException:
+                started_speaking_at = time.time()
+            self._session._update_agent_state("speaking", start_time=started_speaking_at)
 
         tasks: list[asyncio.Task[Any]] = []
         tees: list[utils.aio.itertools.Tee[Any]] = []
@@ -2675,7 +2703,8 @@ def _on_false_interruption() -> None:
                 and not self._paused_speech.done()
             ):
                 self._session._update_agent_state(
-                    "speaking", otel_context=self._paused_speech._agent_turn_context
+                    "speaking",
+                    otel_context=self._paused_speech._agent_turn_context,
                 )
                 audio_output.resume()
                 resumed = True

diff --git a/livekit-agents/livekit/agents/voice/agent_session.py b/livekit-agents/livekit/agents/voice/agent_session.py
@@ -1159,18 +1159,24 @@ def _cancel_user_away_timer(self) -> None:
             self._user_away_timer = None
 
     def _update_agent_state(
-        self, state: AgentState, *, otel_context: otel_context.Context | None = None
+        self,
+        state: AgentState,
+        *,
+        otel_context: otel_context.Context | None = None,
+        start_time: float | None = None,
     ) -> None:
         if self._agent_state == state:
             return
 
+        start_time_ns = int(start_time * 1_000_000_000) if start_time else None
+
         if state == "speaking":
             self._llm_error_counts = 0
             self._tts_error_counts = 0
 
             if self._agent_speaking_span is None:
                 self._agent_speaking_span = tracer.start_span(
-                    "agent_speaking", context=otel_context
+                    "agent_speaking", context=otel_context, start_time=start_time_ns
                 )
 
                 if self._room_io:
@@ -1201,8 +1207,14 @@ def _update_user_state(
         if self._user_state == state:
             return
 
+        last_speaking_time_ns = (
+            int(last_speaking_time * 1_000_000_000) if last_speaking_time else None
+        )
+
         if state == "speaking" and self._user_speaking_span is None:
-            self._user_speaking_span = tracer.start_span("user_speaking")
+            self._user_speaking_span = tracer.start_span(
+                "user_speaking", start_time=last_speaking_time_ns
+            )
 
             if self._room_io and self._room_io.linked_participant:
                 _set_participant_attributes(
@@ -1213,7 +1225,7 @@ def _update_user_state(
         elif self._user_speaking_span is not None:
             # end_time = last_speaking_time or time.time()
             # self._user_speaking_span.set_attribute(trace_types.ATTR_END_TIME, end_time)
-            self._user_speaking_span.end()
+            self._user_speaking_span.end(end_time=last_speaking_time_ns)
             self._user_speaking_span = None
 
         if state == "listening" and self._agent_state == "listening":

diff --git a/livekit-agents/livekit/agents/voice/audio_recognition.py b/livekit-agents/livekit/agents/voice/audio_recognition.py
@@ -466,9 +466,12 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:
             if self._end_of_turn_task is not None:
                 self._end_of_turn_task.cancel()
 
+    @utils.log_exceptions(logger=logger)
     async def _on_vad_event(self, ev: vad.VADEvent) -> None:
         if ev.type == vad.VADEventType.START_OF_SPEECH:
-            with trace.use_span(self._ensure_user_turn_span()):
+            with trace.use_span(
+                self._ensure_user_turn_span(start_time=time.time() - ev.speech_duration)
+            ):
                 self._hooks.on_start_of_speech(ev)
 
             self._speaking = True
@@ -688,11 +691,13 @@ async def _forward() -> None:
             await aio.cancel_and_wait(forward_task)
             await stream.aclose()
 
-    def _ensure_user_turn_span(self) -> trace.Span:
+    @utils.log_exceptions(logger=logger)
+    def _ensure_user_turn_span(self, start_time: float | None = None) -> trace.Span:
         if self._user_turn_span and self._user_turn_span.is_recording():
             return self._user_turn_span
 
-        self._user_turn_span = tracer.start_span("user_turn")
+        start_time_ns = int(start_time * 1_000_000_000) if start_time else None
+        self._user_turn_span = tracer.start_span("user_turn", start_time=start_time_ns)
 
         if (room_io := self._session._room_io) and room_io.linked_participant:
             _set_participant_attributes(self._user_turn_span, room_io.linked_participant)

diff --git a/livekit-agents/livekit/agents/voice/generation.py b/livekit-agents/livekit/agents/voice/generation.py
@@ -336,7 +336,8 @@ async def _text_forwarding_task(
 @dataclass
 class _AudioOutput:
     audio: list[rtc.AudioFrame]
-    first_frame_fut: asyncio.Future[None]
+    first_frame_fut: asyncio.Future[float]
+    """Future that will be set with the timestamp of the first frame's capture"""
 
 
 def perform_audio_forwarding(
@@ -356,8 +357,15 @@ async def _audio_forwarding_task(
     out: _AudioOutput,
 ) -> None:
     resampler: rtc.AudioResampler | None = None
+
     try:
         audio_output.resume()
+
+        @audio_output.on("playback_started")
+        def _on_playback_started(ev: io.PlaybackStartedEvent) -> None:
+            if not out.first_frame_fut.done():
+                out.first_frame_fut.set_result(ev.created_at)
+
         async for frame in tts_output:
             out.audio.append(frame)
 
@@ -379,11 +387,6 @@ async def _audio_forwarding_task(
             else:
                 await audio_output.capture_frame(frame)
 
-            # set the first frame future if not already set
-            # (after completing the first frame)
-            if not out.first_frame_fut.done():
-                out.first_frame_fut.set_result(None)
-
         if resampler:
             for frame in resampler.flush():
                 await audio_output.capture_frame(frame)

diff --git a/livekit-agents/livekit/agents/voice/io.py b/livekit-agents/livekit/agents/voice/io.py
@@ -126,12 +126,18 @@ class PlaybackFinishedEvent:
     When None, the transcript is not synchronized with the playback"""
 
 
+@dataclass
+class PlaybackStartedEvent:
+    created_at: float
+    """The timestamp (time.time())when the playback started"""
+
+
 @dataclass
 class AudioOutputCapabilities:
     pause: bool
 
 
-class AudioOutput(ABC, rtc.EventEmitter[Literal["playback_finished"]]):
+class AudioOutput(ABC, rtc.EventEmitter[Literal["playback_finished", "playback_started"]]):
     def __init__(
         self,
         *,
@@ -167,6 +173,9 @@ def __init__(
                     synchronized_transcript=ev.synchronized_transcript,
                 ),
             )
+            self.next_in_chain.on(
+                "playback_started", lambda ev: self.on_playback_started(created_at=ev.created_at)
+            )
 
     @property
     def label(self) -> str:
@@ -176,6 +185,9 @@ def label(self) -> str:
     def next_in_chain(self) -> AudioOutput | None:
         return self.__next_in_chain
 
+    def on_playback_started(self, *, created_at: float) -> None:
+        self.emit("playback_started", PlaybackStartedEvent(created_at=created_at))
+
     def on_playback_finished(
         self,
         *,