add reset for av sync

longcw · longcw · commit 51b51e001114 · 2024-12-30T19:08:00.000+08:00
diff --git a/examples/video-stream/video_play.py b/examples/video-stream/video_play.py
@@ -75,7 +75,7 @@ async def stream_video(self) -> AsyncIterable[tuple[rtc.VideoFrame, float]]:
 
     async def stream_audio(self) -> AsyncIterable[tuple[rtc.AudioFrame, float]]:
         """Streams audio frames from the media file in an endless loop."""
-        for i, av_frame in enumerate(self._audio_container.decode(audio=0)):
+        for av_frame in self._audio_container.decode(audio=0):
             # Convert audio frame to raw int16 samples
             frame = av_frame.to_ndarray().T  # Transpose to (samples, channels)
             frame = (frame * 32768).astype(np.int16)
@@ -129,7 +129,7 @@ async def main(room: rtc.Room, room_name: str, media_path: str):
     media_info = streamer.info
 
     # Create video and audio sources/tracks
-    queue_size_ms = 1000  # TODO: testing with different sizes
+    queue_size_ms = 1000
     video_source = rtc.VideoSource(
         width=media_info.video_width,
         height=media_info.video_height,
@@ -172,19 +172,45 @@ async def _push_frames(
             await av_sync.push(frame, timestamp)
             await asyncio.sleep(0)
 
+    async def _log_fps(av_sync: rtc.AVSynchronizer):
+        while True:
+            await asyncio.sleep(2)
+            diff = av_sync.last_video_time - av_sync.last_audio_time
+
+            logger.info(
+                f"fps: {av_sync.actual_fps:.2f}, video_time: {av_sync.last_video_time:.3f}s, "
+                f"audio_time: {av_sync.last_audio_time:.3f}s, diff: {diff:.3f}s"
+            )
+
     try:
         while True:
             streamer.reset()
-            video_task = asyncio.create_task(
-                _push_frames(streamer.stream_video(), av_sync)
-            )
-            audio_task = asyncio.create_task(
-                _push_frames(streamer.stream_audio(), av_sync)
+
+            video_stream = streamer.stream_video()
+            audio_stream = streamer.stream_audio()
+
+            # read the head frames and push them at the same time
+            first_video_frame, video_timestamp = await video_stream.__anext__()
+            first_audio_frame, audio_timestamp = await audio_stream.__anext__()
+            logger.info(
+                f"first video duration: {1/media_info.video_fps:.3f}s, "
+                f"first audio duration: {first_audio_frame.duration:.3f}s"
             )
+            await av_sync.push(first_video_frame, video_timestamp)
+            await av_sync.push(first_audio_frame, audio_timestamp)
+
+            video_task = asyncio.create_task(_push_frames(video_stream, av_sync))
+            audio_task = asyncio.create_task(_push_frames(audio_stream, av_sync))
+
+            log_fps_task = asyncio.create_task(_log_fps(av_sync))
 
             # wait for both tasks to complete
             await asyncio.gather(video_task, audio_task)
             await av_sync.wait_for_playout()
+
+            # clean up
+            av_sync.reset()
+            log_fps_task.cancel()
             logger.info("playout finished")
     finally:
         await streamer.aclose()
diff --git a/livekit-rtc/livekit/rtc/synchronizer.py b/livekit-rtc/livekit/rtc/synchronizer.py
@@ -45,6 +45,7 @@ def __init__(
         self._max_delay_tolerance_ms = _max_delay_tolerance_ms
 
         self._stopped = False
+        # the time of the last video/audio frame captured
         self._last_video_time: float = 0
         self._last_audio_time: float = 0
 
@@ -55,7 +56,7 @@ def __init__(
             # ensure queue is bounded if queue size is specified
             self._video_queue_max_size = max(1, self._video_queue_max_size)
 
-        self._video_queue = asyncio.Queue[tuple[VideoFrame, float]](
+        self._video_queue = asyncio.Queue[tuple[VideoFrame, Optional[float]]](
             maxsize=self._video_queue_max_size
         )
         self._fps_controller = _FPSController(
@@ -67,6 +68,13 @@ def __init__(
     async def push(
         self, frame: Union[VideoFrame, AudioFrame], timestamp: Optional[float] = None
     ) -> None:
+        """Push a frame to the synchronizer
+
+        Args:
+            frame: The video or audio frame to push.
+            timestamp: (optional) The timestamp of the frame, for logging purposes for now.
+                For AudioFrame, it should be the end time of the frame.
+        """
         if isinstance(frame, AudioFrame):
             await self._audio_source.capture_frame(frame)
             if timestamp is not None:
@@ -79,53 +87,25 @@ async def clear_queue(self) -> None:
         self._audio_source.clear_queue()
         while not self._video_queue.empty():
             await self._video_queue.get()
+            self._video_queue.task_done()
 
     async def wait_for_playout(self) -> None:
         """Wait until all video and audio frames are played out."""
-        await self._audio_source.wait_for_playout()
-        await self._video_queue.join()
+        await asyncio.gather(
+            self._audio_source.wait_for_playout(),
+            self._video_queue.join(),
+        )
+
+    def reset(self) -> None:
+        self._fps_controller.reset()
 
     async def _capture_video(self) -> None:
-        count = 0
         while not self._stopped:
             frame, timestamp = await self._video_queue.get()
-
             async with self._fps_controller:
-                # debug
-                frame_rgba = np.frombuffer(frame.data, dtype=np.uint8).reshape(
-                    frame.height, frame.width, 4
-                )
-                frame_bgr = cv2.cvtColor(frame_rgba[:, :, :3], cv2.COLOR_RGBA2BGR)
-                frame_bgr = cv2.putText(
-                    frame_bgr,
-                    f"{self.actual_fps:.2f}fps, video time: {timestamp:.3f}s, "
-                    f"audio time: {self.last_audio_time:.3f}s, diff: {timestamp - self.last_audio_time:.3f}s",
-                    (10, 100),
-                    cv2.FONT_HERSHEY_SIMPLEX,
-                    1,
-                    (0, 0, 255),
-                    2,
-                )
-                frame_rgba = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGBA)
-                frame = VideoFrame(
-                    width=frame.width,
-                    height=frame.height,
-                    type=frame.type,
-                    data=frame_rgba.tobytes(),
-                )
-                count += 1
-                # end debug
-
                 self._video_source.capture_frame(frame)
                 if timestamp is not None:
                     self._last_video_time = timestamp
-
-                if count % 30 == 0:
-                    diff = self.last_video_time - self.last_audio_time
-                    print(
-                        f"{self.actual_fps:.2f}fps, last video time: {self.last_video_time:.3f}s, "
-                        f"last audio time: {self.last_audio_time:.3f}s, diff: {diff:.3f}s"
-                    )
             self._video_queue.task_done()
 
     async def aclose(self) -> None:
@@ -139,10 +119,12 @@ def actual_fps(self) -> float:
 
     @property
     def last_video_time(self) -> float:
+        """The time of the last video frame captured"""
         return self._last_video_time
 
     @property
     def last_audio_time(self) -> float:
+        """The time of the last audio frame played out"""
         return self._last_audio_time - self._audio_source.queued_duration
 
 
@@ -175,6 +157,10 @@ async def __aenter__(self) -> None:
     async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
         self.after_process()
 
+    def reset(self) -> None:
+        self._next_frame_time = None
+        self._send_timestamps.clear()
+
     async def wait_next_process(self) -> None:
         """Wait until it's time for the next frame.