Add support for audio frame processor and update livekit-rtc (#4145)

lukasIO · ladvoc · web-flow · commit 0bfa6906a514 · 2025-12-19T11:43:38.000+01:00
Co-authored-by: Jacob Gelman &lt;3182119+ladvoc@users.noreply.github.com&gt;
diff --git a/livekit-agents/livekit/agents/voice/room_io/_input.py b/livekit-agents/livekit/agents/voice/room_io/_input.py
@@ -30,6 +30,7 @@ def __init__(
         room: rtc.Room,
         *,
         track_source: rtc.TrackSource.ValueType | list[rtc.TrackSource.ValueType],
+        processor: rtc.FrameProcessor[T] | None = None,
     ) -> None:
         self._room = room
         self._accepted_sources = (
@@ -49,6 +50,9 @@ def __init__(
 
         self._room.on("track_subscribed", self._on_track_available)
         self._room.on("track_unpublished", self._on_track_unavailable)
+        self._room.on("token_refreshed", self._on_token_refreshed)
+
+        self._processor = processor
 
     async def __anext__(self) -> T:
         return await self._data_ch.__anext__()
@@ -122,6 +126,7 @@ async def aclose(self) -> None:
             await aio.cancel_and_wait(self._forward_atask)
 
         self._room.off("track_subscribed", self._on_track_available)
+        self._room.off("token_refreshed", self._on_token_refreshed)
         self._data_ch.close()
 
     @log_exceptions(logger=logger)
@@ -160,6 +165,8 @@ def _close_stream(self) -> None:
             self._tasks.add(task)
             self._stream = None
             self._publication = None
+        if self._processor:
+            self._processor._close()
 
     def _on_track_available(
         self,
@@ -177,6 +184,16 @@ def _on_track_available(
         self._close_stream()
         self._stream = self._create_stream(track, participant)
         self._publication = publication
+        if self._processor:
+            self._processor._on_stream_info_updated(
+                room_name=self._room.name,
+                participant_identity=participant.identity,
+                publication_sid=publication.sid,
+            )
+            if self._room._token is not None and self._room._server_url is not None:
+                self._processor._on_credentials_updated(
+                    token=self._room._token, url=self._room._server_url
+                )
         self._forward_atask = asyncio.create_task(
             self._forward_task(self._forward_atask, self._stream, publication, participant)
         )
@@ -201,6 +218,16 @@ def _on_track_unavailable(
             if self._on_track_available(publication.track, publication, participant):
                 return
 
+    def _on_token_refreshed(self) -> None:
+        if (
+            self._processor is not None
+            and self._room._token is not None
+            and self._room._server_url is not None
+        ):
+            self._processor._on_credentials_updated(
+                token=self._room._token, url=self._room._server_url
+            )
+
 
 class _ParticipantAudioInputStream(_ParticipantInputStream[rtc.AudioFrame], AudioInput):
     def __init__(
@@ -209,12 +236,22 @@ def __init__(
         *,
         sample_rate: int,
         num_channels: int,
-        noise_cancellation: rtc.NoiseCancellationOptions | NoiseCancellationSelector | None,
+        noise_cancellation: rtc.NoiseCancellationOptions
+        | NoiseCancellationSelector
+        | rtc.FrameProcessor[rtc.AudioFrame]
+        | None,
         pre_connect_audio_handler: PreConnectAudioHandler | None,
         frame_size_ms: int = 50,
     ) -> None:
+        audio_processor: rtc.FrameProcessor[rtc.AudioFrame] | None = None
+        if isinstance(noise_cancellation, rtc.FrameProcessor):
+            audio_processor = noise_cancellation
+
         _ParticipantInputStream.__init__(
-            self, room=room, track_source=rtc.TrackSource.SOURCE_MICROPHONE
+            self,
+            room=room,
+            track_source=rtc.TrackSource.SOURCE_MICROPHONE,
+            processor=audio_processor,
         )
         AudioInput.__init__(self, label="RoomIO")
         if frame_size_ms <= 0:
@@ -265,7 +302,7 @@ async def _forward_task(
             try:
                 duration: float = 0
                 frames = await self._pre_connect_audio_handler.wait_for_data(publication.track.sid)
-                for frame in self._resample_frames(frames):
+                for frame in self._resample_frames(self._apply_audio_processor(frames)):
                     if self._attached:
                         await self._data_ch.send(frame)
                         duration += frame.duration
@@ -319,6 +356,20 @@ def _resample_frames(self, frames: Iterable[rtc.AudioFrame]) -> Iterable[rtc.Aud
         if resampler:
             yield from resampler.flush()
 
+    def _apply_audio_processor(self, frames: Iterable[rtc.AudioFrame]) -> Iterable[rtc.AudioFrame]:
+        for frame in frames:
+            if self._processor is not None:
+                try:
+                    yield self._processor._process(frame)
+                except Exception as e:
+                    logger.warning(
+                        "error pre-processing audio frame",
+                        exc_info=e,
+                    )
+                    yield frame
+            else:
+                yield frame
+
 
 class _ParticipantVideoInputStream(_ParticipantInputStream[rtc.VideoFrame], VideoInput):
     def __init__(self, room: rtc.Room) -> None:
diff --git a/livekit-agents/livekit/agents/voice/room_io/types.py b/livekit-agents/livekit/agents/voice/room_io/types.py
@@ -2,7 +2,7 @@
 
 from collections.abc import Coroutine
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Callable, Optional
+from typing import TYPE_CHECKING, Callable, Optional, TypeAlias
 
 from livekit import rtc
 
@@ -44,8 +44,9 @@ class NoiseCancellationParams:
     track: rtc.Track
 
 
-NoiseCancellationSelector = Callable[
-    [NoiseCancellationParams], Optional[rtc.NoiseCancellationOptions]
+NoiseCancellationSelector: TypeAlias = Callable[
+    [NoiseCancellationParams],
+    rtc.NoiseCancellationOptions | rtc.FrameProcessor[rtc.AudioFrame] | None,
 ]
 
 
@@ -65,7 +66,12 @@ class AudioInputOptions:
     num_channels: int = 1
     frame_size_ms: int = 50
     """The frame size in milliseconds for the audio input."""
-    noise_cancellation: rtc.NoiseCancellationOptions | NoiseCancellationSelector | None = None
+    noise_cancellation: (
+        rtc.NoiseCancellationOptions
+        | NoiseCancellationSelector
+        | rtc.FrameProcessor[rtc.AudioFrame]
+        | None
+    ) = None
     pre_connect_audio: bool = True
     """Pre-connect audio enabled or not."""
     pre_connect_audio_timeout: float = 3.0
@@ -242,7 +248,9 @@ class RoomInputOptions:
     audio_num_channels: int = 1
     audio_frame_size_ms: int = 50
     """The frame size in milliseconds for the audio input."""
-    noise_cancellation: rtc.NoiseCancellationOptions | None = None
+    noise_cancellation: rtc.NoiseCancellationOptions | rtc.FrameProcessor[rtc.AudioFrame] | None = (
+        None
+    )
     text_input_cb: TextInputCallback = _default_text_input_cb
     participant_kinds: NotGivenOr[list[rtc.ParticipantKind.ValueType]] = NOT_GIVEN
     """Participant kinds accepted for auto subscription. If not provided,
diff --git a/livekit-agents/pyproject.toml b/livekit-agents/pyproject.toml
@@ -29,7 +29,7 @@ dependencies = [
     "typer>=0.15.1",
     "click~=8.1",
     "certifi>=2025.6.15",
-    "livekit>=1.0.19,<2",
+    "livekit>=1.0.23,<2",
     "livekit-api>=1.0.7,<2",
     "livekit-protocol>=1.1,<2",
     "livekit-blingfire~=1.0",
diff --git a/uv.lock b/uv.lock