livekit-examples
diff --git a/‎.cursor/livekit-egress.mdc‎
Lines changed: 1 addition & 0 deletions b/‎.cursor/livekit-egress.mdc‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md‎
Lines changed: 40 additions & 0 deletions b/‎README.md‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/agent.py‎
Lines changed: 67 additions & 3 deletions b/‎src/agent.py‎
Lines changed: 67 additions & 3 deletions
diff --git a/‎src/egress_manager.py‎
Lines changed: 184 additions & 0 deletions b/‎src/egress_manager.py‎
Lines changed: 184 additions & 0 deletions
@@ -1,4 +1,5 @@
 ---
+description: recording audio and transcripts on livekit using Egress APIs
 alwaysApply: false
 ---
 # LiveKit Egress - Python Audio Recording Guide
 
@@ -11,6 +11,8 @@ The starter project includes:
 - A simple voice AI assistant, ready for extension and customization
 - A voice AI pipeline with [models](https://docs.livekit.io/agents/models) from OpenAI, Cartesia, and AssemblyAI served through LiveKit Cloud
   - Easily integrate your preferred [LLM](https://docs.livekit.io/agents/models/llm/), [STT](https://docs.livekit.io/agents/models/stt/), and [TTS](https://docs.livekit.io/agents/models/tts/) instead, or swap to a realtime model like the [OpenAI Realtime API](https://docs.livekit.io/agents/models/realtime/openai)
+- **Dual-channel audio recording** to S3 via [LiveKit Egress](https://docs.livekit.io/home/egress/overview/) (agent on one channel, user on the other)
+- **Real-time transcript capture** saved to S3 as JSON
 - Eval suite based on the LiveKit Agents [testing & evaluation framework](https://docs.livekit.io/agents/build/testing/)
 - [LiveKit Turn Detector](https://docs.livekit.io/agents/build/turns/turn-detector/) for contextually-aware speaker detection, with multilingual support
 - [Background voice cancellation](https://docs.livekit.io/home/cloud/noise-cancellation/)
@@ -19,6 +21,41 @@ The starter project includes:
 
 This starter app is compatible with any [custom web/mobile frontend](https://docs.livekit.io/agents/start/frontend/) or [SIP-based telephony](https://docs.livekit.io/agents/start/telephony/).
 
+## Recording & Transcription
+
+This project includes built-in support for:
+
+- **Dual-channel audio recording** via LiveKit Egress (agent on one channel, user on the other)
+- **Real-time transcript capture** from STT output, saved as JSON
+
+### S3 Output Structure
+
+Recordings and transcripts are saved to S3:
+
+```
+s3://audivi-audio-recordings/livekit-demos/
+  ├── audio/{room_name}-{time}.ogg           # Dual-channel OGG audio
+  ├── audio/{room_name}-{time}.ogg.json      # Egress manifest
+  └── transcripts/{room_name}-{timestamp}.json  # Conversation transcript
+```
+
+### AWS Configuration
+
+Add these environment variables to your `.env.local`:
+
+```bash
+AWS_ACCESS_KEY_ID=your_access_key
+AWS_SECRET_ACCESS_KEY=your_secret_key
+AWS_REGION=us-east-1
+```
+
+To change the S3 bucket or prefix, modify the constants in `src/agent.py`:
+
+```python
+S3_BUCKET = "audivi-audio-recordings"
+S3_PREFIX = "livekit-demos"
+```
+
 ## Coding agents and MCP
 
 This project is designed to work with coding agents like [Cursor](https://www.cursor.com/) and [Claude Code](https://www.anthropic.com/claude-code). 
@@ -61,6 +98,9 @@ Sign up for [LiveKit Cloud](https://cloud.livekit.io/) then set up the environme
 - `LIVEKIT_URL`
 - `LIVEKIT_API_KEY`
 - `LIVEKIT_API_SECRET`
+- `AWS_ACCESS_KEY_ID` (for recording/transcripts)
+- `AWS_SECRET_ACCESS_KEY` (for recording/transcripts)
+- `AWS_REGION` (for recording/transcripts, defaults to `us-east-1`)
 
 You can load the LiveKit environment automatically using the [LiveKit CLI](https://docs.livekit.io/home/cli/cli-setup):
 
 
@@ -11,6 +11,8 @@ requires-python = ">=3.9"
 dependencies = [
     "livekit-agents[silero,turn-detector]~=1.3",
     "livekit-plugins-noise-cancellation~=0.2",
+    "boto3~=1.35",
+    "loguru~=0.7",
     "python-dotenv",
 ]
 
 
@@ -1,11 +1,10 @@
-import logging
-
 from dotenv import load_dotenv
 from livekit import rtc
 from livekit.agents import (
     Agent,
     AgentServer,
     AgentSession,
+    ConversationItemAddedEvent,
     JobContext,
     JobProcess,
     cli,
@@ -14,11 +13,17 @@
 )
 from livekit.plugins import noise_cancellation, silero
 from livekit.plugins.turn_detector.multilingual import MultilingualModel
+from loguru import logger
 
-logger = logging.getLogger("agent")
+from egress_manager import EgressConfig, EgressManager
+from transcript_handler import S3Uploader, TranscriptHandler
 
 load_dotenv(".env.local")
 
+# S3 bucket configuration for recordings and transcripts
+S3_BUCKET = "audivi-audio-recordings"
+S3_PREFIX = "livekit-demos"
+
 
 class Assistant(Agent):
     def __init__(self) -> None:
@@ -65,6 +70,25 @@ async def my_agent(ctx: JobContext):
         "room": ctx.room.name,
     }
 
+    room_name = ctx.room.name
+
+    # Initialize egress manager for dual-channel audio recording
+    egress_config = EgressConfig(
+        s3_bucket=S3_BUCKET,
+        s3_prefix=S3_PREFIX,
+    )
+    egress_manager = EgressManager(egress_config)
+
+    # Initialize transcript handler for saving STT output
+    s3_uploader = S3Uploader(
+        bucket=S3_BUCKET,
+        prefix=S3_PREFIX,
+    )
+    transcript_handler = TranscriptHandler(
+        room_name=room_name,
+        s3_uploader=s3_uploader,
+    )
+
     # Set up a voice AI pipeline using OpenAI, Cartesia, AssemblyAI, and the LiveKit turn detector
     session = AgentSession(
         # Speech-to-text (STT) is your agent's ears, turning the user's speech into text that the LLM can understand
@@ -87,6 +111,36 @@ async def my_agent(ctx: JobContext):
         preemptive_generation=True,
     )
 
+    # Subscribe to conversation events to capture transcripts
+    @session.on("conversation_item_added")
+    def on_conversation_item_added(event: ConversationItemAddedEvent):
+        """Capture user and agent transcripts from conversation events."""
+        item = event.item
+        text = item.text_content
+        if not text:
+            return
+
+        if item.role == "user":
+            transcript_handler.add_user_transcript(text, is_final=True)
+        elif item.role == "assistant":
+            transcript_handler.add_agent_transcript(text, is_final=True)
+
+    # Handle session close to finalize and upload transcript
+    @session.on("close")
+    async def on_session_close(_event):
+        """Finalize transcript and clean up egress when session ends."""
+        logger.info(f"Session closing for room {room_name}, saving transcript...")
+
+        # Upload transcript to S3
+        success = await transcript_handler.finalize_and_upload()
+        if success:
+            logger.info(f"Transcript saved for room {room_name}")
+        else:
+            logger.error(f"Failed to save transcript for room {room_name}")
+
+        # Clean up egress manager resources
+        await egress_manager.close()
+
     # To use a realtime model instead of a voice pipeline, use the following session setup instead.
     # (Note: This is for the OpenAI Realtime API. For other providers, see https://docs.livekit.io/agents/models/realtime/))
     # 1. Install livekit-agents[openai]
@@ -105,6 +159,16 @@ async def my_agent(ctx: JobContext):
     # # Start the avatar and wait for it to join
     # await avatar.start(session, room=ctx.room)
 
+    # Start dual-channel audio recording via egress
+    egress_id = await egress_manager.start_dual_channel_recording(room_name)
+    if egress_id:
+        logger.info(f"Started dual-channel recording for room {room_name}")
+    else:
+        logger.warning(
+            f"Failed to start egress recording for room {room_name}, "
+            "continuing without recording"
+        )
+
     # Start the session, which initializes the voice pipeline and warms up the models
     await session.start(
         agent=Assistant(),
 
@@ -0,0 +1,184 @@
+"""Egress manager for recording dual-channel audio to S3."""
+
+import os
+
+from livekit import api
+from livekit.protocol import egress as egress_proto
+from loguru import logger
+
+
+class EgressConfig:
+    """Configuration for egress recordings."""
+
+    def __init__(
+        self,
+        s3_bucket: str,
+        s3_prefix: str = "",
+        aws_access_key: str | None = None,
+        aws_secret_key: str | None = None,
+        aws_region: str | None = None,
+        livekit_url: str | None = None,
+        livekit_api_key: str | None = None,
+        livekit_api_secret: str | None = None,
+    ):
+        """Initialize egress configuration.
+
+        Args:
+            s3_bucket: S3 bucket name for recordings
+            s3_prefix: Prefix/path within the bucket
+            aws_access_key: AWS access key (defaults to env var)
+            aws_secret_key: AWS secret key (defaults to env var)
+            aws_region: AWS region (defaults to env var or us-east-1)
+            livekit_url: LiveKit server URL (defaults to env var)
+            livekit_api_key: LiveKit API key (defaults to env var)
+            livekit_api_secret: LiveKit API secret (defaults to env var)
+        """
+        self.s3_bucket = s3_bucket
+        self.s3_prefix = s3_prefix.rstrip("/")
+
+        # AWS credentials
+        self.aws_access_key = aws_access_key or os.environ.get("AWS_ACCESS_KEY_ID", "")
+        self.aws_secret_key = aws_secret_key or os.environ.get(
+            "AWS_SECRET_ACCESS_KEY", ""
+        )
+        self.aws_region = aws_region or os.environ.get("AWS_REGION", "us-east-1")
+
+        # LiveKit credentials
+        self.livekit_url = livekit_url or os.environ.get("LIVEKIT_URL", "")
+        self.livekit_api_key = livekit_api_key or os.environ.get("LIVEKIT_API_KEY", "")
+        self.livekit_api_secret = livekit_api_secret or os.environ.get(
+            "LIVEKIT_API_SECRET", ""
+        )
+
+
+class EgressManager:
+    """Manages LiveKit egress for dual-channel audio recording to S3."""
+
+    def __init__(self, config: EgressConfig):
+        """Initialize the egress manager.
+
+        Args:
+            config: Egress configuration
+        """
+        self.config = config
+        self._api: api.LiveKitAPI | None = None
+        self._egress_id: str | None = None
+
+    @property
+    def livekit_api(self) -> api.LiveKitAPI:
+        """Lazily initialize LiveKit API client."""
+        if self._api is None:
+            self._api = api.LiveKitAPI(
+                url=self.config.livekit_url,
+                api_key=self.config.livekit_api_key,
+                api_secret=self.config.livekit_api_secret,
+            )
+        return self._api
+
+    @property
+    def egress_id(self) -> str | None:
+        """Get the current egress ID if recording is active."""
+        return self._egress_id
+
+    def _create_s3_upload(self) -> egress_proto.S3Upload:
+        """Create S3 upload configuration."""
+        return egress_proto.S3Upload(
+            access_key=self.config.aws_access_key,
+            secret=self.config.aws_secret_key,
+            bucket=self.config.s3_bucket,
+            region=self.config.aws_region,
+        )
+
+    async def start_dual_channel_recording(self, room_name: str) -> str | None:
+        """Start dual-channel audio recording for a room.
+
+        The agent's audio will be on one channel, and all other participants
+        (users) will be on the other channel.
+
+        Args:
+            room_name: Name of the LiveKit room to record
+
+        Returns:
+            Egress ID if started successfully, None on failure
+        """
+        if self._egress_id:
+            logger.warning(
+                f"Egress already active with ID {self._egress_id}, skipping start"
+            )
+            return self._egress_id
+
+        try:
+            s3_upload = self._create_s3_upload()
+
+            # Build the filepath with prefix
+            filepath_prefix = (
+                f"{self.config.s3_prefix}/audio" if self.config.s3_prefix else "audio"
+            )
+            filepath = f"{filepath_prefix}/{{room_name}}-{{time}}.ogg"
+
+            file_output = egress_proto.EncodedFileOutput(
+                filepath=filepath,
+                s3=s3_upload,
+            )
+
+            # Start room composite egress with dual-channel audio
+            # DUAL_CHANNEL_AGENT puts agent audio on one channel, all other participants on the other
+            info = await self.livekit_api.egress.start_room_composite_egress(
+                egress_proto.RoomCompositeEgressRequest(
+                    room_name=room_name,
+                    audio_only=True,
+                    audio_mixing=egress_proto.AudioMixing.DUAL_CHANNEL_AGENT,
+                    file_outputs=[file_output],
+                )
+            )
+
+            self._egress_id = info.egress_id
+            logger.info(
+                f"Started dual-channel egress recording for room {room_name}, "
+                f"egress_id={self._egress_id}"
+            )
+            return self._egress_id
+
+        except Exception as e:
+            logger.error(f"Failed to start egress recording: {e}")
+            return None
+
+    async def stop_recording(self) -> bool:
+        """Stop the active egress recording.
+
+        Returns:
+            True if stopped successfully or no active recording, False on error
+        """
+        if not self._egress_id:
+            logger.debug("No active egress to stop")
+            return True
+
+        try:
+            await self.livekit_api.egress.stop_egress(
+                egress_proto.StopEgressRequest(egress_id=self._egress_id)
+            )
+            logger.info(f"Stopped egress recording, egress_id={self._egress_id}")
+            self._egress_id = None
+            return True
+        except Exception as e:
+            logger.error(f"Failed to stop egress recording: {e}")
+            return False
+
+    async def close(self) -> None:
+        """Clean up resources."""
+        if self._api:
+            await self._api.aclose()
+            self._api = None
+
+
+def create_default_egress_manager() -> EgressManager:
+    """Create an egress manager with default configuration for the target S3 bucket.
+
+    Returns:
+        Configured EgressManager instance
+    """
+    config = EgressConfig(
+        s3_bucket="audivi-audio-recordings",
+        s3_prefix="livekit-demos",
+    )
+    return EgressManager(config)
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`---`
	`2`	`+description: recording audio and transcripts on livekit using Egress APIs`
`2`	`3`	`alwaysApply: false`
`3`	`4`	`---`
`4`	`5`	`# LiveKit Egress - Python Audio Recording Guide`
Original file line number	Diff line number	Diff line change
`@@ -11,6 +11,8 @@ requires-python = ">=3.9"`
`11`	`11`	`dependencies = [`
`12`	`12`	`"livekit-agents[silero,turn-detector]~=1.3",`
`13`	`13`	`"livekit-plugins-noise-cancellation~=0.2",`
	`14`	`+ "boto3~=1.35",`
	`15`	`+ "loguru~=0.7",`
`14`	`16`	`"python-dotenv",`
`15`	`17`	`]`
`16`	`18`