ensure audio only and no connection to user camera

seanreed1111 · seanreed1111 · commit bd8dbcbc1488 · 2025-12-12T12:33:20.000-05:00
diff --git a/README.md b/README.md
@@ -25,20 +25,24 @@ This starter app is compatible with any [custom web/mobile frontend](https://doc
 
 This project includes built-in support for:
 
-- **Dual-channel audio recording** via LiveKit Egress (agent on one channel, user on the other)
+- **Audio recording** via LiveKit Egress (all participants mixed, or dual-channel with agent on one channel and user on the other)
 - **Real-time transcript capture** from STT output, saved as JSON
 
+> **Note:** Audio recording via Egress only works in `dev` or `start` mode (connected to LiveKit Cloud). The `console` mode uses a mock room for local testing and cannot record audio. Transcripts are saved in all modes.
+
 ### S3 Output Structure
 
-Recordings and transcripts are saved to S3:
+Recordings and transcripts are saved to S3 with matching session IDs for easy correlation:
 
 ```
 s3://audivi-audio-recordings/livekit-demos/
-  ├── audio/{room_name}-{time}.ogg           # Dual-channel OGG audio
-  ├── audio/{room_name}-{time}.ogg.json      # Egress manifest
-  └── transcripts/{room_name}-{timestamp}.json  # Conversation transcript
+  ├── audio/{room_name}-{session_id}.ogg           # Audio recording (OGG format)
+  ├── audio/{room_name}-{session_id}.ogg.json      # Egress manifest
+  └── transcripts/{room_name}-{session_id}.json    # Conversation transcript
 ```
 
+The `{session_id}` is a timestamp (`YYYYMMDD-HHMMSS`) generated when the session starts, making it easy to match audio recordings with their corresponding transcripts.
+
 ### AWS Configuration
 
 Add these environment variables to your `.env.local`:
@@ -56,6 +60,21 @@ S3_BUCKET = "audivi-audio-recordings"
 S3_PREFIX = "livekit-demos"
 ```
 
+### Dual-Channel Audio
+
+To enable dual-channel recording (agent audio on left channel, user audio on right channel), edit `src/egress_manager.py` and add the `audio_mixing` parameter:
+
+```python
+info = await self.livekit_api.egress.start_room_composite_egress(
+    egress_proto.RoomCompositeEgressRequest(
+        room_name=room_name,
+        audio_only=True,
+        audio_mixing=egress_proto.AudioMixing.DUAL_CHANNEL_AGENT,  # Add this line
+        file_outputs=[file_output],
+    )
+)
+```
+
 ## Coding agents and MCP
 
 This project is designed to work with coding agents like [Cursor](https://www.cursor.com/) and [Claude Code](https://www.anthropic.com/claude-code). 
@@ -123,12 +142,16 @@ Next, run this command to speak to your agent directly in your terminal:
 uv run python src/agent.py console
 ```
 
+> **Note:** Console mode is for local testing only. Audio recording is disabled (transcripts still work).
+
 To run the agent for use with a frontend or telephony, use the `dev` command:
 
 ```console
 uv run python src/agent.py dev
 ```
 
+> This mode connects to LiveKit Cloud and enables full audio recording to S3.
+
 In production, use the `start` command:
 
 ```console
diff --git a/src/agent.py b/src/agent.py
@@ -1,11 +1,13 @@
 import asyncio
+from datetime import datetime, timezone
 
 from dotenv import load_dotenv
 from livekit import rtc
 from livekit.agents import (
     Agent,
     AgentServer,
     AgentSession,
+    AutoSubscribe,
     ConversationItemAddedEvent,
     JobContext,
     JobProcess,
@@ -73,9 +75,13 @@ async def my_agent(ctx: JobContext):
     }
 
     room_name = ctx.room.name
-    logger.info(f"=== Agent session handler called for room: {room_name} ===")
+    # Generate a unique session ID for matching audio and transcript files
+    session_id = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
+    logger.info(
+        f"=== Agent session handler called for room: {room_name}, session_id: {session_id} ==="
+    )
 
-    # Initialize egress manager for dual-channel audio recording
+    # Initialize egress manager for audio recording
     egress_manager = None
     try:
         logger.info("Initializing egress manager...")
@@ -99,6 +105,7 @@ async def my_agent(ctx: JobContext):
         transcript_handler = TranscriptHandler(
             room_name=room_name,
             s3_uploader=s3_uploader,
+            session_id=session_id,
         )
         logger.info("Transcript handler initialized successfully")
     except Exception as e:
@@ -148,27 +155,47 @@ def on_conversation_item_added(event: ConversationItemAddedEvent):
 
     # Handle session close to finalize and upload transcript
     @session.on("close")
-    async def on_session_close(_event):
+    def on_session_close(_event):
         """Finalize transcript and clean up egress when session ends."""
         logger.info(f"Session closing for room {room_name}, saving transcript...")
 
-        # Upload transcript to S3
-        if transcript_handler is not None:
-            try:
-                success = await transcript_handler.finalize_and_upload()
-                if success:
-                    logger.info(f"Transcript saved for room {room_name}")
-                else:
-                    logger.error(f"Failed to save transcript for room {room_name}")
-            except Exception as e:
-                logger.error(f"Error saving transcript: {e}")
-
-        # Clean up egress manager resources
-        if egress_manager is not None:
-            try:
-                await egress_manager.close()
-            except Exception as e:
-                logger.error(f"Error closing egress manager: {e}")
+        async def cleanup():
+            # Stop egress recording - this triggers S3 upload of the audio file
+            if egress_manager is not None:
+                try:
+                    logger.info("Stopping egress recording...")
+                    stopped = await egress_manager.stop_recording()
+                    if stopped:
+                        logger.info(
+                            f"Egress recording stopped for room {room_name}, "
+                            f"audio uploaded to s3://{S3_BUCKET}/{S3_PREFIX}/"
+                        )
+                    else:
+                        logger.warning(
+                            f"Failed to stop egress recording for room {room_name}"
+                        )
+                except Exception as e:
+                    logger.error(f"Error stopping egress recording: {e}")
+
+            # Upload transcript to S3
+            if transcript_handler is not None:
+                try:
+                    success = await transcript_handler.finalize_and_upload()
+                    if success:
+                        logger.info(f"Transcript saved for room {room_name}")
+                    else:
+                        logger.error(f"Failed to save transcript for room {room_name}")
+                except Exception as e:
+                    logger.error(f"Error saving transcript: {e}")
+
+            # Clean up egress manager API client
+            if egress_manager is not None:
+                try:
+                    await egress_manager.close()
+                except Exception as e:
+                    logger.error(f"Error closing egress manager: {e}")
+
+        asyncio.create_task(cleanup())  # noqa: RUF006
 
     logger.info("Event handlers registered")
 
@@ -196,6 +223,8 @@ async def on_session_close(_event):
         agent=Assistant(),
         room=ctx.room,
         room_options=room_io.RoomOptions(
+            # Audio only - disable video input
+            video_input=False,
             audio_input=room_io.AudioInputOptions(
                 noise_cancellation=lambda params: noise_cancellation.BVCTelephony()
                 if params.participant.kind == rtc.ParticipantKind.PARTICIPANT_KIND_SIP
@@ -205,29 +234,46 @@ async def on_session_close(_event):
     )
     logger.info("Session started successfully")
 
-    # Join the room and connect to the user
+    # Join the room and connect to the user (audio only, no video)
     logger.info("Connecting to room...")
-    await ctx.connect()
+    await ctx.connect(auto_subscribe=AutoSubscribe.AUDIO_ONLY)
     logger.info("Connected to room successfully")
 
     # Greet the user
     await session.say("Hello, how can I assist you?")
 
-    # Start dual-channel audio recording via egress (non-blocking, after room is active)
+    # Start audio recording via egress (non-blocking, after room is active)
+    # NOTE: Egress only works in 'dev' mode with a real LiveKit server, not in 'console' mode
     async def start_egress_background():
         """Start egress recording in background so it doesn't block the agent."""
         if egress_manager is None:
             logger.warning("Egress manager not initialized, skipping recording")
             return
+
+        # Check if this is a mock room (console mode)
+        if room_name == "mock_room" or room_name.startswith("FAKE_"):
+            logger.warning(
+                "Skipping egress recording - console mode uses a mock room. "
+                "Run with 'dev' mode to enable audio recording."
+            )
+            return
+
         try:
-            logger.info("Starting egress recording in background...")
-            egress_id = await egress_manager.start_dual_channel_recording(room_name)
+            logger.info(
+                f"Starting egress recording for room {room_name}, session_id={session_id}..."
+            )
+            egress_id = await egress_manager.start_dual_channel_recording(
+                room_name, session_id
+            )
             if egress_id:
-                logger.info(f"Started dual-channel recording for room {room_name}")
+                logger.info(
+                    f"Egress recording started for room {room_name}, "
+                    f"egress_id={egress_id}, session_id={session_id}"
+                )
             else:
                 logger.warning(
                     f"Failed to start egress recording for room {room_name}, "
-                    "continuing without recording"
+                    "continuing without recording. Check AWS credentials and LiveKit egress config."
                 )
         except Exception as e:
             logger.error(f"Error starting egress recording: {e}")
diff --git a/src/egress_manager.py b/src/egress_manager.py
@@ -89,14 +89,14 @@ def _create_s3_upload(self) -> egress_proto.S3Upload:
             region=self.config.aws_region,
         )
 
-    async def start_dual_channel_recording(self, room_name: str) -> str | None:
-        """Start dual-channel audio recording for a room.
-
-        The agent's audio will be on one channel, and all other participants
-        (users) will be on the other channel.
+    async def start_dual_channel_recording(
+        self, room_name: str, session_id: str | None = None
+    ) -> str | None:
+        """Start audio recording for a room.
 
         Args:
             room_name: Name of the LiveKit room to record
+            session_id: Unique session identifier for matching audio/transcript files
 
         Returns:
             Egress ID if started successfully, None on failure
@@ -111,10 +111,14 @@ async def start_dual_channel_recording(self, room_name: str) -> str | None:
             s3_upload = self._create_s3_upload()
 
             # Build the filepath with prefix
+            # Use session_id if provided for matching with transcript, otherwise use LiveKit's {time} placeholder
             filepath_prefix = (
                 f"{self.config.s3_prefix}/audio" if self.config.s3_prefix else "audio"
             )
-            filepath = f"{filepath_prefix}/{{room_name}}-{{time}}.ogg"
+            if session_id:
+                filepath = f"{filepath_prefix}/{room_name}-{session_id}.ogg"
+            else:
+                filepath = f"{filepath_prefix}/{{room_name}}-{{time}}.ogg"
 
             file_output = egress_proto.EncodedFileOutput(
                 filepath=filepath,
diff --git a/src/transcript_handler.py b/src/transcript_handler.py
@@ -117,18 +117,28 @@ def upload_transcript(self, transcript: TranscriptData, key: str) -> bool:
 class TranscriptHandler:
     """Handles capturing and storing conversation transcripts."""
 
-    def __init__(self, room_name: str, s3_uploader: S3UploaderProtocol | None = None):
+    def __init__(
+        self,
+        room_name: str,
+        s3_uploader: S3UploaderProtocol | None = None,
+        session_id: str | None = None,
+    ):
         """Initialize the transcript handler.
 
         Args:
             room_name: Name of the LiveKit room
             s3_uploader: S3 uploader instance for storing transcripts
+            session_id: Unique session identifier for matching audio/transcript files
         """
         self.transcript = TranscriptData(
             room_name=room_name,
             session_start=datetime.now(timezone.utc).isoformat(),
         )
         self.s3_uploader = s3_uploader
+        # Use provided session_id or generate one
+        self.session_id = session_id or datetime.now(timezone.utc).strftime(
+            "%Y%m%d-%H%M%S"
+        )
 
     def add_user_transcript(self, text: str, is_final: bool = True) -> None:
         """Add a user speech transcript entry.
@@ -180,9 +190,8 @@ async def finalize_and_upload(self) -> bool:
             logger.warning("No S3 uploader configured, transcript not saved")
             return True
 
-        # Generate filename based on room name and timestamp
-        timestamp = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
-        key = f"transcripts/{self.transcript.room_name}-{timestamp}.json"
+        # Use session_id for filename to match audio recording
+        key = f"transcripts/{self.transcript.room_name}-{self.session_id}.json"
 
         return self.s3_uploader.upload_transcript(self.transcript, key)