Skip to content

Commit bd8dbcb

Browse files
committed
ensure audio only and no connection to user camera
1 parent f99468c commit bd8dbcb

File tree

4 files changed

+124
-42
lines changed

4 files changed

+124
-42
lines changed

README.md

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,20 +25,24 @@ This starter app is compatible with any [custom web/mobile frontend](https://doc
2525

2626
This project includes built-in support for:
2727

28-
- **Dual-channel audio recording** via LiveKit Egress (agent on one channel, user on the other)
28+
- **Audio recording** via LiveKit Egress (all participants mixed, or dual-channel with agent on one channel and user on the other)
2929
- **Real-time transcript capture** from STT output, saved as JSON
3030

31+
> **Note:** Audio recording via Egress only works in `dev` or `start` mode (connected to LiveKit Cloud). The `console` mode uses a mock room for local testing and cannot record audio. Transcripts are saved in all modes.
32+
3133
### S3 Output Structure
3234

33-
Recordings and transcripts are saved to S3:
35+
Recordings and transcripts are saved to S3 with matching session IDs for easy correlation:
3436

3537
```
3638
s3://audivi-audio-recordings/livekit-demos/
37-
├── audio/{room_name}-{time}.ogg # Dual-channel OGG audio
38-
├── audio/{room_name}-{time}.ogg.json # Egress manifest
39-
└── transcripts/{room_name}-{timestamp}.json # Conversation transcript
39+
├── audio/{room_name}-{session_id}.ogg # Audio recording (OGG format)
40+
├── audio/{room_name}-{session_id}.ogg.json # Egress manifest
41+
└── transcripts/{room_name}-{session_id}.json # Conversation transcript
4042
```
4143

44+
The `{session_id}` is a timestamp (`YYYYMMDD-HHMMSS`) generated when the session starts, making it easy to match audio recordings with their corresponding transcripts.
45+
4246
### AWS Configuration
4347

4448
Add these environment variables to your `.env.local`:
@@ -56,6 +60,21 @@ S3_BUCKET = "audivi-audio-recordings"
5660
S3_PREFIX = "livekit-demos"
5761
```
5862

63+
### Dual-Channel Audio
64+
65+
To enable dual-channel recording (agent audio on left channel, user audio on right channel), edit `src/egress_manager.py` and add the `audio_mixing` parameter:
66+
67+
```python
68+
info = await self.livekit_api.egress.start_room_composite_egress(
69+
egress_proto.RoomCompositeEgressRequest(
70+
room_name=room_name,
71+
audio_only=True,
72+
audio_mixing=egress_proto.AudioMixing.DUAL_CHANNEL_AGENT, # Add this line
73+
file_outputs=[file_output],
74+
)
75+
)
76+
```
77+
5978
## Coding agents and MCP
6079

6180
This project is designed to work with coding agents like [Cursor](https://www.cursor.com/) and [Claude Code](https://www.anthropic.com/claude-code).
@@ -123,12 +142,16 @@ Next, run this command to speak to your agent directly in your terminal:
123142
uv run python src/agent.py console
124143
```
125144

145+
> **Note:** Console mode is for local testing only. Audio recording is disabled (transcripts still work).
146+
126147
To run the agent for use with a frontend or telephony, use the `dev` command:
127148

128149
```console
129150
uv run python src/agent.py dev
130151
```
131152

153+
> This mode connects to LiveKit Cloud and enables full audio recording to S3.
154+
132155
In production, use the `start` command:
133156

134157
```console

src/agent.py

Lines changed: 73 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
import asyncio
2+
from datetime import datetime, timezone
23

34
from dotenv import load_dotenv
45
from livekit import rtc
56
from livekit.agents import (
67
Agent,
78
AgentServer,
89
AgentSession,
10+
AutoSubscribe,
911
ConversationItemAddedEvent,
1012
JobContext,
1113
JobProcess,
@@ -73,9 +75,13 @@ async def my_agent(ctx: JobContext):
7375
}
7476

7577
room_name = ctx.room.name
76-
logger.info(f"=== Agent session handler called for room: {room_name} ===")
78+
# Generate a unique session ID for matching audio and transcript files
79+
session_id = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
80+
logger.info(
81+
f"=== Agent session handler called for room: {room_name}, session_id: {session_id} ==="
82+
)
7783

78-
# Initialize egress manager for dual-channel audio recording
84+
# Initialize egress manager for audio recording
7985
egress_manager = None
8086
try:
8187
logger.info("Initializing egress manager...")
@@ -99,6 +105,7 @@ async def my_agent(ctx: JobContext):
99105
transcript_handler = TranscriptHandler(
100106
room_name=room_name,
101107
s3_uploader=s3_uploader,
108+
session_id=session_id,
102109
)
103110
logger.info("Transcript handler initialized successfully")
104111
except Exception as e:
@@ -148,27 +155,47 @@ def on_conversation_item_added(event: ConversationItemAddedEvent):
148155

149156
# Handle session close to finalize and upload transcript
150157
@session.on("close")
151-
async def on_session_close(_event):
158+
def on_session_close(_event):
152159
"""Finalize transcript and clean up egress when session ends."""
153160
logger.info(f"Session closing for room {room_name}, saving transcript...")
154161

155-
# Upload transcript to S3
156-
if transcript_handler is not None:
157-
try:
158-
success = await transcript_handler.finalize_and_upload()
159-
if success:
160-
logger.info(f"Transcript saved for room {room_name}")
161-
else:
162-
logger.error(f"Failed to save transcript for room {room_name}")
163-
except Exception as e:
164-
logger.error(f"Error saving transcript: {e}")
165-
166-
# Clean up egress manager resources
167-
if egress_manager is not None:
168-
try:
169-
await egress_manager.close()
170-
except Exception as e:
171-
logger.error(f"Error closing egress manager: {e}")
162+
async def cleanup():
163+
# Stop egress recording - this triggers S3 upload of the audio file
164+
if egress_manager is not None:
165+
try:
166+
logger.info("Stopping egress recording...")
167+
stopped = await egress_manager.stop_recording()
168+
if stopped:
169+
logger.info(
170+
f"Egress recording stopped for room {room_name}, "
171+
f"audio uploaded to s3://{S3_BUCKET}/{S3_PREFIX}/"
172+
)
173+
else:
174+
logger.warning(
175+
f"Failed to stop egress recording for room {room_name}"
176+
)
177+
except Exception as e:
178+
logger.error(f"Error stopping egress recording: {e}")
179+
180+
# Upload transcript to S3
181+
if transcript_handler is not None:
182+
try:
183+
success = await transcript_handler.finalize_and_upload()
184+
if success:
185+
logger.info(f"Transcript saved for room {room_name}")
186+
else:
187+
logger.error(f"Failed to save transcript for room {room_name}")
188+
except Exception as e:
189+
logger.error(f"Error saving transcript: {e}")
190+
191+
# Clean up egress manager API client
192+
if egress_manager is not None:
193+
try:
194+
await egress_manager.close()
195+
except Exception as e:
196+
logger.error(f"Error closing egress manager: {e}")
197+
198+
asyncio.create_task(cleanup()) # noqa: RUF006
172199

173200
logger.info("Event handlers registered")
174201

@@ -196,6 +223,8 @@ async def on_session_close(_event):
196223
agent=Assistant(),
197224
room=ctx.room,
198225
room_options=room_io.RoomOptions(
226+
# Audio only - disable video input
227+
video_input=False,
199228
audio_input=room_io.AudioInputOptions(
200229
noise_cancellation=lambda params: noise_cancellation.BVCTelephony()
201230
if params.participant.kind == rtc.ParticipantKind.PARTICIPANT_KIND_SIP
@@ -205,29 +234,46 @@ async def on_session_close(_event):
205234
)
206235
logger.info("Session started successfully")
207236

208-
# Join the room and connect to the user
237+
# Join the room and connect to the user (audio only, no video)
209238
logger.info("Connecting to room...")
210-
await ctx.connect()
239+
await ctx.connect(auto_subscribe=AutoSubscribe.AUDIO_ONLY)
211240
logger.info("Connected to room successfully")
212241

213242
# Greet the user
214243
await session.say("Hello, how can I assist you?")
215244

216-
# Start dual-channel audio recording via egress (non-blocking, after room is active)
245+
# Start audio recording via egress (non-blocking, after room is active)
246+
# NOTE: Egress only works in 'dev' mode with a real LiveKit server, not in 'console' mode
217247
async def start_egress_background():
218248
"""Start egress recording in background so it doesn't block the agent."""
219249
if egress_manager is None:
220250
logger.warning("Egress manager not initialized, skipping recording")
221251
return
252+
253+
# Check if this is a mock room (console mode)
254+
if room_name == "mock_room" or room_name.startswith("FAKE_"):
255+
logger.warning(
256+
"Skipping egress recording - console mode uses a mock room. "
257+
"Run with 'dev' mode to enable audio recording."
258+
)
259+
return
260+
222261
try:
223-
logger.info("Starting egress recording in background...")
224-
egress_id = await egress_manager.start_dual_channel_recording(room_name)
262+
logger.info(
263+
f"Starting egress recording for room {room_name}, session_id={session_id}..."
264+
)
265+
egress_id = await egress_manager.start_dual_channel_recording(
266+
room_name, session_id
267+
)
225268
if egress_id:
226-
logger.info(f"Started dual-channel recording for room {room_name}")
269+
logger.info(
270+
f"Egress recording started for room {room_name}, "
271+
f"egress_id={egress_id}, session_id={session_id}"
272+
)
227273
else:
228274
logger.warning(
229275
f"Failed to start egress recording for room {room_name}, "
230-
"continuing without recording"
276+
"continuing without recording. Check AWS credentials and LiveKit egress config."
231277
)
232278
except Exception as e:
233279
logger.error(f"Error starting egress recording: {e}")

src/egress_manager.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -89,14 +89,14 @@ def _create_s3_upload(self) -> egress_proto.S3Upload:
8989
region=self.config.aws_region,
9090
)
9191

92-
async def start_dual_channel_recording(self, room_name: str) -> str | None:
93-
"""Start dual-channel audio recording for a room.
94-
95-
The agent's audio will be on one channel, and all other participants
96-
(users) will be on the other channel.
92+
async def start_dual_channel_recording(
93+
self, room_name: str, session_id: str | None = None
94+
) -> str | None:
95+
"""Start audio recording for a room.
9796
9897
Args:
9998
room_name: Name of the LiveKit room to record
99+
session_id: Unique session identifier for matching audio/transcript files
100100
101101
Returns:
102102
Egress ID if started successfully, None on failure
@@ -111,10 +111,14 @@ async def start_dual_channel_recording(self, room_name: str) -> str | None:
111111
s3_upload = self._create_s3_upload()
112112

113113
# Build the filepath with prefix
114+
# Use session_id if provided for matching with transcript, otherwise use LiveKit's {time} placeholder
114115
filepath_prefix = (
115116
f"{self.config.s3_prefix}/audio" if self.config.s3_prefix else "audio"
116117
)
117-
filepath = f"{filepath_prefix}/{{room_name}}-{{time}}.ogg"
118+
if session_id:
119+
filepath = f"{filepath_prefix}/{room_name}-{session_id}.ogg"
120+
else:
121+
filepath = f"{filepath_prefix}/{{room_name}}-{{time}}.ogg"
118122

119123
file_output = egress_proto.EncodedFileOutput(
120124
filepath=filepath,

src/transcript_handler.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -117,18 +117,28 @@ def upload_transcript(self, transcript: TranscriptData, key: str) -> bool:
117117
class TranscriptHandler:
118118
"""Handles capturing and storing conversation transcripts."""
119119

120-
def __init__(self, room_name: str, s3_uploader: S3UploaderProtocol | None = None):
120+
def __init__(
121+
self,
122+
room_name: str,
123+
s3_uploader: S3UploaderProtocol | None = None,
124+
session_id: str | None = None,
125+
):
121126
"""Initialize the transcript handler.
122127
123128
Args:
124129
room_name: Name of the LiveKit room
125130
s3_uploader: S3 uploader instance for storing transcripts
131+
session_id: Unique session identifier for matching audio/transcript files
126132
"""
127133
self.transcript = TranscriptData(
128134
room_name=room_name,
129135
session_start=datetime.now(timezone.utc).isoformat(),
130136
)
131137
self.s3_uploader = s3_uploader
138+
# Use provided session_id or generate one
139+
self.session_id = session_id or datetime.now(timezone.utc).strftime(
140+
"%Y%m%d-%H%M%S"
141+
)
132142

133143
def add_user_transcript(self, text: str, is_final: bool = True) -> None:
134144
"""Add a user speech transcript entry.
@@ -180,9 +190,8 @@ async def finalize_and_upload(self) -> bool:
180190
logger.warning("No S3 uploader configured, transcript not saved")
181191
return True
182192

183-
# Generate filename based on room name and timestamp
184-
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
185-
key = f"transcripts/{self.transcript.room_name}-{timestamp}.json"
193+
# Use session_id for filename to match audio recording
194+
key = f"transcripts/{self.transcript.room_name}-{self.session_id}.json"
186195

187196
return self.s3_uploader.upload_transcript(self.transcript, key)
188197

0 commit comments

Comments
 (0)