Skip to content

Commit e3e38c1

Browse files
committed
Add initial refactored version of livekit_voice_agent.py -- "coffee_barista_v2.py"
1 parent 5b7c394 commit e3e38c1

File tree

13 files changed

+968
-3
lines changed

13 files changed

+968
-3
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
"""Agents for Coffee Voice Agent"""
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
"""Simple coffee agent with emotion-aware TTS processing"""
2+
3+
import logging
4+
from livekit.agents import Agent, function_tool
5+
from services.emotion_service import EmotionStateManager
6+
7+
from config.instructions import BARISTA_INSTRUCTIONS
8+
from tools.coffee_tools import (
9+
get_current_time_impl, get_current_date_impl, get_coffee_menu_impl,
10+
get_ordering_instructions_impl, recommend_drink_impl
11+
)
12+
13+
logger = logging.getLogger(__name__)
14+
15+
16+
class SimpleCoffeeAgent(Agent):
17+
"""Coffee barista agent with emotion-aware TTS processing
18+
19+
This agent follows the proven pattern from the original implementation:
20+
- Standard LiveKit Agent with instructions and function tools (programmatic registration)
21+
- Override tts_node to handle emotion:text format processing
22+
- Clean separation of concerns with emotion service
23+
"""
24+
25+
def __init__(self, emotion_manager=None):
26+
# Use provided emotion manager or create a new one
27+
self.emotion_manager = emotion_manager or EmotionStateManager()
28+
29+
# Initialize with instructions and programmatically registered tools
30+
super().__init__(
31+
instructions=BARISTA_INSTRUCTIONS,
32+
tools=[
33+
function_tool(
34+
get_current_time_impl,
35+
name="get_current_time",
36+
description="Get the current time."
37+
),
38+
function_tool(
39+
get_current_date_impl,
40+
name="get_current_date",
41+
description="Get today's date."
42+
),
43+
function_tool(
44+
get_coffee_menu_impl,
45+
name="get_coffee_menu",
46+
description="Get the Sui Hub coffee menu."
47+
),
48+
function_tool(
49+
get_ordering_instructions_impl,
50+
name="get_ordering_instructions",
51+
description="Get instructions on how to order coffee through the Slush wallet and Coffee Hub website."
52+
),
53+
function_tool(
54+
recommend_drink_impl,
55+
name="recommend_drink",
56+
description="Recommend a drink based on user preference."
57+
),
58+
]
59+
)
60+
61+
logger.info("SimpleCoffeeAgent initialized with emotion-aware TTS processing and 5 programmatically registered tools")
62+
63+
async def tts_node(self, text, model_settings=None):
64+
"""Override TTS node to process emotion:text format (same pattern as original)"""
65+
66+
# Process text stream with minimal buffering for emotion extraction
67+
async def process_text_stream():
68+
first_chunk_buffer = ""
69+
emotion_extracted = False
70+
emotion_check_limit = 50 # Only check first 50 characters for emotion delimiter
71+
chunks_processed = 0
72+
73+
async for text_chunk in text:
74+
if not text_chunk:
75+
continue
76+
77+
chunks_processed += 1
78+
79+
# Only buffer and check for emotion in the very first chunk(s)
80+
if not emotion_extracted and len(first_chunk_buffer) < emotion_check_limit:
81+
first_chunk_buffer += text_chunk
82+
83+
# Check if we have delimiter in the buffered portion
84+
if ":" in first_chunk_buffer:
85+
logger.info("🔍 Found delimiter in first chunk(s)! Extracting emotion...")
86+
87+
# Process emotion using our emotion service
88+
emotion, text_after_delimiter = self.emotion_manager.process_emotional_response(first_chunk_buffer)
89+
90+
logger.info(f"🎭 Agent speaking with emotion: {emotion}")
91+
92+
# Mark emotion as extracted
93+
emotion_extracted = True
94+
95+
# Immediately yield the text part (no more buffering)
96+
if text_after_delimiter.strip():
97+
logger.info(f"💬 TTS streaming text immediately: {text_after_delimiter[:30]}{'...' if len(text_after_delimiter) > 30 else ''}")
98+
yield text_after_delimiter
99+
100+
elif len(first_chunk_buffer) >= emotion_check_limit:
101+
# Reached limit without finding delimiter - give up and stream everything
102+
logger.info("🔍 No delimiter found within limit, streaming everything with default emotion")
103+
104+
# Process with default emotion
105+
emotion, processed_text = self.emotion_manager.process_emotional_response(first_chunk_buffer)
106+
107+
emotion_extracted = True
108+
109+
# Yield the processed content immediately
110+
logger.info(f"💬 TTS fallback streaming: {processed_text[:30]}{'...' if len(processed_text) > 30 else ''}")
111+
yield processed_text
112+
113+
# If we haven't extracted emotion yet and haven't hit limit, continue buffering
114+
# (don't yield anything yet)
115+
116+
else:
117+
# Either emotion already extracted, or we're past the check limit
118+
# Stream everything immediately
119+
yield text_chunk
120+
121+
# Process the text stream and pass clean text to default TTS
122+
processed_text = process_text_stream()
123+
124+
# Use default TTS implementation with processed text
125+
async for audio_frame in Agent.default.tts_node(self, processed_text, model_settings):
126+
yield audio_frame
127+
128+
# Function tools are automatically discovered by LiveKit from the imported functions
129+
# No need to manually register them - the @function_tool decorators handle this
130+
131+
def get_emotion_manager(self):
132+
"""Get the emotion manager for external access"""
133+
return self.emotion_manager
134+
135+
def __repr__(self):
136+
return f"SimpleCoffeeAgent(tools=5, emotion={self.emotion_manager.get_current_emotion()}, registration=programmatic)"
Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,218 @@
1+
#!/usr/bin/env python3
2+
"""Coffee Barista Voice Agent v2 - Clean modular implementation
3+
4+
This is a refactored version of the original livekit_voice_agent.py that:
5+
- Uses extracted services for wake word detection and order notifications
6+
- Uses a simple agent with function tools
7+
- Uses emotion-aware TTS processing
8+
- Maintains the same functionality with cleaner architecture
9+
"""
10+
11+
import asyncio
12+
import logging
13+
import os
14+
from datetime import datetime
15+
16+
from livekit import agents
17+
from livekit.agents import AgentSession, JobContext, WorkerOptions
18+
from livekit.plugins import openai, silero
19+
20+
# Import our extracted components
21+
from config.settings import REQUIRED_ENV_VARS, WEBSOCKET_HOST, WEBSOCKET_PORT
22+
from config.instructions import BARISTA_INSTRUCTIONS
23+
from agents.simple_coffee_agent import SimpleCoffeeAgent
24+
from services.emotion_service import EmotionStateManager
25+
from services.wake_word_service import WakeWordService
26+
from services.order_service import OrderNotificationService
27+
from utils.greeting_data import get_random_greeting
28+
from utils.announcement_data import format_virtual_request_announcement
29+
30+
# Configure logging
31+
logging.basicConfig(level=logging.INFO)
32+
logger = logging.getLogger(__name__)
33+
34+
35+
class CoffeeBaristaV2:
36+
"""Clean coffee barista implementation using extracted services
37+
38+
This version composes the extracted services to provide the same functionality
39+
as the original but with much cleaner architecture and separation of concerns.
40+
"""
41+
42+
def __init__(self):
43+
# Core components - emotion manager is handled by the agent now
44+
self.emotion_manager = EmotionStateManager()
45+
self.agent = SimpleCoffeeAgent(emotion_manager=self.emotion_manager)
46+
47+
# I/O Services
48+
self.wake_word_service = WakeWordService(on_wake_word_detected=self.on_wake_word_detected)
49+
self.order_service = OrderNotificationService(on_order_received=self.on_order_received)
50+
51+
# Session management
52+
self.current_session = None
53+
self.room = None
54+
55+
logger.info("CoffeeBaristaV2 initialized with modular architecture")
56+
57+
async def start(self, ctx: JobContext):
58+
"""Start the coffee barista with all services"""
59+
# Connect to the room
60+
await ctx.connect()
61+
self.room = ctx.room
62+
logger.info(f"Connected to room: {ctx.room.name}")
63+
64+
# Start I/O services
65+
await self.order_service.start()
66+
wake_word_started = await self.wake_word_service.start(ctx.room)
67+
68+
# Handle wake word vs always-on mode
69+
if not wake_word_started:
70+
logger.info("🔍 Starting in always-on mode (no wake word detection)")
71+
await self.start_conversation()
72+
else:
73+
logger.info("Started in wake word mode - say 'hey barista' to activate")
74+
75+
async def on_wake_word_detected(self, room):
76+
"""Handle wake word detection - start a conversation"""
77+
logger.info("🔍 Wake word detected - starting conversation")
78+
await self.start_conversation()
79+
80+
async def start_conversation(self):
81+
"""Start a conversation session with emotion-aware TTS"""
82+
try:
83+
# Create session with standard TTS (emotion processing happens in agent.tts_node)
84+
self.current_session = AgentSession(
85+
stt=openai.STT(model="whisper-1"),
86+
llm=openai.LLM(
87+
model="gpt-4o-mini",
88+
temperature=float(os.getenv("VOICE_AGENT_TEMPERATURE", "0.7"))
89+
),
90+
tts=openai.TTS(
91+
model="tts-1",
92+
voice=os.getenv("VOICE_AGENT_VOICE", "nova")
93+
),
94+
vad=silero.VAD.load(),
95+
)
96+
97+
# Start the session with our simple agent
98+
await self.current_session.start(
99+
room=self.room,
100+
agent=self.agent
101+
)
102+
103+
# Pause wake word detection during conversation
104+
if self.wake_word_service.is_active():
105+
self.wake_word_service.pause()
106+
107+
# Start with a random greeting
108+
greeting = get_random_greeting()
109+
emotion, text = self.emotion_manager.process_emotional_response(greeting)
110+
111+
# Use the session to say the greeting
112+
await self.current_session.say(text)
113+
114+
logger.info("🎉 Conversation started successfully")
115+
116+
except Exception as e:
117+
logger.error(f"Error starting conversation: {e}")
118+
await self.end_conversation()
119+
120+
async def on_order_received(self, order_info):
121+
"""Handle order notifications from WebSocket"""
122+
try:
123+
# Format the order announcement using our utility
124+
announcement = format_virtual_request_announcement({
125+
"type": order_info["type"],
126+
"content": order_info["content"]
127+
})
128+
129+
# Process emotion and announce if we have an active session
130+
if self.current_session:
131+
emotion, text = self.emotion_manager.process_emotional_response(announcement)
132+
await self.current_session.say(text)
133+
logger.info(f"📢 Announced order: {order_info['coffee_type']}")
134+
else:
135+
logger.info(f"📋 Order received but no active session: {order_info['coffee_type']}")
136+
137+
except Exception as e:
138+
logger.error(f"Error processing order notification: {e}")
139+
140+
async def end_conversation(self):
141+
"""End the current conversation and return to dormant state"""
142+
if self.current_session:
143+
try:
144+
await self.current_session.aclose()
145+
except Exception as e:
146+
logger.error(f"Error closing session: {e}")
147+
finally:
148+
self.current_session = None
149+
150+
# Resume wake word detection
151+
if self.wake_word_service.is_active():
152+
self.wake_word_service.resume()
153+
154+
# Reset emotion state for next conversation
155+
self.emotion_manager.reset_emotion_state()
156+
157+
logger.info("🔍 Conversation ended - returned to dormant state")
158+
159+
def stop(self):
160+
"""Stop all services and clean up resources"""
161+
logger.info("🛑 Stopping Coffee Barista v2...")
162+
163+
# Stop I/O services
164+
self.wake_word_service.stop()
165+
self.order_service.stop()
166+
167+
# Close any active session
168+
if self.current_session:
169+
# Note: In a real implementation, this would need proper async cleanup
170+
logger.info("Closing active session...")
171+
172+
logger.info("✅ Coffee Barista v2 stopped")
173+
174+
def __repr__(self):
175+
return f"CoffeeBaristaV2(session_active={self.current_session is not None}, emotion={self.emotion_manager.get_current_emotion()})"
176+
177+
178+
async def entrypoint(ctx: JobContext):
179+
"""Main entrypoint for the coffee barista agent v2"""
180+
barista = CoffeeBaristaV2()
181+
await barista.start(ctx)
182+
183+
184+
def main():
185+
"""Main function with environment validation and startup"""
186+
# Validate required environment variables
187+
missing_vars = [var for var in REQUIRED_ENV_VARS if not os.getenv(var)]
188+
189+
if missing_vars:
190+
logger.error(f"Missing required environment variables: {missing_vars}")
191+
logger.error("Please check your .env file and ensure OPENAI_API_KEY is set.")
192+
exit(1)
193+
194+
# Log configuration
195+
logger.info("☕ Starting Coffee Barista Voice Agent v2...")
196+
logger.info(f"Wake Word Detection: {'✅ Enabled' if os.getenv('PORCUPINE_ACCESS_KEY') else '❌ Disabled (always-on mode)'}")
197+
logger.info(f"WebSocket Server: ✅ Enabled on {WEBSOCKET_HOST}:{WEBSOCKET_PORT}")
198+
logger.info(f"OpenAI Model: gpt-4o-mini")
199+
logger.info(f"Voice: {os.getenv('VOICE_AGENT_VOICE', 'nova')}")
200+
logger.info(f"Temperature: {os.getenv('VOICE_AGENT_TEMPERATURE', '0.7')}")
201+
logger.info(f"Architecture: 🏗️ Modular (extracted services)")
202+
203+
logger.info("\n📋 Available CLI modes:")
204+
logger.info(" python coffee_barista_v2.py console - Terminal mode (local testing)")
205+
logger.info(" python coffee_barista_v2.py dev - Development mode (connect to LiveKit)")
206+
logger.info(" python coffee_barista_v2.py start - Production mode")
207+
208+
# Run the agent
209+
agents.cli.run_app(
210+
WorkerOptions(
211+
entrypoint_fnc=entrypoint,
212+
agent_name="coffee-barista-v2"
213+
)
214+
)
215+
216+
217+
if __name__ == "__main__":
218+
main()
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
"""Services for Coffee Voice Agent"""

0 commit comments

Comments
 (0)